rebase on timer query patch

This commit is contained in:
Chad Brokaw 2022-05-09 22:39:59 -04:00
parent ba7f85731c
commit c749addf6c
45 changed files with 887 additions and 494 deletions

1
Cargo.lock generated
View file

@ -925,6 +925,7 @@ dependencies = [
"block",
"bytemuck",
"cocoa-foundation",
"foreign-types",
"metal",
"objc",
"raw-window-handle 0.3.4",

View file

@ -28,3 +28,4 @@ metal = "0.22"
objc = "0.2.5"
block = "0.1.6"
cocoa-foundation = "0.1"
foreign-types = "0.3.2"

View file

@ -1,4 +1,4 @@
use piet_gpu_hal::{include_shader, BindType};
use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
fn main() {
@ -20,9 +20,9 @@ fn main() {
let mut cmd_buf = session.cmd_buf().unwrap();
cmd_buf.begin();
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
cmd_buf.write_timestamp(&query_pool, 1);
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
pass.end();
cmd_buf.finish_timestamps(&query_pool);
cmd_buf.host_barrier();
cmd_buf.finish();

View file

@ -17,7 +17,8 @@
//! The generic trait for backends to implement.
use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams,
BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
MapMode, SamplerParams,
};
pub trait Device: Sized {
@ -159,14 +160,32 @@ pub trait Device: Sized {
unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
}
/// The trait implemented by backend command buffer implementations.
///
/// Valid encoding is represented by a state machine (currently not validated
/// but it is easy to imagine there might be at least debug validation). Most
/// methods are only valid in a particular state, and some move it to another
/// state.
pub trait CmdBuf<D: Device> {
/// Begin encoding.
///
/// State: init -> ready
unsafe fn begin(&mut self);
/// State: ready -> finished
unsafe fn finish(&mut self);
/// Return true if the command buffer is suitable for reuse.
unsafe fn reset(&mut self) -> bool;
/// Begin a compute pass.
///
/// State: ready -> in_compute_pass
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
/// Dispatch
///
/// State: in_compute_pass
unsafe fn dispatch(
&mut self,
pipeline: &D::Pipeline,
@ -175,6 +194,9 @@ pub trait CmdBuf<D: Device> {
workgroup_size: (u32, u32, u32),
);
/// State: in_compute_pass -> ready
unsafe fn end_compute_pass(&mut self);
/// Insert an execution and memory barrier.
///
/// Compute kernels (and other actions) after this barrier may read from buffers
@ -202,16 +224,16 @@ pub trait CmdBuf<D: Device> {
/// This is readily supported in Vulkan, but for portability it is remarkably
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
/// kernel, or organize the code not to need it.
unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer);
unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer);
unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image);
unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
// low portability, dx12 doesn't support it natively
unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image);
unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
/// Reset the query pool.
///
@ -227,7 +249,7 @@ pub trait CmdBuf<D: Device> {
unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
/// Begin a labeled section for debugging and profiling purposes.
unsafe fn begin_debug_label(&mut self, label: &str) {}
unsafe fn begin_debug_label(&mut self, _label: &str) {}
/// End a section opened by `begin_debug_label`.
unsafe fn end_debug_label(&mut self) {}

View file

@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use smallvec::SmallVec;
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat};
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
use self::{
descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@ -76,6 +76,7 @@ pub struct CmdBuf {
c: wrappers::GraphicsCommandList,
allocator: CommandAllocator,
needs_reset: bool,
end_query: Option<(wrappers::QueryHeap, u32)>,
}
pub struct Pipeline {
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
c,
allocator,
needs_reset: false,
end_query: None,
})
}
}
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
let ts0 = buf[0];
let tsp = (self.ts_freq as f64).recip();
let result = buf[1..]
let result = buf
.iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
.map(|ts| *ts as f64 * tsp)
.collect();
Ok(result)
}
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Dx12(pool) = pool {
self.write_timestamp(pool, *start);
self.end_query = Some((pool.heap.clone(), *end));
}
}
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
.dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
}
unsafe fn end_compute_pass(&mut self) {
if let Some((heap, end)) = self.end_query.take() {
self.c.end_timing_query(&heap, end);
}
}
unsafe fn memory_barrier(&mut self) {
// See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
// The "proper" way to do this would be to name the actual buffers participating
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.memory_barrier();
}
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
let (gpu_ref, heap) = buffer
.gpu_ref
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
);
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
// TODO: consider using copy_resource here (if sizes match)
let size = src.size.min(dst.size);
self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
}
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
self.c
.copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
}
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
self.c
.copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
}
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
self.c.copy_resource(&src.resource, &dst.resource);
}

View file

@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
#[derive(Clone)]
pub struct ShaderByteCode {
pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
blob: Option<Blob>,
}
#[derive(Clone)]
@ -741,7 +740,6 @@ impl ShaderByteCode {
BytecodeLength: blob.0.GetBufferSize(),
pShaderBytecode: blob.0.GetBufferPointer(),
},
blob: Some(blob),
}
}
@ -810,7 +808,6 @@ impl ShaderByteCode {
BytecodeLength: bytecode.len(),
pShaderBytecode: bytecode.as_ptr() as *const _,
},
blob: None,
}
}
}

View file

@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
use bytemuck::Pod;
use smallvec::SmallVec;
use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode};
use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
size: u64,
}
/// A sub-object of a command buffer for a sequence of compute dispatches.
pub struct ComputePass<'a> {
cmd_buf: &'a mut CmdBuf,
}
impl Session {
/// Create a new session, choosing the best backend.
pub fn new(device: mux::Device) -> Session {
@ -370,8 +375,17 @@ impl Session {
///
/// This should be called after waiting on the command buffer that wrote the
/// timer queries.
///
/// The returned vector is one shorter than the number of timer queries in the
/// pool; the first value is subtracted off. It would likely be better to return
/// the raw timestamps, but that change should be made consistently.
pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
self.0.device.fetch_query_pool(pool)
let result = self.0.device.fetch_query_pool(pool)?;
// Subtract off first timestamp.
Ok(result[1..]
.iter()
.map(|ts| *ts as f64 - result[0])
.collect())
}
#[doc(hidden)]
@ -471,23 +485,10 @@ impl CmdBuf {
self.cmd_buf().finish();
}
/// Dispatch a compute shader.
///
/// Request a compute shader to be run, using the pipeline to specify the
/// code, and the descriptor set to address the resources read and written.
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
/// Begin a compute pass.
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
self.cmd_buf().begin_compute_pass(desc);
ComputePass { cmd_buf: self }
}
/// Insert an execution and memory barrier.
@ -582,13 +583,6 @@ impl CmdBuf {
self.cmd_buf().reset_query_pool(pool);
}
/// Write a timestamp.
///
/// The query index must be less than the size of the query pool on creation.
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
self.cmd_buf().write_timestamp(pool, query);
}
/// Prepare the timestamps for reading. This isn't required on Vulkan but
/// is required on (at least) DX12.
///
@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf {
}
}
impl<'a> ComputePass<'a> {
/// Dispatch a compute shader.
///
/// Request a compute shader to be run, using the pipeline to specify the
/// code, and the descriptor set to address the resources read and written.
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf
.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
}
/// Add a memory barrier.
///
/// Inserts a memory barrier in the compute encoder. This is a convenience
/// function for calling the same function on the underlying command buffer,
/// avoiding borrow check issues.
pub unsafe fn memory_barrier(&mut self) {
self.cmd_buf.memory_barrier();
}
/// Begin a labeled section for debugging and profiling purposes.
pub unsafe fn begin_debug_label(&mut self, label: &str) {
self.cmd_buf.begin_debug_label(label);
}
/// End a section opened by `begin_debug_label`.
pub unsafe fn end_debug_label(&mut self) {
self.cmd_buf.end_debug_label();
}
pub unsafe fn end(self) {
self.cmd_buf.cmd_buf().end_compute_pass();
}
}
impl Drop for BufferInner {
fn drop(&mut self) {
if let Some(session) = Weak::upgrade(&self.session) {

View file

@ -21,8 +21,8 @@ pub use crate::mux::{
};
pub use bufwrite::BufWrite;
pub use hub::{
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource,
Session, SubmittedCmdBuf,
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
RetainResource, Session, SubmittedCmdBuf,
};
// TODO: because these are conditionally included, "cargo fmt" does not
@ -189,3 +189,23 @@ pub struct WorkgroupLimits {
/// dimension.
pub max_invocations: u32,
}
/// Options for creating a compute pass.
#[derive(Default)]
pub struct ComputePassDescriptor<'a> {
// Maybe label should go here? It does in wgpu and wgpu_hal.
/// Timer query parameters.
///
/// To record timer queries for a compute pass, set the query pool, start
/// query index, and end query index here. The indices must be less than
/// the size of the query pool.
timer_queries: Option<(&'a QueryPool, u32, u32)>,
}
impl<'a> ComputePassDescriptor<'a> {
pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
ComputePassDescriptor {
timer_queries: Some((pool, start_query, end_query)),
}
}
}

View file

@ -15,25 +15,32 @@
// Also licensed under MIT license, at your choice.
mod clear;
mod timer;
mod util;
use std::mem;
use std::sync::{Arc, Mutex};
use block::Block;
use cocoa_foundation::base::id;
use cocoa_foundation::foundation::{NSInteger, NSUInteger};
use foreign_types::ForeignType;
use objc::rc::autoreleasepool;
use objc::runtime::{Object, BOOL, YES};
use objc::{class, msg_send, sel, sel_impl};
use metal::{CGFloat, MTLFeatureSet};
use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits};
use crate::{
BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
};
use util::*;
use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
pub struct MtlInstance;
pub struct MtlDevice {
@ -41,6 +48,18 @@ pub struct MtlDevice {
cmd_queue: Arc<Mutex<metal::CommandQueue>>,
gpu_info: GpuInfo,
helpers: Arc<Helpers>,
timer_set: Option<CounterSet>,
counter_style: CounterStyle,
}
/// Type of counter sampling.
///
/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum CounterStyle {
None,
Stage,
Command,
}
pub struct MtlSurface {
@ -81,9 +100,22 @@ pub struct Semaphore;
pub struct CmdBuf {
cmd_buf: metal::CommandBuffer,
helpers: Arc<Helpers>,
cur_encoder: Encoder,
time_calibration: Arc<Mutex<TimeCalibration>>,
counter_style: CounterStyle,
}
pub struct QueryPool;
enum Encoder {
None,
Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
Blit(metal::BlitCommandEncoder),
}
#[derive(Default)]
pub struct QueryPool {
counter_sample_buf: Option<CounterSampleBuffer>,
calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
}
pub struct Pipeline(metal::ComputePipelineState);
@ -209,18 +241,43 @@ impl MtlDevice {
let helpers = Arc::new(Helpers {
clear_pipeline: clear::make_clear_pipeline(&device),
});
// Timer stuff
let timer_set = CounterSet::get_timer_counter_set(&device);
let counter_style = if timer_set.is_some() {
if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
CounterStyle::Stage
} else if device
.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
{
CounterStyle::Command
} else {
CounterStyle::None
}
} else {
CounterStyle::None
};
MtlDevice {
device,
cmd_queue: Arc::new(Mutex::new(cmd_queue)),
gpu_info,
helpers,
timer_set,
counter_style,
}
}
pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
let cmd_buf = raw_cmd_buf;
let helpers = self.helpers.clone();
CmdBuf { cmd_buf, helpers }
let cur_encoder = Encoder::None;
let time_calibration = Default::default();
CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
}
}
pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice {
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
let cmd_queue = self.cmd_queue.lock().unwrap();
// A discussion about autorelease pools.
//
// Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
// you can have any two of correctness, ergonomics, and performance. Here we've
// chosen the first two, using the pattern of a fine grained autorelease pool
// to give the Obj-C object Rust-like lifetime semantics whenever objects are
// created as autorelease (by convention, this is any object creation with an
// Obj-C method name that doesn't begin with "new" or "alloc").
//
// To gain back some of the performance, we'd need a way to wrap an autorelease
// pool over a chunk of work - that could be one frame of rendering, but for
// tests that iterate a number of command buffer submissions, it would need to
// be around that. On non-mac platforms, it would be a no-op.
//
// In any case, this way, the caller doesn't need to worry, and the performance
// hit might not be so bad (perhaps we should measure).
// consider new_command_buffer_with_unretained_references for performance
let cmd_buf = cmd_queue.new_command_buffer();
let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
let helpers = self.helpers.clone();
Ok(CmdBuf { cmd_buf, helpers })
let cur_encoder = Encoder::None;
let time_calibration = Default::default();
Ok(CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
})
}
unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice {
}
fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
// TODO
Ok(QueryPool)
if let Some(timer_set) = &self.timer_set {
let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
.ok_or("error creating timer query pool")?;
return Ok(QueryPool {
counter_sample_buf: Some(pool),
calibration: Default::default(),
});
}
Ok(QueryPool::default())
}
unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
// TODO
if let Some(raw) = &pool.counter_sample_buf {
let resolved = raw.resolve();
let calibration = pool.calibration.lock().unwrap();
if let Some(calibration) = &*calibration {
let calibration = calibration.lock().unwrap();
let result = resolved
.iter()
.map(|time_ns| calibration.correlate(*time_ns))
.collect();
return Ok(result);
}
}
// Maybe should return None indicating it wasn't successful? But that might break.
Ok(Vec::new())
}
@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice {
_signal_semaphores: &[&Self::Semaphore],
fence: Option<&mut Self::Fence>,
) -> Result<(), Error> {
unsafe fn add_scheduled_handler(
cmd_buf: &metal::CommandBufferRef,
block: &Block<(&CommandBufferRef,), ()>,
) {
msg_send![cmd_buf, addScheduledHandler: block]
}
for cmd_buf in cmd_bufs {
let time_calibration = cmd_buf.time_calibration.clone();
let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
// TODO: only do this if supported.
let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
let time_calibration = cmd_buf.time_calibration.clone();
let completed_block =
block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
// TODO: only do this if supported.
let () =
msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
cmd_buf.cmd_buf.add_completed_handler(&completed_block);
cmd_buf.cmd_buf.commit();
}
if let Some(last_cmd_buf) = cmd_bufs.last() {
@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice {
impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
unsafe fn begin(&mut self) {}
unsafe fn finish(&mut self) {}
unsafe fn finish(&mut self) {
self.flush_encoder();
}
unsafe fn reset(&mut self) -> bool {
false
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
// TODO: we might want to get better about validation but the following
// assert is likely to trigger, and also a case can be made that
// validation should be done at the hub level, for consistency.
//debug_assert!(matches!(self.cur_encoder, Encoder::None));
self.flush_encoder();
autoreleasepool(|| {
let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
(Some(queries), CounterStyle::Stage) => {
let descriptor: id =
msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
let index: NSUInteger = 0;
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
// Here we break the hub/mux separation a bit, for expedience
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
}
}
let start_index = queries.1 as NSUInteger;
let end_index = queries.2 as NSInteger;
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
(
msg_send![
self.cmd_buf,
computeCommandEncoderWithDescriptor: descriptor
],
None,
)
}
(Some(queries), CounterStyle::Command) => {
let encoder = self.cmd_buf.new_compute_command_encoder();
#[allow(irrefutable_let_patterns)]
let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let sample_index = queries.1 as NSUInteger;
let sample_buf = sample_buf.id();
let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
Some((sample_buf, queries.2))
} else {
None
}
} else {
None
};
(encoder, end_query)
}
_ => (self.cmd_buf.new_compute_command_encoder(), None),
};
self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
});
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
@ -452,7 +640,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
let encoder = self.cmd_buf.new_compute_command_encoder();
let encoder = self.compute_command_encoder();
encoder.set_compute_pipeline_state(&pipeline.0);
let mut buf_ix = 0;
for buffer in &descriptor_set.buffers {
@ -475,7 +663,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
depth: workgroup_size.2 as u64,
};
encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
encoder.end_encoding();
}
unsafe fn end_compute_pass(&mut self) {
// TODO: might validate that we are in a compute encoder state
self.flush_encoder();
}
unsafe fn memory_barrier(&mut self) {
@ -494,22 +686,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
// I think these are being tracked.
}
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let size = size.unwrap_or(buffer.size);
let encoder = self.cmd_buf.new_compute_command_encoder();
clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size);
encoder.end_encoding()
let _ = self.compute_command_encoder();
// Getting this directly is a workaround for a borrow checker issue.
if let Encoder::Compute(e, _) = &self.cur_encoder {
clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
}
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let encoder = self.blit_command_encoder();
let size = src.size.min(dst.size);
encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
encoder.end_encoding();
}
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let encoder = self.blit_command_encoder();
assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
let bytes_per_row = (src.width * 4) as NSUInteger;
let src_size = metal::MTLSize {
@ -530,11 +723,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
bytes_per_row * src.height as NSUInteger,
metal::MTLBlitOption::empty(),
);
encoder.end_encoding();
}
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let encoder = self.blit_command_encoder();
assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
let bytes_per_row = (dst.width * 4) as NSUInteger;
let src_size = metal::MTLSize {
@ -555,11 +747,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
origin,
metal::MTLBlitOption::empty(),
);
encoder.end_encoding();
}
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder();
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let encoder = self.blit_command_encoder();
let src_size = metal::MTLSize {
width: src.width.min(dst.width) as NSUInteger,
height: src.width.min(dst.height) as NSUInteger,
@ -577,15 +768,79 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
0,
origin,
);
encoder.end_encoding();
}
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {}
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
let mut calibration = pool.calibration.lock().unwrap();
*calibration = Some(self.time_calibration.clone());
}
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
// TODO
// This really a PITA because it's pretty different than Vulkan.
// See https://developer.apple.com/documentation/metal/counter_sampling
if let Some(buf) = &pool.counter_sample_buf {
if matches!(self.cur_encoder, Encoder::None) {
self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
let sample_index = query as NSUInteger;
if self.counter_style == CounterStyle::Command {
match &self.cur_encoder {
Encoder::Compute(e, _) => {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
}
Encoder::None => unreachable!(),
_ => todo!(),
}
} else if self.counter_style == CounterStyle::Stage {
match &self.cur_encoder {
Encoder::Compute(_e, _) => {
println!("write_timestamp is not supported for stage-style encoders");
}
_ => (),
}
}
}
}
}
impl CmdBuf {
fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Compute(..)) {
self.flush_encoder();
self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
if let Encoder::Compute(e, _) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Blit(_)) {
self.flush_encoder();
self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
}
if let Encoder::Blit(e) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn flush_encoder(&mut self) {
match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
Encoder::Compute(e, Some((sample_buf, end_query))) => {
let sample_index = end_query as NSUInteger;
unsafe {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
}
e.end_encoding();
}
Encoder::Compute(e, None) => e.end_encoding(),
Encoder::Blit(e) => e.end_encoding(),
Encoder::None => (),
}
}
}

View file

@ -0,0 +1,172 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Support for timer queries.
//!
//! Likely some of this should be upstreamed into metal-rs.
use std::{ffi::CStr, ptr::null_mut};
use cocoa_foundation::{
base::id,
foundation::{NSRange, NSUInteger},
};
use metal::{DeviceRef, MTLStorageMode};
use objc::{class, msg_send, sel, sel_impl};
pub struct CounterSampleBuffer {
id: id,
count: u64,
}
pub struct CounterSet {
id: id,
}
#[derive(Default)]
pub struct TimeCalibration {
pub cpu_start_ts: u64,
pub gpu_start_ts: u64,
pub cpu_end_ts: u64,
pub gpu_end_ts: u64,
}
impl Drop for CounterSampleBuffer {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl Clone for CounterSampleBuffer {
fn clone(&self) -> CounterSampleBuffer {
unsafe {
CounterSampleBuffer {
id: msg_send![self.id, retain],
count: self.count,
}
}
}
}
impl CounterSampleBuffer {
pub fn id(&self) -> id {
self.id
}
}
impl Drop for CounterSet {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl CounterSet {
pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
unsafe {
// TODO: version check
let sets: id = msg_send!(device, counterSets);
let count: NSUInteger = msg_send![sets, count];
for i in 0..count {
let set: id = msg_send![sets, objectAtIndex: i];
let name: id = msg_send![set, name];
let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
if name_cstr.to_bytes() == b"timestamp" {
return Some(CounterSet { id: set });
}
}
None
}
}
}
// copied from metal-rs; should be in common utilities maybe?
fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
let bytes = unsafe {
let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
bytes as *const u8
};
let len: NSUInteger = unsafe { msg_send![nsstr, length] };
unsafe {
let bytes = std::slice::from_raw_parts(bytes, len as usize);
std::str::from_utf8(bytes).unwrap()
}
}
impl CounterSampleBuffer {
pub fn new(
device: &DeviceRef,
count: u64,
counter_set: &CounterSet,
) -> Option<CounterSampleBuffer> {
unsafe {
let desc_cls = class!(MTLCounterSampleBufferDescriptor);
let descriptor: id = msg_send![desc_cls, alloc];
let _: id = msg_send![descriptor, init];
let count = count as NSUInteger;
let () = msg_send![descriptor, setSampleCount: count];
let () = msg_send![descriptor, setCounterSet: counter_set.id];
let () = msg_send![
descriptor,
setStorageMode: MTLStorageMode::Shared as NSUInteger
];
let mut error: id = null_mut();
let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
let () = msg_send![descriptor, release];
if !error.is_null() {
let description = msg_send![error, localizedDescription];
println!(
"error allocating sample buffer, code = {}",
nsstring_as_str(description)
);
let () = msg_send![error, release];
return None;
}
Some(CounterSampleBuffer { id: buf, count })
}
}
// Read the timestamps.
//
// Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
pub unsafe fn resolve(&self) -> &[u64] {
let range = NSRange::new(0, self.count);
let data: id = msg_send![self.id, resolveCounterRange: range];
if data.is_null() {
&[]
} else {
let bytes: *const u64 = msg_send![data, bytes];
std::slice::from_raw_parts(bytes, self.count as usize)
}
}
}
impl TimeCalibration {
/// Convert GPU timestamp into CPU time base.
///
/// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
pub fn correlate(&self, raw_ts: u64) -> f64 {
let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
let adj_ts = if delta_gpu > 0 {
let scale = delta_cpu as f64 / delta_gpu as f64;
self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
} else {
// Default is ns on Apple Silicon; on other hardware this will be wrong
raw_ts as f64
};
adj_ts * 1e-9
}
}

View file

@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
use crate::backend::Device as DeviceTrait;
use crate::BackendType;
use crate::BindType;
use crate::ComputePassDescriptor;
use crate::ImageFormat;
use crate::MapMode;
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@ -658,6 +659,14 @@ impl CmdBuf {
}
}
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
mux_match! { self;
CmdBuf::Vk(c) => c.begin_compute_pass(desc),
CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
}
}
/// Dispatch a compute shader.
///
/// Note that both the number of workgroups (`workgroup_count`) and the number of
@ -680,6 +689,14 @@ impl CmdBuf {
}
}
pub unsafe fn end_compute_pass(&mut self) {
mux_match! { self;
CmdBuf::Vk(c) => c.end_compute_pass(),
CmdBuf::Dx12(c) => c.end_compute_pass(),
CmdBuf::Mtl(c) => c.end_compute_pass(),
}
}
pub unsafe fn memory_barrier(&mut self) {
mux_match! { self;
CmdBuf::Vk(c) => c.memory_barrier(),

View file

@ -15,7 +15,7 @@ use smallvec::SmallVec;
use crate::backend::Device as DeviceTrait;
use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
WorkgroupLimits,
WorkgroupLimits, ComputePassDescriptor,
};
pub struct VkInstance {
@ -92,6 +92,7 @@ pub struct CmdBuf {
cmd_buf: vk::CommandBuffer,
cmd_pool: vk::CommandPool,
device: Arc<RawDevice>,
end_query: Option<(vk::QueryPool, u32)>,
}
pub struct QueryPool {
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
cmd_buf,
cmd_pool,
device: self.device.clone(),
end_query: None,
})
}
}
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
// results (Windows 10, AMD 5700 XT).
let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
let ts0 = buf[0];
let tsp = self.timestamp_period as f64 * 1e-9;
let result = buf[1..]
let result = buf
.iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp)
.map(|ts| *ts as f64 * tsp)
.collect();
Ok(result)
}
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
true
}
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Vk(pool) = pool {
self.write_timestamp_raw(pool.pool, *start);
self.end_query = Some((pool.pool, *end));
}
}
}
unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn end_compute_pass(&mut self) {
if let Some((pool, end)) = self.end_query.take() {
self.write_timestamp_raw(pool, end);
}
}
/// Insert a pipeline barrier for all memory accesses.
unsafe fn memory_barrier(&mut self) {
let device = &self.device.device;
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) {
unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device;
let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
}
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) {
unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let device = &self.device.device;
let size = src.size.min(dst.size);
device.cmd_copy_buffer(
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) {
unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let device = &self.device.device;
device.cmd_copy_image_to_buffer(
self.cmd_buf,
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) {
unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let device = &self.device.device;
device.cmd_copy_buffer_to_image(
self.cmd_buf,
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
);
}
unsafe fn blit_image(&self, src: &Image, dst: &Image) {
unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let device = &self.device.device;
device.cmd_blit_image(
self.cmd_buf,
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
}
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool.pool,
query,
);
self.write_timestamp_raw(pool.pool, query);
}
unsafe fn begin_debug_label(&mut self, label: &str) {
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
}
}
impl CmdBuf {
unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool,
query,
);
}
}
impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
fn add_buffers(&mut self, buffers: &[&Buffer]) {
self.buffers.extend(buffers.iter().map(|b| b.buffer));

View file

@ -6,7 +6,7 @@ use clap::{App, Arg};
use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};
use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
const WIDTH: usize = 2048;
const HEIGHT: usize = 1536;
@ -243,7 +243,11 @@ fn main() -> Result<(), Error> {
if matches.is_present("flip") {
scale = -scale;
}
test_scenes::render_svg(&mut ctx, input, scale);
let xml_str = std::fs::read_to_string(input).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
test_scenes::render_svg(&mut ctx, &svg);
} else {
test_scenes::render_scene(&mut ctx);
}

View file

@ -2,7 +2,7 @@ use piet::kurbo::Point;
use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer};
use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
use clap::{App, Arg};
@ -29,6 +29,25 @@ fn main() -> Result<(), Error> {
)
.get_matches();
// Collect SVG if input
let svg = match matches.value_of("INPUT") {
Some(file) => {
let mut scale = matches
.value_of("scale")
.map(|scale| scale.parse().unwrap())
.unwrap_or(8.0);
if matches.is_present("flip") {
scale = -scale;
}
let xml_str = std::fs::read_to_string(file).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
Some(svg)
}
None => None,
};
let event_loop = EventLoop::new();
let window = WindowBuilder::new()
.with_inner_size(winit::dpi::LogicalSize {
@ -51,7 +70,7 @@ fn main() -> Result<(), Error> {
.map(|_| session.create_semaphore())
.collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(8))
.map(|_| session.create_query_pool(12))
.collect::<Result<Vec<_>, Error>>()?;
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@ -93,29 +112,23 @@ fn main() -> Result<(), Error> {
if !ts.is_empty() {
info_string = format!(
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
ts[6] * 1e3,
ts[10] * 1e3,
ts[0] * 1e3,
(ts[1] - ts[0]) * 1e3,
(ts[2] - ts[1]) * 1e3,
(ts[3] - ts[2]) * 1e3,
(ts[4] - ts[3]) * 1e3,
(ts[5] - ts[4]) * 1e3,
(ts[6] - ts[5]) * 1e3,
(ts[8] - ts[7]) * 1e3,
(ts[10] - ts[9]) * 1e3,
);
}
}
let mut ctx = PietGpuRenderContext::new();
if let Some(input) = matches.value_of("INPUT") {
let mut scale = matches
.value_of("scale")
.map(|scale| scale.parse().unwrap())
.unwrap_or(8.0);
if matches.is_present("flip") {
scale = -scale;
}
test_scenes::render_svg(&mut ctx, input, scale);
} else {
let test_blend = false;
if let Some(svg) = &svg {
test_scenes::render_svg(&mut ctx, svg);
} else if test_blend {
use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
let blends = [
Blend::new(Normal, SrcOver),
@ -151,6 +164,8 @@ fn main() -> Result<(), Error> {
let blend = blends[mode % blends.len()];
test_scenes::render_blend_test(&mut ctx, current_frame, blend);
info_string = format!("{:?}", blend);
} else {
test_scenes::render_anim_frame(&mut ctx, current_frame);
}
render_info_string(&mut ctx, &info_string);
if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {

View file

@ -306,7 +306,7 @@ void main() {
is_blend = (blend != BlendComp_default);
}
include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
|| (is_clip && is_blend);
|| is_blend;
}
if (include_tile) {
uint el_slice = el_ix / 32;

Binary file not shown.

Binary file not shown.

View file

@ -931,23 +931,14 @@ void comp_main()
{
_1701 = _1692;
}
bool _1708;
if (!_1701)
{
_1708 = is_clip && is_blend;
}
else
{
_1708 = _1701;
}
include_tile = _1708;
include_tile = _1701 || is_blend;
}
if (include_tile)
{
uint el_slice = el_ix / 32u;
uint el_mask = 1u << (el_ix & 31u);
uint _1728;
InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1728);
uint _1723;
InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
}
}
GroupMemoryBarrierWithGroupSync();
@ -976,9 +967,9 @@ void comp_main()
{
uint param_25 = element_ref_ix;
bool param_26 = mem_ok;
TileRef _1805 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
Alloc param_27 = read_tile_alloc(param_25, param_26);
TileRef param_28 = _1805;
TileRef param_28 = _1800;
Tile tile_1 = Tile_read(param_27, param_28);
uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@ -993,11 +984,11 @@ void comp_main()
Alloc param_29 = cmd_alloc;
CmdRef param_30 = cmd_ref;
uint param_31 = cmd_limit;
bool _1853 = alloc_cmd(param_29, param_30, param_31);
bool _1848 = alloc_cmd(param_29, param_30, param_31);
cmd_alloc = param_29;
cmd_ref = param_30;
cmd_limit = param_31;
if (!_1853)
if (!_1848)
{
break;
}
@ -1008,10 +999,10 @@ void comp_main()
write_fill(param_32, param_33, param_34, param_35);
cmd_ref = param_33;
uint rgba = _1372.Load(dd_1 * 4 + 0);
CmdColor _1876 = { rgba };
CmdColor _1871 = { rgba };
Alloc param_36 = cmd_alloc;
CmdRef param_37 = cmd_ref;
CmdColor param_38 = _1876;
CmdColor param_38 = _1871;
Cmd_Color_write(param_36, param_37, param_38);
cmd_ref.offset += 8u;
break;
@ -1021,11 +1012,11 @@ void comp_main()
Alloc param_39 = cmd_alloc;
CmdRef param_40 = cmd_ref;
uint param_41 = cmd_limit;
bool _1894 = alloc_cmd(param_39, param_40, param_41);
bool _1889 = alloc_cmd(param_39, param_40, param_41);
cmd_alloc = param_39;
cmd_ref = param_40;
cmd_limit = param_41;
if (!_1894)
if (!_1889)
{
break;
}
@ -1052,11 +1043,11 @@ void comp_main()
Alloc param_49 = cmd_alloc;
CmdRef param_50 = cmd_ref;
uint param_51 = cmd_limit;
bool _1958 = alloc_cmd(param_49, param_50, param_51);
bool _1953 = alloc_cmd(param_49, param_50, param_51);
cmd_alloc = param_49;
cmd_ref = param_50;
cmd_limit = param_51;
if (!_1958)
if (!_1953)
{
break;
}
@ -1086,11 +1077,11 @@ void comp_main()
Alloc param_59 = cmd_alloc;
CmdRef param_60 = cmd_ref;
uint param_61 = cmd_limit;
bool _2064 = alloc_cmd(param_59, param_60, param_61);
bool _2059 = alloc_cmd(param_59, param_60, param_61);
cmd_alloc = param_59;
cmd_ref = param_60;
cmd_limit = param_61;
if (!_2064)
if (!_2059)
{
break;
}
@ -1103,27 +1094,27 @@ void comp_main()
uint index = _1372.Load(dd_1 * 4 + 0);
uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
CmdImage _2103 = { index, offset_1 };
CmdImage _2098 = { index, offset_1 };
Alloc param_66 = cmd_alloc;
CmdRef param_67 = cmd_ref;
CmdImage param_68 = _2103;
CmdImage param_68 = _2098;
Cmd_Image_write(param_66, param_67, param_68);
cmd_ref.offset += 12u;
break;
}
case 5u:
{
bool _2117 = tile_1.tile.offset == 0u;
bool _2123;
if (_2117)
bool _2112 = tile_1.tile.offset == 0u;
bool _2118;
if (_2112)
{
_2123 = tile_1.backdrop == 0;
_2118 = tile_1.backdrop == 0;
}
else
{
_2123 = _2117;
_2118 = _2112;
}
if (_2123)
if (_2118)
{
clip_zero_depth = clip_depth + 1u;
}
@ -1132,11 +1123,11 @@ void comp_main()
Alloc param_69 = cmd_alloc;
CmdRef param_70 = cmd_ref;
uint param_71 = cmd_limit;
bool _2135 = alloc_cmd(param_69, param_70, param_71);
bool _2130 = alloc_cmd(param_69, param_70, param_71);
cmd_alloc = param_69;
cmd_ref = param_70;
cmd_limit = param_71;
if (!_2135)
if (!_2130)
{
break;
}
@ -1154,11 +1145,11 @@ void comp_main()
Alloc param_74 = cmd_alloc;
CmdRef param_75 = cmd_ref;
uint param_76 = cmd_limit;
bool _2163 = alloc_cmd(param_74, param_75, param_76);
bool _2158 = alloc_cmd(param_74, param_75, param_76);
cmd_alloc = param_74;
cmd_ref = param_75;
cmd_limit = param_76;
if (!_2163)
if (!_2158)
{
break;
}
@ -1169,10 +1160,10 @@ void comp_main()
write_fill(param_77, param_78, param_79, param_80);
cmd_ref = param_78;
uint blend_1 = _1372.Load(dd_1 * 4 + 0);
CmdEndClip _2186 = { blend_1 };
CmdEndClip _2181 = { blend_1 };
Alloc param_81 = cmd_alloc;
CmdRef param_82 = cmd_ref;
CmdEndClip param_83 = _2186;
CmdEndClip param_83 = _2181;
Cmd_EndClip_write(param_81, param_82, param_83);
cmd_ref.offset += 8u;
break;
@ -1207,17 +1198,17 @@ void comp_main()
break;
}
}
bool _2233 = (bin_tile_x + tile_x) < _1005.Load(8);
bool _2242;
if (_2233)
bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
bool _2237;
if (_2228)
{
_2242 = (bin_tile_y + tile_y) < _1005.Load(12);
_2237 = (bin_tile_y + tile_y) < _1005.Load(12);
}
else
{
_2242 = _2233;
_2237 = _2228;
}
if (_2242)
if (_2237)
{
Alloc param_84 = cmd_alloc;
CmdRef param_85 = cmd_ref;

View file

@ -954,22 +954,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
{
_1701 = _1692;
}
bool _1708;
if (!_1701)
{
_1708 = is_clip && is_blend;
}
else
{
_1708 = _1701;
}
include_tile = _1708;
include_tile = _1701 || is_blend;
}
if (include_tile)
{
uint el_slice = el_ix / 32u;
uint el_mask = 1u << (el_ix & 31u);
uint _1728 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
@ -1014,11 +1005,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_29 = cmd_alloc;
CmdRef param_30 = cmd_ref;
uint param_31 = cmd_limit;
bool _1853 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
cmd_alloc = param_29;
cmd_ref = param_30;
cmd_limit = param_31;
if (!_1853)
if (!_1848)
{
break;
}
@ -1041,11 +1032,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_39 = cmd_alloc;
CmdRef param_40 = cmd_ref;
uint param_41 = cmd_limit;
bool _1894 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
cmd_alloc = param_39;
cmd_ref = param_40;
cmd_limit = param_41;
if (!_1894)
if (!_1889)
{
break;
}
@ -1072,11 +1063,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_49 = cmd_alloc;
CmdRef param_50 = cmd_ref;
uint param_51 = cmd_limit;
bool _1958 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
cmd_alloc = param_49;
cmd_ref = param_50;
cmd_limit = param_51;
if (!_1958)
if (!_1953)
{
break;
}
@ -1106,11 +1097,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_59 = cmd_alloc;
CmdRef param_60 = cmd_ref;
uint param_61 = cmd_limit;
bool _2064 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
cmd_alloc = param_59;
cmd_ref = param_60;
cmd_limit = param_61;
if (!_2064)
if (!_2059)
{
break;
}
@ -1132,17 +1123,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
}
case 5u:
{
bool _2117 = tile_1.tile.offset == 0u;
bool _2123;
if (_2117)
bool _2112 = tile_1.tile.offset == 0u;
bool _2118;
if (_2112)
{
_2123 = tile_1.backdrop == 0;
_2118 = tile_1.backdrop == 0;
}
else
{
_2123 = _2117;
_2118 = _2112;
}
if (_2123)
if (_2118)
{
clip_zero_depth = clip_depth + 1u;
}
@ -1151,11 +1142,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_69 = cmd_alloc;
CmdRef param_70 = cmd_ref;
uint param_71 = cmd_limit;
bool _2135 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
cmd_alloc = param_69;
cmd_ref = param_70;
cmd_limit = param_71;
if (!_2135)
if (!_2130)
{
break;
}
@ -1173,11 +1164,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_74 = cmd_alloc;
CmdRef param_75 = cmd_ref;
uint param_76 = cmd_limit;
bool _2163 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
cmd_alloc = param_74;
cmd_ref = param_75;
cmd_limit = param_76;
if (!_2163)
if (!_2158)
{
break;
}
@ -1225,17 +1216,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
break;
}
}
bool _2233 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
bool _2242;
if (_2233)
bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
bool _2237;
if (_2228)
{
_2242 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
_2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
}
else
{
_2242 = _2233;
_2237 = _2228;
}
if (_2242)
if (_2237)
{
Alloc param_84 = cmd_alloc;
CmdRef param_85 = cmd_ref;

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -37,147 +37,6 @@ pub struct Encoder {
n_clip: u32,
}
#[derive(Copy, Clone, Debug)]
pub struct EncodedSceneRef<'a, T: Copy + Pod> {
pub transform_stream: &'a [T],
pub tag_stream: &'a [u8],
pub pathseg_stream: &'a [u8],
pub linewidth_stream: &'a [f32],
pub drawtag_stream: &'a [u32],
pub drawdata_stream: &'a [u8],
pub n_path: u32,
pub n_pathseg: u32,
pub n_clip: u32,
pub ramp_data: &'a [u32],
}
impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
/// Return a config for the element processing pipeline.
///
/// This does not include further pipeline processing. Also returns the
/// beginning of free memory.
pub fn stage_config(&self) -> (Config, usize) {
// Layout of scene buffer
let drawtag_offset = 0;
let n_drawobj = self.n_drawobj();
let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
let trans_offset = drawdata_offset + self.drawdata_stream.len();
let n_trans = self.transform_stream.len();
let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
let n_linewidth = self.linewidth_stream.len();
let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
let n_pathtag = self.tag_stream.len();
let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
let pathseg_offset = pathtag_offset + n_pathtag_padded;
// Layout of memory
let mut alloc = 0;
let trans_alloc = alloc;
alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
let pathseg_alloc = alloc;
alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
let path_bbox_alloc = alloc;
let n_path = self.n_path as usize;
alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
let drawmonoid_alloc = alloc;
alloc += n_drawobj_padded * DRAWMONOID_SIZE;
let anno_alloc = alloc;
alloc += n_drawobj * ANNOTATED_SIZE;
let clip_alloc = alloc;
let n_clip = self.n_clip as usize;
const CLIP_SIZE: usize = 4;
alloc += n_clip * CLIP_SIZE;
let clip_bic_alloc = alloc;
const CLIP_BIC_SIZE: usize = 8;
// This can round down, as we only reduce the prefix
alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
let clip_stack_alloc = alloc;
const CLIP_EL_SIZE: usize = 20;
alloc += n_clip * CLIP_EL_SIZE;
let clip_bbox_alloc = alloc;
const CLIP_BBOX_SIZE: usize = 16;
alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
let draw_bbox_alloc = alloc;
alloc += n_drawobj * DRAW_BBOX_SIZE;
let drawinfo_alloc = alloc;
// TODO: not optimized; it can be accumulated during encoding or summed from drawtags
const MAX_DRAWINFO_SIZE: usize = 44;
alloc += n_drawobj * MAX_DRAWINFO_SIZE;
let config = Config {
n_elements: n_drawobj as u32,
n_pathseg: self.n_pathseg,
pathseg_alloc: pathseg_alloc as u32,
anno_alloc: anno_alloc as u32,
trans_alloc: trans_alloc as u32,
path_bbox_alloc: path_bbox_alloc as u32,
drawmonoid_alloc: drawmonoid_alloc as u32,
clip_alloc: clip_alloc as u32,
clip_bic_alloc: clip_bic_alloc as u32,
clip_stack_alloc: clip_stack_alloc as u32,
clip_bbox_alloc: clip_bbox_alloc as u32,
draw_bbox_alloc: draw_bbox_alloc as u32,
drawinfo_alloc: drawinfo_alloc as u32,
n_trans: n_trans as u32,
n_path: self.n_path,
n_clip: self.n_clip,
trans_offset: trans_offset as u32,
linewidth_offset: linewidth_offset as u32,
pathtag_offset: pathtag_offset as u32,
pathseg_offset: pathseg_offset as u32,
drawtag_offset: drawtag_offset as u32,
drawdata_offset: drawdata_offset as u32,
..Default::default()
};
(config, alloc)
}
pub fn write_scene(&self, buf: &mut BufWrite) {
buf.extend_slice(&self.drawtag_stream);
let n_drawobj = self.drawtag_stream.len();
buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
buf.extend_slice(&self.drawdata_stream);
buf.extend_slice(&self.transform_stream);
let n_trans = self.transform_stream.len();
buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE);
buf.extend_slice(&self.linewidth_stream);
buf.extend_slice(&self.tag_stream);
let n_pathtag = self.tag_stream.len();
buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
buf.extend_slice(&self.pathseg_stream);
}
/// The number of draw objects in the draw object stream.
pub(crate) fn n_drawobj(&self) -> usize {
self.drawtag_stream.len()
}
/// The number of paths.
pub(crate) fn n_path(&self) -> u32 {
self.n_path
}
/// The number of path segments.
pub(crate) fn n_pathseg(&self) -> u32 {
self.n_pathseg
}
pub(crate) fn n_transform(&self) -> usize {
self.transform_stream.len()
}
/// The number of tags in the path stream.
pub(crate) fn n_pathtag(&self) -> usize {
self.tag_stream.len()
}
pub(crate) fn n_clip(&self) -> u32 {
self.n_clip
}
}
/// A scene fragment encoding a glyph.
///
/// This is a reduced version of the full encoder.
@ -471,21 +330,6 @@ impl Encoder {
self.n_path += glyph.n_path;
self.n_pathseg += glyph.n_pathseg;
}
pub(crate) fn scene_ref(&self) -> EncodedSceneRef<stages::Transform> {
EncodedSceneRef {
transform_stream: &self.transform_stream,
tag_stream: &self.tag_stream,
pathseg_stream: &self.pathseg_stream,
linewidth_stream: &self.linewidth_stream,
drawtag_stream: &self.drawtag_stream,
drawdata_stream: &self.drawdata_stream,
n_path: self.n_path,
n_pathseg: self.n_pathseg,
n_clip: self.n_clip,
ramp_data: &[],
}
}
}
fn align_up(x: usize, align: usize) -> usize {

View file

@ -10,10 +10,7 @@ mod text;
use std::convert::TryInto;
use bytemuck::Pod;
pub use blend::{Blend, BlendMode, CompositionMode};
pub use encoder::EncodedSceneRef;
pub use render_ctx::PietGpuRenderContext;
pub use gradient::Colrv1RadialGradient;
@ -21,11 +18,11 @@ use piet::kurbo::Vec2;
use piet::{ImageFormat, RenderContext};
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image,
ImageLayout, Pipeline, QueryPool, Session,
include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
Error, Image, ImageLayout, Pipeline, QueryPool, Session,
};
use pico_svg::PicoSvg;
pub use pico_svg::PicoSvg;
use stages::{ClipBinding, ElementBinding, ElementCode};
use crate::stages::{ClipCode, Config, ElementStage};
@ -358,27 +355,16 @@ impl Renderer {
render_ctx: &mut PietGpuRenderContext,
buf_ix: usize,
) -> Result<(), Error> {
let mut scene = render_ctx.encoded_scene();
let ramp_data = render_ctx.get_ramp_data();
scene.ramp_data = &ramp_data;
self.upload_scene(&scene, buf_ix)
}
pub fn upload_scene<T: Copy + Pod>(
&mut self,
scene: &EncodedSceneRef<T>,
buf_ix: usize,
) -> Result<(), Error> {
let (mut config, mut alloc) = scene.stage_config();
let n_drawobj = scene.n_drawobj();
let (mut config, mut alloc) = render_ctx.stage_config();
let n_drawobj = render_ctx.n_drawobj();
// TODO: be more consistent in size types
let n_path = scene.n_path() as usize;
let n_path = render_ctx.n_path() as usize;
self.n_paths = n_path;
self.n_transform = scene.n_transform();
self.n_drawobj = scene.n_drawobj();
self.n_pathseg = scene.n_pathseg() as usize;
self.n_pathtag = scene.n_pathtag();
self.n_clip = scene.n_clip();
self.n_transform = render_ctx.n_transform();
self.n_drawobj = render_ctx.n_drawobj();
self.n_pathseg = render_ctx.n_pathseg() as usize;
self.n_pathtag = render_ctx.n_pathtag();
self.n_clip = render_ctx.n_clip();
// These constants depend on encoding and may need to be updated.
// Perhaps we can plumb these from piet-gpu-derive?
@ -402,18 +388,19 @@ impl Renderer {
// TODO: reallocate scene buffer if size is inadequate
{
let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
scene.write_scene(&mut mapped_scene);
render_ctx.write_scene(&mut mapped_scene);
}
self.config_bufs[buf_ix].write(&[config])?;
self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
// Upload gradient data.
if !scene.ramp_data.is_empty() {
let ramp_data = render_ctx.get_ramp_data();
if !ramp_data.is_empty() {
assert!(
self.gradient_bufs[buf_ix].size() as usize
>= std::mem::size_of_val(&*scene.ramp_data)
>= std::mem::size_of_val(&*ramp_data)
);
self.gradient_bufs[buf_ix].write(scene.ramp_data)?;
self.gradient_bufs[buf_ix].write(&ramp_data)?;
}
}
Ok(())
@ -437,10 +424,10 @@ impl Renderer {
cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.begin_debug_label("Element bounding box calculation");
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
self.element_stage.record(
cmd_buf,
&mut pass,
&self.element_code,
&self.element_bindings[buf_ix],
self.n_transform as u64,
@ -448,56 +435,59 @@ impl Renderer {
self.n_pathtag as u32,
self.n_drawobj as u64,
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Clip bounding box calculation");
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
pass.begin_debug_label("Clip bounding box calculation");
self.clip_binding
.record(cmd_buf, &self.clip_code, self.n_clip as u32);
cmd_buf.end_debug_label();
cmd_buf.begin_debug_label("Element binning");
cmd_buf.dispatch(
.record(&mut pass, &self.clip_code, self.n_clip as u32);
pass.end_debug_label();
pass.begin_debug_label("Element binning");
pass.dispatch(
&self.bin_pipeline,
&self.bin_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1),
);
cmd_buf.end_debug_label();
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Tile allocation");
cmd_buf.dispatch(
pass.end_debug_label();
pass.memory_barrier();
pass.begin_debug_label("Tile allocation");
pass.dispatch(
&self.tile_pipeline,
&self.tile_ds[buf_ix],
(((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1),
);
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 2);
cmd_buf.memory_barrier();
pass.end_debug_label();
pass.end();
cmd_buf.begin_debug_label("Path flattening");
cmd_buf.dispatch(
cmd_buf.memory_barrier();
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
pass.dispatch(
&self.path_pipeline,
&self.path_ds,
(((self.n_pathseg + 31) / 32) as u32, 1, 1),
(32, 1, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Backdrop propagation");
cmd_buf.dispatch(
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
pass.dispatch(
&self.backdrop_pipeline,
&self.backdrop_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1),
(256, self.backdrop_y, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 4);
// TODO: redo query accounting
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Coarse raster");
cmd_buf.dispatch(
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
pass.dispatch(
&self.coarse_pipeline,
&self.coarse_ds[buf_ix],
(
@ -507,11 +497,13 @@ impl Renderer {
),
(256, 1, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 6);
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Fine raster");
cmd_buf.dispatch(
let mut pass =
cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
pass.dispatch(
&self.k4_pipeline,
&self.k4_ds,
(
@ -521,8 +513,8 @@ impl Renderer {
),
(8, 4, 1),
);
pass.end();
cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 7);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
}

View file

@ -1,6 +1,6 @@
use std::borrow::Cow;
use crate::encoder::{EncodedSceneRef, GlyphEncoder};
use crate::encoder::GlyphEncoder;
use crate::stages::{Config, Transform};
use crate::MAX_BLEND_STACK;
use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
@ -97,10 +97,6 @@ impl PietGpuRenderContext {
self.new_encoder.stage_config()
}
pub fn encoded_scene(&self) -> EncodedSceneRef<crate::stages::Transform> {
self.new_encoder.scene_ref()
}
/// Number of draw objects.
///
/// This is for the new element processing pipeline. It's not necessarily the

View file

@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
use piet_gpu_hal::{Buffer, CmdBuf, Session};
use piet_gpu_hal::{Buffer, ComputePass, Session};
pub use transform::{
Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
};
@ -140,7 +140,7 @@ impl ElementStage {
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &ElementCode,
binding: &ElementBinding,
n_transform: u64,
@ -149,14 +149,14 @@ impl ElementStage {
n_drawobj: u64,
) {
self.transform_stage.record(
cmd_buf,
pass,
&code.transform_code,
&binding.transform_binding,
n_transform,
);
// No memory barrier needed here; path has at least one before pathseg
self.path_stage.record(
cmd_buf,
pass,
&code.path_code,
&binding.path_binding,
n_paths,
@ -164,6 +164,6 @@ impl ElementStage {
);
// No memory barrier needed here; draw has at least one before draw_leaf
self.draw_stage
.record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj);
.record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
}
}

View file

@ -16,7 +16,7 @@
//! The clip processing stage (includes substages).
use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session};
use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
// Note that this isn't the code/stage/binding pattern of most of the other stages
// in the new element processing pipeline. We want to move those temporary buffers
@ -69,26 +69,26 @@ impl ClipBinding {
/// Record the clip dispatches.
///
/// Assumes memory barrier on entry. Provides memory barrier on exit.
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) {
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
if n_wg_reduce > 0 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&self.reduce_ds,
(n_wg_reduce, 1, 1),
(CLIP_PART_SIZE, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
}
let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
if n_wg > 0 {
cmd_buf.dispatch(
pass.dispatch(
&code.leaf_pipeline,
&self.leaf_ds,
(n_wg, 1, 1),
(CLIP_PART_SIZE, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
}
}
}

View file

@ -19,7 +19,7 @@
use bytemuck::{Pod, Zeroable};
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
};
/// The output element of the draw object stage.
@ -130,7 +130,7 @@ impl DrawStage {
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &DrawCode,
binding: &DrawBinding,
size: u64,
@ -140,22 +140,22 @@ impl DrawStage {
}
let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
if n_workgroups > 1 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_workgroups as u32, 1, 1),
(DRAW_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.root_pipeline,
&self.root_ds,
(1, 1, 1),
(DRAW_WG as u32, 1, 1),
);
}
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.leaf_pipeline,
&binding.leaf_ds,
(n_workgroups as u32, 1, 1),

View file

@ -17,7 +17,7 @@
//! The path stage (includes substages).
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
};
pub struct PathCode {
@ -148,7 +148,7 @@ impl PathStage {
/// those are consumed. Result is written without barrier.
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &PathCode,
binding: &PathBinding,
n_paths: u32,
@ -166,15 +166,15 @@ impl PathStage {
let reduce_part_tags = REDUCE_PART_SIZE * 4;
let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
if n_wg_tag_reduce > 1 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_wg_tag_reduce, 1, 1),
(REDUCE_WG, 1, 1),
);
// I think we can skip root if n_wg_tag_reduce == 2
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.tag_root_pipeline,
&self.tag_root_ds,
(1, 1, 1),
@ -183,15 +183,15 @@ impl PathStage {
// No barrier needed here; clear doesn't depend on path tags
}
let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
cmd_buf.dispatch(
pass.dispatch(
&code.clear_pipeline,
&binding.clear_ds,
(n_wg_clear, 1, 1),
(CLEAR_WG, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
cmd_buf.dispatch(
pass.dispatch(
&code.pathseg_pipeline,
&binding.path_ds,
(n_wg_pathseg, 1, 1),

View file

@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};
use piet::kurbo::Affine;
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
};
/// An affine transform.
@ -132,7 +132,7 @@ impl TransformStage {
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
pass: &mut ComputePass,
code: &TransformCode,
binding: &TransformBinding,
size: u64,
@ -142,22 +142,22 @@ impl TransformStage {
}
let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
if n_workgroups > 1 {
cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_workgroups as u32, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
pass.memory_barrier();
pass.dispatch(
&code.root_pipeline,
&self.root_ds,
(1, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
pass.memory_barrier();
}
cmd_buf.dispatch(
pass.dispatch(
&code.leaf_pipeline,
&binding.leaf_ds,
(n_workgroups as u32, 1, 1),

View file

@ -21,12 +21,7 @@ pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend)
rc.restore().unwrap();
}
pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
let xml_str = std::fs::read_to_string(filename).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
pub fn render_svg(rc: &mut impl RenderContext, svg: &PicoSvg) {
let start = std::time::Instant::now();
svg.render(rc);
println!("flattening and encoding time: {:?}", start.elapsed());

View file

@ -16,11 +16,11 @@
//! Utilities (and a benchmark) for clearing buffers with compute shaders.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config;
use crate::runner::{Commands, Runner};
use crate::runner::Runner;
use crate::test_result::TestResult;
const WG_SIZE: u64 = 256;
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
let mut total_elapsed = 0.0;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -108,17 +108,12 @@ impl ClearStage {
ClearBinding { descriptor_set }
}
pub unsafe fn record(
&self,
commands: &mut Commands,
code: &ClearCode,
bindings: &ClearBinding,
) {
pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
// An issue: for clearing large buffers (>16M), we need to check the
// number of workgroups against the (dynamically detected) limit, and
// potentially issue multiple dispatches.
commands.cmd_buf.dispatch(
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),

View file

@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
let mut commands = runner.commands();
commands.write_timestamp(0);
commands.upload(&memory);
binding.record(&mut commands.cmd_buf, &code, n_clip as u32);
let mut pass = commands.compute_pass(0, 1);
binding.record(&mut pass, &code, n_clip as u32);
pass.end();
commands.download(&memory);
commands.write_timestamp(1);
runner.submit(commands);
let dst = memory.map_read(..);
if let Some(failure) = data.verify(&dst) {

View file

@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
let n_iter = config.n_iter;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands.cmd_buf, &code, &binding, n_tag);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding, n_tag);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);

View file

@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
for i in 0..n_iter {
let mut commands = runner.commands();
// Might clear only buckets to save time.
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&mem_buf);
@ -107,12 +105,14 @@ impl LinkedListStage {
commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier();
let n_workgroups = N_BUCKETS / WG_SIZE;
commands.cmd_buf.dispatch(
let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
pass.end();
}
}

View file

@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
let mut failures = 0;
for _ in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
commands.write_timestamp(1);
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
total_elapsed += runner.submit(commands);
@ -128,11 +126,13 @@ impl MessagePassingStage {
commands.cmd_buf.clear_buffer(&self.data_buf, None);
commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch(
let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(256, 1, 1),
(256, 1, 1),
);
pass.end();
}
}

View file

@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
commands.cmd_buf.memory_barrier();
commands.write_timestamp(0);
let mut pass = commands.compute_pass(0, 1);
stage.record(
&mut commands.cmd_buf,
&mut pass,
&code,
&binding,
path_data.n_path,
path_data.tags.len() as u32,
);
commands.write_timestamp(1);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);

View file

@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
let mut total_elapsed = 0.0;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -159,12 +157,14 @@ impl PrefixStage {
let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
commands.cmd_buf.clear_buffer(&self.state_buf, None);
commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch(
let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
pass.end();
// One thing that's missing here is registering the buffers so
// they can be safely dropped by Rust code before the execution
// of the command buffer completes.

View file

@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
commands.cmd_buf.memory_barrier();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
@ -175,33 +173,35 @@ impl PrefixTreeStage {
code: &PrefixTreeCode,
bindings: &PrefixTreeBinding,
) {
let mut pass = commands.compute_pass(0, 1);
let n = self.tmp_bufs.len();
for i in 0..n {
let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch(
pass.dispatch(
&code.reduce_pipeline,
&bindings.descriptor_sets[i],
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
commands.cmd_buf.memory_barrier();
pass.memory_barrier();
}
commands.cmd_buf.dispatch(
pass.dispatch(
&code.root_pipeline,
&bindings.descriptor_sets[n],
(1, 1, 1),
(WG_SIZE as u32, 1, 1),
);
for i in (0..n).rev() {
commands.cmd_buf.memory_barrier();
pass.memory_barrier();
let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch(
pass.dispatch(
&code.scan_pipeline,
&bindings.descriptor_sets[2 * n - i],
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
}
pass.end();
}
}

View file

@ -20,8 +20,8 @@ use std::ops::RangeBounds;
use bytemuck::Pod;
use piet_gpu_hal::{
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags,
QueryPool, Session,
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
};
pub struct Runner {
@ -118,8 +118,14 @@ impl Runner {
}
impl Commands {
pub unsafe fn write_timestamp(&mut self, query: u32) {
self.cmd_buf.write_timestamp(&self.query_pool, query);
/// Start a compute pass with timer queries.
pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
self.cmd_buf
.begin_compute_pass(&ComputePassDescriptor::timer(
&self.query_pool,
start_query,
end_query,
))
}
pub unsafe fn upload(&mut self, buf: &BufStage) {

View file

@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
let n_iter = config.n_iter;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
commands.write_timestamp(1);
let mut pass = commands.compute_pass(0, 1);
stage.record(&mut pass, &code, &binding, n_elements);
pass.end();
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);