rebase on timer query patch

This commit is contained in:
Chad Brokaw 2022-05-09 22:39:59 -04:00
parent ba7f85731c
commit c749addf6c
45 changed files with 887 additions and 494 deletions

1
Cargo.lock generated
View file

@ -925,6 +925,7 @@ dependencies = [
"block", "block",
"bytemuck", "bytemuck",
"cocoa-foundation", "cocoa-foundation",
"foreign-types",
"metal", "metal",
"objc", "objc",
"raw-window-handle 0.3.4", "raw-window-handle 0.3.4",

View file

@ -28,3 +28,4 @@ metal = "0.22"
objc = "0.2.5" objc = "0.2.5"
block = "0.1.6" block = "0.1.6"
cocoa-foundation = "0.1" cocoa-foundation = "0.1"
foreign-types = "0.3.2"

View file

@ -1,4 +1,4 @@
use piet_gpu_hal::{include_shader, BindType}; use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor};
use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session}; use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session};
fn main() { fn main() {
@ -20,9 +20,9 @@ fn main() {
let mut cmd_buf = session.cmd_buf().unwrap(); let mut cmd_buf = session.cmd_buf().unwrap();
cmd_buf.begin(); cmd_buf.begin();
cmd_buf.reset_query_pool(&query_pool); cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0); let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1)); pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1));
cmd_buf.write_timestamp(&query_pool, 1); pass.end();
cmd_buf.finish_timestamps(&query_pool); cmd_buf.finish_timestamps(&query_pool);
cmd_buf.host_barrier(); cmd_buf.host_barrier();
cmd_buf.finish(); cmd_buf.finish();

View file

@ -17,7 +17,8 @@
//! The generic trait for backends to implement. //! The generic trait for backends to implement.
use crate::{ use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout,
MapMode, SamplerParams,
}; };
pub trait Device: Sized { pub trait Device: Sized {
@ -159,14 +160,32 @@ pub trait Device: Sized {
unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>; unsafe fn create_sampler(&self, params: SamplerParams) -> Result<Self::Sampler, Error>;
} }
/// The trait implemented by backend command buffer implementations.
///
/// Valid encoding is represented by a state machine (currently not validated
/// but it is easy to imagine there might be at least debug validation). Most
/// methods are only valid in a particular state, and some move it to another
/// state.
pub trait CmdBuf<D: Device> { pub trait CmdBuf<D: Device> {
/// Begin encoding.
///
/// State: init -> ready
unsafe fn begin(&mut self); unsafe fn begin(&mut self);
/// State: ready -> finished
unsafe fn finish(&mut self); unsafe fn finish(&mut self);
/// Return true if the command buffer is suitable for reuse. /// Return true if the command buffer is suitable for reuse.
unsafe fn reset(&mut self) -> bool; unsafe fn reset(&mut self) -> bool;
/// Begin a compute pass.
///
/// State: ready -> in_compute_pass
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor);
/// Dispatch
///
/// State: in_compute_pass
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &D::Pipeline, pipeline: &D::Pipeline,
@ -175,6 +194,9 @@ pub trait CmdBuf<D: Device> {
workgroup_size: (u32, u32, u32), workgroup_size: (u32, u32, u32),
); );
/// State: in_compute_pass -> ready
unsafe fn end_compute_pass(&mut self);
/// Insert an execution and memory barrier. /// Insert an execution and memory barrier.
/// ///
/// Compute kernels (and other actions) after this barrier may read from buffers /// Compute kernels (and other actions) after this barrier may read from buffers
@ -202,16 +224,16 @@ pub trait CmdBuf<D: Device> {
/// This is readily supported in Vulkan, but for portability it is remarkably /// This is readily supported in Vulkan, but for portability it is remarkably
/// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute
/// kernel, or organize the code not to need it. /// kernel, or organize the code not to need it.
unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option<u64>); unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option<u64>);
unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer);
unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer); unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer);
unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image); unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image);
// low portability, dx12 doesn't support it natively // low portability, dx12 doesn't support it natively
unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image); unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image);
/// Reset the query pool. /// Reset the query pool.
/// ///
@ -227,7 +249,7 @@ pub trait CmdBuf<D: Device> {
unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {} unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {}
/// Begin a labeled section for debugging and profiling purposes. /// Begin a labeled section for debugging and profiling purposes.
unsafe fn begin_debug_label(&mut self, label: &str) {} unsafe fn begin_debug_label(&mut self, _label: &str) {}
/// End a section opened by `begin_debug_label`. /// End a section opened by `begin_debug_label`.
unsafe fn end_debug_label(&mut self) {} unsafe fn end_debug_label(&mut self) {}

View file

@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat}; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor};
use self::{ use self::{
descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned}, descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned},
@ -76,6 +76,7 @@ pub struct CmdBuf {
c: wrappers::GraphicsCommandList, c: wrappers::GraphicsCommandList,
allocator: CommandAllocator, allocator: CommandAllocator,
needs_reset: bool, needs_reset: bool,
end_query: Option<(wrappers::QueryHeap, u32)>,
} }
pub struct Pipeline { pub struct Pipeline {
@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device {
c, c,
allocator, allocator,
needs_reset: false, needs_reset: false,
end_query: None,
}) })
} }
} }
@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device {
let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size); std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size);
self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?;
let ts0 = buf[0];
let tsp = (self.ts_freq as f64).recip(); let tsp = (self.ts_freq as f64).recip();
let result = buf[1..] let result = buf
.iter() .iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) .map(|ts| *ts as f64 * tsp)
.collect(); .collect();
Ok(result) Ok(result)
} }
@ -610,6 +611,16 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok() self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok()
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Dx12(pool) = pool {
self.write_timestamp(pool, *start);
self.end_query = Some((pool.heap.clone(), *end));
}
}
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -628,6 +639,12 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
.dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2);
} }
unsafe fn end_compute_pass(&mut self) {
if let Some((heap, end)) = self.end_query.take() {
self.c.end_timing_query(&heap, end);
}
}
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
// See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend. // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend.
// The "proper" way to do this would be to name the actual buffers participating // The "proper" way to do this would be to name the actual buffers participating
@ -666,7 +683,7 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
self.memory_barrier(); self.memory_barrier();
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let cpu_ref = buffer.cpu_ref.as_ref().unwrap(); let cpu_ref = buffer.cpu_ref.as_ref().unwrap();
let (gpu_ref, heap) = buffer let (gpu_ref, heap) = buffer
.gpu_ref .gpu_ref
@ -684,23 +701,23 @@ impl crate::backend::CmdBuf<Dx12Device> for CmdBuf {
); );
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
// TODO: consider using copy_resource here (if sizes match) // TODO: consider using copy_resource here (if sizes match)
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size); self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size);
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
self.c self.c
.copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1); .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1);
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
self.c self.c
.copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1); .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1);
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
self.c.copy_resource(&src.resource, &dst.resource); self.c.copy_resource(&src.resource, &dst.resource);
} }

View file

@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr<d3dcommon::ID3DBlob>);
#[derive(Clone)] #[derive(Clone)]
pub struct ShaderByteCode { pub struct ShaderByteCode {
pub bytecode: d3d12::D3D12_SHADER_BYTECODE, pub bytecode: d3d12::D3D12_SHADER_BYTECODE,
blob: Option<Blob>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -741,7 +740,6 @@ impl ShaderByteCode {
BytecodeLength: blob.0.GetBufferSize(), BytecodeLength: blob.0.GetBufferSize(),
pShaderBytecode: blob.0.GetBufferPointer(), pShaderBytecode: blob.0.GetBufferPointer(),
}, },
blob: Some(blob),
} }
} }
@ -810,7 +808,6 @@ impl ShaderByteCode {
BytecodeLength: bytecode.len(), BytecodeLength: bytecode.len(),
pShaderBytecode: bytecode.as_ptr() as *const _, pShaderBytecode: bytecode.as_ptr() as *const _,
}, },
blob: None,
} }
} }
} }

View file

@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak};
use bytemuck::Pod; use bytemuck::Pod;
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode}; use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode};
use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams}; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams};
@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> {
size: u64, size: u64,
} }
/// A sub-object of a command buffer for a sequence of compute dispatches.
pub struct ComputePass<'a> {
cmd_buf: &'a mut CmdBuf,
}
impl Session { impl Session {
/// Create a new session, choosing the best backend. /// Create a new session, choosing the best backend.
pub fn new(device: mux::Device) -> Session { pub fn new(device: mux::Device) -> Session {
@ -370,8 +375,17 @@ impl Session {
/// ///
/// This should be called after waiting on the command buffer that wrote the /// This should be called after waiting on the command buffer that wrote the
/// timer queries. /// timer queries.
///
/// The returned vector is one shorter than the number of timer queries in the
/// pool; the first value is subtracted off. It would likely be better to return
/// the raw timestamps, but that change should be made consistently.
pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> { pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result<Vec<f64>, Error> {
self.0.device.fetch_query_pool(pool) let result = self.0.device.fetch_query_pool(pool)?;
// Subtract off first timestamp.
Ok(result[1..]
.iter()
.map(|ts| *ts as f64 - result[0])
.collect())
} }
#[doc(hidden)] #[doc(hidden)]
@ -471,23 +485,10 @@ impl CmdBuf {
self.cmd_buf().finish(); self.cmd_buf().finish();
} }
/// Dispatch a compute shader. /// Begin a compute pass.
/// pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass {
/// Request a compute shader to be run, using the pipeline to specify the self.cmd_buf().begin_compute_pass(desc);
/// code, and the descriptor set to address the resources read and written. ComputePass { cmd_buf: self }
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
} }
/// Insert an execution and memory barrier. /// Insert an execution and memory barrier.
@ -582,13 +583,6 @@ impl CmdBuf {
self.cmd_buf().reset_query_pool(pool); self.cmd_buf().reset_query_pool(pool);
} }
/// Write a timestamp.
///
/// The query index must be less than the size of the query pool on creation.
pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
self.cmd_buf().write_timestamp(pool, query);
}
/// Prepare the timestamps for reading. This isn't required on Vulkan but /// Prepare the timestamps for reading. This isn't required on Vulkan but
/// is required on (at least) DX12. /// is required on (at least) DX12.
/// ///
@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf {
} }
} }
impl<'a> ComputePass<'a> {
/// Dispatch a compute shader.
///
/// Request a compute shader to be run, using the pipeline to specify the
/// code, and the descriptor set to address the resources read and written.
///
/// Both the workgroup count (number of workgroups) and the workgroup size
/// (number of threads in a workgroup) must be specified here, though not
/// all back-ends require the latter info.
pub unsafe fn dispatch(
&mut self,
pipeline: &Pipeline,
descriptor_set: &DescriptorSet,
workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32),
) {
self.cmd_buf
.cmd_buf()
.dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size);
}
/// Add a memory barrier.
///
/// Inserts a memory barrier in the compute encoder. This is a convenience
/// function for calling the same function on the underlying command buffer,
/// avoiding borrow check issues.
pub unsafe fn memory_barrier(&mut self) {
self.cmd_buf.memory_barrier();
}
/// Begin a labeled section for debugging and profiling purposes.
pub unsafe fn begin_debug_label(&mut self, label: &str) {
self.cmd_buf.begin_debug_label(label);
}
/// End a section opened by `begin_debug_label`.
pub unsafe fn end_debug_label(&mut self) {
self.cmd_buf.end_debug_label();
}
pub unsafe fn end(self) {
self.cmd_buf.cmd_buf().end_compute_pass();
}
}
impl Drop for BufferInner { impl Drop for BufferInner {
fn drop(&mut self) { fn drop(&mut self) {
if let Some(session) = Weak::upgrade(&self.session) { if let Some(session) = Weak::upgrade(&self.session) {

View file

@ -21,8 +21,8 @@ pub use crate::mux::{
}; };
pub use bufwrite::BufWrite; pub use bufwrite::BufWrite;
pub use hub::{ pub use hub::{
BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource, BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image,
Session, SubmittedCmdBuf, RetainResource, Session, SubmittedCmdBuf,
}; };
// TODO: because these are conditionally included, "cargo fmt" does not // TODO: because these are conditionally included, "cargo fmt" does not
@ -189,3 +189,23 @@ pub struct WorkgroupLimits {
/// dimension. /// dimension.
pub max_invocations: u32, pub max_invocations: u32,
} }
/// Options for creating a compute pass.
#[derive(Default)]
pub struct ComputePassDescriptor<'a> {
// Maybe label should go here? It does in wgpu and wgpu_hal.
/// Timer query parameters.
///
/// To record timer queries for a compute pass, set the query pool, start
/// query index, and end query index here. The indices must be less than
/// the size of the query pool.
timer_queries: Option<(&'a QueryPool, u32, u32)>,
}
impl<'a> ComputePassDescriptor<'a> {
pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor {
ComputePassDescriptor {
timer_queries: Some((pool, start_query, end_query)),
}
}
}

View file

@ -15,25 +15,32 @@
// Also licensed under MIT license, at your choice. // Also licensed under MIT license, at your choice.
mod clear; mod clear;
mod timer;
mod util; mod util;
use std::mem; use std::mem;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use block::Block;
use cocoa_foundation::base::id; use cocoa_foundation::base::id;
use cocoa_foundation::foundation::{NSInteger, NSUInteger}; use cocoa_foundation::foundation::{NSInteger, NSUInteger};
use foreign_types::ForeignType;
use objc::rc::autoreleasepool; use objc::rc::autoreleasepool;
use objc::runtime::{Object, BOOL, YES}; use objc::runtime::{Object, BOOL, YES};
use objc::{class, msg_send, sel, sel_impl}; use objc::{class, msg_send, sel, sel_impl};
use metal::{CGFloat, MTLFeatureSet}; use metal::{CGFloat, CommandBufferRef, MTLFeatureSet};
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits}; use crate::{
BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits,
};
use util::*; use util::*;
use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration};
pub struct MtlInstance; pub struct MtlInstance;
pub struct MtlDevice { pub struct MtlDevice {
@ -41,6 +48,18 @@ pub struct MtlDevice {
cmd_queue: Arc<Mutex<metal::CommandQueue>>, cmd_queue: Arc<Mutex<metal::CommandQueue>>,
gpu_info: GpuInfo, gpu_info: GpuInfo,
helpers: Arc<Helpers>, helpers: Arc<Helpers>,
timer_set: Option<CounterSet>,
counter_style: CounterStyle,
}
/// Type of counter sampling.
///
/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum CounterStyle {
None,
Stage,
Command,
} }
pub struct MtlSurface { pub struct MtlSurface {
@ -81,9 +100,22 @@ pub struct Semaphore;
pub struct CmdBuf { pub struct CmdBuf {
cmd_buf: metal::CommandBuffer, cmd_buf: metal::CommandBuffer,
helpers: Arc<Helpers>, helpers: Arc<Helpers>,
cur_encoder: Encoder,
time_calibration: Arc<Mutex<TimeCalibration>>,
counter_style: CounterStyle,
} }
pub struct QueryPool; enum Encoder {
None,
Compute(metal::ComputeCommandEncoder, Option<(id, u32)>),
Blit(metal::BlitCommandEncoder),
}
#[derive(Default)]
pub struct QueryPool {
counter_sample_buf: Option<CounterSampleBuffer>,
calibration: Arc<Mutex<Option<Arc<Mutex<TimeCalibration>>>>>,
}
pub struct Pipeline(metal::ComputePipelineState); pub struct Pipeline(metal::ComputePipelineState);
@ -209,18 +241,43 @@ impl MtlDevice {
let helpers = Arc::new(Helpers { let helpers = Arc::new(Helpers {
clear_pipeline: clear::make_clear_pipeline(&device), clear_pipeline: clear::make_clear_pipeline(&device),
}); });
// Timer stuff
let timer_set = CounterSet::get_timer_counter_set(&device);
let counter_style = if timer_set.is_some() {
if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) {
CounterStyle::Stage
} else if device
.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary)
{
CounterStyle::Command
} else {
CounterStyle::None
}
} else {
CounterStyle::None
};
MtlDevice { MtlDevice {
device, device,
cmd_queue: Arc::new(Mutex::new(cmd_queue)), cmd_queue: Arc::new(Mutex::new(cmd_queue)),
gpu_info, gpu_info,
helpers, helpers,
timer_set,
counter_style,
} }
} }
pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf { pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf {
let cmd_buf = raw_cmd_buf; let cmd_buf = raw_cmd_buf;
let helpers = self.helpers.clone(); let helpers = self.helpers.clone();
CmdBuf { cmd_buf, helpers } let cur_encoder = Encoder::None;
let time_calibration = Default::default();
CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
}
} }
pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image { pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image {
@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice {
fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> { fn create_cmd_buf(&self) -> Result<Self::CmdBuf, Error> {
let cmd_queue = self.cmd_queue.lock().unwrap(); let cmd_queue = self.cmd_queue.lock().unwrap();
// A discussion about autorelease pools.
//
// Autorelease pools are a sore point in Rust/Objective-C interop. Basically,
// you can have any two of correctness, ergonomics, and performance. Here we've
// chosen the first two, using the pattern of a fine grained autorelease pool
// to give the Obj-C object Rust-like lifetime semantics whenever objects are
// created as autorelease (by convention, this is any object creation with an
// Obj-C method name that doesn't begin with "new" or "alloc").
//
// To gain back some of the performance, we'd need a way to wrap an autorelease
// pool over a chunk of work - that could be one frame of rendering, but for
// tests that iterate a number of command buffer submissions, it would need to
// be around that. On non-mac platforms, it would be a no-op.
//
// In any case, this way, the caller doesn't need to worry, and the performance
// hit might not be so bad (perhaps we should measure).
// consider new_command_buffer_with_unretained_references for performance // consider new_command_buffer_with_unretained_references for performance
let cmd_buf = cmd_queue.new_command_buffer(); let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned());
let cmd_buf = autoreleasepool(|| cmd_buf.to_owned());
let helpers = self.helpers.clone(); let helpers = self.helpers.clone();
Ok(CmdBuf { cmd_buf, helpers }) let cur_encoder = Encoder::None;
let time_calibration = Default::default();
Ok(CmdBuf {
cmd_buf,
helpers,
cur_encoder,
time_calibration,
counter_style: self.counter_style,
})
} }
unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> { unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> {
@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice {
} }
fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> { fn create_query_pool(&self, n_queries: u32) -> Result<Self::QueryPool, Error> {
// TODO if let Some(timer_set) = &self.timer_set {
Ok(QueryPool) let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set)
.ok_or("error creating timer query pool")?;
return Ok(QueryPool {
counter_sample_buf: Some(pool),
calibration: Default::default(),
});
}
Ok(QueryPool::default())
} }
unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> { unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result<Vec<f64>, Error> {
// TODO if let Some(raw) = &pool.counter_sample_buf {
let resolved = raw.resolve();
let calibration = pool.calibration.lock().unwrap();
if let Some(calibration) = &*calibration {
let calibration = calibration.lock().unwrap();
let result = resolved
.iter()
.map(|time_ns| calibration.correlate(*time_ns))
.collect();
return Ok(result);
}
}
// Maybe should return None indicating it wasn't successful? But that might break.
Ok(Vec::new()) Ok(Vec::new())
} }
@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice {
_signal_semaphores: &[&Self::Semaphore], _signal_semaphores: &[&Self::Semaphore],
fence: Option<&mut Self::Fence>, fence: Option<&mut Self::Fence>,
) -> Result<(), Error> { ) -> Result<(), Error> {
unsafe fn add_scheduled_handler(
cmd_buf: &metal::CommandBufferRef,
block: &Block<(&CommandBufferRef,), ()>,
) {
msg_send![cmd_buf, addScheduledHandler: block]
}
for cmd_buf in cmd_bufs { for cmd_buf in cmd_bufs {
let time_calibration = cmd_buf.time_calibration.clone();
let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _;
// TODO: only do this if supported.
let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
add_scheduled_handler(&cmd_buf.cmd_buf, &start_block);
let time_calibration = cmd_buf.time_calibration.clone();
let completed_block =
block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| {
let device: id = msg_send![buffer, device];
let mut time_calibration = time_calibration.lock().unwrap();
let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _;
let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _;
// TODO: only do this if supported.
let () =
msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr];
})
.copy();
cmd_buf.cmd_buf.add_completed_handler(&completed_block);
cmd_buf.cmd_buf.commit(); cmd_buf.cmd_buf.commit();
} }
if let Some(last_cmd_buf) = cmd_bufs.last() { if let Some(last_cmd_buf) = cmd_bufs.last() {
@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice {
impl crate::backend::CmdBuf<MtlDevice> for CmdBuf { impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
unsafe fn begin(&mut self) {} unsafe fn begin(&mut self) {}
unsafe fn finish(&mut self) {} unsafe fn finish(&mut self) {
self.flush_encoder();
}
unsafe fn reset(&mut self) -> bool { unsafe fn reset(&mut self) -> bool {
false false
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
// TODO: we might want to get better about validation but the following
// assert is likely to trigger, and also a case can be made that
// validation should be done at the hub level, for consistency.
//debug_assert!(matches!(self.cur_encoder, Encoder::None));
self.flush_encoder();
autoreleasepool(|| {
let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) {
(Some(queries), CounterStyle::Stage) => {
let descriptor: id =
msg_send![class!(MTLComputePassDescriptor), computePassDescriptor];
let attachments: id = msg_send![descriptor, sampleBufferAttachments];
let index: NSUInteger = 0;
let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index];
// Here we break the hub/mux separation a bit, for expedience
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let () = msg_send![attachment, setSampleBuffer: sample_buf.id()];
}
}
let start_index = queries.1 as NSUInteger;
let end_index = queries.2 as NSInteger;
let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index];
let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index];
(
msg_send![
self.cmd_buf,
computeCommandEncoderWithDescriptor: descriptor
],
None,
)
}
(Some(queries), CounterStyle::Command) => {
let encoder = self.cmd_buf.new_compute_command_encoder();
#[allow(irrefutable_let_patterns)]
let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 {
if let Some(sample_buf) = &query_pool.counter_sample_buf {
let sample_index = queries.1 as NSUInteger;
let sample_buf = sample_buf.id();
let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
Some((sample_buf, queries.2))
} else {
None
}
} else {
None
};
(encoder, end_query)
}
_ => (self.cmd_buf.new_compute_command_encoder(), None),
};
self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query);
});
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -452,7 +640,7 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
workgroup_count: (u32, u32, u32), workgroup_count: (u32, u32, u32),
workgroup_size: (u32, u32, u32), workgroup_size: (u32, u32, u32),
) { ) {
let encoder = self.cmd_buf.new_compute_command_encoder(); let encoder = self.compute_command_encoder();
encoder.set_compute_pipeline_state(&pipeline.0); encoder.set_compute_pipeline_state(&pipeline.0);
let mut buf_ix = 0; let mut buf_ix = 0;
for buffer in &descriptor_set.buffers { for buffer in &descriptor_set.buffers {
@ -475,7 +663,11 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
depth: workgroup_size.2 as u64, depth: workgroup_size.2 as u64,
}; };
encoder.dispatch_thread_groups(workgroup_count, workgroup_size); encoder.dispatch_thread_groups(workgroup_count, workgroup_size);
encoder.end_encoding(); }
unsafe fn end_compute_pass(&mut self) {
// TODO: might validate that we are in a compute encoder state
self.flush_encoder();
} }
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
@ -494,22 +686,23 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
// I think these are being tracked. // I think these are being tracked.
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let size = size.unwrap_or(buffer.size); let size = size.unwrap_or(buffer.size);
let encoder = self.cmd_buf.new_compute_command_encoder(); let _ = self.compute_command_encoder();
clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size); // Getting this directly is a workaround for a borrow checker issue.
encoder.end_encoding() if let Encoder::Compute(e, _) = &self.cur_encoder {
clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size);
}
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size); encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size);
encoder.end_encoding();
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4); assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4);
let bytes_per_row = (src.width * 4) as NSUInteger; let bytes_per_row = (src.width * 4) as NSUInteger;
let src_size = metal::MTLSize { let src_size = metal::MTLSize {
@ -530,11 +723,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
bytes_per_row * src.height as NSUInteger, bytes_per_row * src.height as NSUInteger,
metal::MTLBlitOption::empty(), metal::MTLBlitOption::empty(),
); );
encoder.end_encoding();
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4); assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4);
let bytes_per_row = (dst.width * 4) as NSUInteger; let bytes_per_row = (dst.width * 4) as NSUInteger;
let src_size = metal::MTLSize { let src_size = metal::MTLSize {
@ -555,11 +747,10 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
origin, origin,
metal::MTLBlitOption::empty(), metal::MTLBlitOption::empty(),
); );
encoder.end_encoding();
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let encoder = self.cmd_buf.new_blit_command_encoder(); let encoder = self.blit_command_encoder();
let src_size = metal::MTLSize { let src_size = metal::MTLSize {
width: src.width.min(dst.width) as NSUInteger, width: src.width.min(dst.width) as NSUInteger,
height: src.width.min(dst.height) as NSUInteger, height: src.width.min(dst.height) as NSUInteger,
@ -577,15 +768,79 @@ impl crate::backend::CmdBuf<MtlDevice> for CmdBuf {
0, 0,
origin, origin,
); );
encoder.end_encoding();
} }
unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {} unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {
let mut calibration = pool.calibration.lock().unwrap();
*calibration = Some(self.time_calibration.clone());
}
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
// TODO if let Some(buf) = &pool.counter_sample_buf {
// This really a PITA because it's pretty different than Vulkan. if matches!(self.cur_encoder, Encoder::None) {
// See https://developer.apple.com/documentation/metal/counter_sampling self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
let sample_index = query as NSUInteger;
if self.counter_style == CounterStyle::Command {
match &self.cur_encoder {
Encoder::Compute(e, _) => {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true];
}
Encoder::None => unreachable!(),
_ => todo!(),
}
} else if self.counter_style == CounterStyle::Stage {
match &self.cur_encoder {
Encoder::Compute(_e, _) => {
println!("write_timestamp is not supported for stage-style encoders");
}
_ => (),
}
}
}
}
}
impl CmdBuf {
fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Compute(..)) {
self.flush_encoder();
self.cur_encoder =
Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None);
}
if let Encoder::Compute(e, _) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder {
if !matches!(self.cur_encoder, Encoder::Blit(_)) {
self.flush_encoder();
self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned());
}
if let Encoder::Blit(e) = &self.cur_encoder {
e
} else {
unreachable!()
}
}
fn flush_encoder(&mut self) {
match std::mem::replace(&mut self.cur_encoder, Encoder::None) {
Encoder::Compute(e, Some((sample_buf, end_query))) => {
let sample_index = end_query as NSUInteger;
unsafe {
let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true];
}
e.end_encoding();
}
Encoder::Compute(e, None) => e.end_encoding(),
Encoder::Blit(e) => e.end_encoding(),
Encoder::None => (),
}
} }
} }

View file

@ -0,0 +1,172 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Support for timer queries.
//!
//! Likely some of this should be upstreamed into metal-rs.
use std::{ffi::CStr, ptr::null_mut};
use cocoa_foundation::{
base::id,
foundation::{NSRange, NSUInteger},
};
use metal::{DeviceRef, MTLStorageMode};
use objc::{class, msg_send, sel, sel_impl};
pub struct CounterSampleBuffer {
id: id,
count: u64,
}
pub struct CounterSet {
id: id,
}
#[derive(Default)]
pub struct TimeCalibration {
pub cpu_start_ts: u64,
pub gpu_start_ts: u64,
pub cpu_end_ts: u64,
pub gpu_end_ts: u64,
}
impl Drop for CounterSampleBuffer {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl Clone for CounterSampleBuffer {
fn clone(&self) -> CounterSampleBuffer {
unsafe {
CounterSampleBuffer {
id: msg_send![self.id, retain],
count: self.count,
}
}
}
}
impl CounterSampleBuffer {
pub fn id(&self) -> id {
self.id
}
}
impl Drop for CounterSet {
fn drop(&mut self) {
unsafe { msg_send![self.id, release] }
}
}
impl CounterSet {
pub fn get_timer_counter_set(device: &DeviceRef) -> Option<CounterSet> {
unsafe {
// TODO: version check
let sets: id = msg_send!(device, counterSets);
let count: NSUInteger = msg_send![sets, count];
for i in 0..count {
let set: id = msg_send![sets, objectAtIndex: i];
let name: id = msg_send![set, name];
let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]);
if name_cstr.to_bytes() == b"timestamp" {
return Some(CounterSet { id: set });
}
}
None
}
}
}
// copied from metal-rs; should be in common utilities maybe?
fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str {
let bytes = unsafe {
let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String];
bytes as *const u8
};
let len: NSUInteger = unsafe { msg_send![nsstr, length] };
unsafe {
let bytes = std::slice::from_raw_parts(bytes, len as usize);
std::str::from_utf8(bytes).unwrap()
}
}
impl CounterSampleBuffer {
pub fn new(
device: &DeviceRef,
count: u64,
counter_set: &CounterSet,
) -> Option<CounterSampleBuffer> {
unsafe {
let desc_cls = class!(MTLCounterSampleBufferDescriptor);
let descriptor: id = msg_send![desc_cls, alloc];
let _: id = msg_send![descriptor, init];
let count = count as NSUInteger;
let () = msg_send![descriptor, setSampleCount: count];
let () = msg_send![descriptor, setCounterSet: counter_set.id];
let () = msg_send![
descriptor,
setStorageMode: MTLStorageMode::Shared as NSUInteger
];
let mut error: id = null_mut();
let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error];
let () = msg_send![descriptor, release];
if !error.is_null() {
let description = msg_send![error, localizedDescription];
println!(
"error allocating sample buffer, code = {}",
nsstring_as_str(description)
);
let () = msg_send![error, release];
return None;
}
Some(CounterSampleBuffer { id: buf, count })
}
}
// Read the timestamps.
//
// Safety: the lifetime of the returned slice is wrong, it's actually autoreleased.
pub unsafe fn resolve(&self) -> &[u64] {
let range = NSRange::new(0, self.count);
let data: id = msg_send![self.id, resolveCounterRange: range];
if data.is_null() {
&[]
} else {
let bytes: *const u64 = msg_send![data, bytes];
std::slice::from_raw_parts(bytes, self.count as usize)
}
}
}
impl TimeCalibration {
/// Convert GPU timestamp into CPU time base.
///
/// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps
pub fn correlate(&self, raw_ts: u64) -> f64 {
let delta_cpu = self.cpu_end_ts - self.cpu_start_ts;
let delta_gpu = self.gpu_end_ts - self.gpu_start_ts;
let adj_ts = if delta_gpu > 0 {
let scale = delta_cpu as f64 / delta_gpu as f64;
self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale
} else {
// Default is ns on Apple Silicon; on other hardware this will be wrong
raw_ts as f64
};
adj_ts * 1e-9
}
}

View file

@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait;
use crate::backend::Device as DeviceTrait; use crate::backend::Device as DeviceTrait;
use crate::BackendType; use crate::BackendType;
use crate::BindType; use crate::BindType;
use crate::ComputePassDescriptor;
use crate::ImageFormat; use crate::ImageFormat;
use crate::MapMode; use crate::MapMode;
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags}; use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags};
@ -658,6 +659,14 @@ impl CmdBuf {
} }
} }
pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
mux_match! { self;
CmdBuf::Vk(c) => c.begin_compute_pass(desc),
CmdBuf::Dx12(c) => c.begin_compute_pass(desc),
CmdBuf::Mtl(c) => c.begin_compute_pass(desc),
}
}
/// Dispatch a compute shader. /// Dispatch a compute shader.
/// ///
/// Note that both the number of workgroups (`workgroup_count`) and the number of /// Note that both the number of workgroups (`workgroup_count`) and the number of
@ -680,6 +689,14 @@ impl CmdBuf {
} }
} }
pub unsafe fn end_compute_pass(&mut self) {
mux_match! { self;
CmdBuf::Vk(c) => c.end_compute_pass(),
CmdBuf::Dx12(c) => c.end_compute_pass(),
CmdBuf::Mtl(c) => c.end_compute_pass(),
}
}
pub unsafe fn memory_barrier(&mut self) { pub unsafe fn memory_barrier(&mut self) {
mux_match! { self; mux_match! { self;
CmdBuf::Vk(c) => c.memory_barrier(), CmdBuf::Vk(c) => c.memory_barrier(),

View file

@ -15,7 +15,7 @@ use smallvec::SmallVec;
use crate::backend::Device as DeviceTrait; use crate::backend::Device as DeviceTrait;
use crate::{ use crate::{
BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize, BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize,
WorkgroupLimits, WorkgroupLimits, ComputePassDescriptor,
}; };
pub struct VkInstance { pub struct VkInstance {
@ -92,6 +92,7 @@ pub struct CmdBuf {
cmd_buf: vk::CommandBuffer, cmd_buf: vk::CommandBuffer,
cmd_pool: vk::CommandPool, cmd_pool: vk::CommandPool,
device: Arc<RawDevice>, device: Arc<RawDevice>,
end_query: Option<(vk::QueryPool, u32)>,
} }
pub struct QueryPool { pub struct QueryPool {
@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice {
cmd_buf, cmd_buf,
cmd_pool, cmd_pool,
device: self.device.clone(), device: self.device.clone(),
end_query: None,
}) })
} }
} }
@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice {
// results (Windows 10, AMD 5700 XT). // results (Windows 10, AMD 5700 XT).
let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT; let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT;
device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?; device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?;
let ts0 = buf[0];
let tsp = self.timestamp_period as f64 * 1e-9; let tsp = self.timestamp_period as f64 * 1e-9;
let result = buf[1..] let result = buf
.iter() .iter()
.map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) .map(|ts| *ts as f64 * tsp)
.collect(); .collect();
Ok(result) Ok(result)
} }
@ -902,6 +903,16 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
true true
} }
unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) {
if let Some((pool, start, end)) = &desc.timer_queries {
#[allow(irrefutable_let_patterns)]
if let crate::hub::QueryPool::Vk(pool) = pool {
self.write_timestamp_raw(pool.pool, *start);
self.end_query = Some((pool.pool, *end));
}
}
}
unsafe fn dispatch( unsafe fn dispatch(
&mut self, &mut self,
pipeline: &Pipeline, pipeline: &Pipeline,
@ -931,6 +942,12 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn end_compute_pass(&mut self) {
if let Some((pool, end)) = self.end_query.take() {
self.write_timestamp_raw(pool, end);
}
}
/// Insert a pipeline barrier for all memory accesses. /// Insert a pipeline barrier for all memory accesses.
unsafe fn memory_barrier(&mut self) { unsafe fn memory_barrier(&mut self) {
let device = &self.device.device; let device = &self.device.device;
@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option<u64>) { unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option<u64>) {
let device = &self.device.device; let device = &self.device.device;
let size = size.unwrap_or(vk::WHOLE_SIZE); let size = size.unwrap_or(vk::WHOLE_SIZE);
device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0); device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0);
} }
unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) {
let device = &self.device.device; let device = &self.device.device;
let size = src.size.min(dst.size); let size = src.size.min(dst.size);
device.cmd_copy_buffer( device.cmd_copy_buffer(
@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_copy_image_to_buffer( device.cmd_copy_image_to_buffer(
self.cmd_buf, self.cmd_buf,
@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_copy_buffer_to_image( device.cmd_copy_buffer_to_image(
self.cmd_buf, self.cmd_buf,
@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
); );
} }
unsafe fn blit_image(&self, src: &Image, dst: &Image) { unsafe fn blit_image(&mut self, src: &Image, dst: &Image) {
let device = &self.device.device; let device = &self.device.device;
device.cmd_blit_image( device.cmd_blit_image(
self.cmd_buf, self.cmd_buf,
@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
} }
unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) {
let device = &self.device.device; self.write_timestamp_raw(pool.pool, query);
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool.pool,
query,
);
} }
unsafe fn begin_debug_label(&mut self, label: &str) { unsafe fn begin_debug_label(&mut self, label: &str) {
@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf<VkDevice> for CmdBuf {
} }
} }
impl CmdBuf {
unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) {
let device = &self.device.device;
device.cmd_write_timestamp(
self.cmd_buf,
vk::PipelineStageFlags::COMPUTE_SHADER,
pool,
query,
);
}
}
impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder { impl crate::backend::DescriptorSetBuilder<VkDevice> for DescriptorSetBuilder {
fn add_buffers(&mut self, buffers: &[&Buffer]) { fn add_buffers(&mut self, buffers: &[&Buffer]) {
self.buffers.extend(buffers.iter().map(|b| b.buffer)); self.buffers.extend(buffers.iter().map(|b| b.buffer));

View file

@ -6,7 +6,7 @@ use clap::{App, Arg};
use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session}; use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};
use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer}; use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
const WIDTH: usize = 2048; const WIDTH: usize = 2048;
const HEIGHT: usize = 1536; const HEIGHT: usize = 1536;
@ -243,7 +243,11 @@ fn main() -> Result<(), Error> {
if matches.is_present("flip") { if matches.is_present("flip") {
scale = -scale; scale = -scale;
} }
test_scenes::render_svg(&mut ctx, input, scale); let xml_str = std::fs::read_to_string(input).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
test_scenes::render_svg(&mut ctx, &svg);
} else { } else {
test_scenes::render_scene(&mut ctx); test_scenes::render_scene(&mut ctx);
} }

View file

@ -2,7 +2,7 @@ use piet::kurbo::Point;
use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder}; use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf}; use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf};
use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer}; use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer};
use clap::{App, Arg}; use clap::{App, Arg};
@ -29,6 +29,25 @@ fn main() -> Result<(), Error> {
) )
.get_matches(); .get_matches();
// Collect SVG if input
let svg = match matches.value_of("INPUT") {
Some(file) => {
let mut scale = matches
.value_of("scale")
.map(|scale| scale.parse().unwrap())
.unwrap_or(8.0);
if matches.is_present("flip") {
scale = -scale;
}
let xml_str = std::fs::read_to_string(file).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
Some(svg)
}
None => None,
};
let event_loop = EventLoop::new(); let event_loop = EventLoop::new();
let window = WindowBuilder::new() let window = WindowBuilder::new()
.with_inner_size(winit::dpi::LogicalSize { .with_inner_size(winit::dpi::LogicalSize {
@ -51,7 +70,7 @@ fn main() -> Result<(), Error> {
.map(|_| session.create_semaphore()) .map(|_| session.create_semaphore())
.collect::<Result<Vec<_>, Error>>()?; .collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES) let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(8)) .map(|_| session.create_query_pool(12))
.collect::<Result<Vec<_>, Error>>()?; .collect::<Result<Vec<_>, Error>>()?;
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default(); let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default(); let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
@ -93,29 +112,23 @@ fn main() -> Result<(), Error> {
if !ts.is_empty() { if !ts.is_empty() {
info_string = format!( info_string = format!(
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms", "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
ts[6] * 1e3, ts[10] * 1e3,
ts[0] * 1e3, ts[0] * 1e3,
(ts[1] - ts[0]) * 1e3, (ts[1] - ts[0]) * 1e3,
(ts[2] - ts[1]) * 1e3, (ts[2] - ts[1]) * 1e3,
(ts[3] - ts[2]) * 1e3,
(ts[4] - ts[3]) * 1e3, (ts[4] - ts[3]) * 1e3,
(ts[5] - ts[4]) * 1e3,
(ts[6] - ts[5]) * 1e3, (ts[6] - ts[5]) * 1e3,
(ts[8] - ts[7]) * 1e3,
(ts[10] - ts[9]) * 1e3,
); );
} }
} }
let mut ctx = PietGpuRenderContext::new(); let mut ctx = PietGpuRenderContext::new();
if let Some(input) = matches.value_of("INPUT") { let test_blend = false;
let mut scale = matches if let Some(svg) = &svg {
.value_of("scale") test_scenes::render_svg(&mut ctx, svg);
.map(|scale| scale.parse().unwrap()) } else if test_blend {
.unwrap_or(8.0);
if matches.is_present("flip") {
scale = -scale;
}
test_scenes::render_svg(&mut ctx, input, scale);
} else {
use piet_gpu::{Blend, BlendMode::*, CompositionMode::*}; use piet_gpu::{Blend, BlendMode::*, CompositionMode::*};
let blends = [ let blends = [
Blend::new(Normal, SrcOver), Blend::new(Normal, SrcOver),
@ -151,6 +164,8 @@ fn main() -> Result<(), Error> {
let blend = blends[mode % blends.len()]; let blend = blends[mode % blends.len()];
test_scenes::render_blend_test(&mut ctx, current_frame, blend); test_scenes::render_blend_test(&mut ctx, current_frame, blend);
info_string = format!("{:?}", blend); info_string = format!("{:?}", blend);
} else {
test_scenes::render_anim_frame(&mut ctx, current_frame);
} }
render_info_string(&mut ctx, &info_string); render_info_string(&mut ctx, &info_string);
if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) { if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) {

View file

@ -306,7 +306,7 @@ void main() {
is_blend = (blend != BlendComp_default); is_blend = (blend != BlendComp_default);
} }
include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
|| (is_clip && is_blend); || is_blend;
} }
if (include_tile) { if (include_tile) {
uint el_slice = el_ix / 32; uint el_slice = el_ix / 32;

Binary file not shown.

Binary file not shown.

View file

@ -931,23 +931,14 @@ void comp_main()
{ {
_1701 = _1692; _1701 = _1692;
} }
bool _1708; include_tile = _1701 || is_blend;
if (!_1701)
{
_1708 = is_clip && is_blend;
}
else
{
_1708 = _1701;
}
include_tile = _1708;
} }
if (include_tile) if (include_tile)
{ {
uint el_slice = el_ix / 32u; uint el_slice = el_ix / 32u;
uint el_mask = 1u << (el_ix & 31u); uint el_mask = 1u << (el_ix & 31u);
uint _1728; uint _1723;
InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1728); InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723);
} }
} }
GroupMemoryBarrierWithGroupSync(); GroupMemoryBarrierWithGroupSync();
@ -976,9 +967,9 @@ void comp_main()
{ {
uint param_25 = element_ref_ix; uint param_25 = element_ref_ix;
bool param_26 = mem_ok; bool param_26 = mem_ok;
TileRef _1805 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
Alloc param_27 = read_tile_alloc(param_25, param_26); Alloc param_27 = read_tile_alloc(param_25, param_26);
TileRef param_28 = _1805; TileRef param_28 = _1800;
Tile tile_1 = Tile_read(param_27, param_28); Tile tile_1 = Tile_read(param_27, param_28);
uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8); uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8);
@ -993,11 +984,11 @@ void comp_main()
Alloc param_29 = cmd_alloc; Alloc param_29 = cmd_alloc;
CmdRef param_30 = cmd_ref; CmdRef param_30 = cmd_ref;
uint param_31 = cmd_limit; uint param_31 = cmd_limit;
bool _1853 = alloc_cmd(param_29, param_30, param_31); bool _1848 = alloc_cmd(param_29, param_30, param_31);
cmd_alloc = param_29; cmd_alloc = param_29;
cmd_ref = param_30; cmd_ref = param_30;
cmd_limit = param_31; cmd_limit = param_31;
if (!_1853) if (!_1848)
{ {
break; break;
} }
@ -1008,10 +999,10 @@ void comp_main()
write_fill(param_32, param_33, param_34, param_35); write_fill(param_32, param_33, param_34, param_35);
cmd_ref = param_33; cmd_ref = param_33;
uint rgba = _1372.Load(dd_1 * 4 + 0); uint rgba = _1372.Load(dd_1 * 4 + 0);
CmdColor _1876 = { rgba }; CmdColor _1871 = { rgba };
Alloc param_36 = cmd_alloc; Alloc param_36 = cmd_alloc;
CmdRef param_37 = cmd_ref; CmdRef param_37 = cmd_ref;
CmdColor param_38 = _1876; CmdColor param_38 = _1871;
Cmd_Color_write(param_36, param_37, param_38); Cmd_Color_write(param_36, param_37, param_38);
cmd_ref.offset += 8u; cmd_ref.offset += 8u;
break; break;
@ -1021,11 +1012,11 @@ void comp_main()
Alloc param_39 = cmd_alloc; Alloc param_39 = cmd_alloc;
CmdRef param_40 = cmd_ref; CmdRef param_40 = cmd_ref;
uint param_41 = cmd_limit; uint param_41 = cmd_limit;
bool _1894 = alloc_cmd(param_39, param_40, param_41); bool _1889 = alloc_cmd(param_39, param_40, param_41);
cmd_alloc = param_39; cmd_alloc = param_39;
cmd_ref = param_40; cmd_ref = param_40;
cmd_limit = param_41; cmd_limit = param_41;
if (!_1894) if (!_1889)
{ {
break; break;
} }
@ -1052,11 +1043,11 @@ void comp_main()
Alloc param_49 = cmd_alloc; Alloc param_49 = cmd_alloc;
CmdRef param_50 = cmd_ref; CmdRef param_50 = cmd_ref;
uint param_51 = cmd_limit; uint param_51 = cmd_limit;
bool _1958 = alloc_cmd(param_49, param_50, param_51); bool _1953 = alloc_cmd(param_49, param_50, param_51);
cmd_alloc = param_49; cmd_alloc = param_49;
cmd_ref = param_50; cmd_ref = param_50;
cmd_limit = param_51; cmd_limit = param_51;
if (!_1958) if (!_1953)
{ {
break; break;
} }
@ -1086,11 +1077,11 @@ void comp_main()
Alloc param_59 = cmd_alloc; Alloc param_59 = cmd_alloc;
CmdRef param_60 = cmd_ref; CmdRef param_60 = cmd_ref;
uint param_61 = cmd_limit; uint param_61 = cmd_limit;
bool _2064 = alloc_cmd(param_59, param_60, param_61); bool _2059 = alloc_cmd(param_59, param_60, param_61);
cmd_alloc = param_59; cmd_alloc = param_59;
cmd_ref = param_60; cmd_ref = param_60;
cmd_limit = param_61; cmd_limit = param_61;
if (!_2064) if (!_2059)
{ {
break; break;
} }
@ -1103,27 +1094,27 @@ void comp_main()
uint index = _1372.Load(dd_1 * 4 + 0); uint index = _1372.Load(dd_1 * 4 + 0);
uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0); uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0);
int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
CmdImage _2103 = { index, offset_1 }; CmdImage _2098 = { index, offset_1 };
Alloc param_66 = cmd_alloc; Alloc param_66 = cmd_alloc;
CmdRef param_67 = cmd_ref; CmdRef param_67 = cmd_ref;
CmdImage param_68 = _2103; CmdImage param_68 = _2098;
Cmd_Image_write(param_66, param_67, param_68); Cmd_Image_write(param_66, param_67, param_68);
cmd_ref.offset += 12u; cmd_ref.offset += 12u;
break; break;
} }
case 5u: case 5u:
{ {
bool _2117 = tile_1.tile.offset == 0u; bool _2112 = tile_1.tile.offset == 0u;
bool _2123; bool _2118;
if (_2117) if (_2112)
{ {
_2123 = tile_1.backdrop == 0; _2118 = tile_1.backdrop == 0;
} }
else else
{ {
_2123 = _2117; _2118 = _2112;
} }
if (_2123) if (_2118)
{ {
clip_zero_depth = clip_depth + 1u; clip_zero_depth = clip_depth + 1u;
} }
@ -1132,11 +1123,11 @@ void comp_main()
Alloc param_69 = cmd_alloc; Alloc param_69 = cmd_alloc;
CmdRef param_70 = cmd_ref; CmdRef param_70 = cmd_ref;
uint param_71 = cmd_limit; uint param_71 = cmd_limit;
bool _2135 = alloc_cmd(param_69, param_70, param_71); bool _2130 = alloc_cmd(param_69, param_70, param_71);
cmd_alloc = param_69; cmd_alloc = param_69;
cmd_ref = param_70; cmd_ref = param_70;
cmd_limit = param_71; cmd_limit = param_71;
if (!_2135) if (!_2130)
{ {
break; break;
} }
@ -1154,11 +1145,11 @@ void comp_main()
Alloc param_74 = cmd_alloc; Alloc param_74 = cmd_alloc;
CmdRef param_75 = cmd_ref; CmdRef param_75 = cmd_ref;
uint param_76 = cmd_limit; uint param_76 = cmd_limit;
bool _2163 = alloc_cmd(param_74, param_75, param_76); bool _2158 = alloc_cmd(param_74, param_75, param_76);
cmd_alloc = param_74; cmd_alloc = param_74;
cmd_ref = param_75; cmd_ref = param_75;
cmd_limit = param_76; cmd_limit = param_76;
if (!_2163) if (!_2158)
{ {
break; break;
} }
@ -1169,10 +1160,10 @@ void comp_main()
write_fill(param_77, param_78, param_79, param_80); write_fill(param_77, param_78, param_79, param_80);
cmd_ref = param_78; cmd_ref = param_78;
uint blend_1 = _1372.Load(dd_1 * 4 + 0); uint blend_1 = _1372.Load(dd_1 * 4 + 0);
CmdEndClip _2186 = { blend_1 }; CmdEndClip _2181 = { blend_1 };
Alloc param_81 = cmd_alloc; Alloc param_81 = cmd_alloc;
CmdRef param_82 = cmd_ref; CmdRef param_82 = cmd_ref;
CmdEndClip param_83 = _2186; CmdEndClip param_83 = _2181;
Cmd_EndClip_write(param_81, param_82, param_83); Cmd_EndClip_write(param_81, param_82, param_83);
cmd_ref.offset += 8u; cmd_ref.offset += 8u;
break; break;
@ -1207,17 +1198,17 @@ void comp_main()
break; break;
} }
} }
bool _2233 = (bin_tile_x + tile_x) < _1005.Load(8); bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8);
bool _2242; bool _2237;
if (_2233) if (_2228)
{ {
_2242 = (bin_tile_y + tile_y) < _1005.Load(12); _2237 = (bin_tile_y + tile_y) < _1005.Load(12);
} }
else else
{ {
_2242 = _2233; _2237 = _2228;
} }
if (_2242) if (_2237)
{ {
Alloc param_84 = cmd_alloc; Alloc param_84 = cmd_alloc;
CmdRef param_85 = cmd_ref; CmdRef param_85 = cmd_ref;

View file

@ -954,22 +954,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
{ {
_1701 = _1692; _1701 = _1692;
} }
bool _1708; include_tile = _1701 || is_blend;
if (!_1701)
{
_1708 = is_clip && is_blend;
}
else
{
_1708 = _1701;
}
include_tile = _1708;
} }
if (include_tile) if (include_tile)
{ {
uint el_slice = el_ix / 32u; uint el_slice = el_ix / 32u;
uint el_mask = 1u << (el_ix & 31u); uint el_mask = 1u << (el_ix & 31u);
uint _1728 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
} }
} }
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
@ -1014,11 +1005,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_29 = cmd_alloc; Alloc param_29 = cmd_alloc;
CmdRef param_30 = cmd_ref; CmdRef param_30 = cmd_ref;
uint param_31 = cmd_limit; uint param_31 = cmd_limit;
bool _1853 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize); bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
cmd_alloc = param_29; cmd_alloc = param_29;
cmd_ref = param_30; cmd_ref = param_30;
cmd_limit = param_31; cmd_limit = param_31;
if (!_1853) if (!_1848)
{ {
break; break;
} }
@ -1041,11 +1032,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_39 = cmd_alloc; Alloc param_39 = cmd_alloc;
CmdRef param_40 = cmd_ref; CmdRef param_40 = cmd_ref;
uint param_41 = cmd_limit; uint param_41 = cmd_limit;
bool _1894 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize); bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
cmd_alloc = param_39; cmd_alloc = param_39;
cmd_ref = param_40; cmd_ref = param_40;
cmd_limit = param_41; cmd_limit = param_41;
if (!_1894) if (!_1889)
{ {
break; break;
} }
@ -1072,11 +1063,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_49 = cmd_alloc; Alloc param_49 = cmd_alloc;
CmdRef param_50 = cmd_ref; CmdRef param_50 = cmd_ref;
uint param_51 = cmd_limit; uint param_51 = cmd_limit;
bool _1958 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize); bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
cmd_alloc = param_49; cmd_alloc = param_49;
cmd_ref = param_50; cmd_ref = param_50;
cmd_limit = param_51; cmd_limit = param_51;
if (!_1958) if (!_1953)
{ {
break; break;
} }
@ -1106,11 +1097,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_59 = cmd_alloc; Alloc param_59 = cmd_alloc;
CmdRef param_60 = cmd_ref; CmdRef param_60 = cmd_ref;
uint param_61 = cmd_limit; uint param_61 = cmd_limit;
bool _2064 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize); bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
cmd_alloc = param_59; cmd_alloc = param_59;
cmd_ref = param_60; cmd_ref = param_60;
cmd_limit = param_61; cmd_limit = param_61;
if (!_2064) if (!_2059)
{ {
break; break;
} }
@ -1132,17 +1123,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
} }
case 5u: case 5u:
{ {
bool _2117 = tile_1.tile.offset == 0u; bool _2112 = tile_1.tile.offset == 0u;
bool _2123; bool _2118;
if (_2117) if (_2112)
{ {
_2123 = tile_1.backdrop == 0; _2118 = tile_1.backdrop == 0;
} }
else else
{ {
_2123 = _2117; _2118 = _2112;
} }
if (_2123) if (_2118)
{ {
clip_zero_depth = clip_depth + 1u; clip_zero_depth = clip_depth + 1u;
} }
@ -1151,11 +1142,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_69 = cmd_alloc; Alloc param_69 = cmd_alloc;
CmdRef param_70 = cmd_ref; CmdRef param_70 = cmd_ref;
uint param_71 = cmd_limit; uint param_71 = cmd_limit;
bool _2135 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize); bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
cmd_alloc = param_69; cmd_alloc = param_69;
cmd_ref = param_70; cmd_ref = param_70;
cmd_limit = param_71; cmd_limit = param_71;
if (!_2135) if (!_2130)
{ {
break; break;
} }
@ -1173,11 +1164,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
Alloc param_74 = cmd_alloc; Alloc param_74 = cmd_alloc;
CmdRef param_75 = cmd_ref; CmdRef param_75 = cmd_ref;
uint param_76 = cmd_limit; uint param_76 = cmd_limit;
bool _2163 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize); bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
cmd_alloc = param_74; cmd_alloc = param_74;
cmd_ref = param_75; cmd_ref = param_75;
cmd_limit = param_76; cmd_limit = param_76;
if (!_2163) if (!_2158)
{ {
break; break;
} }
@ -1225,17 +1216,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M
break; break;
} }
} }
bool _2233 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles; bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
bool _2242; bool _2237;
if (_2233) if (_2228)
{ {
_2242 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles; _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
} }
else else
{ {
_2242 = _2233; _2237 = _2228;
} }
if (_2242) if (_2237)
{ {
Alloc param_84 = cmd_alloc; Alloc param_84 = cmd_alloc;
CmdRef param_85 = cmd_ref; CmdRef param_85 = cmd_ref;

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -37,147 +37,6 @@ pub struct Encoder {
n_clip: u32, n_clip: u32,
} }
#[derive(Copy, Clone, Debug)]
pub struct EncodedSceneRef<'a, T: Copy + Pod> {
pub transform_stream: &'a [T],
pub tag_stream: &'a [u8],
pub pathseg_stream: &'a [u8],
pub linewidth_stream: &'a [f32],
pub drawtag_stream: &'a [u32],
pub drawdata_stream: &'a [u8],
pub n_path: u32,
pub n_pathseg: u32,
pub n_clip: u32,
pub ramp_data: &'a [u32],
}
impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
/// Return a config for the element processing pipeline.
///
/// This does not include further pipeline processing. Also returns the
/// beginning of free memory.
pub fn stage_config(&self) -> (Config, usize) {
// Layout of scene buffer
let drawtag_offset = 0;
let n_drawobj = self.n_drawobj();
let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
let trans_offset = drawdata_offset + self.drawdata_stream.len();
let n_trans = self.transform_stream.len();
let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
let n_linewidth = self.linewidth_stream.len();
let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
let n_pathtag = self.tag_stream.len();
let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
let pathseg_offset = pathtag_offset + n_pathtag_padded;
// Layout of memory
let mut alloc = 0;
let trans_alloc = alloc;
alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
let pathseg_alloc = alloc;
alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
let path_bbox_alloc = alloc;
let n_path = self.n_path as usize;
alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
let drawmonoid_alloc = alloc;
alloc += n_drawobj_padded * DRAWMONOID_SIZE;
let anno_alloc = alloc;
alloc += n_drawobj * ANNOTATED_SIZE;
let clip_alloc = alloc;
let n_clip = self.n_clip as usize;
const CLIP_SIZE: usize = 4;
alloc += n_clip * CLIP_SIZE;
let clip_bic_alloc = alloc;
const CLIP_BIC_SIZE: usize = 8;
// This can round down, as we only reduce the prefix
alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
let clip_stack_alloc = alloc;
const CLIP_EL_SIZE: usize = 20;
alloc += n_clip * CLIP_EL_SIZE;
let clip_bbox_alloc = alloc;
const CLIP_BBOX_SIZE: usize = 16;
alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
let draw_bbox_alloc = alloc;
alloc += n_drawobj * DRAW_BBOX_SIZE;
let drawinfo_alloc = alloc;
// TODO: not optimized; it can be accumulated during encoding or summed from drawtags
const MAX_DRAWINFO_SIZE: usize = 44;
alloc += n_drawobj * MAX_DRAWINFO_SIZE;
let config = Config {
n_elements: n_drawobj as u32,
n_pathseg: self.n_pathseg,
pathseg_alloc: pathseg_alloc as u32,
anno_alloc: anno_alloc as u32,
trans_alloc: trans_alloc as u32,
path_bbox_alloc: path_bbox_alloc as u32,
drawmonoid_alloc: drawmonoid_alloc as u32,
clip_alloc: clip_alloc as u32,
clip_bic_alloc: clip_bic_alloc as u32,
clip_stack_alloc: clip_stack_alloc as u32,
clip_bbox_alloc: clip_bbox_alloc as u32,
draw_bbox_alloc: draw_bbox_alloc as u32,
drawinfo_alloc: drawinfo_alloc as u32,
n_trans: n_trans as u32,
n_path: self.n_path,
n_clip: self.n_clip,
trans_offset: trans_offset as u32,
linewidth_offset: linewidth_offset as u32,
pathtag_offset: pathtag_offset as u32,
pathseg_offset: pathseg_offset as u32,
drawtag_offset: drawtag_offset as u32,
drawdata_offset: drawdata_offset as u32,
..Default::default()
};
(config, alloc)
}
pub fn write_scene(&self, buf: &mut BufWrite) {
buf.extend_slice(&self.drawtag_stream);
let n_drawobj = self.drawtag_stream.len();
buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE);
buf.extend_slice(&self.drawdata_stream);
buf.extend_slice(&self.transform_stream);
let n_trans = self.transform_stream.len();
buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE);
buf.extend_slice(&self.linewidth_stream);
buf.extend_slice(&self.tag_stream);
let n_pathtag = self.tag_stream.len();
buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
buf.extend_slice(&self.pathseg_stream);
}
/// The number of draw objects in the draw object stream.
pub(crate) fn n_drawobj(&self) -> usize {
self.drawtag_stream.len()
}
/// The number of paths.
pub(crate) fn n_path(&self) -> u32 {
self.n_path
}
/// The number of path segments.
pub(crate) fn n_pathseg(&self) -> u32 {
self.n_pathseg
}
pub(crate) fn n_transform(&self) -> usize {
self.transform_stream.len()
}
/// The number of tags in the path stream.
pub(crate) fn n_pathtag(&self) -> usize {
self.tag_stream.len()
}
pub(crate) fn n_clip(&self) -> u32 {
self.n_clip
}
}
/// A scene fragment encoding a glyph. /// A scene fragment encoding a glyph.
/// ///
/// This is a reduced version of the full encoder. /// This is a reduced version of the full encoder.
@ -471,21 +330,6 @@ impl Encoder {
self.n_path += glyph.n_path; self.n_path += glyph.n_path;
self.n_pathseg += glyph.n_pathseg; self.n_pathseg += glyph.n_pathseg;
} }
pub(crate) fn scene_ref(&self) -> EncodedSceneRef<stages::Transform> {
EncodedSceneRef {
transform_stream: &self.transform_stream,
tag_stream: &self.tag_stream,
pathseg_stream: &self.pathseg_stream,
linewidth_stream: &self.linewidth_stream,
drawtag_stream: &self.drawtag_stream,
drawdata_stream: &self.drawdata_stream,
n_path: self.n_path,
n_pathseg: self.n_pathseg,
n_clip: self.n_clip,
ramp_data: &[],
}
}
} }
fn align_up(x: usize, align: usize) -> usize { fn align_up(x: usize, align: usize) -> usize {

View file

@ -10,10 +10,7 @@ mod text;
use std::convert::TryInto; use std::convert::TryInto;
use bytemuck::Pod;
pub use blend::{Blend, BlendMode, CompositionMode}; pub use blend::{Blend, BlendMode, CompositionMode};
pub use encoder::EncodedSceneRef;
pub use render_ctx::PietGpuRenderContext; pub use render_ctx::PietGpuRenderContext;
pub use gradient::Colrv1RadialGradient; pub use gradient::Colrv1RadialGradient;
@ -21,11 +18,11 @@ use piet::kurbo::Vec2;
use piet::{ImageFormat, RenderContext}; use piet::{ImageFormat, RenderContext};
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet,
ImageLayout, Pipeline, QueryPool, Session, Error, Image, ImageLayout, Pipeline, QueryPool, Session,
}; };
use pico_svg::PicoSvg; pub use pico_svg::PicoSvg;
use stages::{ClipBinding, ElementBinding, ElementCode}; use stages::{ClipBinding, ElementBinding, ElementCode};
use crate::stages::{ClipCode, Config, ElementStage}; use crate::stages::{ClipCode, Config, ElementStage};
@ -358,27 +355,16 @@ impl Renderer {
render_ctx: &mut PietGpuRenderContext, render_ctx: &mut PietGpuRenderContext,
buf_ix: usize, buf_ix: usize,
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut scene = render_ctx.encoded_scene(); let (mut config, mut alloc) = render_ctx.stage_config();
let ramp_data = render_ctx.get_ramp_data(); let n_drawobj = render_ctx.n_drawobj();
scene.ramp_data = &ramp_data;
self.upload_scene(&scene, buf_ix)
}
pub fn upload_scene<T: Copy + Pod>(
&mut self,
scene: &EncodedSceneRef<T>,
buf_ix: usize,
) -> Result<(), Error> {
let (mut config, mut alloc) = scene.stage_config();
let n_drawobj = scene.n_drawobj();
// TODO: be more consistent in size types // TODO: be more consistent in size types
let n_path = scene.n_path() as usize; let n_path = render_ctx.n_path() as usize;
self.n_paths = n_path; self.n_paths = n_path;
self.n_transform = scene.n_transform(); self.n_transform = render_ctx.n_transform();
self.n_drawobj = scene.n_drawobj(); self.n_drawobj = render_ctx.n_drawobj();
self.n_pathseg = scene.n_pathseg() as usize; self.n_pathseg = render_ctx.n_pathseg() as usize;
self.n_pathtag = scene.n_pathtag(); self.n_pathtag = render_ctx.n_pathtag();
self.n_clip = scene.n_clip(); self.n_clip = render_ctx.n_clip();
// These constants depend on encoding and may need to be updated. // These constants depend on encoding and may need to be updated.
// Perhaps we can plumb these from piet-gpu-derive? // Perhaps we can plumb these from piet-gpu-derive?
@ -402,18 +388,19 @@ impl Renderer {
// TODO: reallocate scene buffer if size is inadequate // TODO: reallocate scene buffer if size is inadequate
{ {
let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?; let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
scene.write_scene(&mut mapped_scene); render_ctx.write_scene(&mut mapped_scene);
} }
self.config_bufs[buf_ix].write(&[config])?; self.config_bufs[buf_ix].write(&[config])?;
self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
// Upload gradient data. // Upload gradient data.
if !scene.ramp_data.is_empty() { let ramp_data = render_ctx.get_ramp_data();
if !ramp_data.is_empty() {
assert!( assert!(
self.gradient_bufs[buf_ix].size() as usize self.gradient_bufs[buf_ix].size() as usize
>= std::mem::size_of_val(&*scene.ramp_data) >= std::mem::size_of_val(&*ramp_data)
); );
self.gradient_bufs[buf_ix].write(scene.ramp_data)?; self.gradient_bufs[buf_ix].write(&ramp_data)?;
} }
} }
Ok(()) Ok(())
@ -437,10 +424,10 @@ impl Renderer {
cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients); cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients);
cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General); cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General);
cmd_buf.reset_query_pool(&query_pool); cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.begin_debug_label("Element bounding box calculation"); cmd_buf.begin_debug_label("Element bounding box calculation");
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1));
self.element_stage.record( self.element_stage.record(
cmd_buf, &mut pass,
&self.element_code, &self.element_code,
&self.element_bindings[buf_ix], &self.element_bindings[buf_ix],
self.n_transform as u64, self.n_transform as u64,
@ -448,56 +435,59 @@ impl Renderer {
self.n_pathtag as u32, self.n_pathtag as u32,
self.n_drawobj as u64, self.n_drawobj as u64,
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Clip bounding box calculation"); let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3));
pass.begin_debug_label("Clip bounding box calculation");
self.clip_binding self.clip_binding
.record(cmd_buf, &self.clip_code, self.n_clip as u32); .record(&mut pass, &self.clip_code, self.n_clip as u32);
cmd_buf.end_debug_label(); pass.end_debug_label();
cmd_buf.begin_debug_label("Element binning"); pass.begin_debug_label("Element binning");
cmd_buf.dispatch( pass.dispatch(
&self.bin_pipeline, &self.bin_pipeline,
&self.bin_ds, &self.bin_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1), (((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1), (256, 1, 1),
); );
cmd_buf.end_debug_label(); pass.end_debug_label();
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.begin_debug_label("Tile allocation"); pass.begin_debug_label("Tile allocation");
cmd_buf.dispatch( pass.dispatch(
&self.tile_pipeline, &self.tile_pipeline,
&self.tile_ds[buf_ix], &self.tile_ds[buf_ix],
(((self.n_paths + 255) / 256) as u32, 1, 1), (((self.n_paths + 255) / 256) as u32, 1, 1),
(256, 1, 1), (256, 1, 1),
); );
cmd_buf.end_debug_label(); pass.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 2); pass.end();
cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Path flattening"); cmd_buf.begin_debug_label("Path flattening");
cmd_buf.dispatch( cmd_buf.memory_barrier();
let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5));
pass.dispatch(
&self.path_pipeline, &self.path_pipeline,
&self.path_ds, &self.path_ds,
(((self.n_pathseg + 31) / 32) as u32, 1, 1), (((self.n_pathseg + 31) / 32) as u32, 1, 1),
(32, 1, 1), (32, 1, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Backdrop propagation"); cmd_buf.begin_debug_label("Backdrop propagation");
cmd_buf.dispatch( let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7));
pass.dispatch(
&self.backdrop_pipeline, &self.backdrop_pipeline,
&self.backdrop_ds, &self.backdrop_ds,
(((self.n_paths + 255) / 256) as u32, 1, 1), (((self.n_paths + 255) / 256) as u32, 1, 1),
(256, self.backdrop_y, 1), (256, self.backdrop_y, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 4);
// TODO: redo query accounting // TODO: redo query accounting
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Coarse raster"); cmd_buf.begin_debug_label("Coarse raster");
cmd_buf.dispatch( let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9));
pass.dispatch(
&self.coarse_pipeline, &self.coarse_pipeline,
&self.coarse_ds[buf_ix], &self.coarse_ds[buf_ix],
( (
@ -507,11 +497,13 @@ impl Renderer {
), ),
(256, 1, 1), (256, 1, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 6);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.begin_debug_label("Fine raster"); cmd_buf.begin_debug_label("Fine raster");
cmd_buf.dispatch( let mut pass =
cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11));
pass.dispatch(
&self.k4_pipeline, &self.k4_pipeline,
&self.k4_ds, &self.k4_ds,
( (
@ -521,8 +513,8 @@ impl Renderer {
), ),
(8, 4, 1), (8, 4, 1),
); );
pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.write_timestamp(&query_pool, 7);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
} }

View file

@ -1,6 +1,6 @@
use std::borrow::Cow; use std::borrow::Cow;
use crate::encoder::{EncodedSceneRef, GlyphEncoder}; use crate::encoder::GlyphEncoder;
use crate::stages::{Config, Transform}; use crate::stages::{Config, Transform};
use crate::MAX_BLEND_STACK; use crate::MAX_BLEND_STACK;
use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape}; use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape};
@ -97,10 +97,6 @@ impl PietGpuRenderContext {
self.new_encoder.stage_config() self.new_encoder.stage_config()
} }
pub fn encoded_scene(&self) -> EncodedSceneRef<crate::stages::Transform> {
self.new_encoder.scene_ref()
}
/// Number of draw objects. /// Number of draw objects.
/// ///
/// This is for the new element processing pipeline. It's not necessarily the /// This is for the new element processing pipeline. It's not necessarily the

View file

@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable};
pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE}; pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE};
pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE}; pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE};
pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE}; pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE};
use piet_gpu_hal::{Buffer, CmdBuf, Session}; use piet_gpu_hal::{Buffer, ComputePass, Session};
pub use transform::{ pub use transform::{
Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE, Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE,
}; };
@ -140,7 +140,7 @@ impl ElementStage {
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &ElementCode, code: &ElementCode,
binding: &ElementBinding, binding: &ElementBinding,
n_transform: u64, n_transform: u64,
@ -149,14 +149,14 @@ impl ElementStage {
n_drawobj: u64, n_drawobj: u64,
) { ) {
self.transform_stage.record( self.transform_stage.record(
cmd_buf, pass,
&code.transform_code, &code.transform_code,
&binding.transform_binding, &binding.transform_binding,
n_transform, n_transform,
); );
// No memory barrier needed here; path has at least one before pathseg // No memory barrier needed here; path has at least one before pathseg
self.path_stage.record( self.path_stage.record(
cmd_buf, pass,
&code.path_code, &code.path_code,
&binding.path_binding, &binding.path_binding,
n_paths, n_paths,
@ -164,6 +164,6 @@ impl ElementStage {
); );
// No memory barrier needed here; draw has at least one before draw_leaf // No memory barrier needed here; draw has at least one before draw_leaf
self.draw_stage self.draw_stage
.record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj); .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
} }
} }

View file

@ -16,7 +16,7 @@
//! The clip processing stage (includes substages). //! The clip processing stage (includes substages).
use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session}; use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session};
// Note that this isn't the code/stage/binding pattern of most of the other stages // Note that this isn't the code/stage/binding pattern of most of the other stages
// in the new element processing pipeline. We want to move those temporary buffers // in the new element processing pipeline. We want to move those temporary buffers
@ -69,26 +69,26 @@ impl ClipBinding {
/// Record the clip dispatches. /// Record the clip dispatches.
/// ///
/// Assumes memory barrier on entry. Provides memory barrier on exit. /// Assumes memory barrier on entry. Provides memory barrier on exit.
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) { pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) {
let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE; let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE;
if n_wg_reduce > 0 { if n_wg_reduce > 0 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&self.reduce_ds, &self.reduce_ds,
(n_wg_reduce, 1, 1), (n_wg_reduce, 1, 1),
(CLIP_PART_SIZE, 1, 1), (CLIP_PART_SIZE, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
} }
let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE; let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE;
if n_wg > 0 { if n_wg > 0 {
cmd_buf.dispatch( pass.dispatch(
&code.leaf_pipeline, &code.leaf_pipeline,
&self.leaf_ds, &self.leaf_ds,
(n_wg, 1, 1), (n_wg, 1, 1),
(CLIP_PART_SIZE, 1, 1), (CLIP_PART_SIZE, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
} }
} }
} }

View file

@ -19,7 +19,7 @@
use bytemuck::{Pod, Zeroable}; use bytemuck::{Pod, Zeroable};
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
}; };
/// The output element of the draw object stage. /// The output element of the draw object stage.
@ -130,7 +130,7 @@ impl DrawStage {
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &DrawCode, code: &DrawCode,
binding: &DrawBinding, binding: &DrawBinding,
size: u64, size: u64,
@ -140,22 +140,22 @@ impl DrawStage {
} }
let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE; let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE;
if n_workgroups > 1 { if n_workgroups > 1 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&binding.reduce_ds, &binding.reduce_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(DRAW_WG as u32, 1, 1), (DRAW_WG as u32, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.root_pipeline, &code.root_pipeline,
&self.root_ds, &self.root_ds,
(1, 1, 1), (1, 1, 1),
(DRAW_WG as u32, 1, 1), (DRAW_WG as u32, 1, 1),
); );
} }
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.leaf_pipeline, &code.leaf_pipeline,
&binding.leaf_ds, &binding.leaf_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),

View file

@ -17,7 +17,7 @@
//! The path stage (includes substages). //! The path stage (includes substages).
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
}; };
pub struct PathCode { pub struct PathCode {
@ -148,7 +148,7 @@ impl PathStage {
/// those are consumed. Result is written without barrier. /// those are consumed. Result is written without barrier.
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &PathCode, code: &PathCode,
binding: &PathBinding, binding: &PathBinding,
n_paths: u32, n_paths: u32,
@ -166,15 +166,15 @@ impl PathStage {
let reduce_part_tags = REDUCE_PART_SIZE * 4; let reduce_part_tags = REDUCE_PART_SIZE * 4;
let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags; let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags;
if n_wg_tag_reduce > 1 { if n_wg_tag_reduce > 1 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&binding.reduce_ds, &binding.reduce_ds,
(n_wg_tag_reduce, 1, 1), (n_wg_tag_reduce, 1, 1),
(REDUCE_WG, 1, 1), (REDUCE_WG, 1, 1),
); );
// I think we can skip root if n_wg_tag_reduce == 2 // I think we can skip root if n_wg_tag_reduce == 2
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.tag_root_pipeline, &code.tag_root_pipeline,
&self.tag_root_ds, &self.tag_root_ds,
(1, 1, 1), (1, 1, 1),
@ -183,15 +183,15 @@ impl PathStage {
// No barrier needed here; clear doesn't depend on path tags // No barrier needed here; clear doesn't depend on path tags
} }
let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG; let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG;
cmd_buf.dispatch( pass.dispatch(
&code.clear_pipeline, &code.clear_pipeline,
&binding.clear_ds, &binding.clear_ds,
(n_wg_clear, 1, 1), (n_wg_clear, 1, 1),
(CLEAR_WG, 1, 1), (CLEAR_WG, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE; let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE;
cmd_buf.dispatch( pass.dispatch(
&code.pathseg_pipeline, &code.pathseg_pipeline,
&binding.path_ds, &binding.path_ds,
(n_wg_pathseg, 1, 1), (n_wg_pathseg, 1, 1),

View file

@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable};
use piet::kurbo::Affine; use piet::kurbo::Affine;
use piet_gpu_hal::{ use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session,
}; };
/// An affine transform. /// An affine transform.
@ -132,7 +132,7 @@ impl TransformStage {
pub unsafe fn record( pub unsafe fn record(
&self, &self,
cmd_buf: &mut CmdBuf, pass: &mut ComputePass,
code: &TransformCode, code: &TransformCode,
binding: &TransformBinding, binding: &TransformBinding,
size: u64, size: u64,
@ -142,22 +142,22 @@ impl TransformStage {
} }
let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE; let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
if n_workgroups > 1 { if n_workgroups > 1 {
cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&binding.reduce_ds, &binding.reduce_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(TRANSFORM_WG as u32, 1, 1), (TRANSFORM_WG as u32, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
cmd_buf.dispatch( pass.dispatch(
&code.root_pipeline, &code.root_pipeline,
&self.root_ds, &self.root_ds,
(1, 1, 1), (1, 1, 1),
(TRANSFORM_WG as u32, 1, 1), (TRANSFORM_WG as u32, 1, 1),
); );
cmd_buf.memory_barrier(); pass.memory_barrier();
} }
cmd_buf.dispatch( pass.dispatch(
&code.leaf_pipeline, &code.leaf_pipeline,
&binding.leaf_ds, &binding.leaf_ds,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),

View file

@ -21,12 +21,7 @@ pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend)
rc.restore().unwrap(); rc.restore().unwrap();
} }
pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) { pub fn render_svg(rc: &mut impl RenderContext, svg: &PicoSvg) {
let xml_str = std::fs::read_to_string(filename).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
let start = std::time::Instant::now(); let start = std::time::Instant::now();
svg.render(rc); svg.render(rc);
println!("flattening and encoding time: {:?}", start.elapsed()); println!("flattening and encoding time: {:?}", start.elapsed());

View file

@ -16,11 +16,11 @@
//! Utilities (and a benchmark) for clearing buffers with compute shaders. //! Utilities (and a benchmark) for clearing buffers with compute shaders.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet}; use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline}; use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config; use crate::config::Config;
use crate::runner::{Commands, Runner}; use crate::runner::Runner;
use crate::test_result::TestResult; use crate::test_result::TestResult;
const WG_SIZE: u64 = 256; const WG_SIZE: u64 = 256;
@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
let mut total_elapsed = 0.0; let mut total_elapsed = 0.0;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record(&mut commands, &code, &binding); stage.record(&mut pass, &code, &binding);
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
@ -108,17 +108,12 @@ impl ClearStage {
ClearBinding { descriptor_set } ClearBinding { descriptor_set }
} }
pub unsafe fn record( pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) {
&self,
commands: &mut Commands,
code: &ClearCode,
bindings: &ClearBinding,
) {
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE; let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
// An issue: for clearing large buffers (>16M), we need to check the // An issue: for clearing large buffers (>16M), we need to check the
// number of workgroups against the (dynamically detected) limit, and // number of workgroups against the (dynamically detected) limit, and
// potentially issue multiple dispatches. // potentially issue multiple dispatches.
commands.cmd_buf.dispatch( pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),

View file

@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult {
let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf); let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf);
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0);
commands.upload(&memory); commands.upload(&memory);
binding.record(&mut commands.cmd_buf, &code, n_clip as u32); let mut pass = commands.compute_pass(0, 1);
binding.record(&mut pass, &code, n_clip as u32);
pass.end();
commands.download(&memory); commands.download(&memory);
commands.write_timestamp(1);
runner.submit(commands); runner.submit(commands);
let dst = memory.map_read(..); let dst = memory.map_read(..);
if let Some(failure) = data.verify(&dst) { if let Some(failure) = data.verify(&dst) {

View file

@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult {
let n_iter = config.n_iter; let n_iter = config.n_iter;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record(&mut commands.cmd_buf, &code, &binding, n_tag); stage.record(&mut pass, &code, &binding, n_tag);
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&memory); commands.download(&memory);

View file

@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
// Might clear only buckets to save time. // Might clear only buckets to save time.
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf); stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
commands.write_timestamp(1);
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&mem_buf); commands.download(&mem_buf);
@ -107,12 +105,14 @@ impl LinkedListStage {
commands.cmd_buf.clear_buffer(out_buf, None); commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
let n_workgroups = N_BUCKETS / WG_SIZE; let n_workgroups = N_BUCKETS / WG_SIZE;
commands.cmd_buf.dispatch( let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
pass.end();
} }
} }

View file

@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test(
let mut failures = 0; let mut failures = 0;
for _ in 0..n_iter { for _ in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &out_buf.dev_buf); stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
commands.write_timestamp(1);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
total_elapsed += runner.submit(commands); total_elapsed += runner.submit(commands);
@ -128,11 +126,13 @@ impl MessagePassingStage {
commands.cmd_buf.clear_buffer(&self.data_buf, None); commands.cmd_buf.clear_buffer(&self.data_buf, None);
commands.cmd_buf.clear_buffer(out_buf, None); commands.cmd_buf.clear_buffer(out_buf, None);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch( let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(256, 1, 1), (256, 1, 1),
(256, 1, 1), (256, 1, 1),
); );
pass.end();
} }
} }

View file

@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf); commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record( stage.record(
&mut commands.cmd_buf, &mut pass,
&code, &code,
&binding, &binding,
path_data.n_path, path_data.n_path,
path_data.tags.len() as u32, path_data.tags.len() as u32,
); );
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&memory); commands.download(&memory);

View file

@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test(
let mut total_elapsed = 0.0; let mut total_elapsed = 0.0;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
@ -159,12 +157,14 @@ impl PrefixStage {
let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
commands.cmd_buf.clear_buffer(&self.state_buf, None); commands.cmd_buf.clear_buffer(&self.state_buf, None);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch( let mut pass = commands.compute_pass(0, 1);
pass.dispatch(
&code.pipeline, &code.pipeline,
&bindings.descriptor_set, &bindings.descriptor_set,
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
pass.end();
// One thing that's missing here is registering the buffers so // One thing that's missing here is registering the buffers so
// they can be safely dropped by Rust code before the execution // they can be safely dropped by Rust code before the execution
// of the command buffer completes. // of the command buffer completes.

View file

@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf); commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
@ -175,33 +173,35 @@ impl PrefixTreeStage {
code: &PrefixTreeCode, code: &PrefixTreeCode,
bindings: &PrefixTreeBinding, bindings: &PrefixTreeBinding,
) { ) {
let mut pass = commands.compute_pass(0, 1);
let n = self.tmp_bufs.len(); let n = self.tmp_bufs.len();
for i in 0..n { for i in 0..n {
let n_workgroups = self.sizes[i + 1]; let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch( pass.dispatch(
&code.reduce_pipeline, &code.reduce_pipeline,
&bindings.descriptor_sets[i], &bindings.descriptor_sets[i],
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
commands.cmd_buf.memory_barrier(); pass.memory_barrier();
} }
commands.cmd_buf.dispatch( pass.dispatch(
&code.root_pipeline, &code.root_pipeline,
&bindings.descriptor_sets[n], &bindings.descriptor_sets[n],
(1, 1, 1), (1, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
for i in (0..n).rev() { for i in (0..n).rev() {
commands.cmd_buf.memory_barrier(); pass.memory_barrier();
let n_workgroups = self.sizes[i + 1]; let n_workgroups = self.sizes[i + 1];
commands.cmd_buf.dispatch( pass.dispatch(
&code.scan_pipeline, &code.scan_pipeline,
&bindings.descriptor_sets[2 * n - i], &bindings.descriptor_sets[2 * n - i],
(n_workgroups as u32, 1, 1), (n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1), (WG_SIZE as u32, 1, 1),
); );
} }
pass.end();
} }
} }

View file

@ -20,8 +20,8 @@ use std::ops::RangeBounds;
use bytemuck::Pod; use bytemuck::Pod;
use piet_gpu_hal::{ use piet_gpu_hal::{
BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass,
QueryPool, Session, ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session,
}; };
pub struct Runner { pub struct Runner {
@ -118,8 +118,14 @@ impl Runner {
} }
impl Commands { impl Commands {
pub unsafe fn write_timestamp(&mut self, query: u32) { /// Start a compute pass with timer queries.
self.cmd_buf.write_timestamp(&self.query_pool, query); pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass {
self.cmd_buf
.begin_compute_pass(&ComputePassDescriptor::timer(
&self.query_pool,
start_query,
end_query,
))
} }
pub unsafe fn upload(&mut self, buf: &BufStage) { pub unsafe fn upload(&mut self, buf: &BufStage) {

View file

@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult
let n_iter = config.n_iter; let n_iter = config.n_iter;
for i in 0..n_iter { for i in 0..n_iter {
let mut commands = runner.commands(); let mut commands = runner.commands();
commands.write_timestamp(0); let mut pass = commands.compute_pass(0, 1);
stage.record(&mut commands.cmd_buf, &code, &binding, n_elements); stage.record(&mut pass, &code, &binding, n_elements);
commands.write_timestamp(1); pass.end();
if i == 0 || config.verify_all { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&memory); commands.download(&memory);