diff --git a/Cargo.lock b/Cargo.lock index 4e98c9e..cb6b76a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -925,6 +925,7 @@ dependencies = [ "block", "bytemuck", "cocoa-foundation", + "foreign-types", "metal", "objc", "raw-window-handle 0.3.4", diff --git a/piet-gpu-hal/Cargo.toml b/piet-gpu-hal/Cargo.toml index 29b51bd..f9b844a 100644 --- a/piet-gpu-hal/Cargo.toml +++ b/piet-gpu-hal/Cargo.toml @@ -28,3 +28,4 @@ metal = "0.22" objc = "0.2.5" block = "0.1.6" cocoa-foundation = "0.1" +foreign-types = "0.3.2" diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs index dae5b31..7aff938 100644 --- a/piet-gpu-hal/examples/collatz.rs +++ b/piet-gpu-hal/examples/collatz.rs @@ -1,4 +1,4 @@ -use piet_gpu_hal::{include_shader, BindType}; +use piet_gpu_hal::{include_shader, BindType, ComputePassDescriptor}; use piet_gpu_hal::{BufferUsage, Instance, InstanceFlags, Session}; fn main() { @@ -20,9 +20,9 @@ fn main() { let mut cmd_buf = session.cmd_buf().unwrap(); cmd_buf.begin(); cmd_buf.reset_query_pool(&query_pool); - cmd_buf.write_timestamp(&query_pool, 0); - cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1)); - cmd_buf.write_timestamp(&query_pool, 1); + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1)); + pass.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1)); + pass.end(); cmd_buf.finish_timestamps(&query_pool); cmd_buf.host_barrier(); cmd_buf.finish(); diff --git a/piet-gpu-hal/src/backend.rs b/piet-gpu-hal/src/backend.rs index 02ac7cb..f2c67a1 100644 --- a/piet-gpu-hal/src/backend.rs +++ b/piet-gpu-hal/src/backend.rs @@ -17,7 +17,8 @@ //! The generic trait for backends to implement. use crate::{ - BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, + BindType, BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, ImageLayout, + MapMode, SamplerParams, }; pub trait Device: Sized { @@ -159,14 +160,32 @@ pub trait Device: Sized { unsafe fn create_sampler(&self, params: SamplerParams) -> Result; } +/// The trait implemented by backend command buffer implementations. +/// +/// Valid encoding is represented by a state machine (currently not validated +/// but it is easy to imagine there might be at least debug validation). Most +/// methods are only valid in a particular state, and some move it to another +/// state. pub trait CmdBuf { + /// Begin encoding. + /// + /// State: init -> ready unsafe fn begin(&mut self); + /// State: ready -> finished unsafe fn finish(&mut self); /// Return true if the command buffer is suitable for reuse. unsafe fn reset(&mut self) -> bool; + /// Begin a compute pass. + /// + /// State: ready -> in_compute_pass + unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor); + + /// Dispatch + /// + /// State: in_compute_pass unsafe fn dispatch( &mut self, pipeline: &D::Pipeline, @@ -175,6 +194,9 @@ pub trait CmdBuf { workgroup_size: (u32, u32, u32), ); + /// State: in_compute_pass -> ready + unsafe fn end_compute_pass(&mut self); + /// Insert an execution and memory barrier. /// /// Compute kernels (and other actions) after this barrier may read from buffers @@ -202,16 +224,16 @@ pub trait CmdBuf { /// This is readily supported in Vulkan, but for portability it is remarkably /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute /// kernel, or organize the code not to need it. - unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option); + unsafe fn clear_buffer(&mut self, buffer: &D::Buffer, size: Option); - unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); + unsafe fn copy_buffer(&mut self, src: &D::Buffer, dst: &D::Buffer); - unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer); + unsafe fn copy_image_to_buffer(&mut self, src: &D::Image, dst: &D::Buffer); - unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image); + unsafe fn copy_buffer_to_image(&mut self, src: &D::Buffer, dst: &D::Image); // low portability, dx12 doesn't support it natively - unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image); + unsafe fn blit_image(&mut self, src: &D::Image, dst: &D::Image); /// Reset the query pool. /// @@ -227,7 +249,7 @@ pub trait CmdBuf { unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {} /// Begin a labeled section for debugging and profiling purposes. - unsafe fn begin_debug_label(&mut self, label: &str) {} + unsafe fn begin_debug_label(&mut self, _label: &str) {} /// End a section opened by `begin_debug_label`. unsafe fn end_debug_label(&mut self) {} diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index 78ad449..c5e1e04 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -21,7 +21,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use smallvec::SmallVec; -use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat}; +use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, MapMode, WorkgroupLimits, ImageFormat, ComputePassDescriptor}; use self::{ descriptor::{CpuHeapRefOwned, DescriptorPool, GpuHeapRefOwned}, @@ -76,6 +76,7 @@ pub struct CmdBuf { c: wrappers::GraphicsCommandList, allocator: CommandAllocator, needs_reset: bool, + end_query: Option<(wrappers::QueryHeap, u32)>, } pub struct Pipeline { @@ -360,6 +361,7 @@ impl crate::backend::Device for Dx12Device { c, allocator, needs_reset: false, + end_query: None, }) } } @@ -388,11 +390,10 @@ impl crate::backend::Device for Dx12Device { let mapped = self.map_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; std::ptr::copy_nonoverlapping(mapped, buf.as_mut_ptr() as *mut u8, size); self.unmap_buffer(&pool.buf, 0, size as u64, MapMode::Read)?; - let ts0 = buf[0]; let tsp = (self.ts_freq as f64).recip(); - let result = buf[1..] + let result = buf .iter() - .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) + .map(|ts| *ts as f64 * tsp) .collect(); Ok(result) } @@ -610,6 +611,16 @@ impl crate::backend::CmdBuf for CmdBuf { self.allocator.reset().is_ok() && self.c.reset(&self.allocator, None).is_ok() } + unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { + if let Some((pool, start, end)) = &desc.timer_queries { + #[allow(irrefutable_let_patterns)] + if let crate::hub::QueryPool::Dx12(pool) = pool { + self.write_timestamp(pool, *start); + self.end_query = Some((pool.heap.clone(), *end)); + } + } + } + unsafe fn dispatch( &mut self, pipeline: &Pipeline, @@ -628,6 +639,12 @@ impl crate::backend::CmdBuf for CmdBuf { .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); } + unsafe fn end_compute_pass(&mut self) { + if let Some((heap, end)) = self.end_query.take() { + self.c.end_timing_query(&heap, end); + } + } + unsafe fn memory_barrier(&mut self) { // See comments in CommandBuffer::pipeline_barrier in gfx-hal dx12 backend. // The "proper" way to do this would be to name the actual buffers participating @@ -666,7 +683,7 @@ impl crate::backend::CmdBuf for CmdBuf { self.memory_barrier(); } - unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { + unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option) { let cpu_ref = buffer.cpu_ref.as_ref().unwrap(); let (gpu_ref, heap) = buffer .gpu_ref @@ -684,23 +701,23 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { + unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) { // TODO: consider using copy_resource here (if sizes match) let size = src.size.min(dst.size); self.c.copy_buffer(&dst.resource, 0, &src.resource, 0, size); } - unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { + unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) { self.c .copy_texture_to_buffer(&src.resource, &dst.resource, src.size.0, src.size.1); } - unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { + unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) { self.c .copy_buffer_to_texture(&src.resource, &dst.resource, dst.size.0, dst.size.1); } - unsafe fn blit_image(&self, src: &Image, dst: &Image) { + unsafe fn blit_image(&mut self, src: &Image, dst: &Image) { self.c.copy_resource(&src.resource, &dst.resource); } diff --git a/piet-gpu-hal/src/dx12/wrappers.rs b/piet-gpu-hal/src/dx12/wrappers.rs index 4bbb86c..9a3fb90 100644 --- a/piet-gpu-hal/src/dx12/wrappers.rs +++ b/piet-gpu-hal/src/dx12/wrappers.rs @@ -79,7 +79,6 @@ pub struct Blob(pub ComPtr); #[derive(Clone)] pub struct ShaderByteCode { pub bytecode: d3d12::D3D12_SHADER_BYTECODE, - blob: Option, } #[derive(Clone)] @@ -741,7 +740,6 @@ impl ShaderByteCode { BytecodeLength: blob.0.GetBufferSize(), pShaderBytecode: blob.0.GetBufferPointer(), }, - blob: Some(blob), } } @@ -810,7 +808,6 @@ impl ShaderByteCode { BytecodeLength: bytecode.len(), pShaderBytecode: bytecode.as_ptr() as *const _, }, - blob: None, } } } diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index cc09832..ea17754 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -13,7 +13,7 @@ use std::sync::{Arc, Mutex, Weak}; use bytemuck::Pod; use smallvec::SmallVec; -use crate::{mux, BackendType, BufWrite, ImageFormat, MapMode}; +use crate::{mux, BackendType, BufWrite, ComputePassDescriptor, ImageFormat, MapMode}; use crate::{BindType, BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams}; @@ -135,6 +135,11 @@ pub struct BufReadGuard<'a> { size: u64, } +/// A sub-object of a command buffer for a sequence of compute dispatches. +pub struct ComputePass<'a> { + cmd_buf: &'a mut CmdBuf, +} + impl Session { /// Create a new session, choosing the best backend. pub fn new(device: mux::Device) -> Session { @@ -370,8 +375,17 @@ impl Session { /// /// This should be called after waiting on the command buffer that wrote the /// timer queries. + /// + /// The returned vector is one shorter than the number of timer queries in the + /// pool; the first value is subtracted off. It would likely be better to return + /// the raw timestamps, but that change should be made consistently. pub unsafe fn fetch_query_pool(&self, pool: &QueryPool) -> Result, Error> { - self.0.device.fetch_query_pool(pool) + let result = self.0.device.fetch_query_pool(pool)?; + // Subtract off first timestamp. + Ok(result[1..] + .iter() + .map(|ts| *ts as f64 - result[0]) + .collect()) } #[doc(hidden)] @@ -471,23 +485,10 @@ impl CmdBuf { self.cmd_buf().finish(); } - /// Dispatch a compute shader. - /// - /// Request a compute shader to be run, using the pipeline to specify the - /// code, and the descriptor set to address the resources read and written. - /// - /// Both the workgroup count (number of workgroups) and the workgroup size - /// (number of threads in a workgroup) must be specified here, though not - /// all back-ends require the latter info. - pub unsafe fn dispatch( - &mut self, - pipeline: &Pipeline, - descriptor_set: &DescriptorSet, - workgroup_count: (u32, u32, u32), - workgroup_size: (u32, u32, u32), - ) { - self.cmd_buf() - .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size); + /// Begin a compute pass. + pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) -> ComputePass { + self.cmd_buf().begin_compute_pass(desc); + ComputePass { cmd_buf: self } } /// Insert an execution and memory barrier. @@ -582,13 +583,6 @@ impl CmdBuf { self.cmd_buf().reset_query_pool(pool); } - /// Write a timestamp. - /// - /// The query index must be less than the size of the query pool on creation. - pub unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { - self.cmd_buf().write_timestamp(pool, query); - } - /// Prepare the timestamps for reading. This isn't required on Vulkan but /// is required on (at least) DX12. /// @@ -692,6 +686,51 @@ impl Drop for SubmittedCmdBuf { } } +impl<'a> ComputePass<'a> { + /// Dispatch a compute shader. + /// + /// Request a compute shader to be run, using the pipeline to specify the + /// code, and the descriptor set to address the resources read and written. + /// + /// Both the workgroup count (number of workgroups) and the workgroup size + /// (number of threads in a workgroup) must be specified here, though not + /// all back-ends require the latter info. + pub unsafe fn dispatch( + &mut self, + pipeline: &Pipeline, + descriptor_set: &DescriptorSet, + workgroup_count: (u32, u32, u32), + workgroup_size: (u32, u32, u32), + ) { + self.cmd_buf + .cmd_buf() + .dispatch(pipeline, descriptor_set, workgroup_count, workgroup_size); + } + + /// Add a memory barrier. + /// + /// Inserts a memory barrier in the compute encoder. This is a convenience + /// function for calling the same function on the underlying command buffer, + /// avoiding borrow check issues. + pub unsafe fn memory_barrier(&mut self) { + self.cmd_buf.memory_barrier(); + } + + /// Begin a labeled section for debugging and profiling purposes. + pub unsafe fn begin_debug_label(&mut self, label: &str) { + self.cmd_buf.begin_debug_label(label); + } + + /// End a section opened by `begin_debug_label`. + pub unsafe fn end_debug_label(&mut self) { + self.cmd_buf.end_debug_label(); + } + + pub unsafe fn end(self) { + self.cmd_buf.cmd_buf().end_compute_pass(); + } +} + impl Drop for BufferInner { fn drop(&mut self) { if let Some(session) = Weak::upgrade(&self.session) { diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index fab7d65..a1073f4 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -21,8 +21,8 @@ pub use crate::mux::{ }; pub use bufwrite::BufWrite; pub use hub::{ - BufReadGuard, BufWriteGuard, Buffer, CmdBuf, DescriptorSetBuilder, Image, RetainResource, - Session, SubmittedCmdBuf, + BufReadGuard, BufWriteGuard, Buffer, CmdBuf, ComputePass, DescriptorSetBuilder, Image, + RetainResource, Session, SubmittedCmdBuf, }; // TODO: because these are conditionally included, "cargo fmt" does not @@ -189,3 +189,23 @@ pub struct WorkgroupLimits { /// dimension. pub max_invocations: u32, } + +/// Options for creating a compute pass. +#[derive(Default)] +pub struct ComputePassDescriptor<'a> { + // Maybe label should go here? It does in wgpu and wgpu_hal. + /// Timer query parameters. + /// + /// To record timer queries for a compute pass, set the query pool, start + /// query index, and end query index here. The indices must be less than + /// the size of the query pool. + timer_queries: Option<(&'a QueryPool, u32, u32)>, +} + +impl<'a> ComputePassDescriptor<'a> { + pub fn timer(pool: &'a QueryPool, start_query: u32, end_query: u32) -> ComputePassDescriptor { + ComputePassDescriptor { + timer_queries: Some((pool, start_query, end_query)), + } + } +} diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs index e3157d4..307def8 100644 --- a/piet-gpu-hal/src/metal.rs +++ b/piet-gpu-hal/src/metal.rs @@ -15,25 +15,32 @@ // Also licensed under MIT license, at your choice. mod clear; +mod timer; mod util; use std::mem; use std::sync::{Arc, Mutex}; +use block::Block; use cocoa_foundation::base::id; use cocoa_foundation::foundation::{NSInteger, NSUInteger}; +use foreign_types::ForeignType; use objc::rc::autoreleasepool; use objc::runtime::{Object, BOOL, YES}; use objc::{class, msg_send, sel, sel_impl}; -use metal::{CGFloat, MTLFeatureSet}; +use metal::{CGFloat, CommandBufferRef, MTLFeatureSet}; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; -use crate::{BufferUsage, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits}; +use crate::{ + BufferUsage, ComputePassDescriptor, Error, GpuInfo, ImageFormat, MapMode, WorkgroupLimits, +}; use util::*; +use self::timer::{CounterSampleBuffer, CounterSet, TimeCalibration}; + pub struct MtlInstance; pub struct MtlDevice { @@ -41,6 +48,18 @@ pub struct MtlDevice { cmd_queue: Arc>, gpu_info: GpuInfo, helpers: Arc, + timer_set: Option, + counter_style: CounterStyle, +} + +/// Type of counter sampling. +/// +/// See https://developer.apple.com/documentation/metal/counter_sampling/sampling_gpu_data_into_counter_sample_buffers +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +enum CounterStyle { + None, + Stage, + Command, } pub struct MtlSurface { @@ -81,9 +100,22 @@ pub struct Semaphore; pub struct CmdBuf { cmd_buf: metal::CommandBuffer, helpers: Arc, + cur_encoder: Encoder, + time_calibration: Arc>, + counter_style: CounterStyle, } -pub struct QueryPool; +enum Encoder { + None, + Compute(metal::ComputeCommandEncoder, Option<(id, u32)>), + Blit(metal::BlitCommandEncoder), +} + +#[derive(Default)] +pub struct QueryPool { + counter_sample_buf: Option, + calibration: Arc>>>>, +} pub struct Pipeline(metal::ComputePipelineState); @@ -209,18 +241,43 @@ impl MtlDevice { let helpers = Arc::new(Helpers { clear_pipeline: clear::make_clear_pipeline(&device), }); + // Timer stuff + let timer_set = CounterSet::get_timer_counter_set(&device); + let counter_style = if timer_set.is_some() { + if device.supports_counter_sampling(metal::MTLCounterSamplingPoint::AtStageBoundary) { + CounterStyle::Stage + } else if device + .supports_counter_sampling(metal::MTLCounterSamplingPoint::AtDispatchBoundary) + { + CounterStyle::Command + } else { + CounterStyle::None + } + } else { + CounterStyle::None + }; MtlDevice { device, cmd_queue: Arc::new(Mutex::new(cmd_queue)), gpu_info, helpers, + timer_set, + counter_style, } } pub fn cmd_buf_from_raw_mtl(&self, raw_cmd_buf: metal::CommandBuffer) -> CmdBuf { let cmd_buf = raw_cmd_buf; let helpers = self.helpers.clone(); - CmdBuf { cmd_buf, helpers } + let cur_encoder = Encoder::None; + let time_calibration = Default::default(); + CmdBuf { + cmd_buf, + helpers, + cur_encoder, + time_calibration, + counter_style: self.counter_style, + } } pub fn image_from_raw_mtl(&self, texture: metal::Texture, width: u32, height: u32) -> Image { @@ -330,11 +387,35 @@ impl crate::backend::Device for MtlDevice { fn create_cmd_buf(&self) -> Result { let cmd_queue = self.cmd_queue.lock().unwrap(); + // A discussion about autorelease pools. + // + // Autorelease pools are a sore point in Rust/Objective-C interop. Basically, + // you can have any two of correctness, ergonomics, and performance. Here we've + // chosen the first two, using the pattern of a fine grained autorelease pool + // to give the Obj-C object Rust-like lifetime semantics whenever objects are + // created as autorelease (by convention, this is any object creation with an + // Obj-C method name that doesn't begin with "new" or "alloc"). + // + // To gain back some of the performance, we'd need a way to wrap an autorelease + // pool over a chunk of work - that could be one frame of rendering, but for + // tests that iterate a number of command buffer submissions, it would need to + // be around that. On non-mac platforms, it would be a no-op. + // + // In any case, this way, the caller doesn't need to worry, and the performance + // hit might not be so bad (perhaps we should measure). + // consider new_command_buffer_with_unretained_references for performance - let cmd_buf = cmd_queue.new_command_buffer(); - let cmd_buf = autoreleasepool(|| cmd_buf.to_owned()); + let cmd_buf = autoreleasepool(|| cmd_queue.new_command_buffer().to_owned()); let helpers = self.helpers.clone(); - Ok(CmdBuf { cmd_buf, helpers }) + let cur_encoder = Encoder::None; + let time_calibration = Default::default(); + Ok(CmdBuf { + cmd_buf, + helpers, + cur_encoder, + time_calibration, + counter_style: self.counter_style, + }) } unsafe fn destroy_cmd_buf(&self, _cmd_buf: Self::CmdBuf) -> Result<(), Error> { @@ -342,12 +423,31 @@ impl crate::backend::Device for MtlDevice { } fn create_query_pool(&self, n_queries: u32) -> Result { - // TODO - Ok(QueryPool) + if let Some(timer_set) = &self.timer_set { + let pool = CounterSampleBuffer::new(&self.device, n_queries as u64, timer_set) + .ok_or("error creating timer query pool")?; + return Ok(QueryPool { + counter_sample_buf: Some(pool), + calibration: Default::default(), + }); + } + Ok(QueryPool::default()) } unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error> { - // TODO + if let Some(raw) = &pool.counter_sample_buf { + let resolved = raw.resolve(); + let calibration = pool.calibration.lock().unwrap(); + if let Some(calibration) = &*calibration { + let calibration = calibration.lock().unwrap(); + let result = resolved + .iter() + .map(|time_ns| calibration.correlate(*time_ns)) + .collect(); + return Ok(result); + } + } + // Maybe should return None indicating it wasn't successful? But that might break. Ok(Vec::new()) } @@ -358,7 +458,37 @@ impl crate::backend::Device for MtlDevice { _signal_semaphores: &[&Self::Semaphore], fence: Option<&mut Self::Fence>, ) -> Result<(), Error> { + unsafe fn add_scheduled_handler( + cmd_buf: &metal::CommandBufferRef, + block: &Block<(&CommandBufferRef,), ()>, + ) { + msg_send![cmd_buf, addScheduledHandler: block] + } for cmd_buf in cmd_bufs { + let time_calibration = cmd_buf.time_calibration.clone(); + let start_block = block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| { + let device: id = msg_send![buffer, device]; + let mut time_calibration = time_calibration.lock().unwrap(); + let cpu_ts_ptr = &mut time_calibration.cpu_start_ts as *mut _; + let gpu_ts_ptr = &mut time_calibration.gpu_start_ts as *mut _; + // TODO: only do this if supported. + let () = msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr]; + }) + .copy(); + add_scheduled_handler(&cmd_buf.cmd_buf, &start_block); + let time_calibration = cmd_buf.time_calibration.clone(); + let completed_block = + block::ConcreteBlock::new(move |buffer: &metal::CommandBufferRef| { + let device: id = msg_send![buffer, device]; + let mut time_calibration = time_calibration.lock().unwrap(); + let cpu_ts_ptr = &mut time_calibration.cpu_end_ts as *mut _; + let gpu_ts_ptr = &mut time_calibration.gpu_end_ts as *mut _; + // TODO: only do this if supported. + let () = + msg_send![device, sampleTimestamps: cpu_ts_ptr gpuTimestamp: gpu_ts_ptr]; + }) + .copy(); + cmd_buf.cmd_buf.add_completed_handler(&completed_block); cmd_buf.cmd_buf.commit(); } if let Some(last_cmd_buf) = cmd_bufs.last() { @@ -439,12 +569,70 @@ impl crate::backend::Device for MtlDevice { impl crate::backend::CmdBuf for CmdBuf { unsafe fn begin(&mut self) {} - unsafe fn finish(&mut self) {} + unsafe fn finish(&mut self) { + self.flush_encoder(); + } unsafe fn reset(&mut self) -> bool { false } + unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { + // TODO: we might want to get better about validation but the following + // assert is likely to trigger, and also a case can be made that + // validation should be done at the hub level, for consistency. + //debug_assert!(matches!(self.cur_encoder, Encoder::None)); + self.flush_encoder(); + autoreleasepool(|| { + let (encoder, end_query) = match (&desc.timer_queries, self.counter_style) { + (Some(queries), CounterStyle::Stage) => { + let descriptor: id = + msg_send![class!(MTLComputePassDescriptor), computePassDescriptor]; + let attachments: id = msg_send![descriptor, sampleBufferAttachments]; + let index: NSUInteger = 0; + let attachment: id = msg_send![attachments, objectAtIndexedSubscript: index]; + // Here we break the hub/mux separation a bit, for expedience + #[allow(irrefutable_let_patterns)] + if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 { + if let Some(sample_buf) = &query_pool.counter_sample_buf { + let () = msg_send![attachment, setSampleBuffer: sample_buf.id()]; + } + } + let start_index = queries.1 as NSUInteger; + let end_index = queries.2 as NSInteger; + let () = msg_send![attachment, setStartOfEncoderSampleIndex: start_index]; + let () = msg_send![attachment, setEndOfEncoderSampleIndex: end_index]; + ( + msg_send![ + self.cmd_buf, + computeCommandEncoderWithDescriptor: descriptor + ], + None, + ) + } + (Some(queries), CounterStyle::Command) => { + let encoder = self.cmd_buf.new_compute_command_encoder(); + #[allow(irrefutable_let_patterns)] + let end_query = if let crate::hub::QueryPool::Mtl(query_pool) = queries.0 { + if let Some(sample_buf) = &query_pool.counter_sample_buf { + let sample_index = queries.1 as NSUInteger; + let sample_buf = sample_buf.id(); + let () = msg_send![encoder, sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true]; + Some((sample_buf, queries.2)) + } else { + None + } + } else { + None + }; + (encoder, end_query) + } + _ => (self.cmd_buf.new_compute_command_encoder(), None), + }; + self.cur_encoder = Encoder::Compute(encoder.to_owned(), end_query); + }); + } + unsafe fn dispatch( &mut self, pipeline: &Pipeline, @@ -452,7 +640,7 @@ impl crate::backend::CmdBuf for CmdBuf { workgroup_count: (u32, u32, u32), workgroup_size: (u32, u32, u32), ) { - let encoder = self.cmd_buf.new_compute_command_encoder(); + let encoder = self.compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline.0); let mut buf_ix = 0; for buffer in &descriptor_set.buffers { @@ -475,7 +663,11 @@ impl crate::backend::CmdBuf for CmdBuf { depth: workgroup_size.2 as u64, }; encoder.dispatch_thread_groups(workgroup_count, workgroup_size); - encoder.end_encoding(); + } + + unsafe fn end_compute_pass(&mut self) { + // TODO: might validate that we are in a compute encoder state + self.flush_encoder(); } unsafe fn memory_barrier(&mut self) { @@ -494,22 +686,23 @@ impl crate::backend::CmdBuf for CmdBuf { // I think these are being tracked. } - unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { + unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option) { let size = size.unwrap_or(buffer.size); - let encoder = self.cmd_buf.new_compute_command_encoder(); - clear::encode_clear(&encoder, &self.helpers.clear_pipeline, &buffer.buffer, size); - encoder.end_encoding() + let _ = self.compute_command_encoder(); + // Getting this directly is a workaround for a borrow checker issue. + if let Encoder::Compute(e, _) = &self.cur_encoder { + clear::encode_clear(e, &self.helpers.clear_pipeline, &buffer.buffer, size); + } } - unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { - let encoder = self.cmd_buf.new_blit_command_encoder(); + unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) { + let encoder = self.blit_command_encoder(); let size = src.size.min(dst.size); encoder.copy_from_buffer(&src.buffer, 0, &dst.buffer, 0, size); - encoder.end_encoding(); } - unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { - let encoder = self.cmd_buf.new_blit_command_encoder(); + unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) { + let encoder = self.blit_command_encoder(); assert_eq!(dst.size, (src.width as u64) * (src.height as u64) * 4); let bytes_per_row = (src.width * 4) as NSUInteger; let src_size = metal::MTLSize { @@ -530,11 +723,10 @@ impl crate::backend::CmdBuf for CmdBuf { bytes_per_row * src.height as NSUInteger, metal::MTLBlitOption::empty(), ); - encoder.end_encoding(); } - unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { - let encoder = self.cmd_buf.new_blit_command_encoder(); + unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) { + let encoder = self.blit_command_encoder(); assert_eq!(src.size, (dst.width as u64) * (dst.height as u64) * 4); let bytes_per_row = (dst.width * 4) as NSUInteger; let src_size = metal::MTLSize { @@ -555,11 +747,10 @@ impl crate::backend::CmdBuf for CmdBuf { origin, metal::MTLBlitOption::empty(), ); - encoder.end_encoding(); } - unsafe fn blit_image(&self, src: &Image, dst: &Image) { - let encoder = self.cmd_buf.new_blit_command_encoder(); + unsafe fn blit_image(&mut self, src: &Image, dst: &Image) { + let encoder = self.blit_command_encoder(); let src_size = metal::MTLSize { width: src.width.min(dst.width) as NSUInteger, height: src.width.min(dst.height) as NSUInteger, @@ -577,15 +768,79 @@ impl crate::backend::CmdBuf for CmdBuf { 0, origin, ); - encoder.end_encoding(); } - unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {} + unsafe fn reset_query_pool(&mut self, pool: &QueryPool) { + let mut calibration = pool.calibration.lock().unwrap(); + *calibration = Some(self.time_calibration.clone()); + } unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { - // TODO - // This really a PITA because it's pretty different than Vulkan. - // See https://developer.apple.com/documentation/metal/counter_sampling + if let Some(buf) = &pool.counter_sample_buf { + if matches!(self.cur_encoder, Encoder::None) { + self.cur_encoder = + Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None); + } + let sample_index = query as NSUInteger; + if self.counter_style == CounterStyle::Command { + match &self.cur_encoder { + Encoder::Compute(e, _) => { + let () = msg_send![e.as_ptr(), sampleCountersInBuffer: buf.id() atSampleIndex: sample_index withBarrier: true]; + } + Encoder::None => unreachable!(), + _ => todo!(), + } + } else if self.counter_style == CounterStyle::Stage { + match &self.cur_encoder { + Encoder::Compute(_e, _) => { + println!("write_timestamp is not supported for stage-style encoders"); + } + _ => (), + } + } + } + } +} + +impl CmdBuf { + fn compute_command_encoder(&mut self) -> &metal::ComputeCommandEncoder { + if !matches!(self.cur_encoder, Encoder::Compute(..)) { + self.flush_encoder(); + self.cur_encoder = + Encoder::Compute(self.cmd_buf.new_compute_command_encoder().to_owned(), None); + } + if let Encoder::Compute(e, _) = &self.cur_encoder { + e + } else { + unreachable!() + } + } + + fn blit_command_encoder(&mut self) -> &metal::BlitCommandEncoder { + if !matches!(self.cur_encoder, Encoder::Blit(_)) { + self.flush_encoder(); + self.cur_encoder = Encoder::Blit(self.cmd_buf.new_blit_command_encoder().to_owned()); + } + if let Encoder::Blit(e) = &self.cur_encoder { + e + } else { + unreachable!() + } + } + + fn flush_encoder(&mut self) { + match std::mem::replace(&mut self.cur_encoder, Encoder::None) { + Encoder::Compute(e, Some((sample_buf, end_query))) => { + let sample_index = end_query as NSUInteger; + unsafe { + let () = msg_send![e.as_ptr(), sampleCountersInBuffer: sample_buf atSampleIndex: sample_index withBarrier: true]; + } + e.end_encoding(); + } + Encoder::Compute(e, None) => e.end_encoding(), + Encoder::Blit(e) => e.end_encoding(), + Encoder::None => (), + } } } diff --git a/piet-gpu-hal/src/metal/timer.rs b/piet-gpu-hal/src/metal/timer.rs new file mode 100644 index 0000000..65c8026 --- /dev/null +++ b/piet-gpu-hal/src/metal/timer.rs @@ -0,0 +1,172 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! Support for timer queries. +//! +//! Likely some of this should be upstreamed into metal-rs. + +use std::{ffi::CStr, ptr::null_mut}; + +use cocoa_foundation::{ + base::id, + foundation::{NSRange, NSUInteger}, +}; +use metal::{DeviceRef, MTLStorageMode}; +use objc::{class, msg_send, sel, sel_impl}; + +pub struct CounterSampleBuffer { + id: id, + count: u64, +} + +pub struct CounterSet { + id: id, +} + +#[derive(Default)] +pub struct TimeCalibration { + pub cpu_start_ts: u64, + pub gpu_start_ts: u64, + pub cpu_end_ts: u64, + pub gpu_end_ts: u64, +} + +impl Drop for CounterSampleBuffer { + fn drop(&mut self) { + unsafe { msg_send![self.id, release] } + } +} + +impl Clone for CounterSampleBuffer { + fn clone(&self) -> CounterSampleBuffer { + unsafe { + CounterSampleBuffer { + id: msg_send![self.id, retain], + count: self.count, + } + } + } +} + +impl CounterSampleBuffer { + pub fn id(&self) -> id { + self.id + } +} + +impl Drop for CounterSet { + fn drop(&mut self) { + unsafe { msg_send![self.id, release] } + } +} + +impl CounterSet { + pub fn get_timer_counter_set(device: &DeviceRef) -> Option { + unsafe { + // TODO: version check + let sets: id = msg_send!(device, counterSets); + let count: NSUInteger = msg_send![sets, count]; + for i in 0..count { + let set: id = msg_send![sets, objectAtIndex: i]; + let name: id = msg_send![set, name]; + let name_cstr = CStr::from_ptr(msg_send![name, UTF8String]); + if name_cstr.to_bytes() == b"timestamp" { + return Some(CounterSet { id: set }); + } + } + None + } + } +} + +// copied from metal-rs; should be in common utilities maybe? +fn nsstring_as_str(nsstr: &objc::runtime::Object) -> &str { + let bytes = unsafe { + let bytes: *const std::os::raw::c_char = msg_send![nsstr, UTF8String]; + bytes as *const u8 + }; + let len: NSUInteger = unsafe { msg_send![nsstr, length] }; + unsafe { + let bytes = std::slice::from_raw_parts(bytes, len as usize); + std::str::from_utf8(bytes).unwrap() + } +} + +impl CounterSampleBuffer { + pub fn new( + device: &DeviceRef, + count: u64, + counter_set: &CounterSet, + ) -> Option { + unsafe { + let desc_cls = class!(MTLCounterSampleBufferDescriptor); + let descriptor: id = msg_send![desc_cls, alloc]; + let _: id = msg_send![descriptor, init]; + let count = count as NSUInteger; + let () = msg_send![descriptor, setSampleCount: count]; + let () = msg_send![descriptor, setCounterSet: counter_set.id]; + let () = msg_send![ + descriptor, + setStorageMode: MTLStorageMode::Shared as NSUInteger + ]; + let mut error: id = null_mut(); + let buf: id = msg_send![device, newCounterSampleBufferWithDescriptor: descriptor error: &mut error]; + let () = msg_send![descriptor, release]; + if !error.is_null() { + let description = msg_send![error, localizedDescription]; + println!( + "error allocating sample buffer, code = {}", + nsstring_as_str(description) + ); + let () = msg_send![error, release]; + return None; + } + Some(CounterSampleBuffer { id: buf, count }) + } + } + + // Read the timestamps. + // + // Safety: the lifetime of the returned slice is wrong, it's actually autoreleased. + pub unsafe fn resolve(&self) -> &[u64] { + let range = NSRange::new(0, self.count); + let data: id = msg_send![self.id, resolveCounterRange: range]; + if data.is_null() { + &[] + } else { + let bytes: *const u64 = msg_send![data, bytes]; + std::slice::from_raw_parts(bytes, self.count as usize) + } + } +} + +impl TimeCalibration { + /// Convert GPU timestamp into CPU time base. + /// + /// See https://developer.apple.com/documentation/metal/performance_tuning/correlating_cpu_and_gpu_timestamps + pub fn correlate(&self, raw_ts: u64) -> f64 { + let delta_cpu = self.cpu_end_ts - self.cpu_start_ts; + let delta_gpu = self.gpu_end_ts - self.gpu_start_ts; + let adj_ts = if delta_gpu > 0 { + let scale = delta_cpu as f64 / delta_gpu as f64; + self.cpu_start_ts as f64 + (raw_ts as f64 - self.gpu_start_ts as f64) * scale + } else { + // Default is ns on Apple Silicon; on other hardware this will be wrong + raw_ts as f64 + }; + adj_ts * 1e-9 + } +} diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs index af1702d..9795193 100644 --- a/piet-gpu-hal/src/mux.rs +++ b/piet-gpu-hal/src/mux.rs @@ -35,6 +35,7 @@ use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait; use crate::backend::Device as DeviceTrait; use crate::BackendType; use crate::BindType; +use crate::ComputePassDescriptor; use crate::ImageFormat; use crate::MapMode; use crate::{BufferUsage, Error, GpuInfo, ImageLayout, InstanceFlags}; @@ -658,6 +659,14 @@ impl CmdBuf { } } + pub unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { + mux_match! { self; + CmdBuf::Vk(c) => c.begin_compute_pass(desc), + CmdBuf::Dx12(c) => c.begin_compute_pass(desc), + CmdBuf::Mtl(c) => c.begin_compute_pass(desc), + } + } + /// Dispatch a compute shader. /// /// Note that both the number of workgroups (`workgroup_count`) and the number of @@ -680,6 +689,14 @@ impl CmdBuf { } } + pub unsafe fn end_compute_pass(&mut self) { + mux_match! { self; + CmdBuf::Vk(c) => c.end_compute_pass(), + CmdBuf::Dx12(c) => c.end_compute_pass(), + CmdBuf::Mtl(c) => c.end_compute_pass(), + } + } + pub unsafe fn memory_barrier(&mut self) { mux_match! { self; CmdBuf::Vk(c) => c.memory_barrier(), diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 8392899..504d947 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -15,7 +15,7 @@ use smallvec::SmallVec; use crate::backend::Device as DeviceTrait; use crate::{ BindType, BufferUsage, Error, GpuInfo, ImageFormat, ImageLayout, MapMode, SamplerParams, SubgroupSize, - WorkgroupLimits, + WorkgroupLimits, ComputePassDescriptor, }; pub struct VkInstance { @@ -92,6 +92,7 @@ pub struct CmdBuf { cmd_buf: vk::CommandBuffer, cmd_pool: vk::CommandPool, device: Arc, + end_query: Option<(vk::QueryPool, u32)>, } pub struct QueryPool { @@ -738,6 +739,7 @@ impl crate::backend::Device for VkDevice { cmd_buf, cmd_pool, device: self.device.clone(), + end_query: None, }) } } @@ -770,11 +772,10 @@ impl crate::backend::Device for VkDevice { // results (Windows 10, AMD 5700 XT). let flags = vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT; device.get_query_pool_results(pool.pool, 0, pool.n_queries, &mut buf, flags)?; - let ts0 = buf[0]; let tsp = self.timestamp_period as f64 * 1e-9; - let result = buf[1..] + let result = buf .iter() - .map(|ts| ts.wrapping_sub(ts0) as f64 * tsp) + .map(|ts| *ts as f64 * tsp) .collect(); Ok(result) } @@ -902,6 +903,16 @@ impl crate::backend::CmdBuf for CmdBuf { true } + unsafe fn begin_compute_pass(&mut self, desc: &ComputePassDescriptor) { + if let Some((pool, start, end)) = &desc.timer_queries { + #[allow(irrefutable_let_patterns)] + if let crate::hub::QueryPool::Vk(pool) = pool { + self.write_timestamp_raw(pool.pool, *start); + self.end_query = Some((pool.pool, *end)); + } + } + } + unsafe fn dispatch( &mut self, pipeline: &Pipeline, @@ -931,6 +942,12 @@ impl crate::backend::CmdBuf for CmdBuf { ); } + unsafe fn end_compute_pass(&mut self) { + if let Some((pool, end)) = self.end_query.take() { + self.write_timestamp_raw(pool, end); + } + } + /// Insert a pipeline barrier for all memory accesses. unsafe fn memory_barrier(&mut self) { let device = &self.device.device; @@ -995,13 +1012,13 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn clear_buffer(&self, buffer: &Buffer, size: Option) { + unsafe fn clear_buffer(&mut self, buffer: &Buffer, size: Option) { let device = &self.device.device; let size = size.unwrap_or(vk::WHOLE_SIZE); device.cmd_fill_buffer(self.cmd_buf, buffer.buffer, 0, size, 0); } - unsafe fn copy_buffer(&self, src: &Buffer, dst: &Buffer) { + unsafe fn copy_buffer(&mut self, src: &Buffer, dst: &Buffer) { let device = &self.device.device; let size = src.size.min(dst.size); device.cmd_copy_buffer( @@ -1012,7 +1029,7 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn copy_image_to_buffer(&self, src: &Image, dst: &Buffer) { + unsafe fn copy_image_to_buffer(&mut self, src: &Image, dst: &Buffer) { let device = &self.device.device; device.cmd_copy_image_to_buffer( self.cmd_buf, @@ -1035,7 +1052,7 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn copy_buffer_to_image(&self, src: &Buffer, dst: &Image) { + unsafe fn copy_buffer_to_image(&mut self, src: &Buffer, dst: &Image) { let device = &self.device.device; device.cmd_copy_buffer_to_image( self.cmd_buf, @@ -1058,7 +1075,7 @@ impl crate::backend::CmdBuf for CmdBuf { ); } - unsafe fn blit_image(&self, src: &Image, dst: &Image) { + unsafe fn blit_image(&mut self, src: &Image, dst: &Image) { let device = &self.device.device; device.cmd_blit_image( self.cmd_buf, @@ -1106,13 +1123,7 @@ impl crate::backend::CmdBuf for CmdBuf { } unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { - let device = &self.device.device; - device.cmd_write_timestamp( - self.cmd_buf, - vk::PipelineStageFlags::COMPUTE_SHADER, - pool.pool, - query, - ); + self.write_timestamp_raw(pool.pool, query); } unsafe fn begin_debug_label(&mut self, label: &str) { @@ -1130,6 +1141,18 @@ impl crate::backend::CmdBuf for CmdBuf { } } +impl CmdBuf { + unsafe fn write_timestamp_raw(&mut self, pool: vk::QueryPool, query: u32) { + let device = &self.device.device; + device.cmd_write_timestamp( + self.cmd_buf, + vk::PipelineStageFlags::COMPUTE_SHADER, + pool, + query, + ); + } +} + impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { self.buffers.extend(buffers.iter().map(|b| b.buffer)); diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 70023af..abe6ae1 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -6,7 +6,7 @@ use clap::{App, Arg}; use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session}; -use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer}; +use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer}; const WIDTH: usize = 2048; const HEIGHT: usize = 1536; @@ -243,7 +243,11 @@ fn main() -> Result<(), Error> { if matches.is_present("flip") { scale = -scale; } - test_scenes::render_svg(&mut ctx, input, scale); + let xml_str = std::fs::read_to_string(input).unwrap(); + let start = std::time::Instant::now(); + let svg = PicoSvg::load(&xml_str, scale).unwrap(); + println!("parsing time: {:?}", start.elapsed()); + test_scenes::render_svg(&mut ctx, &svg); } else { test_scenes::render_scene(&mut ctx); } diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs index 3ca0742..1642026 100644 --- a/piet-gpu/bin/winit.rs +++ b/piet-gpu/bin/winit.rs @@ -2,7 +2,7 @@ use piet::kurbo::Point; use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder}; use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf}; -use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer}; +use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer}; use clap::{App, Arg}; @@ -29,6 +29,25 @@ fn main() -> Result<(), Error> { ) .get_matches(); + // Collect SVG if input + let svg = match matches.value_of("INPUT") { + Some(file) => { + let mut scale = matches + .value_of("scale") + .map(|scale| scale.parse().unwrap()) + .unwrap_or(8.0); + if matches.is_present("flip") { + scale = -scale; + } + let xml_str = std::fs::read_to_string(file).unwrap(); + let start = std::time::Instant::now(); + let svg = PicoSvg::load(&xml_str, scale).unwrap(); + println!("parsing time: {:?}", start.elapsed()); + Some(svg) + } + None => None, + }; + let event_loop = EventLoop::new(); let window = WindowBuilder::new() .with_inner_size(winit::dpi::LogicalSize { @@ -51,7 +70,7 @@ fn main() -> Result<(), Error> { .map(|_| session.create_semaphore()) .collect::, Error>>()?; let query_pools = (0..NUM_FRAMES) - .map(|_| session.create_query_pool(8)) + .map(|_| session.create_query_pool(12)) .collect::, Error>>()?; let mut cmd_bufs: [Option; NUM_FRAMES] = Default::default(); let mut submitted: [Option; NUM_FRAMES] = Default::default(); @@ -93,29 +112,23 @@ fn main() -> Result<(), Error> { if !ts.is_empty() { info_string = format!( "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms", - ts[6] * 1e3, + ts[10] * 1e3, ts[0] * 1e3, (ts[1] - ts[0]) * 1e3, (ts[2] - ts[1]) * 1e3, - (ts[3] - ts[2]) * 1e3, (ts[4] - ts[3]) * 1e3, - (ts[5] - ts[4]) * 1e3, (ts[6] - ts[5]) * 1e3, + (ts[8] - ts[7]) * 1e3, + (ts[10] - ts[9]) * 1e3, ); } } let mut ctx = PietGpuRenderContext::new(); - if let Some(input) = matches.value_of("INPUT") { - let mut scale = matches - .value_of("scale") - .map(|scale| scale.parse().unwrap()) - .unwrap_or(8.0); - if matches.is_present("flip") { - scale = -scale; - } - test_scenes::render_svg(&mut ctx, input, scale); - } else { + let test_blend = false; + if let Some(svg) = &svg { + test_scenes::render_svg(&mut ctx, svg); + } else if test_blend { use piet_gpu::{Blend, BlendMode::*, CompositionMode::*}; let blends = [ Blend::new(Normal, SrcOver), @@ -151,6 +164,8 @@ fn main() -> Result<(), Error> { let blend = blends[mode % blends.len()]; test_scenes::render_blend_test(&mut ctx, current_frame, blend); info_string = format!("{:?}", blend); + } else { + test_scenes::render_anim_frame(&mut ctx, current_frame); } render_info_string(&mut ctx, &info_string); if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) { diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index adbedfd..3abb2e0 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -306,7 +306,7 @@ void main() { is_blend = (blend != BlendComp_default); } include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip - || (is_clip && is_blend); + || is_blend; } if (include_tile) { uint el_slice = el_ix / 32; diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil index 4a4f073..3050aa8 100644 Binary files a/piet-gpu/shader/gen/binning.dxil and b/piet-gpu/shader/gen/binning.dxil differ diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil index 879b7c8..fdab444 100644 Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl index 1e610ec..04529bb 100644 --- a/piet-gpu/shader/gen/coarse.hlsl +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -931,23 +931,14 @@ void comp_main() { _1701 = _1692; } - bool _1708; - if (!_1701) - { - _1708 = is_clip && is_blend; - } - else - { - _1708 = _1701; - } - include_tile = _1708; + include_tile = _1701 || is_blend; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1728; - InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1728); + uint _1723; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723); } } GroupMemoryBarrierWithGroupSync(); @@ -976,9 +967,9 @@ void comp_main() { uint param_25 = element_ref_ix; bool param_26 = mem_ok; - TileRef _1805 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; Alloc param_27 = read_tile_alloc(param_25, param_26); - TileRef param_28 = _1805; + TileRef param_28 = _1800; Tile tile_1 = Tile_read(param_27, param_28); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8); @@ -993,11 +984,11 @@ void comp_main() Alloc param_29 = cmd_alloc; CmdRef param_30 = cmd_ref; uint param_31 = cmd_limit; - bool _1853 = alloc_cmd(param_29, param_30, param_31); + bool _1848 = alloc_cmd(param_29, param_30, param_31); cmd_alloc = param_29; cmd_ref = param_30; cmd_limit = param_31; - if (!_1853) + if (!_1848) { break; } @@ -1008,10 +999,10 @@ void comp_main() write_fill(param_32, param_33, param_34, param_35); cmd_ref = param_33; uint rgba = _1372.Load(dd_1 * 4 + 0); - CmdColor _1876 = { rgba }; + CmdColor _1871 = { rgba }; Alloc param_36 = cmd_alloc; CmdRef param_37 = cmd_ref; - CmdColor param_38 = _1876; + CmdColor param_38 = _1871; Cmd_Color_write(param_36, param_37, param_38); cmd_ref.offset += 8u; break; @@ -1021,11 +1012,11 @@ void comp_main() Alloc param_39 = cmd_alloc; CmdRef param_40 = cmd_ref; uint param_41 = cmd_limit; - bool _1894 = alloc_cmd(param_39, param_40, param_41); + bool _1889 = alloc_cmd(param_39, param_40, param_41); cmd_alloc = param_39; cmd_ref = param_40; cmd_limit = param_41; - if (!_1894) + if (!_1889) { break; } @@ -1052,11 +1043,11 @@ void comp_main() Alloc param_49 = cmd_alloc; CmdRef param_50 = cmd_ref; uint param_51 = cmd_limit; - bool _1958 = alloc_cmd(param_49, param_50, param_51); + bool _1953 = alloc_cmd(param_49, param_50, param_51); cmd_alloc = param_49; cmd_ref = param_50; cmd_limit = param_51; - if (!_1958) + if (!_1953) { break; } @@ -1086,11 +1077,11 @@ void comp_main() Alloc param_59 = cmd_alloc; CmdRef param_60 = cmd_ref; uint param_61 = cmd_limit; - bool _2064 = alloc_cmd(param_59, param_60, param_61); + bool _2059 = alloc_cmd(param_59, param_60, param_61); cmd_alloc = param_59; cmd_ref = param_60; cmd_limit = param_61; - if (!_2064) + if (!_2059) { break; } @@ -1103,27 +1094,27 @@ void comp_main() uint index = _1372.Load(dd_1 * 4 + 0); uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0); int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); - CmdImage _2103 = { index, offset_1 }; + CmdImage _2098 = { index, offset_1 }; Alloc param_66 = cmd_alloc; CmdRef param_67 = cmd_ref; - CmdImage param_68 = _2103; + CmdImage param_68 = _2098; Cmd_Image_write(param_66, param_67, param_68); cmd_ref.offset += 12u; break; } case 5u: { - bool _2117 = tile_1.tile.offset == 0u; - bool _2123; - if (_2117) + bool _2112 = tile_1.tile.offset == 0u; + bool _2118; + if (_2112) { - _2123 = tile_1.backdrop == 0; + _2118 = tile_1.backdrop == 0; } else { - _2123 = _2117; + _2118 = _2112; } - if (_2123) + if (_2118) { clip_zero_depth = clip_depth + 1u; } @@ -1132,11 +1123,11 @@ void comp_main() Alloc param_69 = cmd_alloc; CmdRef param_70 = cmd_ref; uint param_71 = cmd_limit; - bool _2135 = alloc_cmd(param_69, param_70, param_71); + bool _2130 = alloc_cmd(param_69, param_70, param_71); cmd_alloc = param_69; cmd_ref = param_70; cmd_limit = param_71; - if (!_2135) + if (!_2130) { break; } @@ -1154,11 +1145,11 @@ void comp_main() Alloc param_74 = cmd_alloc; CmdRef param_75 = cmd_ref; uint param_76 = cmd_limit; - bool _2163 = alloc_cmd(param_74, param_75, param_76); + bool _2158 = alloc_cmd(param_74, param_75, param_76); cmd_alloc = param_74; cmd_ref = param_75; cmd_limit = param_76; - if (!_2163) + if (!_2158) { break; } @@ -1169,10 +1160,10 @@ void comp_main() write_fill(param_77, param_78, param_79, param_80); cmd_ref = param_78; uint blend_1 = _1372.Load(dd_1 * 4 + 0); - CmdEndClip _2186 = { blend_1 }; + CmdEndClip _2181 = { blend_1 }; Alloc param_81 = cmd_alloc; CmdRef param_82 = cmd_ref; - CmdEndClip param_83 = _2186; + CmdEndClip param_83 = _2181; Cmd_EndClip_write(param_81, param_82, param_83); cmd_ref.offset += 8u; break; @@ -1207,17 +1198,17 @@ void comp_main() break; } } - bool _2233 = (bin_tile_x + tile_x) < _1005.Load(8); - bool _2242; - if (_2233) + bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8); + bool _2237; + if (_2228) { - _2242 = (bin_tile_y + tile_y) < _1005.Load(12); + _2237 = (bin_tile_y + tile_y) < _1005.Load(12); } else { - _2242 = _2233; + _2237 = _2228; } - if (_2242) + if (_2237) { Alloc param_84 = cmd_alloc; CmdRef param_85 = cmd_ref; diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl index abd636b..55812d4 100644 --- a/piet-gpu/shader/gen/coarse.msl +++ b/piet-gpu/shader/gen/coarse.msl @@ -954,22 +954,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { _1701 = _1692; } - bool _1708; - if (!_1701) - { - _1708 = is_clip && is_blend; - } - else - { - _1708 = _1701; - } - include_tile = _1708; + include_tile = _1701 || is_blend; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1728 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); } } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -1014,11 +1005,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_29 = cmd_alloc; CmdRef param_30 = cmd_ref; uint param_31 = cmd_limit; - bool _1853 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize); + bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize); cmd_alloc = param_29; cmd_ref = param_30; cmd_limit = param_31; - if (!_1853) + if (!_1848) { break; } @@ -1041,11 +1032,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_39 = cmd_alloc; CmdRef param_40 = cmd_ref; uint param_41 = cmd_limit; - bool _1894 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize); + bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize); cmd_alloc = param_39; cmd_ref = param_40; cmd_limit = param_41; - if (!_1894) + if (!_1889) { break; } @@ -1072,11 +1063,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_49 = cmd_alloc; CmdRef param_50 = cmd_ref; uint param_51 = cmd_limit; - bool _1958 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize); + bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize); cmd_alloc = param_49; cmd_ref = param_50; cmd_limit = param_51; - if (!_1958) + if (!_1953) { break; } @@ -1106,11 +1097,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_59 = cmd_alloc; CmdRef param_60 = cmd_ref; uint param_61 = cmd_limit; - bool _2064 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize); + bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize); cmd_alloc = param_59; cmd_ref = param_60; cmd_limit = param_61; - if (!_2064) + if (!_2059) { break; } @@ -1132,17 +1123,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } case 5u: { - bool _2117 = tile_1.tile.offset == 0u; - bool _2123; - if (_2117) + bool _2112 = tile_1.tile.offset == 0u; + bool _2118; + if (_2112) { - _2123 = tile_1.backdrop == 0; + _2118 = tile_1.backdrop == 0; } else { - _2123 = _2117; + _2118 = _2112; } - if (_2123) + if (_2118) { clip_zero_depth = clip_depth + 1u; } @@ -1151,11 +1142,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_69 = cmd_alloc; CmdRef param_70 = cmd_ref; uint param_71 = cmd_limit; - bool _2135 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize); + bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize); cmd_alloc = param_69; cmd_ref = param_70; cmd_limit = param_71; - if (!_2135) + if (!_2130) { break; } @@ -1173,11 +1164,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_74 = cmd_alloc; CmdRef param_75 = cmd_ref; uint param_76 = cmd_limit; - bool _2163 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize); + bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize); cmd_alloc = param_74; cmd_ref = param_75; cmd_limit = param_76; - if (!_2163) + if (!_2158) { break; } @@ -1225,17 +1216,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M break; } } - bool _2233 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles; - bool _2242; - if (_2233) + bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles; + bool _2237; + if (_2228) { - _2242 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles; + _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles; } else { - _2242 = _2233; + _2237 = _2228; } - if (_2242) + if (_2237) { Alloc param_84 = cmd_alloc; CmdRef param_85 = cmd_ref; diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv index fdc10a0..6d33ee7 100644 Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil index 6353f19..200f169 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil index c101fc8..be69aad 100644 Binary files a/piet-gpu/shader/gen/draw_reduce.dxil and b/piet-gpu/shader/gen/draw_reduce.dxil differ diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil index 873fa29..4ea23f7 100644 Binary files a/piet-gpu/shader/gen/draw_root.dxil and b/piet-gpu/shader/gen/draw_root.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil index 9f8080b..e6eccc1 100644 Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil index a594d50..046045f 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil index 7b130e0..7759910 100644 Binary files a/piet-gpu/shader/gen/tile_alloc.dxil and b/piet-gpu/shader/gen/tile_alloc.dxil differ diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs index 8f21485..2f4b85e 100644 --- a/piet-gpu/src/encoder.rs +++ b/piet-gpu/src/encoder.rs @@ -37,147 +37,6 @@ pub struct Encoder { n_clip: u32, } -#[derive(Copy, Clone, Debug)] -pub struct EncodedSceneRef<'a, T: Copy + Pod> { - pub transform_stream: &'a [T], - pub tag_stream: &'a [u8], - pub pathseg_stream: &'a [u8], - pub linewidth_stream: &'a [f32], - pub drawtag_stream: &'a [u32], - pub drawdata_stream: &'a [u8], - pub n_path: u32, - pub n_pathseg: u32, - pub n_clip: u32, - pub ramp_data: &'a [u32], -} - -impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> { - /// Return a config for the element processing pipeline. - /// - /// This does not include further pipeline processing. Also returns the - /// beginning of free memory. - pub fn stage_config(&self) -> (Config, usize) { - // Layout of scene buffer - let drawtag_offset = 0; - let n_drawobj = self.n_drawobj(); - let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize); - let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE; - let trans_offset = drawdata_offset + self.drawdata_stream.len(); - let n_trans = self.transform_stream.len(); - let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize); - let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE; - let n_linewidth = self.linewidth_stream.len(); - let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE; - let n_pathtag = self.tag_stream.len(); - let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize); - let pathseg_offset = pathtag_offset + n_pathtag_padded; - - // Layout of memory - let mut alloc = 0; - let trans_alloc = alloc; - alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE; - let pathseg_alloc = alloc; - alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE; - let path_bbox_alloc = alloc; - let n_path = self.n_path as usize; - alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE; - let drawmonoid_alloc = alloc; - alloc += n_drawobj_padded * DRAWMONOID_SIZE; - let anno_alloc = alloc; - alloc += n_drawobj * ANNOTATED_SIZE; - let clip_alloc = alloc; - let n_clip = self.n_clip as usize; - const CLIP_SIZE: usize = 4; - alloc += n_clip * CLIP_SIZE; - let clip_bic_alloc = alloc; - const CLIP_BIC_SIZE: usize = 8; - // This can round down, as we only reduce the prefix - alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE; - let clip_stack_alloc = alloc; - const CLIP_EL_SIZE: usize = 20; - alloc += n_clip * CLIP_EL_SIZE; - let clip_bbox_alloc = alloc; - const CLIP_BBOX_SIZE: usize = 16; - alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE; - let draw_bbox_alloc = alloc; - alloc += n_drawobj * DRAW_BBOX_SIZE; - let drawinfo_alloc = alloc; - // TODO: not optimized; it can be accumulated during encoding or summed from drawtags - const MAX_DRAWINFO_SIZE: usize = 44; - alloc += n_drawobj * MAX_DRAWINFO_SIZE; - - let config = Config { - n_elements: n_drawobj as u32, - n_pathseg: self.n_pathseg, - pathseg_alloc: pathseg_alloc as u32, - anno_alloc: anno_alloc as u32, - trans_alloc: trans_alloc as u32, - path_bbox_alloc: path_bbox_alloc as u32, - drawmonoid_alloc: drawmonoid_alloc as u32, - clip_alloc: clip_alloc as u32, - clip_bic_alloc: clip_bic_alloc as u32, - clip_stack_alloc: clip_stack_alloc as u32, - clip_bbox_alloc: clip_bbox_alloc as u32, - draw_bbox_alloc: draw_bbox_alloc as u32, - drawinfo_alloc: drawinfo_alloc as u32, - n_trans: n_trans as u32, - n_path: self.n_path, - n_clip: self.n_clip, - trans_offset: trans_offset as u32, - linewidth_offset: linewidth_offset as u32, - pathtag_offset: pathtag_offset as u32, - pathseg_offset: pathseg_offset as u32, - drawtag_offset: drawtag_offset as u32, - drawdata_offset: drawdata_offset as u32, - ..Default::default() - }; - (config, alloc) - } - - pub fn write_scene(&self, buf: &mut BufWrite) { - buf.extend_slice(&self.drawtag_stream); - let n_drawobj = self.drawtag_stream.len(); - buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE); - buf.extend_slice(&self.drawdata_stream); - buf.extend_slice(&self.transform_stream); - let n_trans = self.transform_stream.len(); - buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE); - buf.extend_slice(&self.linewidth_stream); - buf.extend_slice(&self.tag_stream); - let n_pathtag = self.tag_stream.len(); - buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize)); - buf.extend_slice(&self.pathseg_stream); - } - - /// The number of draw objects in the draw object stream. - pub(crate) fn n_drawobj(&self) -> usize { - self.drawtag_stream.len() - } - - /// The number of paths. - pub(crate) fn n_path(&self) -> u32 { - self.n_path - } - - /// The number of path segments. - pub(crate) fn n_pathseg(&self) -> u32 { - self.n_pathseg - } - - pub(crate) fn n_transform(&self) -> usize { - self.transform_stream.len() - } - - /// The number of tags in the path stream. - pub(crate) fn n_pathtag(&self) -> usize { - self.tag_stream.len() - } - - pub(crate) fn n_clip(&self) -> u32 { - self.n_clip - } -} - /// A scene fragment encoding a glyph. /// /// This is a reduced version of the full encoder. @@ -471,21 +330,6 @@ impl Encoder { self.n_path += glyph.n_path; self.n_pathseg += glyph.n_pathseg; } - - pub(crate) fn scene_ref(&self) -> EncodedSceneRef { - EncodedSceneRef { - transform_stream: &self.transform_stream, - tag_stream: &self.tag_stream, - pathseg_stream: &self.pathseg_stream, - linewidth_stream: &self.linewidth_stream, - drawtag_stream: &self.drawtag_stream, - drawdata_stream: &self.drawdata_stream, - n_path: self.n_path, - n_pathseg: self.n_pathseg, - n_clip: self.n_clip, - ramp_data: &[], - } - } } fn align_up(x: usize, align: usize) -> usize { diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index b3ead90..773007d 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -10,10 +10,7 @@ mod text; use std::convert::TryInto; -use bytemuck::Pod; - pub use blend::{Blend, BlendMode, CompositionMode}; -pub use encoder::EncodedSceneRef; pub use render_ctx::PietGpuRenderContext; pub use gradient::Colrv1RadialGradient; @@ -21,11 +18,11 @@ use piet::kurbo::Vec2; use piet::{ImageFormat, RenderContext}; use piet_gpu_hal::{ - include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, - ImageLayout, Pipeline, QueryPool, Session, + include_shader, BindType, Buffer, BufferUsage, CmdBuf, ComputePassDescriptor, DescriptorSet, + Error, Image, ImageLayout, Pipeline, QueryPool, Session, }; -use pico_svg::PicoSvg; +pub use pico_svg::PicoSvg; use stages::{ClipBinding, ElementBinding, ElementCode}; use crate::stages::{ClipCode, Config, ElementStage}; @@ -358,27 +355,16 @@ impl Renderer { render_ctx: &mut PietGpuRenderContext, buf_ix: usize, ) -> Result<(), Error> { - let mut scene = render_ctx.encoded_scene(); - let ramp_data = render_ctx.get_ramp_data(); - scene.ramp_data = &ramp_data; - self.upload_scene(&scene, buf_ix) - } - - pub fn upload_scene( - &mut self, - scene: &EncodedSceneRef, - buf_ix: usize, - ) -> Result<(), Error> { - let (mut config, mut alloc) = scene.stage_config(); - let n_drawobj = scene.n_drawobj(); + let (mut config, mut alloc) = render_ctx.stage_config(); + let n_drawobj = render_ctx.n_drawobj(); // TODO: be more consistent in size types - let n_path = scene.n_path() as usize; + let n_path = render_ctx.n_path() as usize; self.n_paths = n_path; - self.n_transform = scene.n_transform(); - self.n_drawobj = scene.n_drawobj(); - self.n_pathseg = scene.n_pathseg() as usize; - self.n_pathtag = scene.n_pathtag(); - self.n_clip = scene.n_clip(); + self.n_transform = render_ctx.n_transform(); + self.n_drawobj = render_ctx.n_drawobj(); + self.n_pathseg = render_ctx.n_pathseg() as usize; + self.n_pathtag = render_ctx.n_pathtag(); + self.n_clip = render_ctx.n_clip(); // These constants depend on encoding and may need to be updated. // Perhaps we can plumb these from piet-gpu-derive? @@ -402,18 +388,19 @@ impl Renderer { // TODO: reallocate scene buffer if size is inadequate { let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?; - scene.write_scene(&mut mapped_scene); + render_ctx.write_scene(&mut mapped_scene); } self.config_bufs[buf_ix].write(&[config])?; self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; // Upload gradient data. - if !scene.ramp_data.is_empty() { + let ramp_data = render_ctx.get_ramp_data(); + if !ramp_data.is_empty() { assert!( self.gradient_bufs[buf_ix].size() as usize - >= std::mem::size_of_val(&*scene.ramp_data) + >= std::mem::size_of_val(&*ramp_data) ); - self.gradient_bufs[buf_ix].write(scene.ramp_data)?; + self.gradient_bufs[buf_ix].write(&ramp_data)?; } } Ok(()) @@ -437,10 +424,10 @@ impl Renderer { cmd_buf.copy_buffer_to_image(&self.gradient_bufs[buf_ix], &self.gradients); cmd_buf.image_barrier(&self.gradients, ImageLayout::BlitDst, ImageLayout::General); cmd_buf.reset_query_pool(&query_pool); - cmd_buf.write_timestamp(&query_pool, 0); cmd_buf.begin_debug_label("Element bounding box calculation"); + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 0, 1)); self.element_stage.record( - cmd_buf, + &mut pass, &self.element_code, &self.element_bindings[buf_ix], self.n_transform as u64, @@ -448,56 +435,59 @@ impl Renderer { self.n_pathtag as u32, self.n_drawobj as u64, ); + pass.end(); cmd_buf.end_debug_label(); - cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); - cmd_buf.begin_debug_label("Clip bounding box calculation"); + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 2, 3)); + pass.begin_debug_label("Clip bounding box calculation"); self.clip_binding - .record(cmd_buf, &self.clip_code, self.n_clip as u32); - cmd_buf.end_debug_label(); - cmd_buf.begin_debug_label("Element binning"); - cmd_buf.dispatch( + .record(&mut pass, &self.clip_code, self.n_clip as u32); + pass.end_debug_label(); + pass.begin_debug_label("Element binning"); + pass.dispatch( &self.bin_pipeline, &self.bin_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), (256, 1, 1), ); - cmd_buf.end_debug_label(); - cmd_buf.memory_barrier(); - cmd_buf.begin_debug_label("Tile allocation"); - cmd_buf.dispatch( + pass.end_debug_label(); + pass.memory_barrier(); + pass.begin_debug_label("Tile allocation"); + pass.dispatch( &self.tile_pipeline, &self.tile_ds[buf_ix], (((self.n_paths + 255) / 256) as u32, 1, 1), (256, 1, 1), ); - cmd_buf.end_debug_label(); - cmd_buf.write_timestamp(&query_pool, 2); - cmd_buf.memory_barrier(); + pass.end_debug_label(); + pass.end(); cmd_buf.begin_debug_label("Path flattening"); - cmd_buf.dispatch( + cmd_buf.memory_barrier(); + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 4, 5)); + pass.dispatch( &self.path_pipeline, &self.path_ds, (((self.n_pathseg + 31) / 32) as u32, 1, 1), (32, 1, 1), ); + pass.end(); cmd_buf.end_debug_label(); - cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); cmd_buf.begin_debug_label("Backdrop propagation"); - cmd_buf.dispatch( + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 6, 7)); + pass.dispatch( &self.backdrop_pipeline, &self.backdrop_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), (256, self.backdrop_y, 1), ); + pass.end(); cmd_buf.end_debug_label(); - cmd_buf.write_timestamp(&query_pool, 4); // TODO: redo query accounting - cmd_buf.write_timestamp(&query_pool, 5); cmd_buf.memory_barrier(); cmd_buf.begin_debug_label("Coarse raster"); - cmd_buf.dispatch( + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 8, 9)); + pass.dispatch( &self.coarse_pipeline, &self.coarse_ds[buf_ix], ( @@ -507,11 +497,13 @@ impl Renderer { ), (256, 1, 1), ); + pass.end(); cmd_buf.end_debug_label(); - cmd_buf.write_timestamp(&query_pool, 6); cmd_buf.memory_barrier(); cmd_buf.begin_debug_label("Fine raster"); - cmd_buf.dispatch( + let mut pass = + cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11)); + pass.dispatch( &self.k4_pipeline, &self.k4_ds, ( @@ -521,8 +513,8 @@ impl Renderer { ), (8, 4, 1), ); + pass.end(); cmd_buf.end_debug_label(); - cmd_buf.write_timestamp(&query_pool, 7); cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index ad608ca..dca03eb 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use crate::encoder::{EncodedSceneRef, GlyphEncoder}; +use crate::encoder::GlyphEncoder; use crate::stages::{Config, Transform}; use crate::MAX_BLEND_STACK; use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape}; @@ -97,10 +97,6 @@ impl PietGpuRenderContext { self.new_encoder.stage_config() } - pub fn encoded_scene(&self) -> EncodedSceneRef { - self.new_encoder.scene_ref() - } - /// Number of draw objects. /// /// This is for the new element processing pipeline. It's not necessarily the diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs index 52b8bf1..5442ba3 100644 --- a/piet-gpu/src/stages.rs +++ b/piet-gpu/src/stages.rs @@ -26,7 +26,7 @@ use bytemuck::{Pod, Zeroable}; pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE}; pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE}; pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE}; -use piet_gpu_hal::{Buffer, CmdBuf, Session}; +use piet_gpu_hal::{Buffer, ComputePass, Session}; pub use transform::{ Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE, }; @@ -140,7 +140,7 @@ impl ElementStage { pub unsafe fn record( &self, - cmd_buf: &mut CmdBuf, + pass: &mut ComputePass, code: &ElementCode, binding: &ElementBinding, n_transform: u64, @@ -149,14 +149,14 @@ impl ElementStage { n_drawobj: u64, ) { self.transform_stage.record( - cmd_buf, + pass, &code.transform_code, &binding.transform_binding, n_transform, ); // No memory barrier needed here; path has at least one before pathseg self.path_stage.record( - cmd_buf, + pass, &code.path_code, &binding.path_binding, n_paths, @@ -164,6 +164,6 @@ impl ElementStage { ); // No memory barrier needed here; draw has at least one before draw_leaf self.draw_stage - .record(cmd_buf, &code.draw_code, &binding.draw_binding, n_drawobj); + .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj); } } diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs index e4bc3db..2fd195b 100644 --- a/piet-gpu/src/stages/clip.rs +++ b/piet-gpu/src/stages/clip.rs @@ -16,7 +16,7 @@ //! The clip processing stage (includes substages). -use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session}; +use piet_gpu_hal::{include_shader, BindType, Buffer, ComputePass, DescriptorSet, Pipeline, Session}; // Note that this isn't the code/stage/binding pattern of most of the other stages // in the new element processing pipeline. We want to move those temporary buffers @@ -69,26 +69,26 @@ impl ClipBinding { /// Record the clip dispatches. /// /// Assumes memory barrier on entry. Provides memory barrier on exit. - pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) { + pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClipCode, n_clip: u32) { let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE; if n_wg_reduce > 0 { - cmd_buf.dispatch( + pass.dispatch( &code.reduce_pipeline, &self.reduce_ds, (n_wg_reduce, 1, 1), (CLIP_PART_SIZE, 1, 1), ); - cmd_buf.memory_barrier(); + pass.memory_barrier(); } let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE; if n_wg > 0 { - cmd_buf.dispatch( + pass.dispatch( &code.leaf_pipeline, &self.leaf_ds, (n_wg, 1, 1), (CLIP_PART_SIZE, 1, 1), ); - cmd_buf.memory_barrier(); + pass.memory_barrier(); } } } diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs index 21312a4..f0ee2b6 100644 --- a/piet-gpu/src/stages/draw.rs +++ b/piet-gpu/src/stages/draw.rs @@ -19,7 +19,7 @@ use bytemuck::{Pod, Zeroable}; use piet_gpu_hal::{ - include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, + include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session, }; /// The output element of the draw object stage. @@ -130,7 +130,7 @@ impl DrawStage { pub unsafe fn record( &self, - cmd_buf: &mut CmdBuf, + pass: &mut ComputePass, code: &DrawCode, binding: &DrawBinding, size: u64, @@ -140,22 +140,22 @@ impl DrawStage { } let n_workgroups = (size + DRAW_PART_SIZE - 1) / DRAW_PART_SIZE; if n_workgroups > 1 { - cmd_buf.dispatch( + pass.dispatch( &code.reduce_pipeline, &binding.reduce_ds, (n_workgroups as u32, 1, 1), (DRAW_WG as u32, 1, 1), ); - cmd_buf.memory_barrier(); - cmd_buf.dispatch( + pass.memory_barrier(); + pass.dispatch( &code.root_pipeline, &self.root_ds, (1, 1, 1), (DRAW_WG as u32, 1, 1), ); } - cmd_buf.memory_barrier(); - cmd_buf.dispatch( + pass.memory_barrier(); + pass.dispatch( &code.leaf_pipeline, &binding.leaf_ds, (n_workgroups as u32, 1, 1), diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs index 6c524a2..be33041 100644 --- a/piet-gpu/src/stages/path.rs +++ b/piet-gpu/src/stages/path.rs @@ -17,7 +17,7 @@ //! The path stage (includes substages). use piet_gpu_hal::{ - include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, + include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session, }; pub struct PathCode { @@ -148,7 +148,7 @@ impl PathStage { /// those are consumed. Result is written without barrier. pub unsafe fn record( &self, - cmd_buf: &mut CmdBuf, + pass: &mut ComputePass, code: &PathCode, binding: &PathBinding, n_paths: u32, @@ -166,15 +166,15 @@ impl PathStage { let reduce_part_tags = REDUCE_PART_SIZE * 4; let n_wg_tag_reduce = (n_tags + reduce_part_tags - 1) / reduce_part_tags; if n_wg_tag_reduce > 1 { - cmd_buf.dispatch( + pass.dispatch( &code.reduce_pipeline, &binding.reduce_ds, (n_wg_tag_reduce, 1, 1), (REDUCE_WG, 1, 1), ); // I think we can skip root if n_wg_tag_reduce == 2 - cmd_buf.memory_barrier(); - cmd_buf.dispatch( + pass.memory_barrier(); + pass.dispatch( &code.tag_root_pipeline, &self.tag_root_ds, (1, 1, 1), @@ -183,15 +183,15 @@ impl PathStage { // No barrier needed here; clear doesn't depend on path tags } let n_wg_clear = (n_paths + CLEAR_WG - 1) / CLEAR_WG; - cmd_buf.dispatch( + pass.dispatch( &code.clear_pipeline, &binding.clear_ds, (n_wg_clear, 1, 1), (CLEAR_WG, 1, 1), ); - cmd_buf.memory_barrier(); + pass.memory_barrier(); let n_wg_pathseg = (n_tags + SCAN_PART_SIZE - 1) / SCAN_PART_SIZE; - cmd_buf.dispatch( + pass.dispatch( &code.pathseg_pipeline, &binding.path_ds, (n_wg_pathseg, 1, 1), diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs index b21712f..8de7cee 100644 --- a/piet-gpu/src/stages/transform.rs +++ b/piet-gpu/src/stages/transform.rs @@ -20,7 +20,7 @@ use bytemuck::{Pod, Zeroable}; use piet::kurbo::Affine; use piet_gpu_hal::{ - include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, + include_shader, BindType, Buffer, BufferUsage, ComputePass, DescriptorSet, Pipeline, Session, }; /// An affine transform. @@ -132,7 +132,7 @@ impl TransformStage { pub unsafe fn record( &self, - cmd_buf: &mut CmdBuf, + pass: &mut ComputePass, code: &TransformCode, binding: &TransformBinding, size: u64, @@ -142,22 +142,22 @@ impl TransformStage { } let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE; if n_workgroups > 1 { - cmd_buf.dispatch( + pass.dispatch( &code.reduce_pipeline, &binding.reduce_ds, (n_workgroups as u32, 1, 1), (TRANSFORM_WG as u32, 1, 1), ); - cmd_buf.memory_barrier(); - cmd_buf.dispatch( + pass.memory_barrier(); + pass.dispatch( &code.root_pipeline, &self.root_ds, (1, 1, 1), (TRANSFORM_WG as u32, 1, 1), ); - cmd_buf.memory_barrier(); + pass.memory_barrier(); } - cmd_buf.dispatch( + pass.dispatch( &code.leaf_pipeline, &binding.leaf_ds, (n_workgroups as u32, 1, 1), diff --git a/piet-gpu/src/test_scenes.rs b/piet-gpu/src/test_scenes.rs index cf5a50d..bfd2af2 100644 --- a/piet-gpu/src/test_scenes.rs +++ b/piet-gpu/src/test_scenes.rs @@ -21,12 +21,7 @@ pub fn render_blend_test(rc: &mut PietGpuRenderContext, i: usize, blend: Blend) rc.restore().unwrap(); } -pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) { - let xml_str = std::fs::read_to_string(filename).unwrap(); - let start = std::time::Instant::now(); - let svg = PicoSvg::load(&xml_str, scale).unwrap(); - println!("parsing time: {:?}", start.elapsed()); - +pub fn render_svg(rc: &mut impl RenderContext, svg: &PicoSvg) { let start = std::time::Instant::now(); svg.render(rc); println!("flattening and encoding time: {:?}", start.elapsed()); diff --git a/tests/src/clear.rs b/tests/src/clear.rs index fc6f063..af4b8ea 100644 --- a/tests/src/clear.rs +++ b/tests/src/clear.rs @@ -16,11 +16,11 @@ //! Utilities (and a benchmark) for clearing buffers with compute shaders. -use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet}; +use piet_gpu_hal::{include_shader, BindType, BufferUsage, ComputePass, DescriptorSet}; use piet_gpu_hal::{Buffer, Pipeline}; use crate::config::Config; -use crate::runner::{Commands, Runner}; +use crate::runner::Runner; use crate::test_result::TestResult; const WG_SIZE: u64 = 256; @@ -52,9 +52,9 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult let mut total_elapsed = 0.0; for i in 0..n_iter { let mut commands = runner.commands(); - commands.write_timestamp(0); - stage.record(&mut commands, &code, &binding); - commands.write_timestamp(1); + let mut pass = commands.compute_pass(0, 1); + stage.record(&mut pass, &code, &binding); + pass.end(); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); @@ -108,17 +108,12 @@ impl ClearStage { ClearBinding { descriptor_set } } - pub unsafe fn record( - &self, - commands: &mut Commands, - code: &ClearCode, - bindings: &ClearBinding, - ) { + pub unsafe fn record(&self, pass: &mut ComputePass, code: &ClearCode, bindings: &ClearBinding) { let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE; // An issue: for clearing large buffers (>16M), we need to check the // number of workgroups against the (dynamically detected) limit, and // potentially issue multiple dispatches. - commands.cmd_buf.dispatch( + pass.dispatch( &code.pipeline, &bindings.descriptor_set, (n_workgroups as u32, 1, 1), diff --git a/tests/src/clip.rs b/tests/src/clip.rs index 4a38949..b1f8613 100644 --- a/tests/src/clip.rs +++ b/tests/src/clip.rs @@ -58,11 +58,11 @@ pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult { let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf); let mut commands = runner.commands(); - commands.write_timestamp(0); commands.upload(&memory); - binding.record(&mut commands.cmd_buf, &code, n_clip as u32); + let mut pass = commands.compute_pass(0, 1); + binding.record(&mut pass, &code, n_clip as u32); + pass.end(); commands.download(&memory); - commands.write_timestamp(1); runner.submit(commands); let dst = memory.map_read(..); if let Some(failure) = data.verify(&dst) { diff --git a/tests/src/draw.rs b/tests/src/draw.rs index 4372da4..dc82572 100644 --- a/tests/src/draw.rs +++ b/tests/src/draw.rs @@ -77,9 +77,9 @@ pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult { let n_iter = config.n_iter; for i in 0..n_iter { let mut commands = runner.commands(); - commands.write_timestamp(0); - stage.record(&mut commands.cmd_buf, &code, &binding, n_tag); - commands.write_timestamp(1); + let mut pass = commands.compute_pass(0, 1); + stage.record(&mut pass, &code, &binding, n_tag); + pass.end(); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&memory); diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs index 5767806..e24adcb 100644 --- a/tests/src/linkedlist.rs +++ b/tests/src/linkedlist.rs @@ -45,9 +45,7 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR for i in 0..n_iter { let mut commands = runner.commands(); // Might clear only buckets to save time. - commands.write_timestamp(0); stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf); - commands.write_timestamp(1); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&mem_buf); @@ -107,12 +105,14 @@ impl LinkedListStage { commands.cmd_buf.clear_buffer(out_buf, None); commands.cmd_buf.memory_barrier(); let n_workgroups = N_BUCKETS / WG_SIZE; - commands.cmd_buf.dispatch( + let mut pass = commands.compute_pass(0, 1); + pass.dispatch( &code.pipeline, &bindings.descriptor_set, (n_workgroups as u32, 1, 1), (WG_SIZE as u32, 1, 1), ); + pass.end(); } } diff --git a/tests/src/message_passing.rs b/tests/src/message_passing.rs index c5d989b..39e71dc 100644 --- a/tests/src/message_passing.rs +++ b/tests/src/message_passing.rs @@ -59,9 +59,7 @@ pub unsafe fn run_message_passing_test( let mut failures = 0; for _ in 0..n_iter { let mut commands = runner.commands(); - commands.write_timestamp(0); stage.record(&mut commands, &code, &binding, &out_buf.dev_buf); - commands.write_timestamp(1); commands.cmd_buf.memory_barrier(); commands.download(&out_buf); total_elapsed += runner.submit(commands); @@ -128,11 +126,13 @@ impl MessagePassingStage { commands.cmd_buf.clear_buffer(&self.data_buf, None); commands.cmd_buf.clear_buffer(out_buf, None); commands.cmd_buf.memory_barrier(); - commands.cmd_buf.dispatch( + let mut pass = commands.compute_pass(0, 1); + pass.dispatch( &code.pipeline, &bindings.descriptor_set, (256, 1, 1), (256, 1, 1), ); + pass.end(); } } diff --git a/tests/src/path.rs b/tests/src/path.rs index bf72c68..9d794e1 100644 --- a/tests/src/path.rs +++ b/tests/src/path.rs @@ -105,15 +105,15 @@ pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult { let mut commands = runner.commands(); commands.cmd_buf.copy_buffer(&memory_init, &memory.dev_buf); commands.cmd_buf.memory_barrier(); - commands.write_timestamp(0); + let mut pass = commands.compute_pass(0, 1); stage.record( - &mut commands.cmd_buf, + &mut pass, &code, &binding, path_data.n_path, path_data.tags.len() as u32, ); - commands.write_timestamp(1); + pass.end(); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&memory); diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs index 4174d8d..dbaf256 100644 --- a/tests/src/prefix.rs +++ b/tests/src/prefix.rs @@ -85,9 +85,7 @@ pub unsafe fn run_prefix_test( let mut total_elapsed = 0.0; for i in 0..n_iter { let mut commands = runner.commands(); - commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); - commands.write_timestamp(1); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); @@ -159,12 +157,14 @@ impl PrefixStage { let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; commands.cmd_buf.clear_buffer(&self.state_buf, None); commands.cmd_buf.memory_barrier(); - commands.cmd_buf.dispatch( + let mut pass = commands.compute_pass(0, 1); + pass.dispatch( &code.pipeline, &bindings.descriptor_set, (n_workgroups as u32, 1, 1), (WG_SIZE as u32, 1, 1), ); + pass.end(); // One thing that's missing here is registering the buffers so // they can be safely dropped by Rust code before the execution // of the command buffer completes. diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs index 24be2af..3c9c813 100644 --- a/tests/src/prefix_tree.rs +++ b/tests/src/prefix_tree.rs @@ -66,9 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul let mut commands = runner.commands(); commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf); commands.cmd_buf.memory_barrier(); - commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); - commands.write_timestamp(1); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); @@ -175,33 +173,35 @@ impl PrefixTreeStage { code: &PrefixTreeCode, bindings: &PrefixTreeBinding, ) { + let mut pass = commands.compute_pass(0, 1); let n = self.tmp_bufs.len(); for i in 0..n { let n_workgroups = self.sizes[i + 1]; - commands.cmd_buf.dispatch( + pass.dispatch( &code.reduce_pipeline, &bindings.descriptor_sets[i], (n_workgroups as u32, 1, 1), (WG_SIZE as u32, 1, 1), ); - commands.cmd_buf.memory_barrier(); + pass.memory_barrier(); } - commands.cmd_buf.dispatch( + pass.dispatch( &code.root_pipeline, &bindings.descriptor_sets[n], (1, 1, 1), (WG_SIZE as u32, 1, 1), ); for i in (0..n).rev() { - commands.cmd_buf.memory_barrier(); + pass.memory_barrier(); let n_workgroups = self.sizes[i + 1]; - commands.cmd_buf.dispatch( + pass.dispatch( &code.scan_pipeline, &bindings.descriptor_sets[2 * n - i], (n_workgroups as u32, 1, 1), (WG_SIZE as u32, 1, 1), ); } + pass.end(); } } diff --git a/tests/src/runner.rs b/tests/src/runner.rs index 1fd6774..3ba8223 100644 --- a/tests/src/runner.rs +++ b/tests/src/runner.rs @@ -20,8 +20,8 @@ use std::ops::RangeBounds; use bytemuck::Pod; use piet_gpu_hal::{ - BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, - QueryPool, Session, + BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, ComputePass, + ComputePassDescriptor, Instance, InstanceFlags, QueryPool, Session, }; pub struct Runner { @@ -118,8 +118,14 @@ impl Runner { } impl Commands { - pub unsafe fn write_timestamp(&mut self, query: u32) { - self.cmd_buf.write_timestamp(&self.query_pool, query); + /// Start a compute pass with timer queries. + pub unsafe fn compute_pass(&mut self, start_query: u32, end_query: u32) -> ComputePass { + self.cmd_buf + .begin_compute_pass(&ComputePassDescriptor::timer( + &self.query_pool, + start_query, + end_query, + )) } pub unsafe fn upload(&mut self, buf: &BufStage) { diff --git a/tests/src/transform.rs b/tests/src/transform.rs index 6edcc3f..43bfc67 100644 --- a/tests/src/transform.rs +++ b/tests/src/transform.rs @@ -61,9 +61,9 @@ pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult let n_iter = config.n_iter; for i in 0..n_iter { let mut commands = runner.commands(); - commands.write_timestamp(0); - stage.record(&mut commands.cmd_buf, &code, &binding, n_elements); - commands.write_timestamp(1); + let mut pass = commands.compute_pass(0, 1); + stage.record(&mut pass, &code, &binding, n_elements); + pass.end(); if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&memory);