diff --git a/.gitignore b/.gitignore index 2a2be18..2731039 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ /target Cargo.lock +/trace.json examples/assets/downloads/* !examples/assets/downloads/.tracked diff --git a/Cargo.toml b/Cargo.toml index f0b6e13..27fc784 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,15 +50,18 @@ futures-intrusive = "0.5.0" parking_lot = "0.12" smallvec = "1.8.0" vello_encoding = { path = "crates/encoding" } +wgpu-profiler = { workspace = true, optional = true } [workspace.dependencies] bytemuck = { version = "1.12.1", features = ["derive"] } fello = { git = "https://github.com/dfrg/fount", rev = "58a284eaae67512fb61cf76177c5d33238d79cb1" } peniko = { git = "https://github.com/linebender/peniko", rev = "cafdac9a211a0fb2fec5656bd663d1ac770bcc81" } -wgpu = "0.16" # NOTE: Make sure to keep this in sync with the version badge in README.md +wgpu = "0.16" # NOTE: Make sure to keep this in sync with the version badge in README.md + # Used for examples clap = "4.1.0" anyhow = "1.0" -instant = { version = "0.1.12", features = [ "wasm-bindgen" ] } +instant = { version = "0.1.12", features = ["wasm-bindgen"] } pollster = "0.3.0" +wgpu-profiler = "0.12.1" diff --git a/examples/headless/src/main.rs b/examples/headless/src/main.rs index 5a49d05..5f672b1 100644 --- a/examples/headless/src/main.rs +++ b/examples/headless/src/main.rs @@ -89,6 +89,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> { device, &RendererOptions { surface_format: None, + timestamp_period: queue.get_timestamp_period(), }, ) .or_else(|_| bail!("Got non-Send/Sync error from creating renderer"))?; diff --git a/examples/with_bevy/src/main.rs b/examples/with_bevy/src/main.rs index d4d57d6..2e3dd91 100644 --- a/examples/with_bevy/src/main.rs +++ b/examples/with_bevy/src/main.rs @@ -22,11 +22,14 @@ struct VelloRenderer(Renderer); impl FromWorld for VelloRenderer { fn from_world(world: &mut World) -> Self { let device = world.get_resource::().unwrap(); + let queue = world.get_resource::().unwrap(); + VelloRenderer( Renderer::new( device.wgpu_device(), &RendererOptions { surface_format: None, + timestamp_period: queue.0.get_timestamp_period(), }, ) .unwrap(), diff --git a/examples/with_winit/Cargo.toml b/examples/with_winit/Cargo.toml index 24c157e..3fb7410 100644 --- a/examples/with_winit/Cargo.toml +++ b/examples/with_winit/Cargo.toml @@ -20,12 +20,13 @@ name = "with_winit_bin" path = "src/main.rs" [dependencies] -vello = { path = "../../", features = ["buffer_labels"] } +vello = { path = "../../", features = ["buffer_labels", "wgpu-profiler"] } scenes = { path = "../scenes" } anyhow = { workspace = true } clap = { workspace = true, features = ["derive"] } instant = { workspace = true } pollster = { workspace = true } +wgpu-profiler = { workspace = true } wgpu = { workspace = true } winit = "0.28.1" diff --git a/examples/with_winit/src/lib.rs b/examples/with_winit/src/lib.rs index 8a5dacc..961c5fc 100644 --- a/examples/with_winit/src/lib.rs +++ b/examples/with_winit/src/lib.rs @@ -14,7 +14,7 @@ // // Also licensed under MIT license, at your choice. -use instant::Instant; +use instant::{Duration, Instant}; use std::collections::HashSet; use anyhow::Result; @@ -83,6 +83,7 @@ fn run( &render_cx.devices[id].device, &RendererOptions { surface_format: Some(render_state.surface.format), + timestamp_period: render_cx.devices[id].queue.get_timestamp_period(), }, ) .expect("Could create renderer"), @@ -121,7 +122,9 @@ fn run( if let Some(set_scene) = args.scene { scene_ix = set_scene; } + let mut profile_stored = None; let mut prev_scene_ix = scene_ix - 1; + let mut profile_taken = Instant::now(); // _event_loop is used on non-wasm platforms to create new windows event_loop.run(move |event, _event_loop, control_flow| match event { Event::WindowEvent { @@ -163,6 +166,29 @@ fn run( Some(VirtualKeyCode::C) => { stats.clear_min_and_max(); } + Some(VirtualKeyCode::P) => { + if let Some(renderer) = &renderers[render_state.surface.dev_id] { + if let Some(profile_result) = &renderer + .profile_result + .as_ref() + .or(profile_stored.as_ref()) + { + // There can be empty results if the required features aren't supported + if !profile_result.is_empty() { + let path = std::path::Path::new("trace.json"); + match wgpu_profiler::chrometrace::write_chrometrace( + path, + profile_result, + ) { + Ok(()) => { + println!("Wrote trace to path {path:?}") + } + Err(e) => eprintln!("Failed to write trace {e}"), + } + } + } + } + } Some(VirtualKeyCode::V) => { vsync_on = !vsync_on; render_cx.set_present_mode( @@ -342,6 +368,25 @@ fn run( complexity_shown.then_some(scene_complexity).flatten(), vsync_on, ); + if let Some(profiling_result) = renderers[render_state.surface.dev_id] + .as_mut() + .and_then(|it| it.profile_result.take()) + { + if profile_stored.is_none() || profile_taken.elapsed() > Duration::from_secs(1) + { + profile_stored = Some(profiling_result); + profile_taken = Instant::now(); + } + } + if let Some(profiling_result) = profile_stored.as_ref() { + stats::draw_gpu_profiling( + &mut builder, + scene_params.text, + width as f64, + height as f64, + profiling_result, + ) + } } let surface_texture = render_state .surface @@ -438,6 +483,9 @@ fn run( &render_cx.devices[id].device, &RendererOptions { surface_format: Some(render_state.surface.format), + timestamp_period: render_cx.devices[id] + .queue + .get_timestamp_period(), }, ) .expect("Could create renderer") diff --git a/examples/with_winit/src/stats.rs b/examples/with_winit/src/stats.rs index 694ecdf..a18d0d4 100644 --- a/examples/with_winit/src/stats.rs +++ b/examples/with_winit/src/stats.rs @@ -15,12 +15,13 @@ // Also licensed under MIT license, at your choice. use scenes::SimpleText; -use std::collections::VecDeque; +use std::{collections::VecDeque, time::Duration}; use vello::{ - kurbo::{Affine, PathEl, Rect}, + kurbo::{Affine, Line, PathEl, Rect}, peniko::{Brush, Color, Fill, Stroke}, BumpAllocators, SceneBuilder, }; +use wgpu_profiler::GpuTimerScopeResult; const SLIDING_WINDOW_SIZE: usize = 100; @@ -247,3 +248,204 @@ impl Stats { fn round_up(n: usize, f: usize) -> usize { n - 1 - (n - 1) % f + f } + +const COLORS: &[Color] = &[ + Color::AQUA, + Color::RED, + Color::ALICE_BLUE, + Color::YELLOW, + Color::GREEN, + Color::BLUE, + Color::ORANGE, + Color::WHITE, +]; + +pub fn draw_gpu_profiling( + sb: &mut SceneBuilder, + text: &mut SimpleText, + viewport_width: f64, + viewport_height: f64, + profiles: &[GpuTimerScopeResult], +) { + if profiles.is_empty() { + return; + } + let width = (viewport_width * 0.3).clamp(150., 450.); + let height = width * 1.5; + let y_offset = viewport_height - height; + let offset = Affine::translate((0., y_offset)); + + // Draw the background + sb.fill( + Fill::NonZero, + offset, + &Brush::Solid(Color::rgba8(0, 0, 0, 200)), + None, + &Rect::new(0., 0., width, height), + ); + // Find the range of the samples, so we can normalise them + let mut min = f64::MAX; + let mut max = f64::MIN; + let mut max_depth = 0; + let mut depth = 0; + let mut count = 0; + traverse_profiling(profiles, &mut |profile, stage| { + match stage { + TraversalStage::Enter => { + count += 1; + min = min.min(profile.time.start); + max = max.max(profile.time.end); + max_depth = max_depth.max(depth); + // Apply a higher depth to the children + depth += 1; + } + TraversalStage::Leave => depth -= 1, + } + }); + let total_time = max - min; + { + let labels = [ + format!("GPU Time: {:.2?}", Duration::from_secs_f64(total_time)), + "Press P to save a trace".to_string(), + ]; + + // height / 5 is dedicated to the text labels and the rest is filled by the frame time. + let text_height = height * 0.2 / (1 + labels.len()) as f64; + let left_margin = width * 0.01; + let text_size = (text_height * 0.9) as f32; + for (i, label) in labels.iter().enumerate() { + text.add( + sb, + None, + text_size, + Some(&Brush::Solid(Color::WHITE)), + offset * Affine::translate((left_margin, (i + 1) as f64 * text_height)), + label, + ); + } + + let text_size = (text_height * 0.9) as f32; + for (i, label) in labels.iter().enumerate() { + text.add( + sb, + None, + text_size, + Some(&Brush::Solid(Color::WHITE)), + offset * Affine::translate((left_margin, (i + 1) as f64 * text_height)), + label, + ); + } + } + let timeline_start_y = height * 0.21; + let timeline_range_y = height * 0.78; + let timeline_range_end = timeline_start_y + timeline_range_y; + + // Add 6 items worth of margin + let text_height = timeline_range_y / (6 + count) as f64; + let left_margin = width * 0.35; + let mut cur_text_y = timeline_start_y; + let mut cur_index = 0; + let mut depth = 0; + // Leave 1 bar's worth of margin + let depth_width = width * 0.28 / (max_depth + 1) as f64; + let depth_size = depth_width * 0.8; + traverse_profiling(profiles, &mut |profile, stage| { + if let TraversalStage::Enter = stage { + let start_normalised = + ((profile.time.start - min) / total_time) * timeline_range_y + timeline_start_y; + let end_normalised = + ((profile.time.end - min) / total_time) * timeline_range_y + timeline_start_y; + + let color = COLORS[cur_index % COLORS.len()]; + let x = width * 0.01 + (depth as f64 * depth_width); + sb.fill( + Fill::NonZero, + offset, + &Brush::Solid(color), + None, + &Rect::new(x, start_normalised, x + depth_size, end_normalised), + ); + + let mut text_start = start_normalised; + let nested = !profile.nested_scopes.is_empty(); + if nested { + // If we have children, leave some more space for them + text_start -= text_height * 0.7; + } + let this_time = profile.time.end - profile.time.start; + // Highlight as important if more than 10% of the total time, or more than 1ms + let slow = this_time * 20. >= total_time || this_time >= 0.001; + let text_y = text_start + // Ensure that we don't overlap the previous item + .max(cur_text_y) + // Ensure that all remaining items can fit + .min(timeline_range_end - (count - cur_index) as f64 * text_height); + let (text_height, text_color) = if slow { + (text_height, Color::WHITE) + } else { + (text_height * 0.6, Color::LIGHT_GRAY) + }; + let text_size = (text_height * 0.9) as f32; + // Text is specified by the baseline, but the y positions all refer to the top of the text + cur_text_y = text_y + text_height; + let label = format!( + "{:.2?} - {:.30}", + Duration::from_secs_f64(this_time), + profile.label + ); + sb.fill( + Fill::NonZero, + offset, + &Brush::Solid(color), + None, + &Rect::new( + width * 0.31, + cur_text_y - text_size as f64 * 0.7, + width * 0.34, + cur_text_y, + ), + ); + text.add( + sb, + None, + text_size, + Some(&Brush::Solid(text_color)), + offset * Affine::translate((left_margin, cur_text_y)), + &label, + ); + if !nested && slow { + sb.stroke( + &Stroke::new(2.), + offset, + &Brush::Solid(color), + None, + &Line::new( + (x + depth_size, (end_normalised + start_normalised) / 2.), + (width * 0.31, cur_text_y - text_size as f64 * 0.35), + ), + ); + } + cur_index += 1; + // Higher depth applies only to the children + depth += 1; + } else { + depth -= 1; + } + }); +} + +enum TraversalStage { + Enter, + Leave, +} + +fn traverse_profiling( + profiles: &[GpuTimerScopeResult], + callback: &mut impl FnMut(&GpuTimerScopeResult, TraversalStage), +) { + for profile in profiles { + callback(profile, TraversalStage::Enter); + traverse_profiling(&profile.nested_scopes, &mut *callback); + callback(profile, TraversalStage::Leave); + } +} diff --git a/src/engine.rs b/src/engine.rs index 4a910e6..59650f9 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -22,8 +22,8 @@ use std::{ }; use wgpu::{ - BindGroup, BindGroupLayout, Buffer, BufferUsages, ComputePipeline, Device, Queue, Texture, - TextureAspect, TextureUsages, TextureView, TextureViewDimension, + BindGroup, BindGroupLayout, Buffer, BufferUsages, CommandEncoderDescriptor, ComputePipeline, + Device, Queue, Texture, TextureAspect, TextureUsages, TextureView, TextureViewDimension, }; pub type Error = Box; @@ -46,6 +46,7 @@ pub struct Engine { struct Shader { pipeline: ComputePipeline, bind_group_layout: BindGroupLayout, + label: &'static str, } #[derive(Default)] @@ -238,6 +239,7 @@ impl Engine { let shader = Shader { pipeline, bind_group_layout, + label, }; let id = self.shaders.len(); self.shaders.push(shader); @@ -250,11 +252,16 @@ impl Engine { queue: &Queue, recording: &Recording, external_resources: &[ExternalResource], + label: &'static str, + #[cfg(feature = "wgpu-profiler")] profiler: &mut wgpu_profiler::GpuProfiler, ) -> Result<(), Error> { let mut free_bufs: HashSet = Default::default(); let mut free_images: HashSet = Default::default(); - let mut encoder = device.create_command_encoder(&Default::default()); + let mut encoder = + device.create_command_encoder(&CommandEncoderDescriptor { label: Some(label) }); + #[cfg(feature = "wgpu-profiler")] + profiler.begin_scope(label, &mut encoder, device); for command in &recording.commands { match command { Command::Upload(buf_proxy, bytes) => { @@ -366,9 +373,13 @@ impl Engine { &mut self.pool, )?; let mut cpass = encoder.begin_compute_pass(&Default::default()); + #[cfg(feature = "wgpu-profiler")] + profiler.begin_scope(shader.label, &mut cpass, device); cpass.set_pipeline(&shader.pipeline); cpass.set_bind_group(0, &bind_group, &[]); cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2); + #[cfg(feature = "wgpu-profiler")] + profiler.end_scope(&mut cpass); } Command::Download(proxy) => { let src_buf = self @@ -407,6 +418,8 @@ impl Engine { } } } + #[cfg(feature = "wgpu-profiler")] + profiler.end_scope(&mut encoder); queue.submit(Some(encoder.finish())); for id in free_bufs { if let Some(buf) = self.bind_map.buf_map.remove(&id) { diff --git a/src/lib.rs b/src/lib.rs index 43be77a..9668282 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,6 +40,8 @@ use shaders::FullShaders; /// Temporary export, used in with_winit for stats pub use vello_encoding::BumpAllocators; use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView}; +#[cfg(feature = "wgpu-profiler")] +use wgpu_profiler::GpuProfiler; /// Catch-all error type. pub type Error = Box; @@ -53,6 +55,10 @@ pub struct Renderer { shaders: FullShaders, blit: Option, target: Option, + #[cfg(feature = "wgpu-profiler")] + profiler: GpuProfiler, + #[cfg(feature = "wgpu-profiler")] + pub profile_result: Option>, } /// Parameters used in a single render that are configurable by the client. @@ -70,6 +76,9 @@ pub struct RendererOptions { /// The format of the texture used for surfaces with this renderer/device /// If None, the renderer cannot be used with surfaces pub surface_format: Option, + /// The timestamp period from [`wgpu::Queue::get_timestamp_period`] + /// Used when the wgpu-profiler feature is enabled + pub timestamp_period: f32, } impl Renderer { @@ -85,6 +94,11 @@ impl Renderer { shaders, blit, target: None, + // Use 3 pending frames + #[cfg(feature = "wgpu-profiler")] + profiler: GpuProfiler::new(3, render_options.timestamp_period, device.features()), + #[cfg(feature = "wgpu-profiler")] + profile_result: None, }) } @@ -106,8 +120,15 @@ impl Renderer { *target.as_image().unwrap(), texture, )]; - self.engine - .run_recording(device, queue, &recording, &external_resources)?; + self.engine.run_recording( + device, + queue, + &recording, + &external_resources, + "render_to_texture", + #[cfg(feature = "wgpu-profiler")] + &mut self.profiler, + )?; Ok(()) } @@ -217,7 +238,15 @@ impl Renderer { let recording = render.render_encoding_coarse(encoding, &self.shaders, params, true); let target = render.out_image(); let bump_buf = render.bump_buf(); - self.engine.run_recording(device, queue, &recording, &[])?; + self.engine.run_recording( + device, + queue, + &recording, + &[], + "t_async_coarse", + #[cfg(feature = "wgpu-profiler")] + &mut self.profiler, + )?; let mut bump: Option = None; if let Some(bump_buf) = self.engine.get_download(bump_buf) { @@ -239,8 +268,15 @@ impl Renderer { let mut recording = Recording::default(); render.record_fine(&self.shaders, &mut recording); let external_resources = [ExternalResource::Image(target, texture)]; - self.engine - .run_recording(device, queue, &recording, &external_resources)?; + self.engine.run_recording( + device, + queue, + &recording, + &external_resources, + "t_async_fine", + #[cfg(feature = "wgpu-profiler")] + &mut self.profiler, + )?; Ok(bump) } @@ -301,8 +337,16 @@ impl Renderer { render_pass.set_bind_group(0, &bind_group, &[]); render_pass.draw(0..6, 0..1); } + #[cfg(feature = "wgpu-profiler")] + self.profiler.resolve_queries(&mut encoder); queue.submit(Some(encoder.finish())); self.target = Some(target); + #[cfg(feature = "wgpu-profiler")] + self.profiler.end_frame().unwrap(); + #[cfg(feature = "wgpu-profiler")] + if let Some(result) = self.profiler.process_finished_frame() { + self.profile_result = Some(result); + } Ok(bump) } } diff --git a/src/util.rs b/src/util.rs index add0ea2..c2fc3b8 100644 --- a/src/util.rs +++ b/src/util.rs @@ -134,12 +134,16 @@ impl RenderContext { .await?; let features = adapter.features(); let limits = Limits::default(); + let mut maybe_features = wgpu::Features::CLEAR_TEXTURE; + #[cfg(feature = "wgpu-profiler")] + { + maybe_features |= wgpu_profiler::GpuProfiler::ALL_WGPU_TIMER_FEATURES; + }; let (device, queue) = adapter .request_device( &wgpu::DeviceDescriptor { label: None, - features: features - & (wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_TEXTURE), + features: features & maybe_features, limits, }, None,