Add GPU profiling (#304)

* Add GPU profiling

* Fix conditional compilation for `headless`

* Add full profiling

* Productionise

* Fix MacOS trace file

* Try to make it easier to tell what's important

* Resolve CI issues
This commit is contained in:
Daniel McNab 2023-06-01 16:10:27 +01:00 committed by GitHub
parent 03545e5d9a
commit 6d57093cc2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 336 additions and 16 deletions

1
.gitignore vendored
View file

@ -1,5 +1,6 @@
/target
Cargo.lock
/trace.json
examples/assets/downloads/*
!examples/assets/downloads/.tracked

View file

@ -50,15 +50,18 @@ futures-intrusive = "0.5.0"
parking_lot = "0.12"
smallvec = "1.8.0"
vello_encoding = { path = "crates/encoding" }
wgpu-profiler = { workspace = true, optional = true }
[workspace.dependencies]
bytemuck = { version = "1.12.1", features = ["derive"] }
fello = { git = "https://github.com/dfrg/fount", rev = "58a284eaae67512fb61cf76177c5d33238d79cb1" }
peniko = { git = "https://github.com/linebender/peniko", rev = "cafdac9a211a0fb2fec5656bd663d1ac770bcc81" }
wgpu = "0.16" # NOTE: Make sure to keep this in sync with the version badge in README.md
wgpu = "0.16" # NOTE: Make sure to keep this in sync with the version badge in README.md
# Used for examples
clap = "4.1.0"
anyhow = "1.0"
instant = { version = "0.1.12", features = [ "wasm-bindgen" ] }
instant = { version = "0.1.12", features = ["wasm-bindgen"] }
pollster = "0.3.0"
wgpu-profiler = "0.12.1"

View file

@ -89,6 +89,7 @@ async fn render(mut scenes: SceneSet, index: usize, args: &Args) -> Result<()> {
device,
&RendererOptions {
surface_format: None,
timestamp_period: queue.get_timestamp_period(),
},
)
.or_else(|_| bail!("Got non-Send/Sync error from creating renderer"))?;

View file

@ -22,11 +22,14 @@ struct VelloRenderer(Renderer);
impl FromWorld for VelloRenderer {
fn from_world(world: &mut World) -> Self {
let device = world.get_resource::<RenderDevice>().unwrap();
let queue = world.get_resource::<RenderQueue>().unwrap();
VelloRenderer(
Renderer::new(
device.wgpu_device(),
&RendererOptions {
surface_format: None,
timestamp_period: queue.0.get_timestamp_period(),
},
)
.unwrap(),

View file

@ -20,12 +20,13 @@ name = "with_winit_bin"
path = "src/main.rs"
[dependencies]
vello = { path = "../../", features = ["buffer_labels"] }
vello = { path = "../../", features = ["buffer_labels", "wgpu-profiler"] }
scenes = { path = "../scenes" }
anyhow = { workspace = true }
clap = { workspace = true, features = ["derive"] }
instant = { workspace = true }
pollster = { workspace = true }
wgpu-profiler = { workspace = true }
wgpu = { workspace = true }
winit = "0.28.1"

View file

@ -14,7 +14,7 @@
//
// Also licensed under MIT license, at your choice.
use instant::Instant;
use instant::{Duration, Instant};
use std::collections::HashSet;
use anyhow::Result;
@ -83,6 +83,7 @@ fn run(
&render_cx.devices[id].device,
&RendererOptions {
surface_format: Some(render_state.surface.format),
timestamp_period: render_cx.devices[id].queue.get_timestamp_period(),
},
)
.expect("Could create renderer"),
@ -121,7 +122,9 @@ fn run(
if let Some(set_scene) = args.scene {
scene_ix = set_scene;
}
let mut profile_stored = None;
let mut prev_scene_ix = scene_ix - 1;
let mut profile_taken = Instant::now();
// _event_loop is used on non-wasm platforms to create new windows
event_loop.run(move |event, _event_loop, control_flow| match event {
Event::WindowEvent {
@ -163,6 +166,29 @@ fn run(
Some(VirtualKeyCode::C) => {
stats.clear_min_and_max();
}
Some(VirtualKeyCode::P) => {
if let Some(renderer) = &renderers[render_state.surface.dev_id] {
if let Some(profile_result) = &renderer
.profile_result
.as_ref()
.or(profile_stored.as_ref())
{
// There can be empty results if the required features aren't supported
if !profile_result.is_empty() {
let path = std::path::Path::new("trace.json");
match wgpu_profiler::chrometrace::write_chrometrace(
path,
profile_result,
) {
Ok(()) => {
println!("Wrote trace to path {path:?}")
}
Err(e) => eprintln!("Failed to write trace {e}"),
}
}
}
}
}
Some(VirtualKeyCode::V) => {
vsync_on = !vsync_on;
render_cx.set_present_mode(
@ -342,6 +368,25 @@ fn run(
complexity_shown.then_some(scene_complexity).flatten(),
vsync_on,
);
if let Some(profiling_result) = renderers[render_state.surface.dev_id]
.as_mut()
.and_then(|it| it.profile_result.take())
{
if profile_stored.is_none() || profile_taken.elapsed() > Duration::from_secs(1)
{
profile_stored = Some(profiling_result);
profile_taken = Instant::now();
}
}
if let Some(profiling_result) = profile_stored.as_ref() {
stats::draw_gpu_profiling(
&mut builder,
scene_params.text,
width as f64,
height as f64,
profiling_result,
)
}
}
let surface_texture = render_state
.surface
@ -438,6 +483,9 @@ fn run(
&render_cx.devices[id].device,
&RendererOptions {
surface_format: Some(render_state.surface.format),
timestamp_period: render_cx.devices[id]
.queue
.get_timestamp_period(),
},
)
.expect("Could create renderer")

View file

@ -15,12 +15,13 @@
// Also licensed under MIT license, at your choice.
use scenes::SimpleText;
use std::collections::VecDeque;
use std::{collections::VecDeque, time::Duration};
use vello::{
kurbo::{Affine, PathEl, Rect},
kurbo::{Affine, Line, PathEl, Rect},
peniko::{Brush, Color, Fill, Stroke},
BumpAllocators, SceneBuilder,
};
use wgpu_profiler::GpuTimerScopeResult;
const SLIDING_WINDOW_SIZE: usize = 100;
@ -247,3 +248,204 @@ impl Stats {
fn round_up(n: usize, f: usize) -> usize {
n - 1 - (n - 1) % f + f
}
const COLORS: &[Color] = &[
Color::AQUA,
Color::RED,
Color::ALICE_BLUE,
Color::YELLOW,
Color::GREEN,
Color::BLUE,
Color::ORANGE,
Color::WHITE,
];
pub fn draw_gpu_profiling(
sb: &mut SceneBuilder,
text: &mut SimpleText,
viewport_width: f64,
viewport_height: f64,
profiles: &[GpuTimerScopeResult],
) {
if profiles.is_empty() {
return;
}
let width = (viewport_width * 0.3).clamp(150., 450.);
let height = width * 1.5;
let y_offset = viewport_height - height;
let offset = Affine::translate((0., y_offset));
// Draw the background
sb.fill(
Fill::NonZero,
offset,
&Brush::Solid(Color::rgba8(0, 0, 0, 200)),
None,
&Rect::new(0., 0., width, height),
);
// Find the range of the samples, so we can normalise them
let mut min = f64::MAX;
let mut max = f64::MIN;
let mut max_depth = 0;
let mut depth = 0;
let mut count = 0;
traverse_profiling(profiles, &mut |profile, stage| {
match stage {
TraversalStage::Enter => {
count += 1;
min = min.min(profile.time.start);
max = max.max(profile.time.end);
max_depth = max_depth.max(depth);
// Apply a higher depth to the children
depth += 1;
}
TraversalStage::Leave => depth -= 1,
}
});
let total_time = max - min;
{
let labels = [
format!("GPU Time: {:.2?}", Duration::from_secs_f64(total_time)),
"Press P to save a trace".to_string(),
];
// height / 5 is dedicated to the text labels and the rest is filled by the frame time.
let text_height = height * 0.2 / (1 + labels.len()) as f64;
let left_margin = width * 0.01;
let text_size = (text_height * 0.9) as f32;
for (i, label) in labels.iter().enumerate() {
text.add(
sb,
None,
text_size,
Some(&Brush::Solid(Color::WHITE)),
offset * Affine::translate((left_margin, (i + 1) as f64 * text_height)),
label,
);
}
let text_size = (text_height * 0.9) as f32;
for (i, label) in labels.iter().enumerate() {
text.add(
sb,
None,
text_size,
Some(&Brush::Solid(Color::WHITE)),
offset * Affine::translate((left_margin, (i + 1) as f64 * text_height)),
label,
);
}
}
let timeline_start_y = height * 0.21;
let timeline_range_y = height * 0.78;
let timeline_range_end = timeline_start_y + timeline_range_y;
// Add 6 items worth of margin
let text_height = timeline_range_y / (6 + count) as f64;
let left_margin = width * 0.35;
let mut cur_text_y = timeline_start_y;
let mut cur_index = 0;
let mut depth = 0;
// Leave 1 bar's worth of margin
let depth_width = width * 0.28 / (max_depth + 1) as f64;
let depth_size = depth_width * 0.8;
traverse_profiling(profiles, &mut |profile, stage| {
if let TraversalStage::Enter = stage {
let start_normalised =
((profile.time.start - min) / total_time) * timeline_range_y + timeline_start_y;
let end_normalised =
((profile.time.end - min) / total_time) * timeline_range_y + timeline_start_y;
let color = COLORS[cur_index % COLORS.len()];
let x = width * 0.01 + (depth as f64 * depth_width);
sb.fill(
Fill::NonZero,
offset,
&Brush::Solid(color),
None,
&Rect::new(x, start_normalised, x + depth_size, end_normalised),
);
let mut text_start = start_normalised;
let nested = !profile.nested_scopes.is_empty();
if nested {
// If we have children, leave some more space for them
text_start -= text_height * 0.7;
}
let this_time = profile.time.end - profile.time.start;
// Highlight as important if more than 10% of the total time, or more than 1ms
let slow = this_time * 20. >= total_time || this_time >= 0.001;
let text_y = text_start
// Ensure that we don't overlap the previous item
.max(cur_text_y)
// Ensure that all remaining items can fit
.min(timeline_range_end - (count - cur_index) as f64 * text_height);
let (text_height, text_color) = if slow {
(text_height, Color::WHITE)
} else {
(text_height * 0.6, Color::LIGHT_GRAY)
};
let text_size = (text_height * 0.9) as f32;
// Text is specified by the baseline, but the y positions all refer to the top of the text
cur_text_y = text_y + text_height;
let label = format!(
"{:.2?} - {:.30}",
Duration::from_secs_f64(this_time),
profile.label
);
sb.fill(
Fill::NonZero,
offset,
&Brush::Solid(color),
None,
&Rect::new(
width * 0.31,
cur_text_y - text_size as f64 * 0.7,
width * 0.34,
cur_text_y,
),
);
text.add(
sb,
None,
text_size,
Some(&Brush::Solid(text_color)),
offset * Affine::translate((left_margin, cur_text_y)),
&label,
);
if !nested && slow {
sb.stroke(
&Stroke::new(2.),
offset,
&Brush::Solid(color),
None,
&Line::new(
(x + depth_size, (end_normalised + start_normalised) / 2.),
(width * 0.31, cur_text_y - text_size as f64 * 0.35),
),
);
}
cur_index += 1;
// Higher depth applies only to the children
depth += 1;
} else {
depth -= 1;
}
});
}
enum TraversalStage {
Enter,
Leave,
}
fn traverse_profiling(
profiles: &[GpuTimerScopeResult],
callback: &mut impl FnMut(&GpuTimerScopeResult, TraversalStage),
) {
for profile in profiles {
callback(profile, TraversalStage::Enter);
traverse_profiling(&profile.nested_scopes, &mut *callback);
callback(profile, TraversalStage::Leave);
}
}

View file

@ -22,8 +22,8 @@ use std::{
};
use wgpu::{
BindGroup, BindGroupLayout, Buffer, BufferUsages, ComputePipeline, Device, Queue, Texture,
TextureAspect, TextureUsages, TextureView, TextureViewDimension,
BindGroup, BindGroupLayout, Buffer, BufferUsages, CommandEncoderDescriptor, ComputePipeline,
Device, Queue, Texture, TextureAspect, TextureUsages, TextureView, TextureViewDimension,
};
pub type Error = Box<dyn std::error::Error>;
@ -46,6 +46,7 @@ pub struct Engine {
struct Shader {
pipeline: ComputePipeline,
bind_group_layout: BindGroupLayout,
label: &'static str,
}
#[derive(Default)]
@ -238,6 +239,7 @@ impl Engine {
let shader = Shader {
pipeline,
bind_group_layout,
label,
};
let id = self.shaders.len();
self.shaders.push(shader);
@ -250,11 +252,16 @@ impl Engine {
queue: &Queue,
recording: &Recording,
external_resources: &[ExternalResource],
label: &'static str,
#[cfg(feature = "wgpu-profiler")] profiler: &mut wgpu_profiler::GpuProfiler,
) -> Result<(), Error> {
let mut free_bufs: HashSet<Id> = Default::default();
let mut free_images: HashSet<Id> = Default::default();
let mut encoder = device.create_command_encoder(&Default::default());
let mut encoder =
device.create_command_encoder(&CommandEncoderDescriptor { label: Some(label) });
#[cfg(feature = "wgpu-profiler")]
profiler.begin_scope(label, &mut encoder, device);
for command in &recording.commands {
match command {
Command::Upload(buf_proxy, bytes) => {
@ -366,9 +373,13 @@ impl Engine {
&mut self.pool,
)?;
let mut cpass = encoder.begin_compute_pass(&Default::default());
#[cfg(feature = "wgpu-profiler")]
profiler.begin_scope(shader.label, &mut cpass, device);
cpass.set_pipeline(&shader.pipeline);
cpass.set_bind_group(0, &bind_group, &[]);
cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2);
#[cfg(feature = "wgpu-profiler")]
profiler.end_scope(&mut cpass);
}
Command::Download(proxy) => {
let src_buf = self
@ -407,6 +418,8 @@ impl Engine {
}
}
}
#[cfg(feature = "wgpu-profiler")]
profiler.end_scope(&mut encoder);
queue.submit(Some(encoder.finish()));
for id in free_bufs {
if let Some(buf) = self.bind_map.buf_map.remove(&id) {

View file

@ -40,6 +40,8 @@ use shaders::FullShaders;
/// Temporary export, used in with_winit for stats
pub use vello_encoding::BumpAllocators;
use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView};
#[cfg(feature = "wgpu-profiler")]
use wgpu_profiler::GpuProfiler;
/// Catch-all error type.
pub type Error = Box<dyn std::error::Error>;
@ -53,6 +55,10 @@ pub struct Renderer {
shaders: FullShaders,
blit: Option<BlitPipeline>,
target: Option<TargetTexture>,
#[cfg(feature = "wgpu-profiler")]
profiler: GpuProfiler,
#[cfg(feature = "wgpu-profiler")]
pub profile_result: Option<Vec<wgpu_profiler::GpuTimerScopeResult>>,
}
/// Parameters used in a single render that are configurable by the client.
@ -70,6 +76,9 @@ pub struct RendererOptions {
/// The format of the texture used for surfaces with this renderer/device
/// If None, the renderer cannot be used with surfaces
pub surface_format: Option<TextureFormat>,
/// The timestamp period from [`wgpu::Queue::get_timestamp_period`]
/// Used when the wgpu-profiler feature is enabled
pub timestamp_period: f32,
}
impl Renderer {
@ -85,6 +94,11 @@ impl Renderer {
shaders,
blit,
target: None,
// Use 3 pending frames
#[cfg(feature = "wgpu-profiler")]
profiler: GpuProfiler::new(3, render_options.timestamp_period, device.features()),
#[cfg(feature = "wgpu-profiler")]
profile_result: None,
})
}
@ -106,8 +120,15 @@ impl Renderer {
*target.as_image().unwrap(),
texture,
)];
self.engine
.run_recording(device, queue, &recording, &external_resources)?;
self.engine.run_recording(
device,
queue,
&recording,
&external_resources,
"render_to_texture",
#[cfg(feature = "wgpu-profiler")]
&mut self.profiler,
)?;
Ok(())
}
@ -217,7 +238,15 @@ impl Renderer {
let recording = render.render_encoding_coarse(encoding, &self.shaders, params, true);
let target = render.out_image();
let bump_buf = render.bump_buf();
self.engine.run_recording(device, queue, &recording, &[])?;
self.engine.run_recording(
device,
queue,
&recording,
&[],
"t_async_coarse",
#[cfg(feature = "wgpu-profiler")]
&mut self.profiler,
)?;
let mut bump: Option<BumpAllocators> = None;
if let Some(bump_buf) = self.engine.get_download(bump_buf) {
@ -239,8 +268,15 @@ impl Renderer {
let mut recording = Recording::default();
render.record_fine(&self.shaders, &mut recording);
let external_resources = [ExternalResource::Image(target, texture)];
self.engine
.run_recording(device, queue, &recording, &external_resources)?;
self.engine.run_recording(
device,
queue,
&recording,
&external_resources,
"t_async_fine",
#[cfg(feature = "wgpu-profiler")]
&mut self.profiler,
)?;
Ok(bump)
}
@ -301,8 +337,16 @@ impl Renderer {
render_pass.set_bind_group(0, &bind_group, &[]);
render_pass.draw(0..6, 0..1);
}
#[cfg(feature = "wgpu-profiler")]
self.profiler.resolve_queries(&mut encoder);
queue.submit(Some(encoder.finish()));
self.target = Some(target);
#[cfg(feature = "wgpu-profiler")]
self.profiler.end_frame().unwrap();
#[cfg(feature = "wgpu-profiler")]
if let Some(result) = self.profiler.process_finished_frame() {
self.profile_result = Some(result);
}
Ok(bump)
}
}

View file

@ -134,12 +134,16 @@ impl RenderContext {
.await?;
let features = adapter.features();
let limits = Limits::default();
let mut maybe_features = wgpu::Features::CLEAR_TEXTURE;
#[cfg(feature = "wgpu-profiler")]
{
maybe_features |= wgpu_profiler::GpuProfiler::ALL_WGPU_TIMER_FEATURES;
};
let (device, queue) = adapter
.request_device(
&wgpu::DeviceDescriptor {
label: None,
features: features
& (wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_TEXTURE),
features: features & maybe_features,
limits,
},
None,