diff --git a/examples/with_winit/src/main.rs b/examples/with_winit/src/main.rs index 9a28f01..d3daf97 100644 --- a/examples/with_winit/src/main.rs +++ b/examples/with_winit/src/main.rs @@ -22,6 +22,7 @@ use std::{borrow::Cow, path::PathBuf, time::Instant}; use clap::Parser; use vello::{ + block_on_wgpu, kurbo::{Affine, Vec2}, util::RenderContext, Renderer, Scene, SceneBuilder, @@ -187,6 +188,24 @@ async fn run(event_loop: EventLoop, window: Window, args: Args) { .surface .get_current_texture() .expect("failed to get surface texture"); + #[cfg(not(target_arch = "wasm32"))] + { + block_on_wgpu( + &device_handle.device, + renderer.render_to_surface_async( + &device_handle.device, + &device_handle.queue, + &scene, + &surface_texture, + width, + height, + ), + ) + .expect("failed to render to surface"); + } + // Note: in the wasm case, we're currently not running the robust + // pipeline, as it requires more async wiring for the readback. + #[cfg(target_arch = "wasm32")] renderer .render_to_surface( &device_handle.device, @@ -198,7 +217,7 @@ async fn run(event_loop: EventLoop, window: Window, args: Args) { ) .expect("failed to render to surface"); surface_texture.present(); - device_handle.device.poll(wgpu::Maintain::Wait); + device_handle.device.poll(wgpu::Maintain::Poll); } Event::UserEvent(event) => match event { #[cfg(not(target_arch = "wasm32"))] diff --git a/src/engine.rs b/src/engine.rs index d3750a6..d3f152e 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -16,17 +16,14 @@ use std::{ borrow::Cow, - collections::{hash_map::Entry, HashMap}, + collections::{hash_map::Entry, HashMap, HashSet}, num::{NonZeroU32, NonZeroU64}, sync::atomic::{AtomicU64, Ordering}, }; -use futures_intrusive::channel::shared::GenericOneshotReceiver; -use parking_lot::RawMutex; use wgpu::{ - util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferAsyncError, BufferSlice, - BufferUsages, BufferView, ComputePipeline, Device, Queue, Texture, TextureAspect, - TextureFormat, TextureUsages, TextureView, TextureViewDimension, + util::DeviceExt, BindGroup, BindGroupLayout, Buffer, BufferUsages, ComputePipeline, Device, + Queue, Texture, TextureAspect, TextureFormat, TextureUsages, TextureView, TextureViewDimension, }; pub type Error = Box; @@ -42,6 +39,8 @@ static ID_COUNTER: AtomicU64 = AtomicU64::new(0); pub struct Engine { shaders: Vec, pool: ResourcePool, + bind_map: BindMap, + downloads: HashMap, } struct Shader { @@ -96,11 +95,8 @@ pub enum Command { Dispatch(ShaderId, (u32, u32, u32), Vec), Download(BufProxy), Clear(BufProxy, u64, Option), -} - -#[derive(Default)] -pub struct Downloads { - buf_map: HashMap, + FreeBuf(BufProxy), + FreeImage(ImageProxy), } /// The type of resource that will be bound to a slot in a shader. @@ -149,6 +145,8 @@ impl Engine { Engine { shaders: vec![], pool: Default::default(), + bind_map: Default::default(), + downloads: Default::default(), } } @@ -249,9 +247,9 @@ impl Engine { queue: &Queue, recording: &Recording, external_resources: &[ExternalResource], - ) -> Result { - let mut bind_map = BindMap::default(); - let mut downloads = Downloads::default(); + ) -> Result<(), Error> { + let mut free_bufs: HashSet = Default::default(); + let mut free_images: HashSet = Default::default(); let mut encoder = device.create_command_encoder(&Default::default()); for command in &recording.commands { @@ -259,18 +257,22 @@ impl Engine { Command::Upload(buf_proxy, bytes) => { let usage = BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE; - let buf = self.pool.get_buf(buf_proxy, usage, device); + let buf = self + .pool + .get_buf(buf_proxy.size, buf_proxy.name, usage, device); // TODO: if buffer is newly created, might be better to make it mapped at creation // and copy. However, we expect reuse will be most common. queue.write_buffer(&buf, 0, bytes); - bind_map.insert_buf(buf_proxy, buf); + self.bind_map.insert_buf(buf_proxy, buf); } Command::UploadUniform(buf_proxy, bytes) => { let usage = BufferUsages::UNIFORM | BufferUsages::COPY_DST; // Same consideration as above - let buf = self.pool.get_buf(buf_proxy, usage, device); + let buf = self + .pool + .get_buf(buf_proxy.size, buf_proxy.name, usage, device); queue.write_buffer(&buf, 0, bytes); - bind_map.insert_buf(buf_proxy, buf); + self.bind_map.insert_buf(buf_proxy, buf); } Command::UploadImage(image_proxy, bytes) => { let buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { @@ -322,12 +324,13 @@ impl Engine { depth_or_array_layers: 1, }, ); - bind_map.insert_image(image_proxy.id, texture, texture_view) + self.bind_map + .insert_image(image_proxy.id, texture, texture_view) } Command::Dispatch(shader_id, wg_size, bindings) => { // println!("dispatching {:?} with {} bindings", wg_size, bindings.len()); let shader = &self.shaders[shader_id.0]; - let bind_group = bind_map.create_bind_group( + let bind_group = self.bind_map.create_bind_group( device, &shader.bind_group_layout, bindings, @@ -340,18 +343,20 @@ impl Engine { cpass.dispatch_workgroups(wg_size.0, wg_size.1, wg_size.2); } Command::Download(proxy) => { - let src_buf = bind_map.buf_map.get(&proxy.id).ok_or("buffer not in map")?; - let buf = device.create_buffer(&wgpu::BufferDescriptor { - label: Some(proxy.name), - size: proxy.size, - usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, - mapped_at_creation: false, - }); + let src_buf = self + .bind_map + .buf_map + .get(&proxy.id) + .ok_or("buffer not in map")?; + let usage = BufferUsages::MAP_READ | BufferUsages::COPY_DST; + let buf = self.pool.get_buf(proxy.size, "download", usage, device); encoder.copy_buffer_to_buffer(&src_buf.buffer, 0, &buf, 0, proxy.size); - downloads.buf_map.insert(proxy.id, buf); + self.downloads.insert(proxy.id, buf); } Command::Clear(proxy, offset, size) => { - let buffer = bind_map.get_or_create(*proxy, device, &mut self.pool)?; + let buffer = self + .bind_map + .get_or_create(*proxy, device, &mut self.pool)?; #[cfg(not(target_arch = "wasm32"))] encoder.clear_buffer(buffer, *offset, *size); #[cfg(target_arch = "wasm32")] @@ -366,11 +371,42 @@ impl Engine { queue.write_buffer(buffer, *offset, &zeros); } } + Command::FreeBuf(proxy) => { + free_bufs.insert(proxy.id); + } + Command::FreeImage(proxy) => { + free_images.insert(proxy.id); + } } } queue.submit(Some(encoder.finish())); - self.pool.reap_bindmap(bind_map); - Ok(downloads) + for id in free_bufs { + if let Some(buf) = self.bind_map.buf_map.remove(&id) { + let props = BufferProperties { + size: buf.buffer.size(), + usages: buf.buffer.usage(), + #[cfg(feature = "buffer_labels")] + name: buf.label, + }; + self.pool.bufs.entry(props).or_default().push(buf.buffer); + } + } + for id in free_images { + if let Some((texture, view)) = self.bind_map.image_map.remove(&id) { + // TODO: have a pool to avoid needless re-allocation + drop(texture); + drop(view); + } + } + Ok(()) + } + + pub fn get_download(&self, buf: BufProxy) -> Option<&Buffer> { + self.downloads.get(&buf.id) + } + + pub fn free_download(&mut self, buf: BufProxy) { + self.downloads.remove(&buf.id); } } @@ -418,6 +454,10 @@ impl Recording { )); } + /// Prepare a buffer for downloading. + /// + /// Currently this copies to a download buffer. The original buffer can be freed + /// immediately after. pub fn download(&mut self, buf: BufProxy) { self.push(Command::Download(buf)); } @@ -425,6 +465,21 @@ impl Recording { pub fn clear_all(&mut self, buf: BufProxy) { self.push(Command::Clear(buf, 0, None)); } + + pub fn free_buf(&mut self, buf: BufProxy) { + self.push(Command::FreeBuf(buf)); + } + + pub fn free_image(&mut self, image: ImageProxy) { + self.push(Command::FreeImage(image)); + } + + pub fn free_resource(&mut self, resource: ResourceProxy) { + match resource { + ResourceProxy::Buf(buf) => self.free_buf(buf), + ResourceProxy::Image(image) => self.free_image(image), + } + } } impl BufProxy { @@ -565,7 +620,7 @@ impl BindMap { if let Entry::Vacant(v) = self.buf_map.entry(proxy.id) { let usage = BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE; - let buf = pool.get_buf(&proxy, usage, device); + let buf = pool.get_buf(proxy.size, proxy.name, usage, device); v.insert(BindMapBuffer { buffer: buf, label: proxy.name, @@ -647,7 +702,7 @@ impl BindMap { Entry::Occupied(occupied) => Ok(&occupied.into_mut().buffer), Entry::Vacant(vacant) => { let usage = BufferUsages::COPY_SRC | BufferUsages::COPY_DST | BufferUsages::STORAGE; - let buf = pool.get_buf(&proxy, usage, device); + let buf = pool.get_buf(proxy.size, proxy.name, usage, device); Ok(&vacant .insert(BindMapBuffer { buffer: buf, @@ -659,53 +714,23 @@ impl BindMap { } } -pub struct DownloadsMapped<'a>( - HashMap< - Id, - ( - BufferSlice<'a>, - GenericOneshotReceiver>, - ), - >, -); - -impl Downloads { - // Discussion: should API change so we get one buffer, rather than mapping all? - pub fn map(&self) -> DownloadsMapped { - let mut map = HashMap::new(); - for (id, buf) in &self.buf_map { - let buf_slice = buf.slice(..); - let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel(); - buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap()); - map.insert(*id, (buf_slice, receiver)); - } - DownloadsMapped(map) - } -} - -impl<'a> DownloadsMapped<'a> { - pub async fn get_mapped(&self, proxy: BufProxy) -> Result { - let (slice, recv) = self.0.get(&proxy.id).ok_or("buffer not in map")?; - if let Some(recv_result) = recv.receive().await { - recv_result?; - } else { - return Err("channel was closed".into()); - } - Ok(slice.get_mapped_range()) - } -} - const SIZE_CLASS_BITS: u32 = 1; impl ResourcePool { /// Get a buffer from the pool or create one. - fn get_buf(&mut self, proxy: &BufProxy, usage: BufferUsages, device: &Device) -> Buffer { - let rounded_size = Self::size_class(proxy.size, SIZE_CLASS_BITS); + fn get_buf( + &mut self, + size: u64, + name: &'static str, + usage: BufferUsages, + device: &Device, + ) -> Buffer { + let rounded_size = Self::size_class(size, SIZE_CLASS_BITS); let props = BufferProperties { size: rounded_size, usages: usage, #[cfg(feature = "buffer_labels")] - name: proxy.name, + name: name, }; if let Some(buf_vec) = self.bufs.get_mut(&props) { if let Some(buf) = buf_vec.pop() { @@ -714,7 +739,7 @@ impl ResourcePool { } device.create_buffer(&wgpu::BufferDescriptor { #[cfg(feature = "buffer_labels")] - label: Some(proxy.name), + label: Some(name), #[cfg(not(feature = "buffer_labels"))] label: None, size: rounded_size, @@ -723,19 +748,6 @@ impl ResourcePool { }) } - fn reap_bindmap(&mut self, bind_map: BindMap) { - for (_id, buf) in bind_map.buf_map { - let size = buf.buffer.size(); - let props = BufferProperties { - size, - usages: buf.buffer.usage(), - #[cfg(feature = "buffer_labels")] - name: buf.label, - }; - self.bufs.entry(props).or_default().push(buf.buffer); - } - } - /// Quantize a size up to the nearest size class. fn size_class(x: u64, bits: u32) -> u64 { if x > 1 << bits { diff --git a/src/lib.rs b/src/lib.rs index 6dc7bb2..7217311 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,9 +29,11 @@ pub mod encoding; pub mod glyph; pub mod util; +use render::Render; pub use scene::{Scene, SceneBuilder, SceneFragment}; +pub use util::block_on_wgpu; -use engine::{Engine, ExternalResource}; +use engine::{Engine, ExternalResource, Recording}; use shaders::FullShaders; use wgpu::{Device, Queue, SurfaceTexture, TextureFormat, TextureView}; @@ -83,8 +85,7 @@ impl Renderer { *target.as_image().unwrap(), texture, )]; - let _ = self - .engine + self.engine .run_recording(device, queue, &recording, &external_resources)?; Ok(()) } @@ -164,6 +165,105 @@ impl Renderer { self.shaders = shaders; Ok(()) } + + /// Renders a scene to the target texture. + /// + /// The texture is assumed to be of the specified dimensions and have been created with + /// the [wgpu::TextureFormat::Rgba8Unorm] format and the [wgpu::TextureUsages::STORAGE_BINDING] + /// flag set. + pub async fn render_to_texture_async( + &mut self, + device: &Device, + queue: &Queue, + scene: &Scene, + texture: &TextureView, + width: u32, + height: u32, + ) -> Result<()> { + let mut render = Render::new(); + let encoding = scene.data(); + let recording = render.render_encoding_coarse(encoding, &self.shaders, width, height, true); + let target = render.out_image(); + let bump_buf = render.bump_buf(); + self.engine.run_recording(device, queue, &recording, &[])?; + if let Some(bump_buf) = self.engine.get_download(bump_buf) { + let buf_slice = bump_buf.slice(..); + let (sender, receiver) = futures_intrusive::channel::shared::oneshot_channel(); + buf_slice.map_async(wgpu::MapMode::Read, move |v| sender.send(v).unwrap()); + if let Some(recv_result) = receiver.receive().await { + recv_result?; + } else { + return Err("channel was closed".into()); + } + let mapped = buf_slice.get_mapped_range(); + println!("{:?}", bytemuck::cast_slice::<_, u32>(&mapped)); + } + // TODO: apply logic to determine whether we need to rerun coarse, and also + // allocate the blend stack as needed. + self.engine.free_download(bump_buf); + // Maybe clear to reuse allocation? + let mut recording = Recording::default(); + render.record_fine(&self.shaders, &mut recording); + let external_resources = [ExternalResource::Image(target, texture)]; + self.engine + .run_recording(device, queue, &recording, &external_resources)?; + Ok(()) + } + + pub async fn render_to_surface_async( + &mut self, + device: &Device, + queue: &Queue, + scene: &Scene, + surface: &SurfaceTexture, + width: u32, + height: u32, + ) -> Result<()> { + let mut target = self + .target + .take() + .unwrap_or_else(|| TargetTexture::new(device, width, height)); + // TODO: implement clever resizing semantics here to avoid thrashing the memory allocator + // during resize, specifically on metal. + if target.width != width || target.height != height { + target = TargetTexture::new(device, width, height); + } + self.render_to_texture_async(device, queue, scene, &target.view, width, height) + .await?; + let mut encoder = + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); + { + let surface_view = surface + .texture + .create_view(&wgpu::TextureViewDescriptor::default()); + let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: None, + layout: &self.blit.bind_layout, + entries: &[wgpu::BindGroupEntry { + binding: 0, + resource: wgpu::BindingResource::TextureView(&target.view), + }], + }); + let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor { + label: None, + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + view: &surface_view, + resolve_target: None, + ops: wgpu::Operations { + load: wgpu::LoadOp::Clear(wgpu::Color::default()), + store: true, + }, + })], + depth_stencil_attachment: None, + }); + render_pass.set_pipeline(&self.blit.pipeline); + render_pass.set_bind_group(0, &bind_group, &[]); + render_pass.draw(0..6, 0..1); + } + queue.submit(Some(encoder.finish())); + self.target = Some(target); + Ok(()) + } } struct TargetTexture { diff --git a/src/render.rs b/src/render.rs index 9136931..5874dbc 100644 --- a/src/render.rs +++ b/src/render.rs @@ -9,6 +9,34 @@ use crate::{ Scene, }; +/// State for a render in progress. +pub struct Render { + /// Size of binning and info combined buffer in u32 units + binning_info_size: u32, + /// Size of tiles buf in tiles + tiles_size: u32, + /// Size of segments buf in segments + segments_size: u32, + /// Size of per-tile command list in u32 units + ptcl_size: u32, + width_in_tiles: u32, + height_in_tiles: u32, + fine: Option, +} + +/// Resources produced by pipeline, needed for fine rasterization. +struct FineResources { + config_buf: ResourceProxy, + bump_buf: ResourceProxy, + tile_buf: ResourceProxy, + segments_buf: ResourceProxy, + ptcl_buf: ResourceProxy, + gradient_image: ResourceProxy, + info_bin_data_buf: ResourceProxy, + + out_image: ImageProxy, +} + const TAG_MONOID_SIZE: u64 = 12; const TAG_MONOID_FULL_SIZE: u64 = 20; const PATH_BBOX_SIZE: u64 = 24; @@ -157,288 +185,389 @@ pub fn render_full( render_encoding_full(scene.data(), shaders, width, height) } +/// Create a single recording with both coarse and fine render stages. +/// +/// This function is not recommended when the scene can be complex, as it does not +/// implement robust dynamic memory. pub fn render_encoding_full( encoding: &Encoding, shaders: &FullShaders, width: u32, height: u32, ) -> (Recording, ResourceProxy) { - use crate::encoding::{resource::ResourceCache, PackedEncoding}; - let mut recording = Recording::default(); - let mut resources = ResourceCache::new(); - let mut packed = PackedEncoding::default(); - packed.pack(encoding, &mut resources); - let (ramp_data, ramps_width, ramps_height) = resources.ramps(packed.resources).unwrap(); - let gradient_image = if encoding.patches.is_empty() { - ResourceProxy::new_image(1, 1, ImageFormat::Rgba8) - } else { - let data: &[u8] = bytemuck::cast_slice(ramp_data); - ResourceProxy::Image(recording.upload_image( - ramps_width, - ramps_height, - ImageFormat::Rgba8, - data, - )) - }; - // TODO: calculate for real when we do rectangles - let n_pathtag = encoding.path_tags.len(); - let pathtag_padded = align_up(encoding.path_tags.len(), 4 * shaders::PATHTAG_REDUCE_WG); - let n_paths = encoding.n_paths; - let n_drawobj = n_paths; - let n_clip = encoding.n_clips; - - let new_width = next_multiple_of(width, 16); - let new_height = next_multiple_of(height, 16); - - let info_size = packed.layout.bin_data_start; - let config = crate::encoding::Config { - width_in_tiles: new_width / 16, - height_in_tiles: new_height / 16, - target_width: width, - target_height: height, - binning_size: ((1 << 20) / 4) - info_size, - tiles_size: (1 << 24) / TILE_SIZE as u32, - segments_size: (1 << 26) / SEGMENT_SIZE as u32, - ptcl_size: (1 << 25) / 4, - layout: packed.layout, - }; - // println!("{:?}", config); - let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data)); - let config_buf = - ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config))); - let info_bin_data_buf = ResourceProxy::new_buf( - (info_size + config.binning_size) as u64 * 4, - "info_bin_data_buf", - ); - let tile_buf = ResourceProxy::new_buf(config.tiles_size as u64 * TILE_SIZE, "tile_buf"); - let segments_buf = - ResourceProxy::new_buf(config.segments_size as u64 * SEGMENT_SIZE, "segments_buf"); - let ptcl_buf = ResourceProxy::new_buf(config.ptcl_size as u64 * 4, "ptcl_buf"); - - let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); - let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize; - let reduced_size = if pathtag_large { - align_up(pathtag_wgs, shaders::PATHTAG_REDUCE_WG) - } else { - pathtag_wgs - }; - let reduced_buf = - ResourceProxy::new_buf(reduced_size as u64 * TAG_MONOID_FULL_SIZE, "reduced_buf"); - // TODO: really only need pathtag_wgs - 1 - recording.dispatch( - shaders.pathtag_reduce, - (pathtag_wgs as u32, 1, 1), - [config_buf, scene_buf, reduced_buf], - ); - let mut pathtag_parent = reduced_buf; - if pathtag_large { - let reduced2_size = shaders::PATHTAG_REDUCE_WG as usize; - let reduced2_buf = - ResourceProxy::new_buf(reduced2_size as u64 * TAG_MONOID_FULL_SIZE, "reduced2_buf"); - recording.dispatch( - shaders.pathtag_reduce2, - (reduced2_size as u32, 1, 1), - [reduced_buf, reduced2_buf], - ); - let reduced_scan_buf = ResourceProxy::new_buf( - pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE, - "reduced_scan_buf", - ); - recording.dispatch( - shaders.pathtag_scan1, - (reduced_size as u32 / shaders::PATHTAG_REDUCE_WG, 1, 1), - [reduced_buf, reduced2_buf, reduced_scan_buf], - ); - pathtag_parent = reduced_scan_buf; - } - - let tagmonoid_buf = ResourceProxy::new_buf( - pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE, - "tagmonoid_buf", - ); - let pathtag_scan = if pathtag_large { - shaders.pathtag_scan_large - } else { - shaders.pathtag_scan - }; - recording.dispatch( - pathtag_scan, - (pathtag_wgs as u32, 1, 1), - [config_buf, scene_buf, pathtag_parent, tagmonoid_buf], - ); - let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; - let path_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE, "path_bbox_buf"); - recording.dispatch( - shaders.bbox_clear, - (drawobj_wgs, 1, 1), - [config_buf, path_bbox_buf], - ); - let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE, "cubic_buf"); - let path_coarse_wgs = - (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; - recording.dispatch( - shaders.pathseg, - (path_coarse_wgs, 1, 1), - [ - config_buf, - scene_buf, - tagmonoid_buf, - path_bbox_buf, - cubic_buf, - ], - ); - let draw_reduced_buf = - ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE, "draw_reduced_buf"); - recording.dispatch( - shaders.draw_reduce, - (drawobj_wgs, 1, 1), - [config_buf, scene_buf, draw_reduced_buf], - ); - let draw_monoid_buf = - ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf"); - let clip_inp_buf = - ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf"); - recording.dispatch( - shaders.draw_leaf, - (drawobj_wgs, 1, 1), - [ - config_buf, - scene_buf, - draw_reduced_buf, - path_bbox_buf, - draw_monoid_buf, - info_bin_data_buf, - clip_inp_buf, - ], - ); - let clip_el_buf = ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_EL_SIZE, "clip_el_buf"); - let clip_bic_buf = ResourceProxy::new_buf( - (n_clip / shaders::CLIP_REDUCE_WG) as u64 * CLIP_BIC_SIZE, - "clip_bic_buf", - ); - let clip_wg_reduce = n_clip.saturating_sub(1) / shaders::CLIP_REDUCE_WG; - if clip_wg_reduce > 0 { - recording.dispatch( - shaders.clip_reduce, - (clip_wg_reduce, 1, 1), - [ - config_buf, - clip_inp_buf, - path_bbox_buf, - clip_bic_buf, - clip_el_buf, - ], - ); - } - let clip_wg = (n_clip + shaders::CLIP_REDUCE_WG - 1) / shaders::CLIP_REDUCE_WG; - let clip_bbox_buf = ResourceProxy::new_buf(n_clip as u64 * CLIP_BBOX_SIZE, "clip_bbox_buf"); - if clip_wg > 0 { - recording.dispatch( - shaders.clip_leaf, - (clip_wg, 1, 1), - [ - config_buf, - clip_inp_buf, - path_bbox_buf, - clip_bic_buf, - clip_el_buf, - draw_monoid_buf, - clip_bbox_buf, - ], - ); - } - let draw_bbox_buf = ResourceProxy::new_buf(n_paths as u64 * DRAW_BBOX_SIZE, "draw_bbox_buf"); - let bump_buf = BufProxy::new(BUMP_SIZE, "bump_buf"); - let width_in_bins = (config.width_in_tiles + 15) / 16; - let height_in_bins = (config.height_in_tiles + 15) / 16; - let bin_header_buf = ResourceProxy::new_buf( - (256 * drawobj_wgs) as u64 * BIN_HEADER_SIZE, - "bin_header_buf", - ); - recording.clear_all(bump_buf); - let bump_buf = ResourceProxy::Buf(bump_buf); - recording.dispatch( - shaders.binning, - (drawobj_wgs, 1, 1), - [ - config_buf, - draw_monoid_buf, - path_bbox_buf, - clip_bbox_buf, - draw_bbox_buf, - bump_buf, - info_bin_data_buf, - bin_header_buf, - ], - ); - // Note: this only needs to be rounded up because of the workaround to store the tile_offset - // in storage rather than workgroup memory. - let n_path_aligned = align_up(n_paths as usize, 256); - let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf"); - let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; - recording.dispatch( - shaders.tile_alloc, - (path_wgs, 1, 1), - [ - config_buf, - scene_buf, - draw_bbox_buf, - bump_buf, - path_buf, - tile_buf, - ], - ); - recording.dispatch( - shaders.path_coarse, - (path_coarse_wgs, 1, 1), - [ - config_buf, - scene_buf, - tagmonoid_buf, - cubic_buf, - path_buf, - bump_buf, - tile_buf, - segments_buf, - ], - ); - recording.dispatch( - shaders.backdrop, - (path_wgs, 1, 1), - [config_buf, path_buf, tile_buf], - ); - recording.dispatch( - shaders.coarse, - (width_in_bins, height_in_bins, 1), - [ - config_buf, - scene_buf, - draw_monoid_buf, - bin_header_buf, - info_bin_data_buf, - path_buf, - tile_buf, - bump_buf, - ptcl_buf, - ], - ); - let out_image = ImageProxy::new(width, height, ImageFormat::Rgba8); - recording.dispatch( - shaders.fine, - (config.width_in_tiles, config.height_in_tiles, 1), - [ - config_buf, - tile_buf, - segments_buf, - ResourceProxy::Image(out_image), - ptcl_buf, - gradient_image, - info_bin_data_buf, - ], - ); - (recording, ResourceProxy::Image(out_image)) + let mut render = Render::new(); + let mut recording = render.render_encoding_coarse(encoding, shaders, width, height, false); + let out_image = render.out_image(); + render.record_fine(shaders, &mut recording); + (recording, out_image.into()) } pub fn align_up(len: usize, alignment: u32) -> usize { len + (len.wrapping_neg() & (alignment as usize - 1)) } + +impl Render { + pub fn new() -> Self { + // These sizes are adequate for paris-30k but should probably be dialed down. + Render { + binning_info_size: (1 << 20) / 4, + tiles_size: (1 << 24) / TILE_SIZE as u32, + segments_size: (1 << 26) / SEGMENT_SIZE as u32, + ptcl_size: (1 << 25) / 4 as u32, + width_in_tiles: 0, + height_in_tiles: 0, + fine: None, + } + } + + /// Prepare a recording for the coarse rasterization phase. + /// + /// The `robust` parameter controls whether we're preparing for readback + /// of the atomic bump buffer, for robust dynamic memory. + pub fn render_encoding_coarse( + &mut self, + encoding: &Encoding, + shaders: &FullShaders, + width: u32, + height: u32, + robust: bool, + ) -> Recording { + use crate::encoding::{resource::ResourceCache, PackedEncoding}; + let mut recording = Recording::default(); + let mut resources = ResourceCache::new(); + let mut packed = PackedEncoding::default(); + packed.pack(encoding, &mut resources); + let (ramp_data, ramps_width, ramps_height) = resources.ramps(packed.resources).unwrap(); + let gradient_image = if encoding.patches.is_empty() { + ResourceProxy::new_image(1, 1, ImageFormat::Rgba8) + } else { + let data: &[u8] = bytemuck::cast_slice(ramp_data); + ResourceProxy::Image(recording.upload_image( + ramps_width, + ramps_height, + ImageFormat::Rgba8, + data, + )) + }; + // TODO: calculate for real when we do rectangles + let n_pathtag = encoding.path_tags.len(); + let pathtag_padded = align_up(encoding.path_tags.len(), 4 * shaders::PATHTAG_REDUCE_WG); + let n_paths = encoding.n_paths; + let n_drawobj = n_paths; + let n_clip = encoding.n_clips; + + let new_width = next_multiple_of(width, 16); + let new_height = next_multiple_of(height, 16); + + let info_size = packed.layout.bin_data_start; + let config = crate::encoding::Config { + width_in_tiles: new_width / 16, + height_in_tiles: new_height / 16, + target_width: width, + target_height: height, + binning_size: self.binning_info_size - info_size, + tiles_size: self.tiles_size, + segments_size: self.segments_size, + ptcl_size: self.ptcl_size, + layout: packed.layout, + }; + // println!("{:?}", config); + let scene_buf = ResourceProxy::Buf(recording.upload("scene", packed.data)); + let config_buf = + ResourceProxy::Buf(recording.upload_uniform("config", bytemuck::bytes_of(&config))); + let info_bin_data_buf = ResourceProxy::new_buf( + (info_size + config.binning_size) as u64 * 4, + "info_bin_data_buf", + ); + let tile_buf = ResourceProxy::new_buf(config.tiles_size as u64 * TILE_SIZE, "tile_buf"); + let segments_buf = + ResourceProxy::new_buf(config.segments_size as u64 * SEGMENT_SIZE, "segments_buf"); + let ptcl_buf = ResourceProxy::new_buf(config.ptcl_size as u64 * 4, "ptcl_buf"); + + let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize); + let pathtag_large = pathtag_wgs > shaders::PATHTAG_REDUCE_WG as usize; + let reduced_size = if pathtag_large { + align_up(pathtag_wgs, shaders::PATHTAG_REDUCE_WG) + } else { + pathtag_wgs + }; + let reduced_buf = + ResourceProxy::new_buf(reduced_size as u64 * TAG_MONOID_FULL_SIZE, "reduced_buf"); + // TODO: really only need pathtag_wgs - 1 + recording.dispatch( + shaders.pathtag_reduce, + (pathtag_wgs as u32, 1, 1), + [config_buf, scene_buf, reduced_buf], + ); + let mut pathtag_parent = reduced_buf; + let mut large_pathtag_bufs = None; + if pathtag_large { + let reduced2_size = shaders::PATHTAG_REDUCE_WG as usize; + let reduced2_buf = + ResourceProxy::new_buf(reduced2_size as u64 * TAG_MONOID_FULL_SIZE, "reduced2_buf"); + recording.dispatch( + shaders.pathtag_reduce2, + (reduced2_size as u32, 1, 1), + [reduced_buf, reduced2_buf], + ); + let reduced_scan_buf = ResourceProxy::new_buf( + pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE, + "reduced_scan_buf", + ); + recording.dispatch( + shaders.pathtag_scan1, + (reduced_size as u32 / shaders::PATHTAG_REDUCE_WG, 1, 1), + [reduced_buf, reduced2_buf, reduced_scan_buf], + ); + pathtag_parent = reduced_scan_buf; + large_pathtag_bufs = Some((reduced2_buf, reduced_scan_buf)); + } + + let tagmonoid_buf = ResourceProxy::new_buf( + pathtag_wgs as u64 * shaders::PATHTAG_REDUCE_WG as u64 * TAG_MONOID_FULL_SIZE, + "tagmonoid_buf", + ); + let pathtag_scan = if pathtag_large { + shaders.pathtag_scan_large + } else { + shaders.pathtag_scan + }; + recording.dispatch( + pathtag_scan, + (pathtag_wgs as u32, 1, 1), + [config_buf, scene_buf, pathtag_parent, tagmonoid_buf], + ); + recording.free_resource(reduced_buf); + if let Some((reduced2, reduced_scan)) = large_pathtag_bufs { + recording.free_resource(reduced2); + recording.free_resource(reduced_scan); + } + let drawobj_wgs = (n_drawobj + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; + let path_bbox_buf = + ResourceProxy::new_buf(n_paths as u64 * PATH_BBOX_SIZE, "path_bbox_buf"); + recording.dispatch( + shaders.bbox_clear, + (drawobj_wgs, 1, 1), + [config_buf, path_bbox_buf], + ); + let cubic_buf = ResourceProxy::new_buf(n_pathtag as u64 * CUBIC_SIZE, "cubic_buf"); + let path_coarse_wgs = + (n_pathtag as u32 + shaders::PATH_COARSE_WG - 1) / shaders::PATH_COARSE_WG; + recording.dispatch( + shaders.pathseg, + (path_coarse_wgs, 1, 1), + [ + config_buf, + scene_buf, + tagmonoid_buf, + path_bbox_buf, + cubic_buf, + ], + ); + let draw_reduced_buf = + ResourceProxy::new_buf(drawobj_wgs as u64 * DRAWMONOID_SIZE, "draw_reduced_buf"); + recording.dispatch( + shaders.draw_reduce, + (drawobj_wgs, 1, 1), + [config_buf, scene_buf, draw_reduced_buf], + ); + let draw_monoid_buf = + ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE, "draw_monoid_buf"); + let clip_inp_buf = + ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_INP_SIZE, "clip_inp_buf"); + recording.dispatch( + shaders.draw_leaf, + (drawobj_wgs, 1, 1), + [ + config_buf, + scene_buf, + draw_reduced_buf, + path_bbox_buf, + draw_monoid_buf, + info_bin_data_buf, + clip_inp_buf, + ], + ); + recording.free_resource(draw_reduced_buf); + let clip_el_buf = + ResourceProxy::new_buf(encoding.n_clips as u64 * CLIP_EL_SIZE, "clip_el_buf"); + let clip_bic_buf = ResourceProxy::new_buf( + (n_clip / shaders::CLIP_REDUCE_WG) as u64 * CLIP_BIC_SIZE, + "clip_bic_buf", + ); + let clip_wg_reduce = n_clip.saturating_sub(1) / shaders::CLIP_REDUCE_WG; + if clip_wg_reduce > 0 { + recording.dispatch( + shaders.clip_reduce, + (clip_wg_reduce, 1, 1), + [ + config_buf, + clip_inp_buf, + path_bbox_buf, + clip_bic_buf, + clip_el_buf, + ], + ); + } + let clip_wg = (n_clip + shaders::CLIP_REDUCE_WG - 1) / shaders::CLIP_REDUCE_WG; + let clip_bbox_buf = ResourceProxy::new_buf(n_clip as u64 * CLIP_BBOX_SIZE, "clip_bbox_buf"); + if clip_wg > 0 { + recording.dispatch( + shaders.clip_leaf, + (clip_wg, 1, 1), + [ + config_buf, + clip_inp_buf, + path_bbox_buf, + clip_bic_buf, + clip_el_buf, + draw_monoid_buf, + clip_bbox_buf, + ], + ); + } + recording.free_resource(clip_inp_buf); + recording.free_resource(clip_bic_buf); + recording.free_resource(clip_el_buf); + let draw_bbox_buf = + ResourceProxy::new_buf(n_paths as u64 * DRAW_BBOX_SIZE, "draw_bbox_buf"); + let bump_buf = BufProxy::new(BUMP_SIZE, "bump_buf"); + let width_in_bins = (config.width_in_tiles + 15) / 16; + let height_in_bins = (config.height_in_tiles + 15) / 16; + let bin_header_buf = ResourceProxy::new_buf( + (256 * drawobj_wgs) as u64 * BIN_HEADER_SIZE, + "bin_header_buf", + ); + recording.clear_all(bump_buf); + let bump_buf = ResourceProxy::Buf(bump_buf); + recording.dispatch( + shaders.binning, + (drawobj_wgs, 1, 1), + [ + config_buf, + draw_monoid_buf, + path_bbox_buf, + clip_bbox_buf, + draw_bbox_buf, + bump_buf, + info_bin_data_buf, + bin_header_buf, + ], + ); + recording.free_resource(draw_monoid_buf); + recording.free_resource(path_bbox_buf); + recording.free_resource(clip_bbox_buf); + // Note: this only needs to be rounded up because of the workaround to store the tile_offset + // in storage rather than workgroup memory. + let n_path_aligned = align_up(n_paths as usize, 256); + let path_buf = ResourceProxy::new_buf(n_path_aligned as u64 * PATH_SIZE, "path_buf"); + let path_wgs = (n_paths + shaders::PATH_BBOX_WG - 1) / shaders::PATH_BBOX_WG; + recording.dispatch( + shaders.tile_alloc, + (path_wgs, 1, 1), + [ + config_buf, + scene_buf, + draw_bbox_buf, + bump_buf, + path_buf, + tile_buf, + ], + ); + recording.free_resource(draw_bbox_buf); + recording.dispatch( + shaders.path_coarse, + (path_coarse_wgs, 1, 1), + [ + config_buf, + scene_buf, + tagmonoid_buf, + cubic_buf, + path_buf, + bump_buf, + tile_buf, + segments_buf, + ], + ); + recording.free_resource(tagmonoid_buf); + recording.free_resource(cubic_buf); + recording.dispatch( + shaders.backdrop, + (path_wgs, 1, 1), + [config_buf, path_buf, tile_buf], + ); + recording.dispatch( + shaders.coarse, + (width_in_bins, height_in_bins, 1), + [ + config_buf, + scene_buf, + draw_monoid_buf, + bin_header_buf, + info_bin_data_buf, + path_buf, + tile_buf, + bump_buf, + ptcl_buf, + ], + ); + recording.free_resource(scene_buf); + recording.free_resource(draw_monoid_buf); + recording.free_resource(bin_header_buf); + recording.free_resource(path_buf); + let out_image = ImageProxy::new(width, height, ImageFormat::Rgba8); + self.width_in_tiles = config.width_in_tiles; + self.height_in_tiles = config.height_in_tiles; + self.fine = Some(FineResources { + config_buf, + bump_buf, + tile_buf, + segments_buf, + ptcl_buf, + gradient_image, + info_bin_data_buf, + out_image, + }); + if robust { + recording.download(*bump_buf.as_buf().unwrap()); + } + recording.free_resource(bump_buf); + recording + } + + /// Run fine rasterization assuming the coarse phase succeeded. + pub fn record_fine(&mut self, shaders: &FullShaders, recording: &mut Recording) { + let fine = self.fine.take().unwrap(); + recording.dispatch( + shaders.fine, + (self.width_in_tiles, self.height_in_tiles, 1), + [ + fine.config_buf, + fine.tile_buf, + fine.segments_buf, + ResourceProxy::Image(fine.out_image), + fine.ptcl_buf, + fine.gradient_image, + fine.info_bin_data_buf, + ], + ); + recording.free_resource(fine.config_buf); + recording.free_resource(fine.tile_buf); + recording.free_resource(fine.segments_buf); + recording.free_resource(fine.ptcl_buf); + recording.free_resource(fine.gradient_image); + recording.free_resource(fine.info_bin_data_buf); + } + + /// Get the output image. + /// + /// This is going away, as the caller will add the output image to the bind + /// map. + pub fn out_image(&self) -> ImageProxy { + self.fine.as_ref().unwrap().out_image + } + + pub fn bump_buf(&self) -> BufProxy { + *self.fine.as_ref().unwrap().bump_buf.as_buf().unwrap() + } +} diff --git a/src/util.rs b/src/util.rs index 0245d11..a75f804 100644 --- a/src/util.rs +++ b/src/util.rs @@ -16,6 +16,8 @@ //! Simple helpers for managing wgpu state and surfaces. +use std::future::Future; + use super::Result; use raw_window_handle::{HasRawDisplayHandle, HasRawWindowHandle}; @@ -132,3 +134,27 @@ pub struct RenderSurface { pub config: SurfaceConfiguration, pub dev_id: usize, } + +struct NullWake; + +impl std::task::Wake for NullWake { + fn wake(self: std::sync::Arc) {} +} + +/// Block on a future, polling the device as needed. +/// +/// This will deadlock if the future is awaiting anything other than GPU progress. +pub fn block_on_wgpu(device: &Device, mut fut: F) -> F::Output { + let waker = std::task::Waker::from(std::sync::Arc::new(NullWake)); + let mut context = std::task::Context::from_waker(&waker); + // Same logic as `pin_mut!` macro from `pin_utils`. + let mut fut = unsafe { std::pin::Pin::new_unchecked(&mut fut) }; + loop { + match fut.as_mut().poll(&mut context) { + std::task::Poll::Pending => { + device.poll(wgpu::Maintain::Wait); + } + std::task::Poll::Ready(item) => break item, + } + } +}