diff --git a/.github/workflows/push-shader.yml b/.github/workflows/push-shader.yml index 6cdfe38..274147d 100644 --- a/.github/workflows/push-shader.yml +++ b/.github/workflows/push-shader.yml @@ -20,7 +20,7 @@ jobs: git switch main git config user.name "Commit by GitHub Action" git config user.email "nobody@example.com" - git merge dev -m "merge from dev branch" + git merge dev -m "merge from dev branch - ${{ github.ref_name }}" sed -i '' '/shader\/gen/d' .gitignore git add .gitignore git rm -r --ignore-unmatch piet-gpu/shader/gen diff --git a/piet-gpu/bin/android.rs b/piet-gpu/bin/android.rs index 75c7045..d1c4749 100644 --- a/piet-gpu/bin/android.rs +++ b/piet-gpu/bin/android.rs @@ -20,7 +20,7 @@ use piet_gpu_hal::{ use piet::kurbo::Point; use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder}; -use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer}; +use piet_gpu::{test_scenes, PietGpuRenderContext, RenderDriver, Renderer}; #[cfg_attr(target_os = "android", ndk_glue::main(backtrace = "on"))] fn main() { @@ -34,12 +34,9 @@ struct MyHandle { // State required to render and present the contents struct GfxState { session: Session, - renderer: Renderer, + render_driver: RenderDriver, swapchain: Swapchain, current_frame: usize, - submitted: [Option; NUM_FRAMES], - cmd_bufs: [Option; NUM_FRAMES], - query_pools: Vec, present_semaphores: Vec, } @@ -110,22 +107,15 @@ impl GfxState { let present_semaphores = (0..NUM_FRAMES) .map(|_| session.create_semaphore()) .collect::, Error>>()?; - let query_pools = (0..NUM_FRAMES) - .map(|_| session.create_query_pool(Renderer::QUERY_POOL_SIZE)) - .collect::, Error>>()?; - let submitted = Default::default(); - let cmd_bufs = Default::default(); let renderer = Renderer::new(&session, width, height, NUM_FRAMES)?; + let render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer); Ok(GfxState { session, - renderer, + render_driver, swapchain, current_frame, - submitted, - cmd_bufs, - query_pools, present_semaphores, }) } @@ -137,51 +127,47 @@ impl GfxState { let frame_idx = self.current_frame % NUM_FRAMES; let mut info_string = String::new(); - if let Some(submitted) = self.submitted[frame_idx].take() { - self.cmd_bufs[frame_idx] = submitted.wait().unwrap(); - let ts = self - .session - .fetch_query_pool(&self.query_pools[frame_idx]) - .unwrap(); - info_string = format!("{:.1}ms", ts.last().unwrap() * 1e3); - println!("render time: {:?}", ts); + if self.current_frame >= NUM_FRAMES { + let stats = self + .render_driver + .get_timing_stats(&self.session, frame_idx); + info_string = stats.short_summary(); + println!("{}", info_string); } let mut ctx = PietGpuRenderContext::new(); test_scenes::render_anim_frame(&mut ctx, self.current_frame); //test_scenes::render_tiger(&mut ctx); render_info_string(&mut ctx, &info_string); - if let Err(e) = self.renderer.upload_render_ctx(&mut ctx, frame_idx) { + if let Err(e) = self + .render_driver + .upload_render_ctx(&self.session, &mut ctx) + { println!("error in uploading: {}", e); } let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap(); let swap_image = self.swapchain.image(image_idx); - let query_pool = &self.query_pools[frame_idx]; - let mut cmd_buf = self.cmd_bufs[frame_idx] - .take() - .unwrap_or_else(|| self.session.cmd_buf().unwrap()); - cmd_buf.begin(); - self.renderer.record(&mut cmd_buf, &query_pool, frame_idx); + self.render_driver.run_coarse(&self.session).unwrap(); + let target = self.render_driver.record_fine(&self.session).unwrap(); + let cmd_buf = target.cmd_buf; // Image -> Swapchain cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst); - cmd_buf.blit_image(&self.renderer.image_dev, &swap_image); + cmd_buf.blit_image(target.image, &swap_image); cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present); - cmd_buf.finish(); - self.submitted[frame_idx] = Some( - self.session - .run_cmd_buf( - cmd_buf, - &[&acquisition_semaphore], - &[&self.present_semaphores[frame_idx]], - ) - .unwrap(), - ); + self.render_driver + .submit( + &self.session, + &[&acquisition_semaphore], + &[&self.present_semaphores[frame_idx]], + ) + .unwrap(); self.swapchain .present(image_idx, &[&self.present_semaphores[frame_idx]]) .unwrap(); + self.render_driver.next_buffer(); self.current_frame += 1; } } diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 79914bf..df86158 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -6,7 +6,7 @@ use clap::{App, Arg}; use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session}; -use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer}; +use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, RenderDriver, Renderer}; const WIDTH: usize = 2048; const HEIGHT: usize = 1536; @@ -231,9 +231,6 @@ fn main() -> Result<(), Error> { let device = instance.device(None)?; let session = Session::new(device); - let mut cmd_buf = session.cmd_buf()?; - let query_pool = session.create_query_pool(Renderer::QUERY_POOL_SIZE)?; - let mut ctx = PietGpuRenderContext::new(); if let Some(input) = matches.value_of("INPUT") { let mut scale = matches @@ -253,40 +250,22 @@ fn main() -> Result<(), Error> { test_scenes::render_blend_grid(&mut ctx); } - let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?; - renderer.upload_render_ctx(&mut ctx, 0)?; + let renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?; + let mut render_driver = RenderDriver::new(&session, 1, renderer); + let start = std::time::Instant::now(); + render_driver.upload_render_ctx(&session, &mut ctx)?; let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST; let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?; - cmd_buf.begin(); - renderer.record(&mut cmd_buf, &query_pool, 0); - cmd_buf.copy_image_to_buffer(&renderer.image_dev, &image_buf); - cmd_buf.finish_timestamps(&query_pool); - cmd_buf.host_barrier(); - cmd_buf.finish(); - let start = std::time::Instant::now(); - let submitted = session.run_cmd_buf(cmd_buf, &[], &[])?; - submitted.wait()?; + render_driver.run_coarse(&session)?; + let target = render_driver.record_fine(&session)?; + target + .cmd_buf + .copy_image_to_buffer(target.image, &image_buf); + render_driver.submit(&session, &[], &[])?; + render_driver.wait(&session); println!("elapsed = {:?}", start.elapsed()); - let ts = session.fetch_query_pool(&query_pool).unwrap(); - if !ts.is_empty() { - println!("Element kernel time: {:.3}ms", ts[0] * 1e3); - println!( - "Tile allocation kernel time: {:.3}ms", - (ts[1] - ts[0]) * 1e3 - ); - println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); - println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); - println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3); - println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3); - println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3); - } - - /* - let mut data: Vec = Default::default(); - renderer.memory_buf_dev.read(&mut data).unwrap(); - piet_gpu::dump_k1_data(&data[2..]); - */ + render_driver.get_timing_stats(&session, 0).print_summary(); let mut img_data: Vec = Default::default(); // Note: because png can use a `&[u8]` slice, we could avoid an extra copy diff --git a/piet-gpu/bin/winit.rs b/piet-gpu/bin/winit.rs index 8f84da4..8438371 100644 --- a/piet-gpu/bin/winit.rs +++ b/piet-gpu/bin/winit.rs @@ -1,8 +1,8 @@ use piet::kurbo::Point; use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder}; -use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf}; +use piet_gpu_hal::{Error, ImageLayout, Instance, Session}; -use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer}; +use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, RenderDriver, Renderer}; use clap::{App, Arg}; @@ -69,13 +69,9 @@ fn main() -> Result<(), Error> { let present_semaphores = (0..NUM_FRAMES) .map(|_| session.create_semaphore()) .collect::, Error>>()?; - let query_pools = (0..NUM_FRAMES) - .map(|_| session.create_query_pool(Renderer::QUERY_POOL_SIZE)) - .collect::, Error>>()?; - let mut cmd_bufs: [Option; NUM_FRAMES] = Default::default(); - let mut submitted: [Option; NUM_FRAMES] = Default::default(); - let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?; + let renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?; + let mut render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer); let mut mode = 0usize; event_loop.run(move |event, _, control_flow| { @@ -106,26 +102,13 @@ fn main() -> Result<(), Error> { Event::RedrawRequested(window_id) if window_id == window.id() => { let frame_idx = current_frame % NUM_FRAMES; - if let Some(submitted) = submitted[frame_idx].take() { - cmd_bufs[frame_idx] = submitted.wait().unwrap(); - let ts = session.fetch_query_pool(&query_pools[frame_idx]).unwrap(); - if !ts.is_empty() { - info_string = format!( - "{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms", - ts[10] * 1e3, - ts[0] * 1e3, - (ts[1] - ts[0]) * 1e3, - (ts[2] - ts[1]) * 1e3, - (ts[4] - ts[3]) * 1e3, - (ts[6] - ts[5]) * 1e3, - (ts[8] - ts[7]) * 1e3, - (ts[10] - ts[9]) * 1e3, - ); - } + if current_frame >= NUM_FRAMES { + let stats = render_driver.get_timing_stats(&session, frame_idx); + info_string = stats.short_summary(); } let mut ctx = PietGpuRenderContext::new(); - let test_blend = true; + let test_blend = false; if let Some(svg) = &svg { test_scenes::render_svg(&mut ctx, svg); } else if test_blend { @@ -168,16 +151,15 @@ fn main() -> Result<(), Error> { test_scenes::render_anim_frame(&mut ctx, current_frame); } render_info_string(&mut ctx, &info_string); - if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) { + if let Err(e) = render_driver.upload_render_ctx(&session, &mut ctx) { println!("error in uploading: {}", e); } let (image_idx, acquisition_semaphore) = swapchain.next().unwrap(); let swap_image = swapchain.image(image_idx); - let query_pool = &query_pools[frame_idx]; - let mut cmd_buf = cmd_bufs[frame_idx].take().unwrap_or_else(|| session.cmd_buf().unwrap()); - cmd_buf.begin(); - renderer.record(&mut cmd_buf, &query_pool, frame_idx); + render_driver.run_coarse(&session).unwrap(); + let target = render_driver.record_fine(&session).unwrap(); + let cmd_buf = target.cmd_buf; // Image -> Swapchain cmd_buf.image_barrier( @@ -185,32 +167,25 @@ fn main() -> Result<(), Error> { ImageLayout::Undefined, ImageLayout::BlitDst, ); - cmd_buf.blit_image(&renderer.image_dev, &swap_image); + cmd_buf.blit_image(target.image, &swap_image); cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present); - cmd_buf.finish(); - - submitted[frame_idx] = Some(session - .run_cmd_buf( - cmd_buf, + render_driver + .submit( + &session, &[&acquisition_semaphore], &[&present_semaphores[frame_idx]], ) - .unwrap()); + .unwrap(); swapchain .present(image_idx, &[&present_semaphores[frame_idx]]) .unwrap(); + render_driver.next_buffer(); current_frame += 1; } Event::LoopDestroyed => { - for cmd_buf in &mut submitted { - // Wait for command list submission, otherwise dropping of renderer may - // cause validation errors (and possibly crashes). - if let Some(cmd_buf) = cmd_buf.take() { - cmd_buf.wait().unwrap(); - } - } + render_driver.wait_all(&session); } _ => (), } diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index 4a45d28..60f3783 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -45,12 +45,15 @@ shared Alloc sh_row_alloc[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG]; void main() { + if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) { + return; + } + uint th_ix = gl_LocalInvocationIndex; uint element_ix = gl_GlobalInvocationID.x; // Work assignment: 1 thread : 1 path element uint row_count = 0; - bool mem_ok = mem_error == NO_ERROR; if (gl_LocalInvocationID.y == 0) { if (element_ix < conf.n_elements) { // Possible TODO: it's not necessary to process backdrops of stroked paths. @@ -68,7 +71,7 @@ void main() { row_count = 0; } Alloc path_alloc = new_alloc( - path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); + path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true); sh_row_alloc[th_ix] = path_alloc; } sh_row_count[th_ix] = row_count; @@ -98,7 +101,7 @@ void main() { } } uint width = sh_row_width[el_ix]; - if (width > 0 && mem_ok) { + if (width > 0) { // Process one row sequentially // Read backdrop value per tile and prefix sum it Alloc tiles_alloc = sh_row_alloc[el_ix]; diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 9b04400..7485eee 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -32,8 +32,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf { // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; -shared Alloc sh_chunk_alloc[N_TILE]; -shared bool sh_alloc_failed; +shared uint sh_chunk_offset[N_TILE]; DrawMonoid load_draw_monoid(uint element_ix) { uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix; @@ -84,10 +83,6 @@ void main() { for (uint i = 0; i < N_SLICE; i++) { bitmaps[i][gl_LocalInvocationID.x] = 0; } - if (gl_LocalInvocationID.x == 0) { - sh_alloc_failed = false; - } - barrier(); // Read inputs and determine coverage of bins uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; @@ -148,26 +143,18 @@ void main() { count[i][gl_LocalInvocationID.x] = element_count; } // element_count is number of elements covering bin for this invocation. - Alloc chunk_alloc = new_alloc(0, 0, true); + uint chunk_offset = 0; if (element_count != 0) { - // TODO: aggregate atomic adds (subgroup is probably fastest) - MallocResult chunk = malloc(element_count * BinInstance_size); - chunk_alloc = chunk.alloc; - sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; - if (chunk.failed) { - sh_alloc_failed = true; - } + chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING); + sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset; } // Note: it might be more efficient for reading to do this in the // other order (each bin is a contiguous sequence of partitions) uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; write_mem(conf.bin_alloc, out_ix, element_count); - write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); + write_mem(conf.bin_alloc, out_ix + 1, chunk_offset); barrier(); - if (sh_alloc_failed || mem_error != NO_ERROR) { - return; - } // Use similar strategy as Laine & Karras paper; loop over bbox of bins // touched by this element @@ -181,9 +168,10 @@ void main() { if (my_slice > 0) { idx += count[my_slice - 1][bin_ix]; } - Alloc out_alloc = sh_chunk_alloc[bin_ix]; - uint out_offset = out_alloc.offset + idx * BinInstance_size; - BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); + uint chunk_offset = sh_chunk_offset[bin_ix]; + if (chunk_offset != MALLOC_FAILED) { + memory[(chunk_offset >> 2) + idx] = element_ix; + } } x++; if (x == x1) { diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index fc6df21..edc61b2 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -72,49 +72,62 @@ void write_tile_alloc(uint el_ix, Alloc a) { Alloc read_tile_alloc(uint el_ix, bool mem_ok) { // All memory. - return new_alloc(0, memory.length() * 4, mem_ok); + return new_alloc(0, conf.mem_size, mem_ok); } #endif // The maximum number of commands per annotated element. #define ANNO_COMMANDS 2 -// Perhaps cmd_alloc should be a global? This is a style question. -bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { +// All writes to the output must be gated by mem_ok. +bool mem_ok = true; + +// Perhaps cmd allocations should be a global? This is a style question. +void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { if (cmd_ref.offset < cmd_limit) { - return true; + return; } - MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC); - if (new_cmd.failed) { - return false; + uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE); + if (new_cmd == MALLOC_FAILED) { + mem_ok = false; } - CmdJump jump = CmdJump(new_cmd.alloc.offset); - Cmd_Jump_write(cmd_alloc, cmd_ref, jump); - cmd_alloc = new_cmd.alloc; - cmd_ref = CmdRef(cmd_alloc.offset); + if (mem_ok) { + CmdJump jump = CmdJump(new_cmd); + Cmd_Jump_write(cmd_alloc, cmd_ref, jump); + } + cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true); + cmd_ref = CmdRef(new_cmd); // Reserve space for the maximum number of commands and a potential jump. - cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size; - return true; + cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size; } void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) { if (linewidth < 0.0) { if (tile.tile.offset != 0) { CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop); - Cmd_Fill_write(alloc, cmd_ref, cmd_fill); + if (mem_ok) { + Cmd_Fill_write(alloc, cmd_ref, cmd_fill); + } cmd_ref.offset += 4 + CmdFill_size; } else { - Cmd_Solid_write(alloc, cmd_ref); + if (mem_ok) { + Cmd_Solid_write(alloc, cmd_ref); + } cmd_ref.offset += 4; } } else { CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth); - Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke); + if (mem_ok) { + Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke); + } cmd_ref.offset += 4 + CmdStroke_size; } } void main() { + if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) { + return; + } // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; @@ -161,7 +174,6 @@ void main() { uint drawtag_start = conf.drawtag_offset >> 2; uint drawdata_start = conf.drawdata_offset >> 2; uint drawinfo_start = conf.drawinfo_alloc.offset >> 2; - bool mem_ok = mem_error == NO_ERROR; while (true) { for (uint i = 0; i < N_SLICE; i++) { sh_bitmaps[i][th_ix] = 0; @@ -176,7 +188,7 @@ void main() { uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; count = read_mem(conf.bin_alloc, in_ix); uint offset = read_mem(conf.bin_alloc, in_ix + 1); - sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok); + sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true); } // prefix sum of counts for (uint i = 0; i < LG_N_PART_READ; i++) { @@ -200,7 +212,7 @@ void main() { } // use binary search to find element to read uint ix = rd_ix + th_ix; - if (ix >= wr_ix && ix < ready_ix && mem_ok) { + if (ix >= wr_ix && ix < ready_ix) { uint part_ix = 0; for (uint i = 0; i < LG_N_PART_READ; i++) { uint probe = part_ix + (uint(N_PART_READ / 2) >> i); @@ -257,7 +269,7 @@ void main() { uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size; sh_tile_base[th_ix] = base; Alloc path_alloc = new_alloc(path.tiles.offset, - (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); + (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true); write_tile_alloc(th_ix, path_alloc); break; default: @@ -293,27 +305,25 @@ void main() { uint x = sh_tile_x0[el_ix] + seq_ix % width; uint y = sh_tile_y0[el_ix] + seq_ix / width; bool include_tile = false; - if (mem_ok) { - Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), - TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); - bool is_clip = (tag & 1) != 0; - // Always include the tile if it contains a path segment. - // For draws, include the tile if it is solid. - // For clips, include the tile if it is empty - this way, logic - // below will suppress the drawing of inner elements. - // For blends, include the tile if - // (blend_mode, composition_mode) != (Normal, SrcOver) - bool is_blend = false; - if (is_clip) { - uint drawmonoid_base = drawmonoid_start + 4 * element_ix; - uint scene_offset = memory[drawmonoid_base + 2]; - uint dd = drawdata_start + (scene_offset >> 2); - uint blend = scene[dd]; - is_blend = (blend != BlendComp_clip); - } - include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip - || is_blend; + Tile tile = Tile_read(read_tile_alloc(el_ix, true), + TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); + bool is_clip = (tag & 1) != 0; + // Always include the tile if it contains a path segment. + // For draws, include the tile if it is solid. + // For clips, include the tile if it is empty - this way, logic + // below will suppress the drawing of inner elements. + // For blends, include the tile if + // (blend_mode, composition_mode) != (Normal, SrcOver) + bool is_blend = false; + if (is_clip) { + uint drawmonoid_base = drawmonoid_start + 4 * element_ix; + uint scene_offset = memory[drawmonoid_base + 2]; + uint dd = drawdata_start + (scene_offset >> 2); + uint blend = scene[dd]; + is_blend = (blend != BlendComp_clip); } + include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip + || is_blend; if (include_tile) { uint el_slice = el_ix / 32; uint el_mask = 1u << (el_ix & 31); @@ -327,7 +337,7 @@ void main() { // through the draw objects. uint slice_ix = 0; uint bitmap = sh_bitmaps[0][th_ix]; - while (mem_ok) { + while (true) { if (bitmap == 0) { slice_ix++; if (slice_ix == N_SLICE) { @@ -347,7 +357,7 @@ void main() { uint drawtag = scene[drawtag_start + element_ix]; if (clip_zero_depth == 0) { - Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), + Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true), TileRef(sh_tile_base[element_ref_ix] + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); uint drawmonoid_base = drawmonoid_start + 4 * element_ix; @@ -358,18 +368,16 @@ void main() { switch (drawtag) { case Drawtag_FillColor: float linewidth = uintBitsToFloat(memory[di]); - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; - } + alloc_cmd(cmd_alloc, cmd_ref, cmd_limit); write_fill(cmd_alloc, cmd_ref, tile, linewidth); uint rgba = scene[dd]; - Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba)); + if (mem_ok) { + Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba)); + } cmd_ref.offset += 4 + CmdColor_size; break; case Drawtag_FillLinGradient: - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; - } + alloc_cmd(cmd_alloc, cmd_ref, cmd_limit); linewidth = uintBitsToFloat(memory[di]); write_fill(cmd_alloc, cmd_ref, tile, linewidth); CmdLinGrad cmd_lin; @@ -377,13 +385,13 @@ void main() { cmd_lin.line_x = uintBitsToFloat(memory[di + 1]); cmd_lin.line_y = uintBitsToFloat(memory[di + 2]); cmd_lin.line_c = uintBitsToFloat(memory[di + 3]); - Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin); + if (mem_ok) { + Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin); + } cmd_ref.offset += 4 + CmdLinGrad_size; break; case Drawtag_FillRadGradient: - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; - } + alloc_cmd(cmd_alloc, cmd_ref, cmd_limit); linewidth = uintBitsToFloat(memory[di]); write_fill(cmd_alloc, cmd_ref, tile, linewidth); CmdRadGrad cmd_rad; @@ -396,29 +404,31 @@ void main() { cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8])); cmd_rad.ra = uintBitsToFloat(memory[di + 9]); cmd_rad.roff = uintBitsToFloat(memory[di + 10]); - Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad); + if (mem_ok) { + Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad); + } cmd_ref.offset += 4 + CmdRadGrad_size; break; case Drawtag_FillImage: + alloc_cmd(cmd_alloc, cmd_ref, cmd_limit); linewidth = uintBitsToFloat(memory[di]); - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; - } write_fill(cmd_alloc, cmd_ref, tile, linewidth); uint index = scene[dd]; uint raw1 = scene[dd + 1]; ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16); - Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset)); + if (mem_ok) { + Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset)); + } cmd_ref.offset += 4 + CmdImage_size; break; case Drawtag_BeginClip: if (tile.tile.offset == 0 && tile.backdrop == 0) { clip_zero_depth = clip_depth + 1; } else { - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; + alloc_cmd(cmd_alloc, cmd_ref, cmd_limit); + if (mem_ok) { + Cmd_BeginClip_write(cmd_alloc, cmd_ref); } - Cmd_BeginClip_write(cmd_alloc, cmd_ref); cmd_ref.offset += 4; render_blend_depth++; max_blend_depth = max(max_blend_depth, render_blend_depth); @@ -427,12 +437,11 @@ void main() { break; case Drawtag_EndClip: clip_depth--; - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; - } write_fill(cmd_alloc, cmd_ref, tile, -1.0); uint blend = scene[dd]; - Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend)); + if (mem_ok) { + Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend)); + } cmd_ref.offset += 4 + CmdEndClip_size; render_blend_depth--; break; @@ -459,11 +468,13 @@ void main() { break; } if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { - Cmd_End_write(cmd_alloc, cmd_ref); + if (mem_ok) { + Cmd_End_write(cmd_alloc, cmd_ref); + } if (max_blend_depth > BLEND_STACK_SPLIT) { uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4; - MallocResult scratch = malloc(scratch_size); - alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc); + uint scratch = atomicAdd(blend_offset, scratch_size); + write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch); } } } diff --git a/piet-gpu/shader/image.png b/piet-gpu/shader/image.png new file mode 100644 index 0000000..5cb8adc Binary files /dev/null and b/piet-gpu/shader/image.png differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 86751d4..09d0448 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -14,6 +14,7 @@ // higher quality antialiasing among other things). #define DO_SRGB_CONVERSION 0 +// TODO: the binding of the main buffer can be readonly #include "mem.h" #include "setup.h" @@ -24,19 +25,23 @@ #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y) layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in; -layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf { +layout(binding = 1) restrict readonly buffer ConfigBuf { Config conf; }; +layout(binding = 2) buffer BlendBuf { + uint blend_mem[]; +}; + #ifdef GRAY -layout(r8, set = 0, binding = 2) uniform restrict writeonly image2D image; +layout(r8, binding = 3) uniform restrict writeonly image2D image; #else -layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image; +layout(rgba8, binding = 3) uniform restrict writeonly image2D image; #endif -layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D image_atlas; +layout(rgba8, binding = 4) uniform restrict readonly image2D image_atlas; -layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients; +layout(rgba8, binding = 5) uniform restrict readonly image2D gradients; #include "ptcl.h" #include "tile.h" @@ -114,8 +119,9 @@ void main() { mediump float area[CHUNK]; uint clip_depth = 0; - bool mem_ok = mem_error == NO_ERROR; - while (mem_ok) { + // Previously we would early-out if there was a memory failure, so we wouldn't try to read corrupt + // tiles. But now we assume this is checked CPU-side before launching fine rasterization. + while (true) { uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag; if (tag == Cmd_End) { break; @@ -129,7 +135,7 @@ void main() { df[k] = 1e9; TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { - TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref); + TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref); vec2 line_vec = seg.vector; for (uint k = 0; k < CHUNK; k++) { vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin; @@ -151,7 +157,7 @@ void main() { tile_seg_ref = TileSegRef(fill.tile_ref); // Calculate coverage based on backdrop + coverage of each line segment do { - TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref); + TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref); for (uint k = 0; k < CHUNK; k++) { vec2 my_xy = xy + vec2(chunk_offset(k)); vec2 start = seg.origin - my_xy; @@ -248,7 +254,7 @@ void main() { uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX + CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y); for (uint k = 0; k < CHUNK; k++) { - memory[base_ix + k] = packsRGB(vec4(rgba[k])); + blend_mem[base_ix + k] = packsRGB(vec4(rgba[k])); rgba[k] = vec4(0.0); } } @@ -268,7 +274,7 @@ void main() { if (clip_depth < BLEND_STACK_SPLIT) { bg_rgba = blend_stack[clip_depth][k]; } else { - bg_rgba = memory[base_ix + k]; + bg_rgba = blend_mem[base_ix + k]; } mediump vec4 bg = unpacksRGB(bg_rgba); mediump vec4 fg = rgba[k] * area[k]; diff --git a/piet-gpu/shader/mem.h b/piet-gpu/shader/mem.h index 9e81f04..d79ed16 100644 --- a/piet-gpu/shader/mem.h +++ b/piet-gpu/shader/mem.h @@ -3,27 +3,23 @@ layout(set = 0, binding = 0) buffer Memory { // offset into memory of the next allocation, initialized by the user. uint mem_offset; - // mem_error tracks the status of memory accesses, initialized to NO_ERROR - // by the user. ERR_MALLOC_FAILED is reported for insufficient memory. - // If MEM_DEBUG is defined the following errors are reported: - // - ERR_OUT_OF_BOUNDS is reported for out of bounds writes. - // - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words. + // mem_error is a bitmask of stages that have failed allocation. uint mem_error; + // offset into blend memory of allocations for blend stack. + uint blend_offset; uint[] memory; }; // Uncomment this line to add the size field to Alloc and enable memory checks. // Note that the Config struct in setup.h grows size fields as well. -//#define MEM_DEBUG -#define NO_ERROR 0 -#define ERR_MALLOC_FAILED 1 -#define ERR_OUT_OF_BOUNDS 2 -#define ERR_UNALIGNED_ACCESS 3 +// This setting is not working and the mechanism will be removed. +//#define MEM_DEBUG #ifdef MEM_DEBUG #define Alloc_size 16 #else +// TODO: this seems wrong #define Alloc_size 8 #endif @@ -37,12 +33,6 @@ struct Alloc { #endif }; -struct MallocResult { - Alloc alloc; - // failed is true if the allocation overflowed memory. - bool failed; -}; - // new_alloc synthesizes an Alloc from an offset and size. Alloc new_alloc(uint offset, uint size, bool mem_ok) { Alloc a; @@ -57,24 +47,32 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) { return a; } -// malloc allocates size bytes of memory. -MallocResult malloc(uint size) { - MallocResult r; +#define STAGE_BINNING (1u << 0) +#define STAGE_TILE_ALLOC (1u << 1) +#define STAGE_PATH_COARSE (1u << 2) +#define STAGE_COARSE (1u << 3) + +// Allocations in main memory will never be 0, and this might be slightly +// faster to test against than some other value. +#define MALLOC_FAILED 0 + +// Check that previous dependent stages have succeeded. +bool check_deps(uint dep_stage) { + // TODO: this should be an atomic relaxed load, but that involves + // bringing in "memory scope semantics" + return (atomicOr(mem_error, 0) & dep_stage) == 0; +} + +// Allocate size bytes of memory, offset in bytes. +// Note: with a bit of rearrangement of header files, we could make the +// mem_size argument go away (it comes from the config binding). +uint malloc_stage(uint size, uint mem_size, uint stage) { uint offset = atomicAdd(mem_offset, size); - r.failed = offset + size > memory.length() * 4; - r.alloc = new_alloc(offset, size, !r.failed); - if (r.failed) { - atomicMax(mem_error, ERR_MALLOC_FAILED); - return r; + if (offset + size > mem_size) { + atomicOr(mem_error, stage); + offset = MALLOC_FAILED; } -#ifdef MEM_DEBUG - if ((size & 3) != 0) { - r.failed = true; - atomicMax(mem_error, ERR_UNALIGNED_ACCESS); - return r; - } -#endif - return r; + return offset; } // touch_mem checks whether access to the memory word at offset is valid. diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index c6d3815..39b26b2 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -87,7 +87,13 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) { return SubdivResult(val, a0, a2); } +// All writes to the output must be gated by mem_ok. +bool mem_ok = true; + void main() { + if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) { + return; + } uint element_ix = gl_GlobalInvocationID.x; PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size); @@ -95,24 +101,10 @@ void main() { if (element_ix < conf.n_pathseg) { tag = PathSeg_tag(conf.pathseg_alloc, ref); } - bool mem_ok = mem_error == NO_ERROR; switch (tag.tag) { case PathSeg_Cubic: PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref); - // Affine transform is now applied in pathseg - /* - uint trans_ix = cubic.trans_ix; - if (trans_ix > 0) { - TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size); - TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref); - cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate; - cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate; - cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate; - cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate; - } - */ - vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3; float err = err_v.x * err_v.x + err_v.y * err_v.y; // The number of quadratics. @@ -140,7 +132,7 @@ void main() { uint path_ix = cubic.path_ix; Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); Alloc path_alloc = - new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); + new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true); ivec4 bbox = ivec4(path.bbox); vec2 p0 = cubic.p0; qp0 = cubic.p0; @@ -199,11 +191,12 @@ void main() { // TODO: can be tighter, use c to bound width uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); // Consider using subgroups to aggregate atomic add. - MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size); - if (tile_alloc.failed || !mem_ok) { - return; + uint malloc_size = n_tile_alloc * TileSeg_size; + uint tile_offset = malloc_stage(malloc_size, conf.mem_size, STAGE_PATH_COARSE); + if (tile_offset == MALLOC_FAILED) { + mem_ok = false; } - uint tile_offset = tile_alloc.alloc.offset; + Alloc tile_alloc = new_alloc(tile_offset, malloc_size, true); TileSeg tile_seg; @@ -221,9 +214,7 @@ void main() { int backdrop = p1.y < p0.y ? 1 : -1; TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); uint tile_el = tile_ref.offset >> 2; - if (touch_mem(path_alloc, tile_el + 1)) { - atomicAdd(memory[tile_el + 1], backdrop); - } + atomicAdd(memory[tile_el + 1], backdrop); } // next_xray is the xray for the next scanline; the line segment intersects @@ -247,9 +238,7 @@ void main() { TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x)); uint tile_el = tile_ref.offset >> 2; uint old = 0; - if (touch_mem(path_alloc, tile_el)) { - old = atomicExchange(memory[tile_el], tile_offset); - } + old = atomicExchange(memory[tile_el], tile_offset); tile_seg.origin = p0; tile_seg.vector = p1 - p0; float y_edge = 0.0; @@ -276,7 +265,9 @@ void main() { } tile_seg.y_edge = y_edge; tile_seg.next.offset = old; - TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg); + if (mem_ok) { + TileSeg_write(tile_alloc, TileSegRef(tile_offset), tile_seg); + } tile_offset += TileSeg_size; } xc += b; diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 21206e5..eb9f9ea 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -31,8 +31,9 @@ // to memory for the overflow. #define BLEND_STACK_SPLIT 4 -#ifdef ERR_MALLOC_FAILED +#ifdef MALLOC_FAILED struct Config { + uint mem_size; // in bytes uint n_elements; // paths uint n_pathseg; uint width_in_tiles; diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 0fec2ce..63ced91 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -29,7 +29,7 @@ layout(binding = 2) readonly buffer SceneBuf { #define SY (1.0 / float(TILE_HEIGHT_PX)) shared uint sh_tile_count[TILE_ALLOC_WG]; -shared MallocResult sh_tile_alloc; +shared uint sh_tile_offset; vec4 load_draw_bbox(uint draw_ix) { uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix; @@ -42,6 +42,9 @@ vec4 load_draw_bbox(uint draw_ix) { } void main() { + if (!check_deps(STAGE_BINNING)) { + return; + } uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; // At the moment, element_ix == path_ix. The clip-intersected bounding boxes @@ -86,27 +89,24 @@ void main() { sh_tile_count[th_ix] = total_tile_count; } if (th_ix == TILE_ALLOC_WG - 1) { - sh_tile_alloc = malloc(total_tile_count * Tile_size); + sh_tile_offset = malloc_stage(total_tile_count * Tile_size, conf.mem_size, STAGE_TILE_ALLOC); } barrier(); - MallocResult alloc_start = sh_tile_alloc; - if (alloc_start.failed || mem_error != NO_ERROR) { + uint offset_start = sh_tile_offset; + if (offset_start == MALLOC_FAILED) { return; } if (element_ix < conf.n_elements) { uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; - Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count); - path.tiles = TileRef(tiles_alloc.offset); + path.tiles = TileRef(offset_start + Tile_size * tile_subix); Path_write(conf.tile_alloc, path_ref, path); } // Zero out allocated tiles efficiently uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); - uint start_ix = alloc_start.alloc.offset >> 2; + uint start_ix = offset_start >> 2; for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) { - // Note: this interleaving is faster than using Tile_write - // by a significant amount. - write_mem(alloc_start.alloc, start_ix + i, 0); + memory[start_ix + i] = 0; } } diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs index d0ef1eb..bddb6f4 100644 --- a/piet-gpu/src/encoder.rs +++ b/piet-gpu/src/encoder.rs @@ -16,13 +16,12 @@ //! Low-level scene encoding. -use crate::Blend; +use crate::{Blend, SceneStats, DRAWTAG_SIZE, TRANSFORM_SIZE}; use bytemuck::{Pod, Zeroable}; use piet_gpu_hal::BufWrite; use crate::stages::{ - self, Config, PathEncoder, Transform, CLIP_PART_SIZE, DRAW_PART_SIZE, PATHSEG_PART_SIZE, - TRANSFORM_PART_SIZE, + self, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE, }; pub struct Encoder { @@ -52,86 +51,19 @@ pub struct EncodedSceneRef<'a, T: Copy + Pod> { } impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> { - /// Return a config for the element processing pipeline. - /// - /// This does not include further pipeline processing. Also returns the - /// beginning of free memory. - pub fn stage_config(&self) -> (Config, usize) { - // Layout of scene buffer - let drawtag_offset = 0; - let n_drawobj = self.n_drawobj(); - let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize); - let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE; - let trans_offset = drawdata_offset + self.drawdata_stream.len(); - let n_trans = self.transform_stream.len(); - let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize); - let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE; - let n_linewidth = self.linewidth_stream.len(); - let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE; - let n_pathtag = self.tag_stream.len(); - let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize); - let pathseg_offset = pathtag_offset + n_pathtag_padded; + pub(crate) fn stats(&self) -> SceneStats { + SceneStats { + n_drawobj: self.drawtag_stream.len(), + drawdata_len: self.drawdata_stream.len(), + n_transform: self.transform_stream.len(), + linewidth_len: std::mem::size_of_val(self.linewidth_stream), + pathseg_len: self.pathseg_stream.len(), + n_pathtag: self.tag_stream.len(), - // Layout of memory - let mut alloc = 0; - let trans_alloc = alloc; - alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE; - let pathseg_alloc = alloc; - alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE; - let path_bbox_alloc = alloc; - let n_path = self.n_path as usize; - alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE; - let drawmonoid_alloc = alloc; - alloc += n_drawobj_padded * DRAWMONOID_SIZE; - let anno_alloc = alloc; - alloc += n_drawobj * ANNOTATED_SIZE; - let clip_alloc = alloc; - let n_clip = self.n_clip as usize; - const CLIP_SIZE: usize = 4; - alloc += n_clip * CLIP_SIZE; - let clip_bic_alloc = alloc; - const CLIP_BIC_SIZE: usize = 8; - // This can round down, as we only reduce the prefix - alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE; - let clip_stack_alloc = alloc; - const CLIP_EL_SIZE: usize = 20; - alloc += n_clip * CLIP_EL_SIZE; - let clip_bbox_alloc = alloc; - const CLIP_BBOX_SIZE: usize = 16; - alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE; - let draw_bbox_alloc = alloc; - alloc += n_drawobj * DRAW_BBOX_SIZE; - let drawinfo_alloc = alloc; - // TODO: not optimized; it can be accumulated during encoding or summed from drawtags - const MAX_DRAWINFO_SIZE: usize = 44; - alloc += n_drawobj * MAX_DRAWINFO_SIZE; - - let config = Config { - n_elements: n_drawobj as u32, - n_pathseg: self.n_pathseg, - pathseg_alloc: pathseg_alloc as u32, - anno_alloc: anno_alloc as u32, - trans_alloc: trans_alloc as u32, - path_bbox_alloc: path_bbox_alloc as u32, - drawmonoid_alloc: drawmonoid_alloc as u32, - clip_alloc: clip_alloc as u32, - clip_bic_alloc: clip_bic_alloc as u32, - clip_stack_alloc: clip_stack_alloc as u32, - clip_bbox_alloc: clip_bbox_alloc as u32, - draw_bbox_alloc: draw_bbox_alloc as u32, - drawinfo_alloc: drawinfo_alloc as u32, - n_trans: n_trans as u32, n_path: self.n_path, + n_pathseg: self.n_pathseg, n_clip: self.n_clip, - trans_offset: trans_offset as u32, - linewidth_offset: linewidth_offset as u32, - pathtag_offset: pathtag_offset as u32, - pathseg_offset: pathseg_offset as u32, - drawtag_offset: drawtag_offset as u32, - drawdata_offset: drawdata_offset as u32, - ..Default::default() - }; - (config, alloc) + } } pub fn write_scene(&self, buf: &mut BufWrite) { @@ -148,34 +80,6 @@ impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> { buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize)); buf.extend_slice(&self.pathseg_stream); } - - /// The number of draw objects in the draw object stream. - pub(crate) fn n_drawobj(&self) -> usize { - self.drawtag_stream.len() - } - - /// The number of paths. - pub(crate) fn n_path(&self) -> u32 { - self.n_path - } - - /// The number of path segments. - pub(crate) fn n_pathseg(&self) -> u32 { - self.n_pathseg - } - - pub(crate) fn n_transform(&self) -> usize { - self.transform_stream.len() - } - - /// The number of tags in the path stream. - pub(crate) fn n_pathtag(&self) -> usize { - self.tag_stream.len() - } - - pub(crate) fn n_clip(&self) -> u32 { - self.n_clip - } } /// A scene fragment encoding a glyph. @@ -191,15 +95,6 @@ pub struct GlyphEncoder { n_pathseg: u32, } -const TRANSFORM_SIZE: usize = 24; -const LINEWIDTH_SIZE: usize = 4; -const PATHSEG_SIZE: usize = 52; -const PATH_BBOX_SIZE: usize = 24; -const DRAWMONOID_SIZE: usize = 16; -const DRAW_BBOX_SIZE: usize = 16; -const DRAWTAG_SIZE: usize = 4; -const ANNOTATED_SIZE: usize = 40; - // Tags for draw objects. See shader/drawtag.h for the authoritative source. const DRAWTAG_FILLCOLOR: u32 = 0x44; const DRAWTAG_FILLLINGRADIENT: u32 = 0x114; @@ -343,88 +238,6 @@ impl Encoder { self.n_clip += 1; } - /// Return a config for the element processing pipeline. - /// - /// This does not include further pipeline processing. Also returns the - /// beginning of free memory. - pub fn stage_config(&self) -> (Config, usize) { - // Layout of scene buffer - let drawtag_offset = 0; - let n_drawobj = self.n_drawobj(); - let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize); - let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE; - let trans_offset = drawdata_offset + self.drawdata_stream.len(); - let n_trans = self.transform_stream.len(); - let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize); - let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE; - let n_linewidth = self.linewidth_stream.len(); - let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE; - let n_pathtag = self.tag_stream.len(); - let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize); - let pathseg_offset = pathtag_offset + n_pathtag_padded; - - // Layout of memory - let mut alloc = 0; - let trans_alloc = alloc; - alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE; - let pathseg_alloc = alloc; - alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE; - let path_bbox_alloc = alloc; - let n_path = self.n_path as usize; - alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE; - let drawmonoid_alloc = alloc; - alloc += n_drawobj_padded * DRAWMONOID_SIZE; - let anno_alloc = alloc; - alloc += n_drawobj * ANNOTATED_SIZE; - let clip_alloc = alloc; - let n_clip = self.n_clip as usize; - const CLIP_SIZE: usize = 4; - alloc += n_clip * CLIP_SIZE; - let clip_bic_alloc = alloc; - const CLIP_BIC_SIZE: usize = 8; - // This can round down, as we only reduce the prefix - alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE; - let clip_stack_alloc = alloc; - const CLIP_EL_SIZE: usize = 20; - alloc += n_clip * CLIP_EL_SIZE; - let clip_bbox_alloc = alloc; - const CLIP_BBOX_SIZE: usize = 16; - alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE; - let draw_bbox_alloc = alloc; - alloc += n_drawobj * DRAW_BBOX_SIZE; - let drawinfo_alloc = alloc; - // TODO: not optimized; it can be accumulated during encoding or summed from drawtags - const MAX_DRAWINFO_SIZE: usize = 44; - alloc += n_drawobj * MAX_DRAWINFO_SIZE; - - let config = Config { - n_elements: n_drawobj as u32, - n_pathseg: self.n_pathseg, - pathseg_alloc: pathseg_alloc as u32, - anno_alloc: anno_alloc as u32, - trans_alloc: trans_alloc as u32, - path_bbox_alloc: path_bbox_alloc as u32, - drawmonoid_alloc: drawmonoid_alloc as u32, - clip_alloc: clip_alloc as u32, - clip_bic_alloc: clip_bic_alloc as u32, - clip_stack_alloc: clip_stack_alloc as u32, - clip_bbox_alloc: clip_bbox_alloc as u32, - draw_bbox_alloc: draw_bbox_alloc as u32, - drawinfo_alloc: drawinfo_alloc as u32, - n_trans: n_trans as u32, - n_path: self.n_path, - n_clip: self.n_clip, - trans_offset: trans_offset as u32, - linewidth_offset: linewidth_offset as u32, - pathtag_offset: pathtag_offset as u32, - pathseg_offset: pathseg_offset as u32, - drawtag_offset: drawtag_offset as u32, - drawdata_offset: drawdata_offset as u32, - ..Default::default() - }; - (config, alloc) - } - pub fn write_scene(&self, buf: &mut BufWrite) { buf.extend_slice(&self.drawtag_stream); let n_drawobj = self.drawtag_stream.len(); @@ -440,32 +253,19 @@ impl Encoder { buf.extend_slice(&self.pathseg_stream); } - /// The number of draw objects in the draw object stream. - pub(crate) fn n_drawobj(&self) -> usize { - self.drawtag_stream.len() - } + pub(crate) fn stats(&self) -> SceneStats { + SceneStats { + n_drawobj: self.drawtag_stream.len(), + drawdata_len: self.drawdata_stream.len(), + n_transform: self.transform_stream.len(), + linewidth_len: std::mem::size_of_val(&*self.linewidth_stream), + n_pathtag: self.tag_stream.len(), + pathseg_len: self.pathseg_stream.len(), - /// The number of paths. - pub(crate) fn n_path(&self) -> u32 { - self.n_path - } - - /// The number of path segments. - pub(crate) fn n_pathseg(&self) -> u32 { - self.n_pathseg - } - - pub(crate) fn n_transform(&self) -> usize { - self.transform_stream.len() - } - - /// The number of tags in the path stream. - pub(crate) fn n_pathtag(&self) -> usize { - self.tag_stream.len() - } - - pub(crate) fn n_clip(&self) -> u32 { - self.n_clip + n_path: self.n_path, + n_pathseg: self.n_pathseg, + n_clip: self.n_clip, + } } pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) { @@ -478,11 +278,6 @@ impl Encoder { } } -fn align_up(x: usize, align: usize) -> usize { - debug_assert!(align.is_power_of_two()); - (x + align - 1) & !(align - 1) -} - fn padding(x: usize, align: usize) -> usize { x.wrapping_neg() & (align - 1) } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 1ebb5cf..bfb5f19 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -4,17 +4,19 @@ pub mod glyph_render; mod gradient; mod pico_svg; mod render_ctx; +mod render_driver; pub mod stages; pub mod test_scenes; mod text; -use bytemuck::Pod; +use bytemuck::{Pod, Zeroable}; use std::convert::TryInto; pub use blend::{Blend, BlendMode, CompositionMode}; pub use encoder::EncodedSceneRef; pub use gradient::Colrv1RadialGradient; pub use render_ctx::PietGpuRenderContext; +pub use render_driver::RenderDriver; use piet::kurbo::Vec2; use piet::{ImageFormat, RenderContext}; @@ -25,9 +27,12 @@ use piet_gpu_hal::{ }; pub use pico_svg::PicoSvg; -use stages::{ClipBinding, ElementBinding, ElementCode}; +use stages::{ + ClipBinding, ElementBinding, ElementCode, DRAW_PART_SIZE, PATHSEG_PART_SIZE, + TRANSFORM_PART_SIZE, +}; -use crate::stages::{ClipCode, Config, ElementStage}; +use crate::stages::{ClipCode, Config, ElementStage, CLIP_PART_SIZE}; const TILE_W: usize = 16; const TILE_H: usize = 16; @@ -64,6 +69,31 @@ pub enum PixelFormat { Rgba8, } +#[repr(C)] +#[derive(Clone, Copy, Debug, Zeroable, Pod)] +pub(crate) struct MemoryHeader { + mem_offset: u32, + mem_error: u32, + blend_offset: u32, +} + +/// The sizes of various objects in the encoded scene, needed for memory layout. +#[derive(Default)] +pub(crate) struct SceneStats { + // Slices of scene encoding, in order + pub n_drawobj: usize, + pub drawdata_len: usize, + pub n_transform: usize, + pub linewidth_len: usize, + pub pathseg_len: usize, + pub n_pathtag: usize, + + // Additional stats needed needed for memory layout & dispatch + pub n_path: u32, + pub n_pathseg: u32, + pub n_clip: u32, +} + pub struct Renderer { // These sizes are aligned to tile boundaries, though at some point // we'll want to have a good strategy for dealing with odd sizes. @@ -72,18 +102,23 @@ pub struct Renderer { pub image_dev: Image, // resulting image - // The reference is held by the pipelines. We will be changing - // this to make the scene upload dynamic. + // TODO: two changes needed here. First, if we're fencing on the coarse + // pipeline, then we only need one copy (this changes if we also bind the + // scene buffer in fine rasterization, which might be a good idea to reduce + // copying). Second, there should be a staging buffer for discrete cards. scene_bufs: Vec, memory_buf_host: Vec, memory_buf_dev: Buffer, + memory_buf_readback: Buffer, // Staging buffers config_bufs: Vec, // Device config buf config_buf: Buffer, + blend_buf: Buffer, + // New element pipeline element_code: ElementCode, element_stage: ElementStage, @@ -111,6 +146,8 @@ pub struct Renderer { k4_pipeline: Pipeline, k4_ds: DescriptorSet, + scene_stats: SceneStats, + // TODO: the following stats are now redundant and can be removed. n_transform: usize, n_drawobj: usize, n_paths: usize, @@ -142,7 +179,13 @@ impl RenderConfig { impl Renderer { /// The number of query pool entries needed to run the renderer. - pub const QUERY_POOL_SIZE: u32 = 12; + pub const QUERY_POOL_SIZE: u32 = Self::COARSE_QUERY_POOL_SIZE + Self::FINE_QUERY_POOL_SIZE; + + /// The number of query pool entries needed to run the coarse pipeline. + pub const COARSE_QUERY_POOL_SIZE: u32 = 10; + + /// The number of query pool entries needed to run the fine pipeline. + pub const FINE_QUERY_POOL_SIZE: u32 = 2; pub unsafe fn new( session: &Session, @@ -166,12 +209,18 @@ impl Renderer { let width = width + (width.wrapping_neg() & (TILE_W - 1)); let height = height + (height.wrapping_neg() & (TILE_W - 1)); let dev = BufferUsage::STORAGE | BufferUsage::COPY_DST; - let host_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC; + let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC; + let usage_blend = BufferUsage::STORAGE; + let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC; + let usage_readback = BufferUsage::MAP_READ | BufferUsage::COPY_DST; - // This may be inadequate for very complex scenes (paris etc) // TODO: separate staging buffer (if needed) let scene_bufs = (0..n_bufs) - .map(|_| session.create_buffer(8 * 1024 * 1024, host_upload).unwrap()) + .map(|_| { + session + .create_buffer(8 * 1024 * 1024, usage_upload) + .unwrap() + }) .collect::>(); let image_format = match config.format { @@ -185,15 +234,22 @@ impl Renderer { let config_bufs = (0..n_bufs) .map(|_| { session - .create_buffer(CONFIG_BUFFER_SIZE, host_upload) + .create_buffer(CONFIG_BUFFER_SIZE, usage_upload) .unwrap() }) .collect(); let memory_buf_host = (0..n_bufs) - .map(|_| session.create_buffer(2 * 4, host_upload).unwrap()) + .map(|_| { + session + .create_buffer(std::mem::size_of::() as u64, usage_upload) + .unwrap() + }) .collect(); - let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?; + let memory_buf_dev = session.create_buffer(16 * 1024 * 1024, usage_mem_dev)?; + let memory_buf_readback = + session.create_buffer(std::mem::size_of::() as u64, usage_readback)?; + let blend_buf = session.create_buffer(16 * 1024 * 1024, usage_blend)?; let element_code = ElementCode::new(session); let element_stage = ElementStage::new(session, &element_code); @@ -282,7 +338,7 @@ impl Renderer { let gradient_bufs = (0..n_bufs) .map(|_| { session - .create_buffer(GRADIENT_BUF_SIZE as u64, host_upload) + .create_buffer(GRADIENT_BUF_SIZE as u64, usage_upload) .unwrap() }) .collect(); @@ -297,6 +353,7 @@ impl Renderer { &[ BindType::Buffer, BindType::BufReadOnly, + BindType::Buffer, BindType::Image, BindType::ImageRead, BindType::ImageRead, @@ -304,19 +361,22 @@ impl Renderer { )?; let k4_ds = session .descriptor_set_builder() - .add_buffers(&[&memory_buf_dev, &config_buf]) + .add_buffers(&[&memory_buf_dev, &config_buf, &blend_buf]) .add_images(&[&image_dev]) .add_textures(&[&bg_image, &gradients]) .build(&session, &k4_pipeline)?; + let scene_stats = Default::default(); Ok(Renderer { width, height, scene_bufs, memory_buf_host, memory_buf_dev, + memory_buf_readback, config_buf, config_bufs, + blend_buf, image_dev, element_code, element_stage, @@ -336,6 +396,7 @@ impl Renderer { coarse_ds, k4_pipeline, k4_ds, + scene_stats, n_transform: 0, n_drawobj: 0, n_paths: 0, @@ -358,43 +419,14 @@ impl Renderer { render_ctx: &mut PietGpuRenderContext, buf_ix: usize, ) -> Result<(), Error> { - let (mut config, mut alloc) = render_ctx.stage_config(); - let n_drawobj = render_ctx.n_drawobj(); - // TODO: be more consistent in size types - let n_path = render_ctx.n_path() as usize; - self.n_paths = n_path; - self.n_transform = render_ctx.n_transform(); - self.n_drawobj = render_ctx.n_drawobj(); - self.n_pathseg = render_ctx.n_pathseg() as usize; - self.n_pathtag = render_ctx.n_pathtag(); - self.n_clip = render_ctx.n_clip(); + self.scene_stats = render_ctx.stats(); - // These constants depend on encoding and may need to be updated. - // Perhaps we can plumb these from piet-gpu-derive? - const PATH_SIZE: usize = 12; - const BIN_SIZE: usize = 8; - let width_in_tiles = self.width / TILE_W; - let height_in_tiles = self.height / TILE_H; - let tile_base = alloc; - alloc += ((n_path + 3) & !3) * PATH_SIZE; - let bin_base = alloc; - alloc += ((n_drawobj + 255) & !255) * BIN_SIZE; - let ptcl_base = alloc; - alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC; - - config.width_in_tiles = width_in_tiles as u32; - config.height_in_tiles = height_in_tiles as u32; - config.tile_alloc = tile_base as u32; - config.bin_alloc = bin_base as u32; - config.ptcl_alloc = ptcl_base as u32; unsafe { - // TODO: reallocate scene buffer if size is inadequate + self.upload_config(buf_ix)?; { let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?; render_ctx.write_scene(&mut mapped_scene); } - self.config_bufs[buf_ix].write(&[config])?; - self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; // Upload gradient data. let ramp_data = render_ctx.get_ramp_data(); @@ -414,43 +446,14 @@ impl Renderer { scene: &EncodedSceneRef, buf_ix: usize, ) -> Result<(), Error> { - let (mut config, mut alloc) = scene.stage_config(); - let n_drawobj = scene.n_drawobj(); - // TODO: be more consistent in size types - let n_path = scene.n_path() as usize; - self.n_paths = n_path; - self.n_transform = scene.n_transform(); - self.n_drawobj = scene.n_drawobj(); - self.n_pathseg = scene.n_pathseg() as usize; - self.n_pathtag = scene.n_pathtag(); - self.n_clip = scene.n_clip(); + self.scene_stats = scene.stats(); - // These constants depend on encoding and may need to be updated. - // Perhaps we can plumb these from piet-gpu-derive? - const PATH_SIZE: usize = 12; - const BIN_SIZE: usize = 8; - let width_in_tiles = self.width / TILE_W; - let height_in_tiles = self.height / TILE_H; - let tile_base = alloc; - alloc += ((n_path + 3) & !3) * PATH_SIZE; - let bin_base = alloc; - alloc += ((n_drawobj + 255) & !255) * BIN_SIZE; - let ptcl_base = alloc; - alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC; - - config.width_in_tiles = width_in_tiles as u32; - config.height_in_tiles = height_in_tiles as u32; - config.tile_alloc = tile_base as u32; - config.bin_alloc = bin_base as u32; - config.ptcl_alloc = ptcl_base as u32; unsafe { - // TODO: reallocate scene buffer if size is inadequate + self.upload_config(buf_ix)?; { let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?; scene.write_scene(&mut mapped_scene); } - self.config_bufs[buf_ix].write(&[config])?; - self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; // Upload gradient data. if !scene.ramp_data.is_empty() { @@ -464,7 +467,41 @@ impl Renderer { Ok(()) } - pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) { + // Note: configuration has to be re-uploaded when memory buffer is resized + pub(crate) unsafe fn upload_config(&mut self, buf_ix: usize) -> Result<(), Error> { + let stats = &self.scene_stats; + let n_path = stats.n_path as usize; + self.n_paths = n_path; + self.n_transform = stats.n_transform; + self.n_drawobj = stats.n_drawobj; + self.n_pathseg = stats.n_pathseg as usize; + self.n_pathtag = stats.n_pathtag; + self.n_clip = stats.n_clip; + let (mut config, alloc) = stats.config(self.width, self.height); + config.mem_size = self.memory_buf_size() as u32; + self.config_bufs[buf_ix].write(&[config])?; + let mem_header = MemoryHeader { + mem_offset: alloc as u32, + mem_error: 0, + blend_offset: 0, + }; + // Note: we could skip doing this on realloc, but probably not worth the bother + self.memory_buf_host[buf_ix].write(&[mem_header])?; + Ok(()) + } + + /// Get the size of memory for the allocations known in advance. + pub(crate) fn memory_size(&self, stats: &SceneStats) -> usize { + stats.config(self.width, self.height).1 + } + + /// Record the coarse part of a render pipeline. + pub unsafe fn record_coarse( + &self, + cmd_buf: &mut CmdBuf, + query_pool: &QueryPool, + buf_ix: usize, + ) { cmd_buf.copy_buffer(&self.config_bufs[buf_ix], &self.config_buf); cmd_buf.copy_buffer(&self.memory_buf_host[buf_ix], &self.memory_buf_dev); cmd_buf.memory_barrier(); @@ -558,9 +595,21 @@ impl Renderer { pass.end(); cmd_buf.end_debug_label(); cmd_buf.memory_barrier(); + } + + pub unsafe fn record_fine( + &self, + cmd_buf: &mut CmdBuf, + query_pool: &QueryPool, + query_start: u32, + ) { + cmd_buf.reset_query_pool(&query_pool); cmd_buf.begin_debug_label("Fine raster"); - let mut pass = - cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11)); + let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer( + &query_pool, + query_start, + query_start + 1, + )); pass.dispatch( &self.k4_pipeline, &self.k4_ds, @@ -577,6 +626,19 @@ impl Renderer { cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } + pub unsafe fn record_readback(&self, cmd_buf: &mut CmdBuf) { + cmd_buf.copy_buffer(&self.memory_buf_dev, &self.memory_buf_readback); + cmd_buf.memory_barrier(); + } + + /// Record a render pipeline. + /// + /// This *assumes* the buffers are adequately sized. + pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) { + self.record_coarse(cmd_buf, query_pool, buf_ix); + self.record_fine(cmd_buf, query_pool, 10); + } + pub fn make_image( session: &Session, width: usize, @@ -636,4 +698,210 @@ impl Renderer { .unwrap() } } + + pub(crate) unsafe fn realloc_scene_if_needed( + &mut self, + session: &Session, + new_size: u64, + buf_ix: usize, + ) -> Result<(), Error> { + if new_size <= self.scene_bufs[buf_ix].size() { + return Ok(()); + } + const ALIGN: u64 = 0x10000; + let new_size = (new_size + ALIGN - 1) & ALIGN.wrapping_neg(); + println!( + "reallocating scene buf[{}] {} -> {}", + buf_ix, + self.scene_bufs[buf_ix].size(), + new_size + ); + let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC; + let scene_buf = session.create_buffer(new_size, usage_upload)?; + self.element_bindings[buf_ix].rebind_scene(session, &scene_buf); + session.update_buffer_descriptor(&mut self.tile_ds[buf_ix], 2, &scene_buf); + session.update_buffer_descriptor(&mut self.coarse_ds[buf_ix], 2, &scene_buf); + self.scene_bufs[buf_ix] = scene_buf; + Ok(()) + } + + /// Get the size of the memory buffer. + /// + /// This is the usable size (not including the header). + pub(crate) fn memory_buf_size(&self) -> u64 { + self.memory_buf_dev.size() - std::mem::size_of::() as u64 + } + + pub(crate) unsafe fn realloc_memory( + &mut self, + session: &Session, + new_size: u64, + ) -> Result<(), Error> { + println!( + "reallocating memory buf {} -> {}", + self.memory_buf_dev.size(), + new_size + ); + let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC; + let memory_buf_dev = session.create_buffer(new_size, usage_mem_dev)?; + for element_binding in &mut self.element_bindings { + element_binding.rebind_memory(session, &memory_buf_dev); + } + self.clip_binding.rebind_memory(session, &memory_buf_dev); + for tile_ds in &mut self.tile_ds { + session.update_buffer_descriptor(tile_ds, 0, &memory_buf_dev); + } + session.update_buffer_descriptor(&mut self.path_ds, 0, &memory_buf_dev); + session.update_buffer_descriptor(&mut self.backdrop_ds, 0, &memory_buf_dev); + session.update_buffer_descriptor(&mut self.bin_ds, 0, &memory_buf_dev); + for coarse_ds in &mut self.coarse_ds { + session.update_buffer_descriptor(coarse_ds, 0, &memory_buf_dev); + } + session.update_buffer_descriptor(&mut self.k4_ds, 0, &memory_buf_dev); + self.memory_buf_dev = memory_buf_dev; + Ok(()) + } + + pub(crate) fn blend_size(&self) -> u64 { + self.blend_buf.size() + } + + pub(crate) unsafe fn realloc_blend( + &mut self, + session: &Session, + new_size: u64, + ) -> Result<(), Error> { + println!( + "reallocating blend buf {} -> {}", + self.blend_size(), + new_size + ); + let usage_blend = BufferUsage::STORAGE; + let blend_buf = session.create_buffer(new_size, usage_blend)?; + session.update_buffer_descriptor(&mut self.k4_ds, 2, &blend_buf); + self.blend_buf = blend_buf; + Ok(()) + } +} + +const TRANSFORM_SIZE: usize = 24; +const PATHSEG_SIZE: usize = 52; +const PATH_BBOX_SIZE: usize = 24; +const DRAWMONOID_SIZE: usize = 16; +const DRAW_BBOX_SIZE: usize = 16; +const DRAWTAG_SIZE: usize = 4; +const ANNOTATED_SIZE: usize = 40; + +impl SceneStats { + pub(crate) fn scene_size(&self) -> usize { + align_up(self.n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE + + self.drawdata_len + + align_up(self.n_transform, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE + + self.linewidth_len + + align_up(self.n_pathtag, PATHSEG_PART_SIZE as usize) + + self.pathseg_len + } + + /// Return a config for a scene with these stats. + /// + /// Also returns the beginning of free (dynamic) memory. + fn config(&self, width: usize, height: usize) -> (Config, usize) { + // Layout of scene buffer + let drawtag_offset = 0; + let n_drawobj = self.n_drawobj; + let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize); + let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE; + let trans_offset = drawdata_offset + self.drawdata_len; + let n_trans = self.n_transform; + let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize); + let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE; + let pathtag_offset = linewidth_offset + self.linewidth_len; + let n_pathtag = self.n_pathtag; + let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize); + let pathseg_offset = pathtag_offset + n_pathtag_padded; + + // Layout of memory + let mut alloc = 0; + let trans_alloc = alloc; + alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE; + let pathseg_alloc = alloc; + alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE; + let path_bbox_alloc = alloc; + let n_path = self.n_path as usize; + alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE; + let drawmonoid_alloc = alloc; + alloc += n_drawobj_padded * DRAWMONOID_SIZE; + let anno_alloc = alloc; + alloc += n_drawobj * ANNOTATED_SIZE; + let clip_alloc = alloc; + let n_clip = self.n_clip as usize; + const CLIP_SIZE: usize = 4; + alloc += n_clip * CLIP_SIZE; + let clip_bic_alloc = alloc; + const CLIP_BIC_SIZE: usize = 8; + // This can round down, as we only reduce the prefix + alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE; + let clip_stack_alloc = alloc; + const CLIP_EL_SIZE: usize = 20; + alloc += n_clip * CLIP_EL_SIZE; + let clip_bbox_alloc = alloc; + const CLIP_BBOX_SIZE: usize = 16; + alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE; + let draw_bbox_alloc = alloc; + alloc += n_drawobj * DRAW_BBOX_SIZE; + let drawinfo_alloc = alloc; + // TODO: not optimized; it can be accumulated during encoding or summed from drawtags + const MAX_DRAWINFO_SIZE: usize = 44; + alloc += n_drawobj * MAX_DRAWINFO_SIZE; + + // These constants depend on encoding and may need to be updated. + const PATH_SIZE: usize = 12; + const BIN_SIZE: usize = 8; + let width_in_tiles = width / TILE_W; + let height_in_tiles = height / TILE_H; + let tile_base = alloc; + alloc += ((n_path + 3) & !3) * PATH_SIZE; + let bin_base = alloc; + alloc += ((n_drawobj + 255) & !255) * BIN_SIZE; + let ptcl_base = alloc; + alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC; + + let config = Config { + mem_size: 0, // to be filled in later + n_elements: n_drawobj as u32, + n_pathseg: self.n_pathseg, + pathseg_alloc: pathseg_alloc as u32, + anno_alloc: anno_alloc as u32, + trans_alloc: trans_alloc as u32, + path_bbox_alloc: path_bbox_alloc as u32, + drawmonoid_alloc: drawmonoid_alloc as u32, + clip_alloc: clip_alloc as u32, + clip_bic_alloc: clip_bic_alloc as u32, + clip_stack_alloc: clip_stack_alloc as u32, + clip_bbox_alloc: clip_bbox_alloc as u32, + draw_bbox_alloc: draw_bbox_alloc as u32, + drawinfo_alloc: drawinfo_alloc as u32, + n_trans: n_trans as u32, + n_path: self.n_path, + n_clip: self.n_clip, + trans_offset: trans_offset as u32, + linewidth_offset: linewidth_offset as u32, + pathtag_offset: pathtag_offset as u32, + pathseg_offset: pathseg_offset as u32, + drawtag_offset: drawtag_offset as u32, + drawdata_offset: drawdata_offset as u32, + width_in_tiles: width_in_tiles as u32, + height_in_tiles: height_in_tiles as u32, + tile_alloc: tile_base as u32, + bin_alloc: bin_base as u32, + ptcl_alloc: ptcl_base as u32, + }; + + (config, alloc) + } +} + +fn align_up(x: usize, align: usize) -> usize { + debug_assert!(align.is_power_of_two()); + (x + align - 1) & !(align - 1) } diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 5d4ffd3..caef303 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -4,7 +4,7 @@ const DO_SRGB_CONVERSION: bool = false; use std::borrow::Cow; use crate::encoder::GlyphEncoder; -use crate::stages::{Config, Transform}; +use crate::stages::Transform; use piet::kurbo::{Affine, PathEl, Point, Rect, Shape}; use piet::{ Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext, @@ -18,7 +18,7 @@ use piet_gpu_types::scene::Element; use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache}; use crate::text::Font; pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder}; -use crate::Blend; +use crate::{Blend, SceneStats}; pub struct PietGpuImage; @@ -95,44 +95,15 @@ impl PietGpuRenderContext { } } - pub fn stage_config(&self) -> (Config, usize) { - self.new_encoder.stage_config() - } - - /// Number of draw objects. - /// - /// This is for the new element processing pipeline. It's not necessarily the - /// same as the number of paths (as in the old pipeline), but it might take a - /// while to sort that out. - pub fn n_drawobj(&self) -> usize { - self.new_encoder.n_drawobj() - } - - /// Number of paths. - pub fn n_path(&self) -> u32 { - self.new_encoder.n_path() - } - - pub fn n_pathseg(&self) -> u32 { - self.new_encoder.n_pathseg() - } - - pub fn n_pathtag(&self) -> usize { - self.new_encoder.n_pathtag() - } - - pub fn n_transform(&self) -> usize { - self.new_encoder.n_transform() - } - - pub fn n_clip(&self) -> u32 { - self.new_encoder.n_clip() + pub(crate) fn stats(&self) -> SceneStats { + self.new_encoder.stats() } pub fn write_scene(&self, buf: &mut BufWrite) { self.new_encoder.write_scene(buf); } + // TODO: delete pub fn get_scene_buf(&mut self) -> &[u8] { const ALIGN: usize = 128; let padded_size = (self.elements.len() + (ALIGN - 1)) & ALIGN.wrapping_neg(); @@ -194,7 +165,6 @@ impl RenderContext for PietGpuRenderContext { let rad = self.ramp_cache.add_radial_gradient(&rad); Ok(PietGpuBrush::RadGradient(rad)) } - _ => todo!("don't do radial gradients yet"), } } diff --git a/piet-gpu/src/render_driver.rs b/piet-gpu/src/render_driver.rs new file mode 100644 index 0000000..98dff0c --- /dev/null +++ b/piet-gpu/src/render_driver.rs @@ -0,0 +1,332 @@ +// Copyright 2022 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +use bytemuck::Pod; +use piet_gpu_hal::{CmdBuf, Error, Image, QueryPool, Semaphore, Session, SubmittedCmdBuf}; + +use crate::{EncodedSceneRef, MemoryHeader, PietGpuRenderContext, Renderer, SceneStats}; + +/// Additional logic for sequencing rendering operations, specifically +/// for handling failure and reallocation. +/// +/// It may be this shouldn't be a separate object from Renderer. +pub struct RenderDriver { + frames: Vec, + renderer: Renderer, + buf_ix: usize, + /// The index of a pending fine rasterization submission. + pending: Option, +} + +pub struct TargetState<'a> { + pub cmd_buf: &'a mut CmdBuf, + pub image: &'a Image, +} + +#[derive(Default, Debug)] +pub struct TimingStats { + coarse: Vec, + fine: Vec, +} + +struct RenderFrame { + cmd_buf: CmdBufState, + coarse_query_pool: QueryPool, + fine_query_pool: QueryPool, + timing_stats: TimingStats, +} + +enum CmdBufState { + Start, + Submitted(SubmittedCmdBuf), + Ready(CmdBuf), +} + +impl RenderDriver { + /// Create new render driver. + /// + /// Should probably be fallible. + /// + /// We can get n from the renderer as well. + pub fn new(session: &Session, n: usize, renderer: Renderer) -> RenderDriver { + let frames = (0..n) + .map(|_| { + // Maybe should allocate here so it doesn't happen on first frame? + let cmd_buf = CmdBufState::default(); + let coarse_query_pool = + session.create_query_pool(Renderer::COARSE_QUERY_POOL_SIZE)?; + let fine_query_pool = session.create_query_pool(Renderer::FINE_QUERY_POOL_SIZE)?; + Ok(RenderFrame { + cmd_buf, + coarse_query_pool, + fine_query_pool, + timing_stats: TimingStats::default(), + }) + }) + .collect::>() + .unwrap(); + RenderDriver { + frames, + renderer, + buf_ix: 0, + pending: None, + } + } + + pub fn upload_render_ctx( + &mut self, + session: &Session, + render_ctx: &mut PietGpuRenderContext, + ) -> Result<(), Error> { + let stats = render_ctx.stats(); + self.ensure_scene_buffers(session, &stats)?; + self.renderer.upload_render_ctx(render_ctx, self.buf_ix) + } + + pub fn upload_scene( + &mut self, + session: &Session, + scene: &EncodedSceneRef, + ) -> Result<(), Error> { + let stats = scene.stats(); + self.ensure_scene_buffers(session, &stats)?; + self.renderer.upload_scene(scene, self.buf_ix) + } + + fn ensure_scene_buffers(&mut self, session: &Session, stats: &SceneStats) -> Result<(), Error> { + let scene_size = stats.scene_size(); + unsafe { + self.renderer + .realloc_scene_if_needed(session, scene_size as u64, self.buf_ix)?; + } + let memory_size = self.renderer.memory_size(&stats); + // TODO: better estimate of additional memory needed + // Note: if we were to cover the worst-case binning output, we could make the + // binning stage infallible and cut checking logic. It also may not be a bad + // estimate for the rest. + let estimated_needed = memory_size as u64 + (1 << 20); + if estimated_needed > self.renderer.memory_buf_size() { + if let Some(pending) = self.pending.take() { + // There might be a fine rasterization task that binds the memory buffer + // still in flight. + self.frames[pending].cmd_buf.wait(); + } + unsafe { + self.renderer.realloc_memory(session, estimated_needed)?; + } + } + Ok(()) + } + + /// Run one try of the coarse rendering pipeline. + pub(crate) fn try_run_coarse(&mut self, session: &Session) -> Result { + let frame = &mut self.frames[self.buf_ix]; + let cmd_buf = frame.cmd_buf.cmd_buf(session)?; + unsafe { + cmd_buf.begin(); + // TODO: probably want to return query results as well + self.renderer + .record_coarse(cmd_buf, &frame.coarse_query_pool, self.buf_ix); + self.renderer.record_readback(cmd_buf); + let cmd_buf = frame.cmd_buf.cmd_buf(session)?; + cmd_buf.finish_timestamps(&frame.coarse_query_pool); + cmd_buf.host_barrier(); + cmd_buf.finish(); + frame.cmd_buf.submit(session, &[], &[])?; + frame.cmd_buf.wait(); + frame.timing_stats.coarse = session.fetch_query_pool(&frame.coarse_query_pool)?; + let mut result = Vec::new(); + // TODO: consider read method for single POD value + self.renderer.memory_buf_readback.read(&mut result)?; + Ok(result[0]) + } + } + + /// Run the coarse render pipeline, ensuring enough memory for intermediate buffers. + pub fn run_coarse(&mut self, session: &Session) -> Result<(), Error> { + loop { + let mem_header = self.try_run_coarse(session)?; + //println!("{:?}", mem_header); + if mem_header.mem_error == 0 { + let blend_needed = mem_header.blend_offset as u64; + if blend_needed > self.renderer.blend_size() { + unsafe { + self.renderer.realloc_blend(session, blend_needed)?; + } + } + return Ok(()); + } + // Not enough memory, reallocate and retry. + // TODO: be smarter (multiplier for early stages) + let mem_size = mem_header.mem_offset + 4096; + // Safety rationalization: no command buffers containing the buffer are + // in flight. + unsafe { + self.renderer.realloc_memory(session, mem_size.into())?; + self.renderer.upload_config(self.buf_ix)?; + } + } + } + + /// Record the fine rasterizer, leaving the command buffer open. + pub fn record_fine(&mut self, session: &Session) -> Result { + let frame = &mut self.frames[self.buf_ix]; + let cmd_buf = frame.cmd_buf.cmd_buf(session)?; + unsafe { + cmd_buf.begin(); + self.renderer + .record_fine(cmd_buf, &frame.fine_query_pool, 0); + } + let image = &self.renderer.image_dev; + Ok(TargetState { cmd_buf, image }) + } + + /// Submit the current command buffer. + pub fn submit( + &mut self, + session: &Session, + wait_semaphores: &[&Semaphore], + signal_semaphores: &[&Semaphore], + ) -> Result<(), Error> { + let frame = &mut self.frames[self.buf_ix]; + let cmd_buf = frame.cmd_buf.cmd_buf(session)?; + unsafe { + cmd_buf.finish_timestamps(&frame.fine_query_pool); + cmd_buf.host_barrier(); + cmd_buf.finish(); + frame + .cmd_buf + .submit(session, wait_semaphores, signal_semaphores)? + } + self.pending = Some(self.buf_ix); + Ok(()) + } + + unsafe fn wait_frame(&mut self, session: &Session, buf_ix: usize) { + let frame = &mut self.frames[buf_ix]; + frame.cmd_buf.wait(); + if let Ok(stats) = session.fetch_query_pool(&frame.fine_query_pool) { + frame.timing_stats.fine = stats; + } + if self.pending == Some(buf_ix) { + self.pending = None; + } + } + + pub unsafe fn wait(&mut self, session: &Session) { + self.wait_frame(session, self.buf_ix); + } + + /// Move to the next buffer. + pub fn next_buffer(&mut self) { + self.buf_ix = (self.buf_ix + 1) % self.frames.len() + } + + pub unsafe fn get_timing_stats(&mut self, session: &Session, buf_ix: usize) -> &TimingStats { + self.wait_frame(session, buf_ix); + &self.frames[buf_ix].timing_stats + } + + pub fn wait_all(&mut self, session: &Session) { + for buf_ix in 0..self.frames.len() { + unsafe { + self.wait_frame(session, buf_ix); + } + } + } +} + +impl Default for CmdBufState { + fn default() -> Self { + CmdBufState::Start + } +} + +impl CmdBufState { + /// Get a command buffer suitable for recording. + /// + /// If the command buffer is submitted, wait. + fn cmd_buf(&mut self, session: &Session) -> Result<&mut CmdBuf, Error> { + if let CmdBufState::Ready(cmd_buf) = self { + return Ok(cmd_buf); + } + if let CmdBufState::Submitted(submitted) = std::mem::take(self) { + if let Ok(Some(cmd_buf)) = submitted.wait() { + *self = CmdBufState::Ready(cmd_buf); + } + } + if matches!(self, CmdBufState::Start) { + *self = CmdBufState::Ready(session.cmd_buf()?); + } + if let CmdBufState::Ready(cmd_buf) = self { + Ok(cmd_buf) + } else { + unreachable!() + } + } + + unsafe fn submit( + &mut self, + session: &Session, + wait_semaphores: &[&Semaphore], + signal_semaphores: &[&Semaphore], + ) -> Result<(), Error> { + if let CmdBufState::Ready(cmd_buf) = std::mem::take(self) { + let submitted = session.run_cmd_buf(cmd_buf, wait_semaphores, signal_semaphores)?; + *self = CmdBufState::Submitted(submitted); + Ok(()) + } else { + Err("Tried to submit CmdBufState not in ready state".into()) + } + } + + fn wait(&mut self) { + if matches!(self, CmdBufState::Submitted(_)) { + if let CmdBufState::Submitted(submitted) = std::mem::take(self) { + if let Ok(Some(cmd_buf)) = submitted.wait() { + *self = CmdBufState::Ready(cmd_buf); + } + } + } + } +} + +impl TimingStats { + pub fn print_summary(&self) { + let ts = &self.coarse; + println!("Element time: {:.3}ms", ts[0] * 1e3); + println!("Clip + bin + tile time: {:.3}ms", (ts[2] - ts[1]) * 1e3); + println!("Coarse path time: {:.3}ms", (ts[4] - ts[2]) * 1e3); + println!("Backdrop time: {:.3}ms", (ts[6] - ts[5]) * 1e3); + println!("Coarse raster kernel time: {:.3}ms", (ts[8] - ts[7]) * 1e3); + println!("Fine kernel time: {:.3}ms", self.fine[0] * 1e3); + } + + pub fn short_summary(&self) -> String { + let ts = &self.coarse; + let el = ts[0] * 1e3; + let cl = (ts[2] - ts[1]) * 1e3; + let cp = (ts[4] - ts[3]) * 1e3; + let bd = (ts[6] - ts[5]) * 1e3; + let cr = (ts[8] - ts[7]) * 1e3; + let fr = self.fine[0] * 1e3; + let total = el + cl + cp + bd + cr + fr; + format!( + "{:.3}ms :: el:{:.3}ms|cl:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|cr:{:.3}ms|fr:{:.3}ms", + total, el, cl, cp, bd, cr, fr + ) + } +} diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs index 5442ba3..fd85776 100644 --- a/piet-gpu/src/stages.rs +++ b/piet-gpu/src/stages.rs @@ -37,6 +37,7 @@ pub use transform::{ #[repr(C)] #[derive(Clone, Copy, Default, Debug, Zeroable, Pod)] pub struct Config { + pub mem_size: u32, pub n_elements: u32, // paths pub n_pathseg: u32, pub width_in_tiles: u32, @@ -167,3 +168,17 @@ impl ElementStage { .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj); } } + +impl ElementBinding { + pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) { + self.transform_binding.rebind_memory(session, memory); + self.path_binding.rebind_memory(session, memory); + self.draw_binding.rebind_memory(session, memory); + } + + pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) { + self.transform_binding.rebind_scene(session, scene); + self.path_binding.rebind_scene(session, scene); + self.draw_binding.rebind_scene(session, scene); + } +} diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs index b7b77eb..bb9998f 100644 --- a/piet-gpu/src/stages/clip.rs +++ b/piet-gpu/src/stages/clip.rs @@ -93,4 +93,9 @@ impl ClipBinding { pass.memory_barrier(); } } + + pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory); + session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory); + } } diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs index f0ee2b6..8e55f95 100644 --- a/piet-gpu/src/stages/draw.rs +++ b/piet-gpu/src/stages/draw.rs @@ -163,3 +163,15 @@ impl DrawStage { ); } } + +impl DrawBinding { + pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory); + session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory); + } + + pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene); + session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene); + } +} diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs index be33041..312358e 100644 --- a/piet-gpu/src/stages/path.rs +++ b/piet-gpu/src/stages/path.rs @@ -200,6 +200,19 @@ impl PathStage { } } +impl PathBinding { + pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory); + session.update_buffer_descriptor(&mut self.clear_ds, 0, memory); + session.update_buffer_descriptor(&mut self.path_ds, 0, memory); + } + + pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene); + session.update_buffer_descriptor(&mut self.path_ds, 2, scene); + } +} + pub struct PathEncoder<'a> { tag_stream: &'a mut Vec, // If we're never going to use the i16 encoding, it might be diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs index 8de7cee..43b68df 100644 --- a/piet-gpu/src/stages/transform.rs +++ b/piet-gpu/src/stages/transform.rs @@ -166,6 +166,18 @@ impl TransformStage { } } +impl TransformBinding { + pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory); + session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory); + } + + pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) { + session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene); + session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene); + } +} + impl Transform { pub const IDENTITY: Transform = Transform { mat: [1.0, 0.0, 0.0, 1.0],