merge from dev branch - dev

This commit is contained in:
Commit by GitHub Action 2022-07-14 14:28:25 +00:00
commit d529d3b0e8
22 changed files with 994 additions and 634 deletions

View file

@ -20,7 +20,7 @@ jobs:
git switch main git switch main
git config user.name "Commit by GitHub Action" git config user.name "Commit by GitHub Action"
git config user.email "nobody@example.com" git config user.email "nobody@example.com"
git merge dev -m "merge from dev branch" git merge dev -m "merge from dev branch - ${{ github.ref_name }}"
sed -i '' '/shader\/gen/d' .gitignore sed -i '' '/shader\/gen/d' .gitignore
git add .gitignore git add .gitignore
git rm -r --ignore-unmatch piet-gpu/shader/gen git rm -r --ignore-unmatch piet-gpu/shader/gen

View file

@ -20,7 +20,7 @@ use piet_gpu_hal::{
use piet::kurbo::Point; use piet::kurbo::Point;
use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder}; use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
use piet_gpu::{test_scenes, PietGpuRenderContext, Renderer}; use piet_gpu::{test_scenes, PietGpuRenderContext, RenderDriver, Renderer};
#[cfg_attr(target_os = "android", ndk_glue::main(backtrace = "on"))] #[cfg_attr(target_os = "android", ndk_glue::main(backtrace = "on"))]
fn main() { fn main() {
@ -34,12 +34,9 @@ struct MyHandle {
// State required to render and present the contents // State required to render and present the contents
struct GfxState { struct GfxState {
session: Session, session: Session,
renderer: Renderer, render_driver: RenderDriver,
swapchain: Swapchain, swapchain: Swapchain,
current_frame: usize, current_frame: usize,
submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES],
cmd_bufs: [Option<CmdBuf>; NUM_FRAMES],
query_pools: Vec<QueryPool>,
present_semaphores: Vec<Semaphore>, present_semaphores: Vec<Semaphore>,
} }
@ -110,22 +107,15 @@ impl GfxState {
let present_semaphores = (0..NUM_FRAMES) let present_semaphores = (0..NUM_FRAMES)
.map(|_| session.create_semaphore()) .map(|_| session.create_semaphore())
.collect::<Result<Vec<_>, Error>>()?; .collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(Renderer::QUERY_POOL_SIZE))
.collect::<Result<Vec<_>, Error>>()?;
let submitted = Default::default();
let cmd_bufs = Default::default();
let renderer = Renderer::new(&session, width, height, NUM_FRAMES)?; let renderer = Renderer::new(&session, width, height, NUM_FRAMES)?;
let render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer);
Ok(GfxState { Ok(GfxState {
session, session,
renderer, render_driver,
swapchain, swapchain,
current_frame, current_frame,
submitted,
cmd_bufs,
query_pools,
present_semaphores, present_semaphores,
}) })
} }
@ -137,51 +127,47 @@ impl GfxState {
let frame_idx = self.current_frame % NUM_FRAMES; let frame_idx = self.current_frame % NUM_FRAMES;
let mut info_string = String::new(); let mut info_string = String::new();
if let Some(submitted) = self.submitted[frame_idx].take() { if self.current_frame >= NUM_FRAMES {
self.cmd_bufs[frame_idx] = submitted.wait().unwrap(); let stats = self
let ts = self .render_driver
.session .get_timing_stats(&self.session, frame_idx);
.fetch_query_pool(&self.query_pools[frame_idx]) info_string = stats.short_summary();
.unwrap(); println!("{}", info_string);
info_string = format!("{:.1}ms", ts.last().unwrap() * 1e3);
println!("render time: {:?}", ts);
} }
let mut ctx = PietGpuRenderContext::new(); let mut ctx = PietGpuRenderContext::new();
test_scenes::render_anim_frame(&mut ctx, self.current_frame); test_scenes::render_anim_frame(&mut ctx, self.current_frame);
//test_scenes::render_tiger(&mut ctx); //test_scenes::render_tiger(&mut ctx);
render_info_string(&mut ctx, &info_string); render_info_string(&mut ctx, &info_string);
if let Err(e) = self.renderer.upload_render_ctx(&mut ctx, frame_idx) { if let Err(e) = self
.render_driver
.upload_render_ctx(&self.session, &mut ctx)
{
println!("error in uploading: {}", e); println!("error in uploading: {}", e);
} }
let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap(); let (image_idx, acquisition_semaphore) = self.swapchain.next().unwrap();
let swap_image = self.swapchain.image(image_idx); let swap_image = self.swapchain.image(image_idx);
let query_pool = &self.query_pools[frame_idx]; self.render_driver.run_coarse(&self.session).unwrap();
let mut cmd_buf = self.cmd_bufs[frame_idx] let target = self.render_driver.record_fine(&self.session).unwrap();
.take() let cmd_buf = target.cmd_buf;
.unwrap_or_else(|| self.session.cmd_buf().unwrap());
cmd_buf.begin();
self.renderer.record(&mut cmd_buf, &query_pool, frame_idx);
// Image -> Swapchain // Image -> Swapchain
cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst); cmd_buf.image_barrier(&swap_image, ImageLayout::Undefined, ImageLayout::BlitDst);
cmd_buf.blit_image(&self.renderer.image_dev, &swap_image); cmd_buf.blit_image(target.image, &swap_image);
cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present); cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
cmd_buf.finish();
self.submitted[frame_idx] = Some( self.render_driver
self.session .submit(
.run_cmd_buf( &self.session,
cmd_buf,
&[&acquisition_semaphore], &[&acquisition_semaphore],
&[&self.present_semaphores[frame_idx]], &[&self.present_semaphores[frame_idx]],
) )
.unwrap(), .unwrap();
);
self.swapchain self.swapchain
.present(image_idx, &[&self.present_semaphores[frame_idx]]) .present(image_idx, &[&self.present_semaphores[frame_idx]])
.unwrap(); .unwrap();
self.render_driver.next_buffer();
self.current_frame += 1; self.current_frame += 1;
} }
} }

View file

@ -6,7 +6,7 @@ use clap::{App, Arg};
use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session}; use piet_gpu_hal::{BufferUsage, Error, Instance, InstanceFlags, Session};
use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer}; use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, RenderDriver, Renderer};
const WIDTH: usize = 2048; const WIDTH: usize = 2048;
const HEIGHT: usize = 1536; const HEIGHT: usize = 1536;
@ -231,9 +231,6 @@ fn main() -> Result<(), Error> {
let device = instance.device(None)?; let device = instance.device(None)?;
let session = Session::new(device); let session = Session::new(device);
let mut cmd_buf = session.cmd_buf()?;
let query_pool = session.create_query_pool(Renderer::QUERY_POOL_SIZE)?;
let mut ctx = PietGpuRenderContext::new(); let mut ctx = PietGpuRenderContext::new();
if let Some(input) = matches.value_of("INPUT") { if let Some(input) = matches.value_of("INPUT") {
let mut scale = matches let mut scale = matches
@ -253,40 +250,22 @@ fn main() -> Result<(), Error> {
test_scenes::render_blend_grid(&mut ctx); test_scenes::render_blend_grid(&mut ctx);
} }
let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?; let renderer = Renderer::new(&session, WIDTH, HEIGHT, 1)?;
renderer.upload_render_ctx(&mut ctx, 0)?; let mut render_driver = RenderDriver::new(&session, 1, renderer);
let start = std::time::Instant::now();
render_driver.upload_render_ctx(&session, &mut ctx)?;
let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST; let image_usage = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?; let image_buf = session.create_buffer((WIDTH * HEIGHT * 4) as u64, image_usage)?;
cmd_buf.begin(); render_driver.run_coarse(&session)?;
renderer.record(&mut cmd_buf, &query_pool, 0); let target = render_driver.record_fine(&session)?;
cmd_buf.copy_image_to_buffer(&renderer.image_dev, &image_buf); target
cmd_buf.finish_timestamps(&query_pool); .cmd_buf
cmd_buf.host_barrier(); .copy_image_to_buffer(target.image, &image_buf);
cmd_buf.finish(); render_driver.submit(&session, &[], &[])?;
let start = std::time::Instant::now(); render_driver.wait(&session);
let submitted = session.run_cmd_buf(cmd_buf, &[], &[])?;
submitted.wait()?;
println!("elapsed = {:?}", start.elapsed()); println!("elapsed = {:?}", start.elapsed());
let ts = session.fetch_query_pool(&query_pool).unwrap(); render_driver.get_timing_stats(&session, 0).print_summary();
if !ts.is_empty() {
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
println!(
"Tile allocation kernel time: {:.3}ms",
(ts[1] - ts[0]) * 1e3
);
println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
println!("Backdrop kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
println!("Binning kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
println!("Coarse raster kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
println!("Render kernel time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
}
/*
let mut data: Vec<u32> = Default::default();
renderer.memory_buf_dev.read(&mut data).unwrap();
piet_gpu::dump_k1_data(&data[2..]);
*/
let mut img_data: Vec<u8> = Default::default(); let mut img_data: Vec<u8> = Default::default();
// Note: because png can use a `&[u8]` slice, we could avoid an extra copy // Note: because png can use a `&[u8]` slice, we could avoid an extra copy

View file

@ -1,8 +1,8 @@
use piet::kurbo::Point; use piet::kurbo::Point;
use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder}; use piet::{RenderContext, Text, TextAttribute, TextLayoutBuilder};
use piet_gpu_hal::{CmdBuf, Error, ImageLayout, Instance, Session, SubmittedCmdBuf}; use piet_gpu_hal::{Error, ImageLayout, Instance, Session};
use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, Renderer}; use piet_gpu::{test_scenes, PicoSvg, PietGpuRenderContext, RenderDriver, Renderer};
use clap::{App, Arg}; use clap::{App, Arg};
@ -69,13 +69,9 @@ fn main() -> Result<(), Error> {
let present_semaphores = (0..NUM_FRAMES) let present_semaphores = (0..NUM_FRAMES)
.map(|_| session.create_semaphore()) .map(|_| session.create_semaphore())
.collect::<Result<Vec<_>, Error>>()?; .collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES)
.map(|_| session.create_query_pool(Renderer::QUERY_POOL_SIZE))
.collect::<Result<Vec<_>, Error>>()?;
let mut cmd_bufs: [Option<CmdBuf>; NUM_FRAMES] = Default::default();
let mut submitted: [Option<SubmittedCmdBuf>; NUM_FRAMES] = Default::default();
let mut renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?; let renderer = Renderer::new(&session, WIDTH, HEIGHT, NUM_FRAMES)?;
let mut render_driver = RenderDriver::new(&session, NUM_FRAMES, renderer);
let mut mode = 0usize; let mut mode = 0usize;
event_loop.run(move |event, _, control_flow| { event_loop.run(move |event, _, control_flow| {
@ -106,26 +102,13 @@ fn main() -> Result<(), Error> {
Event::RedrawRequested(window_id) if window_id == window.id() => { Event::RedrawRequested(window_id) if window_id == window.id() => {
let frame_idx = current_frame % NUM_FRAMES; let frame_idx = current_frame % NUM_FRAMES;
if let Some(submitted) = submitted[frame_idx].take() { if current_frame >= NUM_FRAMES {
cmd_bufs[frame_idx] = submitted.wait().unwrap(); let stats = render_driver.get_timing_stats(&session, frame_idx);
let ts = session.fetch_query_pool(&query_pools[frame_idx]).unwrap(); info_string = stats.short_summary();
if !ts.is_empty() {
info_string = format!(
"{:.3}ms :: e:{:.3}ms|alloc:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|bin:{:.3}ms|cr:{:.3}ms|r:{:.3}ms",
ts[10] * 1e3,
ts[0] * 1e3,
(ts[1] - ts[0]) * 1e3,
(ts[2] - ts[1]) * 1e3,
(ts[4] - ts[3]) * 1e3,
(ts[6] - ts[5]) * 1e3,
(ts[8] - ts[7]) * 1e3,
(ts[10] - ts[9]) * 1e3,
);
}
} }
let mut ctx = PietGpuRenderContext::new(); let mut ctx = PietGpuRenderContext::new();
let test_blend = true; let test_blend = false;
if let Some(svg) = &svg { if let Some(svg) = &svg {
test_scenes::render_svg(&mut ctx, svg); test_scenes::render_svg(&mut ctx, svg);
} else if test_blend { } else if test_blend {
@ -168,16 +151,15 @@ fn main() -> Result<(), Error> {
test_scenes::render_anim_frame(&mut ctx, current_frame); test_scenes::render_anim_frame(&mut ctx, current_frame);
} }
render_info_string(&mut ctx, &info_string); render_info_string(&mut ctx, &info_string);
if let Err(e) = renderer.upload_render_ctx(&mut ctx, frame_idx) { if let Err(e) = render_driver.upload_render_ctx(&session, &mut ctx) {
println!("error in uploading: {}", e); println!("error in uploading: {}", e);
} }
let (image_idx, acquisition_semaphore) = swapchain.next().unwrap(); let (image_idx, acquisition_semaphore) = swapchain.next().unwrap();
let swap_image = swapchain.image(image_idx); let swap_image = swapchain.image(image_idx);
let query_pool = &query_pools[frame_idx]; render_driver.run_coarse(&session).unwrap();
let mut cmd_buf = cmd_bufs[frame_idx].take().unwrap_or_else(|| session.cmd_buf().unwrap()); let target = render_driver.record_fine(&session).unwrap();
cmd_buf.begin(); let cmd_buf = target.cmd_buf;
renderer.record(&mut cmd_buf, &query_pool, frame_idx);
// Image -> Swapchain // Image -> Swapchain
cmd_buf.image_barrier( cmd_buf.image_barrier(
@ -185,32 +167,25 @@ fn main() -> Result<(), Error> {
ImageLayout::Undefined, ImageLayout::Undefined,
ImageLayout::BlitDst, ImageLayout::BlitDst,
); );
cmd_buf.blit_image(&renderer.image_dev, &swap_image); cmd_buf.blit_image(target.image, &swap_image);
cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present); cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
cmd_buf.finish(); render_driver
.submit(
submitted[frame_idx] = Some(session &session,
.run_cmd_buf(
cmd_buf,
&[&acquisition_semaphore], &[&acquisition_semaphore],
&[&present_semaphores[frame_idx]], &[&present_semaphores[frame_idx]],
) )
.unwrap()); .unwrap();
swapchain swapchain
.present(image_idx, &[&present_semaphores[frame_idx]]) .present(image_idx, &[&present_semaphores[frame_idx]])
.unwrap(); .unwrap();
render_driver.next_buffer();
current_frame += 1; current_frame += 1;
} }
Event::LoopDestroyed => { Event::LoopDestroyed => {
for cmd_buf in &mut submitted { render_driver.wait_all(&session);
// Wait for command list submission, otherwise dropping of renderer may
// cause validation errors (and possibly crashes).
if let Some(cmd_buf) = cmd_buf.take() {
cmd_buf.wait().unwrap();
}
}
} }
_ => (), _ => (),
} }

View file

@ -45,12 +45,15 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG];
void main() { void main() {
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}
uint th_ix = gl_LocalInvocationIndex; uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x;
// Work assignment: 1 thread : 1 path element // Work assignment: 1 thread : 1 path element
uint row_count = 0; uint row_count = 0;
bool mem_ok = mem_error == NO_ERROR;
if (gl_LocalInvocationID.y == 0) { if (gl_LocalInvocationID.y == 0) {
if (element_ix < conf.n_elements) { if (element_ix < conf.n_elements) {
// Possible TODO: it's not necessary to process backdrops of stroked paths. // Possible TODO: it's not necessary to process backdrops of stroked paths.
@ -68,7 +71,7 @@ void main() {
row_count = 0; row_count = 0;
} }
Alloc path_alloc = new_alloc( Alloc path_alloc = new_alloc(
path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
sh_row_alloc[th_ix] = path_alloc; sh_row_alloc[th_ix] = path_alloc;
} }
sh_row_count[th_ix] = row_count; sh_row_count[th_ix] = row_count;
@ -98,7 +101,7 @@ void main() {
} }
} }
uint width = sh_row_width[el_ix]; uint width = sh_row_width[el_ix];
if (width > 0 && mem_ok) { if (width > 0) {
// Process one row sequentially // Process one row sequentially
// Read backdrop value per tile and prefix sum it // Read backdrop value per tile and prefix sum it
Alloc tiles_alloc = sh_row_alloc[el_ix]; Alloc tiles_alloc = sh_row_alloc[el_ix];

View file

@ -32,8 +32,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE]; shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE]; shared uint sh_chunk_offset[N_TILE];
shared bool sh_alloc_failed;
DrawMonoid load_draw_monoid(uint element_ix) { DrawMonoid load_draw_monoid(uint element_ix) {
uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix; uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
@ -84,10 +83,6 @@ void main() {
for (uint i = 0; i < N_SLICE; i++) { for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0; bitmaps[i][gl_LocalInvocationID.x] = 0;
} }
if (gl_LocalInvocationID.x == 0) {
sh_alloc_failed = false;
}
barrier();
// Read inputs and determine coverage of bins // Read inputs and determine coverage of bins
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
@ -148,26 +143,18 @@ void main() {
count[i][gl_LocalInvocationID.x] = element_count; count[i][gl_LocalInvocationID.x] = element_count;
} }
// element_count is number of elements covering bin for this invocation. // element_count is number of elements covering bin for this invocation.
Alloc chunk_alloc = new_alloc(0, 0, true); uint chunk_offset = 0;
if (element_count != 0) { if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest) chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING);
MallocResult chunk = malloc(element_count * BinInstance_size); sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed) {
sh_alloc_failed = true;
}
} }
// Note: it might be more efficient for reading to do this in the // Note: it might be more efficient for reading to do this in the
// other order (each bin is a contiguous sequence of partitions) // other order (each bin is a contiguous sequence of partitions)
uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
write_mem(conf.bin_alloc, out_ix, element_count); write_mem(conf.bin_alloc, out_ix, element_count);
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); write_mem(conf.bin_alloc, out_ix + 1, chunk_offset);
barrier(); barrier();
if (sh_alloc_failed || mem_error != NO_ERROR) {
return;
}
// Use similar strategy as Laine & Karras paper; loop over bbox of bins // Use similar strategy as Laine & Karras paper; loop over bbox of bins
// touched by this element // touched by this element
@ -181,9 +168,10 @@ void main() {
if (my_slice > 0) { if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix]; idx += count[my_slice - 1][bin_ix];
} }
Alloc out_alloc = sh_chunk_alloc[bin_ix]; uint chunk_offset = sh_chunk_offset[bin_ix];
uint out_offset = out_alloc.offset + idx * BinInstance_size; if (chunk_offset != MALLOC_FAILED) {
BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); memory[(chunk_offset >> 2) + idx] = element_ix;
}
} }
x++; x++;
if (x == x1) { if (x == x1) {

View file

@ -72,49 +72,62 @@ void write_tile_alloc(uint el_ix, Alloc a) {
Alloc read_tile_alloc(uint el_ix, bool mem_ok) { Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
// All memory. // All memory.
return new_alloc(0, memory.length() * 4, mem_ok); return new_alloc(0, conf.mem_size, mem_ok);
} }
#endif #endif
// The maximum number of commands per annotated element. // The maximum number of commands per annotated element.
#define ANNO_COMMANDS 2 #define ANNO_COMMANDS 2
// Perhaps cmd_alloc should be a global? This is a style question. // All writes to the output must be gated by mem_ok.
bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { bool mem_ok = true;
// Perhaps cmd allocations should be a global? This is a style question.
void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset < cmd_limit) { if (cmd_ref.offset < cmd_limit) {
return true; return;
} }
MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC); uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE);
if (new_cmd.failed) { if (new_cmd == MALLOC_FAILED) {
return false; mem_ok = false;
} }
CmdJump jump = CmdJump(new_cmd.alloc.offset); if (mem_ok) {
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_alloc, cmd_ref, jump); Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
cmd_alloc = new_cmd.alloc; }
cmd_ref = CmdRef(cmd_alloc.offset); cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true);
cmd_ref = CmdRef(new_cmd);
// Reserve space for the maximum number of commands and a potential jump. // Reserve space for the maximum number of commands and a potential jump.
cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size; cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
return true;
} }
void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) { void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
if (linewidth < 0.0) { if (linewidth < 0.0) {
if (tile.tile.offset != 0) { if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop); CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
if (mem_ok) {
Cmd_Fill_write(alloc, cmd_ref, cmd_fill); Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
}
cmd_ref.offset += 4 + CmdFill_size; cmd_ref.offset += 4 + CmdFill_size;
} else { } else {
if (mem_ok) {
Cmd_Solid_write(alloc, cmd_ref); Cmd_Solid_write(alloc, cmd_ref);
}
cmd_ref.offset += 4; cmd_ref.offset += 4;
} }
} else { } else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth); CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
if (mem_ok) {
Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke); Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
}
cmd_ref.offset += 4 + CmdStroke_size; cmd_ref.offset += 4 + CmdStroke_size;
} }
} }
void main() { void main() {
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}
// Could use either linear or 2d layouts for both dispatch and // Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract. // invocations within the workgroup. We'll use variables to abstract.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
@ -161,7 +174,6 @@ void main() {
uint drawtag_start = conf.drawtag_offset >> 2; uint drawtag_start = conf.drawtag_offset >> 2;
uint drawdata_start = conf.drawdata_offset >> 2; uint drawdata_start = conf.drawdata_offset >> 2;
uint drawinfo_start = conf.drawinfo_alloc.offset >> 2; uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
bool mem_ok = mem_error == NO_ERROR;
while (true) { while (true) {
for (uint i = 0; i < N_SLICE; i++) { for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0; sh_bitmaps[i][th_ix] = 0;
@ -176,7 +188,7 @@ void main() {
uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
count = read_mem(conf.bin_alloc, in_ix); count = read_mem(conf.bin_alloc, in_ix);
uint offset = read_mem(conf.bin_alloc, in_ix + 1); uint offset = read_mem(conf.bin_alloc, in_ix + 1);
sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok); sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true);
} }
// prefix sum of counts // prefix sum of counts
for (uint i = 0; i < LG_N_PART_READ; i++) { for (uint i = 0; i < LG_N_PART_READ; i++) {
@ -200,7 +212,7 @@ void main() {
} }
// use binary search to find element to read // use binary search to find element to read
uint ix = rd_ix + th_ix; uint ix = rd_ix + th_ix;
if (ix >= wr_ix && ix < ready_ix && mem_ok) { if (ix >= wr_ix && ix < ready_ix) {
uint part_ix = 0; uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) { for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + (uint(N_PART_READ / 2) >> i); uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
@ -257,7 +269,7 @@ void main() {
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size; uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
sh_tile_base[th_ix] = base; sh_tile_base[th_ix] = base;
Alloc path_alloc = new_alloc(path.tiles.offset, Alloc path_alloc = new_alloc(path.tiles.offset,
(path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
write_tile_alloc(th_ix, path_alloc); write_tile_alloc(th_ix, path_alloc);
break; break;
default: default:
@ -293,8 +305,7 @@ void main() {
uint x = sh_tile_x0[el_ix] + seq_ix % width; uint x = sh_tile_x0[el_ix] + seq_ix % width;
uint y = sh_tile_y0[el_ix] + seq_ix / width; uint y = sh_tile_y0[el_ix] + seq_ix / width;
bool include_tile = false; bool include_tile = false;
if (mem_ok) { Tile tile = Tile_read(read_tile_alloc(el_ix, true),
Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
bool is_clip = (tag & 1) != 0; bool is_clip = (tag & 1) != 0;
// Always include the tile if it contains a path segment. // Always include the tile if it contains a path segment.
@ -313,7 +324,6 @@ void main() {
} }
include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
|| is_blend; || is_blend;
}
if (include_tile) { if (include_tile) {
uint el_slice = el_ix / 32; uint el_slice = el_ix / 32;
uint el_mask = 1u << (el_ix & 31); uint el_mask = 1u << (el_ix & 31);
@ -327,7 +337,7 @@ void main() {
// through the draw objects. // through the draw objects.
uint slice_ix = 0; uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix]; uint bitmap = sh_bitmaps[0][th_ix];
while (mem_ok) { while (true) {
if (bitmap == 0) { if (bitmap == 0) {
slice_ix++; slice_ix++;
if (slice_ix == N_SLICE) { if (slice_ix == N_SLICE) {
@ -347,7 +357,7 @@ void main() {
uint drawtag = scene[drawtag_start + element_ix]; uint drawtag = scene[drawtag_start + element_ix];
if (clip_zero_depth == 0) { if (clip_zero_depth == 0) {
Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true),
TileRef(sh_tile_base[element_ref_ix] + TileRef(sh_tile_base[element_ref_ix] +
(sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
uint drawmonoid_base = drawmonoid_start + 4 * element_ix; uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
@ -358,18 +368,16 @@ void main() {
switch (drawtag) { switch (drawtag) {
case Drawtag_FillColor: case Drawtag_FillColor:
float linewidth = uintBitsToFloat(memory[di]); float linewidth = uintBitsToFloat(memory[di]);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
break;
}
write_fill(cmd_alloc, cmd_ref, tile, linewidth); write_fill(cmd_alloc, cmd_ref, tile, linewidth);
uint rgba = scene[dd]; uint rgba = scene[dd];
if (mem_ok) {
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba)); Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
}
cmd_ref.offset += 4 + CmdColor_size; cmd_ref.offset += 4 + CmdColor_size;
break; break;
case Drawtag_FillLinGradient: case Drawtag_FillLinGradient:
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
break;
}
linewidth = uintBitsToFloat(memory[di]); linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth); write_fill(cmd_alloc, cmd_ref, tile, linewidth);
CmdLinGrad cmd_lin; CmdLinGrad cmd_lin;
@ -377,13 +385,13 @@ void main() {
cmd_lin.line_x = uintBitsToFloat(memory[di + 1]); cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
cmd_lin.line_y = uintBitsToFloat(memory[di + 2]); cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
cmd_lin.line_c = uintBitsToFloat(memory[di + 3]); cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
if (mem_ok) {
Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin); Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
}
cmd_ref.offset += 4 + CmdLinGrad_size; cmd_ref.offset += 4 + CmdLinGrad_size;
break; break;
case Drawtag_FillRadGradient: case Drawtag_FillRadGradient:
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
break;
}
linewidth = uintBitsToFloat(memory[di]); linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth); write_fill(cmd_alloc, cmd_ref, tile, linewidth);
CmdRadGrad cmd_rad; CmdRadGrad cmd_rad;
@ -396,29 +404,31 @@ void main() {
cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8])); cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
cmd_rad.ra = uintBitsToFloat(memory[di + 9]); cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
cmd_rad.roff = uintBitsToFloat(memory[di + 10]); cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
if (mem_ok) {
Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad); Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
}
cmd_ref.offset += 4 + CmdRadGrad_size; cmd_ref.offset += 4 + CmdRadGrad_size;
break; break;
case Drawtag_FillImage: case Drawtag_FillImage:
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]); linewidth = uintBitsToFloat(memory[di]);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
write_fill(cmd_alloc, cmd_ref, tile, linewidth); write_fill(cmd_alloc, cmd_ref, tile, linewidth);
uint index = scene[dd]; uint index = scene[dd];
uint raw1 = scene[dd + 1]; uint raw1 = scene[dd + 1];
ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16); ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
if (mem_ok) {
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset)); Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
}
cmd_ref.offset += 4 + CmdImage_size; cmd_ref.offset += 4 + CmdImage_size;
break; break;
case Drawtag_BeginClip: case Drawtag_BeginClip:
if (tile.tile.offset == 0 && tile.backdrop == 0) { if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1; clip_zero_depth = clip_depth + 1;
} else { } else {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
break; if (mem_ok) {
}
Cmd_BeginClip_write(cmd_alloc, cmd_ref); Cmd_BeginClip_write(cmd_alloc, cmd_ref);
}
cmd_ref.offset += 4; cmd_ref.offset += 4;
render_blend_depth++; render_blend_depth++;
max_blend_depth = max(max_blend_depth, render_blend_depth); max_blend_depth = max(max_blend_depth, render_blend_depth);
@ -427,12 +437,11 @@ void main() {
break; break;
case Drawtag_EndClip: case Drawtag_EndClip:
clip_depth--; clip_depth--;
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
write_fill(cmd_alloc, cmd_ref, tile, -1.0); write_fill(cmd_alloc, cmd_ref, tile, -1.0);
uint blend = scene[dd]; uint blend = scene[dd];
if (mem_ok) {
Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend)); Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
}
cmd_ref.offset += 4 + CmdEndClip_size; cmd_ref.offset += 4 + CmdEndClip_size;
render_blend_depth--; render_blend_depth--;
break; break;
@ -459,11 +468,13 @@ void main() {
break; break;
} }
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
if (mem_ok) {
Cmd_End_write(cmd_alloc, cmd_ref); Cmd_End_write(cmd_alloc, cmd_ref);
}
if (max_blend_depth > BLEND_STACK_SPLIT) { if (max_blend_depth > BLEND_STACK_SPLIT) {
uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4; uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
MallocResult scratch = malloc(scratch_size); uint scratch = atomicAdd(blend_offset, scratch_size);
alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc); write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch);
} }
} }
} }

BIN
piet-gpu/shader/image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 337 KiB

View file

@ -14,6 +14,7 @@
// higher quality antialiasing among other things). // higher quality antialiasing among other things).
#define DO_SRGB_CONVERSION 0 #define DO_SRGB_CONVERSION 0
// TODO: the binding of the main buffer can be readonly
#include "mem.h" #include "mem.h"
#include "setup.h" #include "setup.h"
@ -24,19 +25,23 @@
#define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y) #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK_Y)
layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in; layout(local_size_x = CHUNK_DX, local_size_y = CHUNK_DY) in;
layout(set = 0, binding = 1) restrict readonly buffer ConfigBuf { layout(binding = 1) restrict readonly buffer ConfigBuf {
Config conf; Config conf;
}; };
layout(binding = 2) buffer BlendBuf {
uint blend_mem[];
};
#ifdef GRAY #ifdef GRAY
layout(r8, set = 0, binding = 2) uniform restrict writeonly image2D image; layout(r8, binding = 3) uniform restrict writeonly image2D image;
#else #else
layout(rgba8, set = 0, binding = 2) uniform restrict writeonly image2D image; layout(rgba8, binding = 3) uniform restrict writeonly image2D image;
#endif #endif
layout(rgba8, set = 0, binding = 3) uniform restrict readonly image2D image_atlas; layout(rgba8, binding = 4) uniform restrict readonly image2D image_atlas;
layout(rgba8, set = 0, binding = 4) uniform restrict readonly image2D gradients; layout(rgba8, binding = 5) uniform restrict readonly image2D gradients;
#include "ptcl.h" #include "ptcl.h"
#include "tile.h" #include "tile.h"
@ -114,8 +119,9 @@ void main() {
mediump float area[CHUNK]; mediump float area[CHUNK];
uint clip_depth = 0; uint clip_depth = 0;
bool mem_ok = mem_error == NO_ERROR; // Previously we would early-out if there was a memory failure, so we wouldn't try to read corrupt
while (mem_ok) { // tiles. But now we assume this is checked CPU-side before launching fine rasterization.
while (true) {
uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag; uint tag = Cmd_tag(cmd_alloc, cmd_ref).tag;
if (tag == Cmd_End) { if (tag == Cmd_End) {
break; break;
@ -129,7 +135,7 @@ void main() {
df[k] = 1e9; df[k] = 1e9;
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
do { do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref); TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
vec2 line_vec = seg.vector; vec2 line_vec = seg.vector;
for (uint k = 0; k < CHUNK; k++) { for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin; vec2 dpos = xy + vec2(0.5, 0.5) - seg.origin;
@ -151,7 +157,7 @@ void main() {
tile_seg_ref = TileSegRef(fill.tile_ref); tile_seg_ref = TileSegRef(fill.tile_ref);
// Calculate coverage based on backdrop + coverage of each line segment // Calculate coverage based on backdrop + coverage of each line segment
do { do {
TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, mem_ok), tile_seg_ref); TileSeg seg = TileSeg_read(new_alloc(tile_seg_ref.offset, TileSeg_size, true), tile_seg_ref);
for (uint k = 0; k < CHUNK; k++) { for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = xy + vec2(chunk_offset(k)); vec2 my_xy = xy + vec2(chunk_offset(k));
vec2 start = seg.origin - my_xy; vec2 start = seg.origin - my_xy;
@ -248,7 +254,7 @@ void main() {
uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX + uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX +
CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y); CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y);
for (uint k = 0; k < CHUNK; k++) { for (uint k = 0; k < CHUNK; k++) {
memory[base_ix + k] = packsRGB(vec4(rgba[k])); blend_mem[base_ix + k] = packsRGB(vec4(rgba[k]));
rgba[k] = vec4(0.0); rgba[k] = vec4(0.0);
} }
} }
@ -268,7 +274,7 @@ void main() {
if (clip_depth < BLEND_STACK_SPLIT) { if (clip_depth < BLEND_STACK_SPLIT) {
bg_rgba = blend_stack[clip_depth][k]; bg_rgba = blend_stack[clip_depth][k];
} else { } else {
bg_rgba = memory[base_ix + k]; bg_rgba = blend_mem[base_ix + k];
} }
mediump vec4 bg = unpacksRGB(bg_rgba); mediump vec4 bg = unpacksRGB(bg_rgba);
mediump vec4 fg = rgba[k] * area[k]; mediump vec4 fg = rgba[k] * area[k];

View file

@ -3,27 +3,23 @@
layout(set = 0, binding = 0) buffer Memory { layout(set = 0, binding = 0) buffer Memory {
// offset into memory of the next allocation, initialized by the user. // offset into memory of the next allocation, initialized by the user.
uint mem_offset; uint mem_offset;
// mem_error tracks the status of memory accesses, initialized to NO_ERROR // mem_error is a bitmask of stages that have failed allocation.
// by the user. ERR_MALLOC_FAILED is reported for insufficient memory.
// If MEM_DEBUG is defined the following errors are reported:
// - ERR_OUT_OF_BOUNDS is reported for out of bounds writes.
// - ERR_UNALIGNED_ACCESS for memory access not aligned to 32-bit words.
uint mem_error; uint mem_error;
// offset into blend memory of allocations for blend stack.
uint blend_offset;
uint[] memory; uint[] memory;
}; };
// Uncomment this line to add the size field to Alloc and enable memory checks. // Uncomment this line to add the size field to Alloc and enable memory checks.
// Note that the Config struct in setup.h grows size fields as well. // Note that the Config struct in setup.h grows size fields as well.
//#define MEM_DEBUG
#define NO_ERROR 0 // This setting is not working and the mechanism will be removed.
#define ERR_MALLOC_FAILED 1 //#define MEM_DEBUG
#define ERR_OUT_OF_BOUNDS 2
#define ERR_UNALIGNED_ACCESS 3
#ifdef MEM_DEBUG #ifdef MEM_DEBUG
#define Alloc_size 16 #define Alloc_size 16
#else #else
// TODO: this seems wrong
#define Alloc_size 8 #define Alloc_size 8
#endif #endif
@ -37,12 +33,6 @@ struct Alloc {
#endif #endif
}; };
struct MallocResult {
Alloc alloc;
// failed is true if the allocation overflowed memory.
bool failed;
};
// new_alloc synthesizes an Alloc from an offset and size. // new_alloc synthesizes an Alloc from an offset and size.
Alloc new_alloc(uint offset, uint size, bool mem_ok) { Alloc new_alloc(uint offset, uint size, bool mem_ok) {
Alloc a; Alloc a;
@ -57,24 +47,32 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) {
return a; return a;
} }
// malloc allocates size bytes of memory. #define STAGE_BINNING (1u << 0)
MallocResult malloc(uint size) { #define STAGE_TILE_ALLOC (1u << 1)
MallocResult r; #define STAGE_PATH_COARSE (1u << 2)
#define STAGE_COARSE (1u << 3)
// Allocations in main memory will never be 0, and this might be slightly
// faster to test against than some other value.
#define MALLOC_FAILED 0
// Check that previous dependent stages have succeeded.
bool check_deps(uint dep_stage) {
// TODO: this should be an atomic relaxed load, but that involves
// bringing in "memory scope semantics"
return (atomicOr(mem_error, 0) & dep_stage) == 0;
}
// Allocate size bytes of memory, offset in bytes.
// Note: with a bit of rearrangement of header files, we could make the
// mem_size argument go away (it comes from the config binding).
uint malloc_stage(uint size, uint mem_size, uint stage) {
uint offset = atomicAdd(mem_offset, size); uint offset = atomicAdd(mem_offset, size);
r.failed = offset + size > memory.length() * 4; if (offset + size > mem_size) {
r.alloc = new_alloc(offset, size, !r.failed); atomicOr(mem_error, stage);
if (r.failed) { offset = MALLOC_FAILED;
atomicMax(mem_error, ERR_MALLOC_FAILED);
return r;
} }
#ifdef MEM_DEBUG return offset;
if ((size & 3) != 0) {
r.failed = true;
atomicMax(mem_error, ERR_UNALIGNED_ACCESS);
return r;
}
#endif
return r;
} }
// touch_mem checks whether access to the memory word at offset is valid. // touch_mem checks whether access to the memory word at offset is valid.

View file

@ -87,7 +87,13 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
return SubdivResult(val, a0, a2); return SubdivResult(val, a0, a2);
} }
// All writes to the output must be gated by mem_ok.
bool mem_ok = true;
void main() { void main() {
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}
uint element_ix = gl_GlobalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x;
PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size); PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
@ -95,24 +101,10 @@ void main() {
if (element_ix < conf.n_pathseg) { if (element_ix < conf.n_pathseg) {
tag = PathSeg_tag(conf.pathseg_alloc, ref); tag = PathSeg_tag(conf.pathseg_alloc, ref);
} }
bool mem_ok = mem_error == NO_ERROR;
switch (tag.tag) { switch (tag.tag) {
case PathSeg_Cubic: case PathSeg_Cubic:
PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref); PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
// Affine transform is now applied in pathseg
/*
uint trans_ix = cubic.trans_ix;
if (trans_ix > 0) {
TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
}
*/
vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3; vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
float err = err_v.x * err_v.x + err_v.y * err_v.y; float err = err_v.x * err_v.x + err_v.y * err_v.y;
// The number of quadratics. // The number of quadratics.
@ -140,7 +132,7 @@ void main() {
uint path_ix = cubic.path_ix; uint path_ix = cubic.path_ix;
Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
Alloc path_alloc = Alloc path_alloc =
new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
ivec4 bbox = ivec4(path.bbox); ivec4 bbox = ivec4(path.bbox);
vec2 p0 = cubic.p0; vec2 p0 = cubic.p0;
qp0 = cubic.p0; qp0 = cubic.p0;
@ -199,11 +191,12 @@ void main() {
// TODO: can be tighter, use c to bound width // TODO: can be tighter, use c to bound width
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
// Consider using subgroups to aggregate atomic add. // Consider using subgroups to aggregate atomic add.
MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size); uint malloc_size = n_tile_alloc * TileSeg_size;
if (tile_alloc.failed || !mem_ok) { uint tile_offset = malloc_stage(malloc_size, conf.mem_size, STAGE_PATH_COARSE);
return; if (tile_offset == MALLOC_FAILED) {
mem_ok = false;
} }
uint tile_offset = tile_alloc.alloc.offset; Alloc tile_alloc = new_alloc(tile_offset, malloc_size, true);
TileSeg tile_seg; TileSeg tile_seg;
@ -221,10 +214,8 @@ void main() {
int backdrop = p1.y < p0.y ? 1 : -1; int backdrop = p1.y < p0.y ? 1 : -1;
TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
uint tile_el = tile_ref.offset >> 2; uint tile_el = tile_ref.offset >> 2;
if (touch_mem(path_alloc, tile_el + 1)) {
atomicAdd(memory[tile_el + 1], backdrop); atomicAdd(memory[tile_el + 1], backdrop);
} }
}
// next_xray is the xray for the next scanline; the line segment intersects // next_xray is the xray for the next scanline; the line segment intersects
// all tiles between xray and next_xray. // all tiles between xray and next_xray.
@ -247,9 +238,7 @@ void main() {
TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x)); TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
uint tile_el = tile_ref.offset >> 2; uint tile_el = tile_ref.offset >> 2;
uint old = 0; uint old = 0;
if (touch_mem(path_alloc, tile_el)) {
old = atomicExchange(memory[tile_el], tile_offset); old = atomicExchange(memory[tile_el], tile_offset);
}
tile_seg.origin = p0; tile_seg.origin = p0;
tile_seg.vector = p1 - p0; tile_seg.vector = p1 - p0;
float y_edge = 0.0; float y_edge = 0.0;
@ -276,7 +265,9 @@ void main() {
} }
tile_seg.y_edge = y_edge; tile_seg.y_edge = y_edge;
tile_seg.next.offset = old; tile_seg.next.offset = old;
TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg); if (mem_ok) {
TileSeg_write(tile_alloc, TileSegRef(tile_offset), tile_seg);
}
tile_offset += TileSeg_size; tile_offset += TileSeg_size;
} }
xc += b; xc += b;

View file

@ -31,8 +31,9 @@
// to memory for the overflow. // to memory for the overflow.
#define BLEND_STACK_SPLIT 4 #define BLEND_STACK_SPLIT 4
#ifdef ERR_MALLOC_FAILED #ifdef MALLOC_FAILED
struct Config { struct Config {
uint mem_size; // in bytes
uint n_elements; // paths uint n_elements; // paths
uint n_pathseg; uint n_pathseg;
uint width_in_tiles; uint width_in_tiles;

View file

@ -29,7 +29,7 @@ layout(binding = 2) readonly buffer SceneBuf {
#define SY (1.0 / float(TILE_HEIGHT_PX)) #define SY (1.0 / float(TILE_HEIGHT_PX))
shared uint sh_tile_count[TILE_ALLOC_WG]; shared uint sh_tile_count[TILE_ALLOC_WG];
shared MallocResult sh_tile_alloc; shared uint sh_tile_offset;
vec4 load_draw_bbox(uint draw_ix) { vec4 load_draw_bbox(uint draw_ix) {
uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix; uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
@ -42,6 +42,9 @@ vec4 load_draw_bbox(uint draw_ix) {
} }
void main() { void main() {
if (!check_deps(STAGE_BINNING)) {
return;
}
uint th_ix = gl_LocalInvocationID.x; uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x;
// At the moment, element_ix == path_ix. The clip-intersected bounding boxes // At the moment, element_ix == path_ix. The clip-intersected bounding boxes
@ -86,27 +89,24 @@ void main() {
sh_tile_count[th_ix] = total_tile_count; sh_tile_count[th_ix] = total_tile_count;
} }
if (th_ix == TILE_ALLOC_WG - 1) { if (th_ix == TILE_ALLOC_WG - 1) {
sh_tile_alloc = malloc(total_tile_count * Tile_size); sh_tile_offset = malloc_stage(total_tile_count * Tile_size, conf.mem_size, STAGE_TILE_ALLOC);
} }
barrier(); barrier();
MallocResult alloc_start = sh_tile_alloc; uint offset_start = sh_tile_offset;
if (alloc_start.failed || mem_error != NO_ERROR) { if (offset_start == MALLOC_FAILED) {
return; return;
} }
if (element_ix < conf.n_elements) { if (element_ix < conf.n_elements) {
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count); path.tiles = TileRef(offset_start + Tile_size * tile_subix);
path.tiles = TileRef(tiles_alloc.offset);
Path_write(conf.tile_alloc, path_ref, path); Path_write(conf.tile_alloc, path_ref, path);
} }
// Zero out allocated tiles efficiently // Zero out allocated tiles efficiently
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
uint start_ix = alloc_start.alloc.offset >> 2; uint start_ix = offset_start >> 2;
for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) { for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
// Note: this interleaving is faster than using Tile_write memory[start_ix + i] = 0;
// by a significant amount.
write_mem(alloc_start.alloc, start_ix + i, 0);
} }
} }

View file

@ -16,13 +16,12 @@
//! Low-level scene encoding. //! Low-level scene encoding.
use crate::Blend; use crate::{Blend, SceneStats, DRAWTAG_SIZE, TRANSFORM_SIZE};
use bytemuck::{Pod, Zeroable}; use bytemuck::{Pod, Zeroable};
use piet_gpu_hal::BufWrite; use piet_gpu_hal::BufWrite;
use crate::stages::{ use crate::stages::{
self, Config, PathEncoder, Transform, CLIP_PART_SIZE, DRAW_PART_SIZE, PATHSEG_PART_SIZE, self, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE,
TRANSFORM_PART_SIZE,
}; };
pub struct Encoder { pub struct Encoder {
@ -52,86 +51,19 @@ pub struct EncodedSceneRef<'a, T: Copy + Pod> {
} }
impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> { impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
/// Return a config for the element processing pipeline. pub(crate) fn stats(&self) -> SceneStats {
/// SceneStats {
/// This does not include further pipeline processing. Also returns the n_drawobj: self.drawtag_stream.len(),
/// beginning of free memory. drawdata_len: self.drawdata_stream.len(),
pub fn stage_config(&self) -> (Config, usize) { n_transform: self.transform_stream.len(),
// Layout of scene buffer linewidth_len: std::mem::size_of_val(self.linewidth_stream),
let drawtag_offset = 0; pathseg_len: self.pathseg_stream.len(),
let n_drawobj = self.n_drawobj(); n_pathtag: self.tag_stream.len(),
let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
let trans_offset = drawdata_offset + self.drawdata_stream.len();
let n_trans = self.transform_stream.len();
let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
let n_linewidth = self.linewidth_stream.len();
let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
let n_pathtag = self.tag_stream.len();
let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
let pathseg_offset = pathtag_offset + n_pathtag_padded;
// Layout of memory
let mut alloc = 0;
let trans_alloc = alloc;
alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
let pathseg_alloc = alloc;
alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
let path_bbox_alloc = alloc;
let n_path = self.n_path as usize;
alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
let drawmonoid_alloc = alloc;
alloc += n_drawobj_padded * DRAWMONOID_SIZE;
let anno_alloc = alloc;
alloc += n_drawobj * ANNOTATED_SIZE;
let clip_alloc = alloc;
let n_clip = self.n_clip as usize;
const CLIP_SIZE: usize = 4;
alloc += n_clip * CLIP_SIZE;
let clip_bic_alloc = alloc;
const CLIP_BIC_SIZE: usize = 8;
// This can round down, as we only reduce the prefix
alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
let clip_stack_alloc = alloc;
const CLIP_EL_SIZE: usize = 20;
alloc += n_clip * CLIP_EL_SIZE;
let clip_bbox_alloc = alloc;
const CLIP_BBOX_SIZE: usize = 16;
alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
let draw_bbox_alloc = alloc;
alloc += n_drawobj * DRAW_BBOX_SIZE;
let drawinfo_alloc = alloc;
// TODO: not optimized; it can be accumulated during encoding or summed from drawtags
const MAX_DRAWINFO_SIZE: usize = 44;
alloc += n_drawobj * MAX_DRAWINFO_SIZE;
let config = Config {
n_elements: n_drawobj as u32,
n_pathseg: self.n_pathseg,
pathseg_alloc: pathseg_alloc as u32,
anno_alloc: anno_alloc as u32,
trans_alloc: trans_alloc as u32,
path_bbox_alloc: path_bbox_alloc as u32,
drawmonoid_alloc: drawmonoid_alloc as u32,
clip_alloc: clip_alloc as u32,
clip_bic_alloc: clip_bic_alloc as u32,
clip_stack_alloc: clip_stack_alloc as u32,
clip_bbox_alloc: clip_bbox_alloc as u32,
draw_bbox_alloc: draw_bbox_alloc as u32,
drawinfo_alloc: drawinfo_alloc as u32,
n_trans: n_trans as u32,
n_path: self.n_path, n_path: self.n_path,
n_pathseg: self.n_pathseg,
n_clip: self.n_clip, n_clip: self.n_clip,
trans_offset: trans_offset as u32, }
linewidth_offset: linewidth_offset as u32,
pathtag_offset: pathtag_offset as u32,
pathseg_offset: pathseg_offset as u32,
drawtag_offset: drawtag_offset as u32,
drawdata_offset: drawdata_offset as u32,
..Default::default()
};
(config, alloc)
} }
pub fn write_scene(&self, buf: &mut BufWrite) { pub fn write_scene(&self, buf: &mut BufWrite) {
@ -148,34 +80,6 @@ impl<'a, T: Copy + Pod> EncodedSceneRef<'a, T> {
buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize)); buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize));
buf.extend_slice(&self.pathseg_stream); buf.extend_slice(&self.pathseg_stream);
} }
/// The number of draw objects in the draw object stream.
pub(crate) fn n_drawobj(&self) -> usize {
self.drawtag_stream.len()
}
/// The number of paths.
pub(crate) fn n_path(&self) -> u32 {
self.n_path
}
/// The number of path segments.
pub(crate) fn n_pathseg(&self) -> u32 {
self.n_pathseg
}
pub(crate) fn n_transform(&self) -> usize {
self.transform_stream.len()
}
/// The number of tags in the path stream.
pub(crate) fn n_pathtag(&self) -> usize {
self.tag_stream.len()
}
pub(crate) fn n_clip(&self) -> u32 {
self.n_clip
}
} }
/// A scene fragment encoding a glyph. /// A scene fragment encoding a glyph.
@ -191,15 +95,6 @@ pub struct GlyphEncoder {
n_pathseg: u32, n_pathseg: u32,
} }
const TRANSFORM_SIZE: usize = 24;
const LINEWIDTH_SIZE: usize = 4;
const PATHSEG_SIZE: usize = 52;
const PATH_BBOX_SIZE: usize = 24;
const DRAWMONOID_SIZE: usize = 16;
const DRAW_BBOX_SIZE: usize = 16;
const DRAWTAG_SIZE: usize = 4;
const ANNOTATED_SIZE: usize = 40;
// Tags for draw objects. See shader/drawtag.h for the authoritative source. // Tags for draw objects. See shader/drawtag.h for the authoritative source.
const DRAWTAG_FILLCOLOR: u32 = 0x44; const DRAWTAG_FILLCOLOR: u32 = 0x44;
const DRAWTAG_FILLLINGRADIENT: u32 = 0x114; const DRAWTAG_FILLLINGRADIENT: u32 = 0x114;
@ -343,88 +238,6 @@ impl Encoder {
self.n_clip += 1; self.n_clip += 1;
} }
/// Return a config for the element processing pipeline.
///
/// This does not include further pipeline processing. Also returns the
/// beginning of free memory.
pub fn stage_config(&self) -> (Config, usize) {
// Layout of scene buffer
let drawtag_offset = 0;
let n_drawobj = self.n_drawobj();
let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
let trans_offset = drawdata_offset + self.drawdata_stream.len();
let n_trans = self.transform_stream.len();
let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
let n_linewidth = self.linewidth_stream.len();
let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE;
let n_pathtag = self.tag_stream.len();
let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
let pathseg_offset = pathtag_offset + n_pathtag_padded;
// Layout of memory
let mut alloc = 0;
let trans_alloc = alloc;
alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
let pathseg_alloc = alloc;
alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
let path_bbox_alloc = alloc;
let n_path = self.n_path as usize;
alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
let drawmonoid_alloc = alloc;
alloc += n_drawobj_padded * DRAWMONOID_SIZE;
let anno_alloc = alloc;
alloc += n_drawobj * ANNOTATED_SIZE;
let clip_alloc = alloc;
let n_clip = self.n_clip as usize;
const CLIP_SIZE: usize = 4;
alloc += n_clip * CLIP_SIZE;
let clip_bic_alloc = alloc;
const CLIP_BIC_SIZE: usize = 8;
// This can round down, as we only reduce the prefix
alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
let clip_stack_alloc = alloc;
const CLIP_EL_SIZE: usize = 20;
alloc += n_clip * CLIP_EL_SIZE;
let clip_bbox_alloc = alloc;
const CLIP_BBOX_SIZE: usize = 16;
alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
let draw_bbox_alloc = alloc;
alloc += n_drawobj * DRAW_BBOX_SIZE;
let drawinfo_alloc = alloc;
// TODO: not optimized; it can be accumulated during encoding or summed from drawtags
const MAX_DRAWINFO_SIZE: usize = 44;
alloc += n_drawobj * MAX_DRAWINFO_SIZE;
let config = Config {
n_elements: n_drawobj as u32,
n_pathseg: self.n_pathseg,
pathseg_alloc: pathseg_alloc as u32,
anno_alloc: anno_alloc as u32,
trans_alloc: trans_alloc as u32,
path_bbox_alloc: path_bbox_alloc as u32,
drawmonoid_alloc: drawmonoid_alloc as u32,
clip_alloc: clip_alloc as u32,
clip_bic_alloc: clip_bic_alloc as u32,
clip_stack_alloc: clip_stack_alloc as u32,
clip_bbox_alloc: clip_bbox_alloc as u32,
draw_bbox_alloc: draw_bbox_alloc as u32,
drawinfo_alloc: drawinfo_alloc as u32,
n_trans: n_trans as u32,
n_path: self.n_path,
n_clip: self.n_clip,
trans_offset: trans_offset as u32,
linewidth_offset: linewidth_offset as u32,
pathtag_offset: pathtag_offset as u32,
pathseg_offset: pathseg_offset as u32,
drawtag_offset: drawtag_offset as u32,
drawdata_offset: drawdata_offset as u32,
..Default::default()
};
(config, alloc)
}
pub fn write_scene(&self, buf: &mut BufWrite) { pub fn write_scene(&self, buf: &mut BufWrite) {
buf.extend_slice(&self.drawtag_stream); buf.extend_slice(&self.drawtag_stream);
let n_drawobj = self.drawtag_stream.len(); let n_drawobj = self.drawtag_stream.len();
@ -440,32 +253,19 @@ impl Encoder {
buf.extend_slice(&self.pathseg_stream); buf.extend_slice(&self.pathseg_stream);
} }
/// The number of draw objects in the draw object stream. pub(crate) fn stats(&self) -> SceneStats {
pub(crate) fn n_drawobj(&self) -> usize { SceneStats {
self.drawtag_stream.len() n_drawobj: self.drawtag_stream.len(),
} drawdata_len: self.drawdata_stream.len(),
n_transform: self.transform_stream.len(),
linewidth_len: std::mem::size_of_val(&*self.linewidth_stream),
n_pathtag: self.tag_stream.len(),
pathseg_len: self.pathseg_stream.len(),
/// The number of paths. n_path: self.n_path,
pub(crate) fn n_path(&self) -> u32 { n_pathseg: self.n_pathseg,
self.n_path n_clip: self.n_clip,
} }
/// The number of path segments.
pub(crate) fn n_pathseg(&self) -> u32 {
self.n_pathseg
}
pub(crate) fn n_transform(&self) -> usize {
self.transform_stream.len()
}
/// The number of tags in the path stream.
pub(crate) fn n_pathtag(&self) -> usize {
self.tag_stream.len()
}
pub(crate) fn n_clip(&self) -> u32 {
self.n_clip
} }
pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) { pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) {
@ -478,11 +278,6 @@ impl Encoder {
} }
} }
fn align_up(x: usize, align: usize) -> usize {
debug_assert!(align.is_power_of_two());
(x + align - 1) & !(align - 1)
}
fn padding(x: usize, align: usize) -> usize { fn padding(x: usize, align: usize) -> usize {
x.wrapping_neg() & (align - 1) x.wrapping_neg() & (align - 1)
} }

View file

@ -4,17 +4,19 @@ pub mod glyph_render;
mod gradient; mod gradient;
mod pico_svg; mod pico_svg;
mod render_ctx; mod render_ctx;
mod render_driver;
pub mod stages; pub mod stages;
pub mod test_scenes; pub mod test_scenes;
mod text; mod text;
use bytemuck::Pod; use bytemuck::{Pod, Zeroable};
use std::convert::TryInto; use std::convert::TryInto;
pub use blend::{Blend, BlendMode, CompositionMode}; pub use blend::{Blend, BlendMode, CompositionMode};
pub use encoder::EncodedSceneRef; pub use encoder::EncodedSceneRef;
pub use gradient::Colrv1RadialGradient; pub use gradient::Colrv1RadialGradient;
pub use render_ctx::PietGpuRenderContext; pub use render_ctx::PietGpuRenderContext;
pub use render_driver::RenderDriver;
use piet::kurbo::Vec2; use piet::kurbo::Vec2;
use piet::{ImageFormat, RenderContext}; use piet::{ImageFormat, RenderContext};
@ -25,9 +27,12 @@ use piet_gpu_hal::{
}; };
pub use pico_svg::PicoSvg; pub use pico_svg::PicoSvg;
use stages::{ClipBinding, ElementBinding, ElementCode}; use stages::{
ClipBinding, ElementBinding, ElementCode, DRAW_PART_SIZE, PATHSEG_PART_SIZE,
TRANSFORM_PART_SIZE,
};
use crate::stages::{ClipCode, Config, ElementStage}; use crate::stages::{ClipCode, Config, ElementStage, CLIP_PART_SIZE};
const TILE_W: usize = 16; const TILE_W: usize = 16;
const TILE_H: usize = 16; const TILE_H: usize = 16;
@ -64,6 +69,31 @@ pub enum PixelFormat {
Rgba8, Rgba8,
} }
#[repr(C)]
#[derive(Clone, Copy, Debug, Zeroable, Pod)]
pub(crate) struct MemoryHeader {
mem_offset: u32,
mem_error: u32,
blend_offset: u32,
}
/// The sizes of various objects in the encoded scene, needed for memory layout.
#[derive(Default)]
pub(crate) struct SceneStats {
// Slices of scene encoding, in order
pub n_drawobj: usize,
pub drawdata_len: usize,
pub n_transform: usize,
pub linewidth_len: usize,
pub pathseg_len: usize,
pub n_pathtag: usize,
// Additional stats needed needed for memory layout & dispatch
pub n_path: u32,
pub n_pathseg: u32,
pub n_clip: u32,
}
pub struct Renderer { pub struct Renderer {
// These sizes are aligned to tile boundaries, though at some point // These sizes are aligned to tile boundaries, though at some point
// we'll want to have a good strategy for dealing with odd sizes. // we'll want to have a good strategy for dealing with odd sizes.
@ -72,18 +102,23 @@ pub struct Renderer {
pub image_dev: Image, // resulting image pub image_dev: Image, // resulting image
// The reference is held by the pipelines. We will be changing // TODO: two changes needed here. First, if we're fencing on the coarse
// this to make the scene upload dynamic. // pipeline, then we only need one copy (this changes if we also bind the
// scene buffer in fine rasterization, which might be a good idea to reduce
// copying). Second, there should be a staging buffer for discrete cards.
scene_bufs: Vec<Buffer>, scene_bufs: Vec<Buffer>,
memory_buf_host: Vec<Buffer>, memory_buf_host: Vec<Buffer>,
memory_buf_dev: Buffer, memory_buf_dev: Buffer,
memory_buf_readback: Buffer,
// Staging buffers // Staging buffers
config_bufs: Vec<Buffer>, config_bufs: Vec<Buffer>,
// Device config buf // Device config buf
config_buf: Buffer, config_buf: Buffer,
blend_buf: Buffer,
// New element pipeline // New element pipeline
element_code: ElementCode, element_code: ElementCode,
element_stage: ElementStage, element_stage: ElementStage,
@ -111,6 +146,8 @@ pub struct Renderer {
k4_pipeline: Pipeline, k4_pipeline: Pipeline,
k4_ds: DescriptorSet, k4_ds: DescriptorSet,
scene_stats: SceneStats,
// TODO: the following stats are now redundant and can be removed.
n_transform: usize, n_transform: usize,
n_drawobj: usize, n_drawobj: usize,
n_paths: usize, n_paths: usize,
@ -142,7 +179,13 @@ impl RenderConfig {
impl Renderer { impl Renderer {
/// The number of query pool entries needed to run the renderer. /// The number of query pool entries needed to run the renderer.
pub const QUERY_POOL_SIZE: u32 = 12; pub const QUERY_POOL_SIZE: u32 = Self::COARSE_QUERY_POOL_SIZE + Self::FINE_QUERY_POOL_SIZE;
/// The number of query pool entries needed to run the coarse pipeline.
pub const COARSE_QUERY_POOL_SIZE: u32 = 10;
/// The number of query pool entries needed to run the fine pipeline.
pub const FINE_QUERY_POOL_SIZE: u32 = 2;
pub unsafe fn new( pub unsafe fn new(
session: &Session, session: &Session,
@ -166,12 +209,18 @@ impl Renderer {
let width = width + (width.wrapping_neg() & (TILE_W - 1)); let width = width + (width.wrapping_neg() & (TILE_W - 1));
let height = height + (height.wrapping_neg() & (TILE_W - 1)); let height = height + (height.wrapping_neg() & (TILE_W - 1));
let dev = BufferUsage::STORAGE | BufferUsage::COPY_DST; let dev = BufferUsage::STORAGE | BufferUsage::COPY_DST;
let host_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC; let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
let usage_blend = BufferUsage::STORAGE;
let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
let usage_readback = BufferUsage::MAP_READ | BufferUsage::COPY_DST;
// This may be inadequate for very complex scenes (paris etc)
// TODO: separate staging buffer (if needed) // TODO: separate staging buffer (if needed)
let scene_bufs = (0..n_bufs) let scene_bufs = (0..n_bufs)
.map(|_| session.create_buffer(8 * 1024 * 1024, host_upload).unwrap()) .map(|_| {
session
.create_buffer(8 * 1024 * 1024, usage_upload)
.unwrap()
})
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let image_format = match config.format { let image_format = match config.format {
@ -185,15 +234,22 @@ impl Renderer {
let config_bufs = (0..n_bufs) let config_bufs = (0..n_bufs)
.map(|_| { .map(|_| {
session session
.create_buffer(CONFIG_BUFFER_SIZE, host_upload) .create_buffer(CONFIG_BUFFER_SIZE, usage_upload)
.unwrap() .unwrap()
}) })
.collect(); .collect();
let memory_buf_host = (0..n_bufs) let memory_buf_host = (0..n_bufs)
.map(|_| session.create_buffer(2 * 4, host_upload).unwrap()) .map(|_| {
session
.create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_upload)
.unwrap()
})
.collect(); .collect();
let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?; let memory_buf_dev = session.create_buffer(16 * 1024 * 1024, usage_mem_dev)?;
let memory_buf_readback =
session.create_buffer(std::mem::size_of::<MemoryHeader>() as u64, usage_readback)?;
let blend_buf = session.create_buffer(16 * 1024 * 1024, usage_blend)?;
let element_code = ElementCode::new(session); let element_code = ElementCode::new(session);
let element_stage = ElementStage::new(session, &element_code); let element_stage = ElementStage::new(session, &element_code);
@ -282,7 +338,7 @@ impl Renderer {
let gradient_bufs = (0..n_bufs) let gradient_bufs = (0..n_bufs)
.map(|_| { .map(|_| {
session session
.create_buffer(GRADIENT_BUF_SIZE as u64, host_upload) .create_buffer(GRADIENT_BUF_SIZE as u64, usage_upload)
.unwrap() .unwrap()
}) })
.collect(); .collect();
@ -297,6 +353,7 @@ impl Renderer {
&[ &[
BindType::Buffer, BindType::Buffer,
BindType::BufReadOnly, BindType::BufReadOnly,
BindType::Buffer,
BindType::Image, BindType::Image,
BindType::ImageRead, BindType::ImageRead,
BindType::ImageRead, BindType::ImageRead,
@ -304,19 +361,22 @@ impl Renderer {
)?; )?;
let k4_ds = session let k4_ds = session
.descriptor_set_builder() .descriptor_set_builder()
.add_buffers(&[&memory_buf_dev, &config_buf]) .add_buffers(&[&memory_buf_dev, &config_buf, &blend_buf])
.add_images(&[&image_dev]) .add_images(&[&image_dev])
.add_textures(&[&bg_image, &gradients]) .add_textures(&[&bg_image, &gradients])
.build(&session, &k4_pipeline)?; .build(&session, &k4_pipeline)?;
let scene_stats = Default::default();
Ok(Renderer { Ok(Renderer {
width, width,
height, height,
scene_bufs, scene_bufs,
memory_buf_host, memory_buf_host,
memory_buf_dev, memory_buf_dev,
memory_buf_readback,
config_buf, config_buf,
config_bufs, config_bufs,
blend_buf,
image_dev, image_dev,
element_code, element_code,
element_stage, element_stage,
@ -336,6 +396,7 @@ impl Renderer {
coarse_ds, coarse_ds,
k4_pipeline, k4_pipeline,
k4_ds, k4_ds,
scene_stats,
n_transform: 0, n_transform: 0,
n_drawobj: 0, n_drawobj: 0,
n_paths: 0, n_paths: 0,
@ -358,43 +419,14 @@ impl Renderer {
render_ctx: &mut PietGpuRenderContext, render_ctx: &mut PietGpuRenderContext,
buf_ix: usize, buf_ix: usize,
) -> Result<(), Error> { ) -> Result<(), Error> {
let (mut config, mut alloc) = render_ctx.stage_config(); self.scene_stats = render_ctx.stats();
let n_drawobj = render_ctx.n_drawobj();
// TODO: be more consistent in size types
let n_path = render_ctx.n_path() as usize;
self.n_paths = n_path;
self.n_transform = render_ctx.n_transform();
self.n_drawobj = render_ctx.n_drawobj();
self.n_pathseg = render_ctx.n_pathseg() as usize;
self.n_pathtag = render_ctx.n_pathtag();
self.n_clip = render_ctx.n_clip();
// These constants depend on encoding and may need to be updated.
// Perhaps we can plumb these from piet-gpu-derive?
const PATH_SIZE: usize = 12;
const BIN_SIZE: usize = 8;
let width_in_tiles = self.width / TILE_W;
let height_in_tiles = self.height / TILE_H;
let tile_base = alloc;
alloc += ((n_path + 3) & !3) * PATH_SIZE;
let bin_base = alloc;
alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
let ptcl_base = alloc;
alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
config.width_in_tiles = width_in_tiles as u32;
config.height_in_tiles = height_in_tiles as u32;
config.tile_alloc = tile_base as u32;
config.bin_alloc = bin_base as u32;
config.ptcl_alloc = ptcl_base as u32;
unsafe { unsafe {
// TODO: reallocate scene buffer if size is inadequate self.upload_config(buf_ix)?;
{ {
let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?; let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
render_ctx.write_scene(&mut mapped_scene); render_ctx.write_scene(&mut mapped_scene);
} }
self.config_bufs[buf_ix].write(&[config])?;
self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
// Upload gradient data. // Upload gradient data.
let ramp_data = render_ctx.get_ramp_data(); let ramp_data = render_ctx.get_ramp_data();
@ -414,43 +446,14 @@ impl Renderer {
scene: &EncodedSceneRef<T>, scene: &EncodedSceneRef<T>,
buf_ix: usize, buf_ix: usize,
) -> Result<(), Error> { ) -> Result<(), Error> {
let (mut config, mut alloc) = scene.stage_config(); self.scene_stats = scene.stats();
let n_drawobj = scene.n_drawobj();
// TODO: be more consistent in size types
let n_path = scene.n_path() as usize;
self.n_paths = n_path;
self.n_transform = scene.n_transform();
self.n_drawobj = scene.n_drawobj();
self.n_pathseg = scene.n_pathseg() as usize;
self.n_pathtag = scene.n_pathtag();
self.n_clip = scene.n_clip();
// These constants depend on encoding and may need to be updated.
// Perhaps we can plumb these from piet-gpu-derive?
const PATH_SIZE: usize = 12;
const BIN_SIZE: usize = 8;
let width_in_tiles = self.width / TILE_W;
let height_in_tiles = self.height / TILE_H;
let tile_base = alloc;
alloc += ((n_path + 3) & !3) * PATH_SIZE;
let bin_base = alloc;
alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
let ptcl_base = alloc;
alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
config.width_in_tiles = width_in_tiles as u32;
config.height_in_tiles = height_in_tiles as u32;
config.tile_alloc = tile_base as u32;
config.bin_alloc = bin_base as u32;
config.ptcl_alloc = ptcl_base as u32;
unsafe { unsafe {
// TODO: reallocate scene buffer if size is inadequate self.upload_config(buf_ix)?;
{ {
let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?; let mut mapped_scene = self.scene_bufs[buf_ix].map_write(..)?;
scene.write_scene(&mut mapped_scene); scene.write_scene(&mut mapped_scene);
} }
self.config_bufs[buf_ix].write(&[config])?;
self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
// Upload gradient data. // Upload gradient data.
if !scene.ramp_data.is_empty() { if !scene.ramp_data.is_empty() {
@ -464,7 +467,41 @@ impl Renderer {
Ok(()) Ok(())
} }
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) { // Note: configuration has to be re-uploaded when memory buffer is resized
pub(crate) unsafe fn upload_config(&mut self, buf_ix: usize) -> Result<(), Error> {
let stats = &self.scene_stats;
let n_path = stats.n_path as usize;
self.n_paths = n_path;
self.n_transform = stats.n_transform;
self.n_drawobj = stats.n_drawobj;
self.n_pathseg = stats.n_pathseg as usize;
self.n_pathtag = stats.n_pathtag;
self.n_clip = stats.n_clip;
let (mut config, alloc) = stats.config(self.width, self.height);
config.mem_size = self.memory_buf_size() as u32;
self.config_bufs[buf_ix].write(&[config])?;
let mem_header = MemoryHeader {
mem_offset: alloc as u32,
mem_error: 0,
blend_offset: 0,
};
// Note: we could skip doing this on realloc, but probably not worth the bother
self.memory_buf_host[buf_ix].write(&[mem_header])?;
Ok(())
}
/// Get the size of memory for the allocations known in advance.
pub(crate) fn memory_size(&self, stats: &SceneStats) -> usize {
stats.config(self.width, self.height).1
}
/// Record the coarse part of a render pipeline.
pub unsafe fn record_coarse(
&self,
cmd_buf: &mut CmdBuf,
query_pool: &QueryPool,
buf_ix: usize,
) {
cmd_buf.copy_buffer(&self.config_bufs[buf_ix], &self.config_buf); cmd_buf.copy_buffer(&self.config_bufs[buf_ix], &self.config_buf);
cmd_buf.copy_buffer(&self.memory_buf_host[buf_ix], &self.memory_buf_dev); cmd_buf.copy_buffer(&self.memory_buf_host[buf_ix], &self.memory_buf_dev);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
@ -558,9 +595,21 @@ impl Renderer {
pass.end(); pass.end();
cmd_buf.end_debug_label(); cmd_buf.end_debug_label();
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
}
pub unsafe fn record_fine(
&self,
cmd_buf: &mut CmdBuf,
query_pool: &QueryPool,
query_start: u32,
) {
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.begin_debug_label("Fine raster"); cmd_buf.begin_debug_label("Fine raster");
let mut pass = let mut pass = cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(
cmd_buf.begin_compute_pass(&ComputePassDescriptor::timer(&query_pool, 10, 11)); &query_pool,
query_start,
query_start + 1,
));
pass.dispatch( pass.dispatch(
&self.k4_pipeline, &self.k4_pipeline,
&self.k4_ds, &self.k4_ds,
@ -577,6 +626,19 @@ impl Renderer {
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
} }
pub unsafe fn record_readback(&self, cmd_buf: &mut CmdBuf) {
cmd_buf.copy_buffer(&self.memory_buf_dev, &self.memory_buf_readback);
cmd_buf.memory_barrier();
}
/// Record a render pipeline.
///
/// This *assumes* the buffers are adequately sized.
pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, query_pool: &QueryPool, buf_ix: usize) {
self.record_coarse(cmd_buf, query_pool, buf_ix);
self.record_fine(cmd_buf, query_pool, 10);
}
pub fn make_image( pub fn make_image(
session: &Session, session: &Session,
width: usize, width: usize,
@ -636,4 +698,210 @@ impl Renderer {
.unwrap() .unwrap()
} }
} }
pub(crate) unsafe fn realloc_scene_if_needed(
&mut self,
session: &Session,
new_size: u64,
buf_ix: usize,
) -> Result<(), Error> {
if new_size <= self.scene_bufs[buf_ix].size() {
return Ok(());
}
const ALIGN: u64 = 0x10000;
let new_size = (new_size + ALIGN - 1) & ALIGN.wrapping_neg();
println!(
"reallocating scene buf[{}] {} -> {}",
buf_ix,
self.scene_bufs[buf_ix].size(),
new_size
);
let usage_upload = BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC;
let scene_buf = session.create_buffer(new_size, usage_upload)?;
self.element_bindings[buf_ix].rebind_scene(session, &scene_buf);
session.update_buffer_descriptor(&mut self.tile_ds[buf_ix], 2, &scene_buf);
session.update_buffer_descriptor(&mut self.coarse_ds[buf_ix], 2, &scene_buf);
self.scene_bufs[buf_ix] = scene_buf;
Ok(())
}
/// Get the size of the memory buffer.
///
/// This is the usable size (not including the header).
pub(crate) fn memory_buf_size(&self) -> u64 {
self.memory_buf_dev.size() - std::mem::size_of::<MemoryHeader>() as u64
}
pub(crate) unsafe fn realloc_memory(
&mut self,
session: &Session,
new_size: u64,
) -> Result<(), Error> {
println!(
"reallocating memory buf {} -> {}",
self.memory_buf_dev.size(),
new_size
);
let usage_mem_dev = BufferUsage::STORAGE | BufferUsage::COPY_DST | BufferUsage::COPY_SRC;
let memory_buf_dev = session.create_buffer(new_size, usage_mem_dev)?;
for element_binding in &mut self.element_bindings {
element_binding.rebind_memory(session, &memory_buf_dev);
}
self.clip_binding.rebind_memory(session, &memory_buf_dev);
for tile_ds in &mut self.tile_ds {
session.update_buffer_descriptor(tile_ds, 0, &memory_buf_dev);
}
session.update_buffer_descriptor(&mut self.path_ds, 0, &memory_buf_dev);
session.update_buffer_descriptor(&mut self.backdrop_ds, 0, &memory_buf_dev);
session.update_buffer_descriptor(&mut self.bin_ds, 0, &memory_buf_dev);
for coarse_ds in &mut self.coarse_ds {
session.update_buffer_descriptor(coarse_ds, 0, &memory_buf_dev);
}
session.update_buffer_descriptor(&mut self.k4_ds, 0, &memory_buf_dev);
self.memory_buf_dev = memory_buf_dev;
Ok(())
}
pub(crate) fn blend_size(&self) -> u64 {
self.blend_buf.size()
}
pub(crate) unsafe fn realloc_blend(
&mut self,
session: &Session,
new_size: u64,
) -> Result<(), Error> {
println!(
"reallocating blend buf {} -> {}",
self.blend_size(),
new_size
);
let usage_blend = BufferUsage::STORAGE;
let blend_buf = session.create_buffer(new_size, usage_blend)?;
session.update_buffer_descriptor(&mut self.k4_ds, 2, &blend_buf);
self.blend_buf = blend_buf;
Ok(())
}
}
const TRANSFORM_SIZE: usize = 24;
const PATHSEG_SIZE: usize = 52;
const PATH_BBOX_SIZE: usize = 24;
const DRAWMONOID_SIZE: usize = 16;
const DRAW_BBOX_SIZE: usize = 16;
const DRAWTAG_SIZE: usize = 4;
const ANNOTATED_SIZE: usize = 40;
impl SceneStats {
pub(crate) fn scene_size(&self) -> usize {
align_up(self.n_drawobj, DRAW_PART_SIZE as usize) * DRAWTAG_SIZE
+ self.drawdata_len
+ align_up(self.n_transform, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE
+ self.linewidth_len
+ align_up(self.n_pathtag, PATHSEG_PART_SIZE as usize)
+ self.pathseg_len
}
/// Return a config for a scene with these stats.
///
/// Also returns the beginning of free (dynamic) memory.
fn config(&self, width: usize, height: usize) -> (Config, usize) {
// Layout of scene buffer
let drawtag_offset = 0;
let n_drawobj = self.n_drawobj;
let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize);
let drawdata_offset = drawtag_offset + n_drawobj_padded * DRAWTAG_SIZE;
let trans_offset = drawdata_offset + self.drawdata_len;
let n_trans = self.n_transform;
let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize);
let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE;
let pathtag_offset = linewidth_offset + self.linewidth_len;
let n_pathtag = self.n_pathtag;
let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize);
let pathseg_offset = pathtag_offset + n_pathtag_padded;
// Layout of memory
let mut alloc = 0;
let trans_alloc = alloc;
alloc += trans_alloc + n_trans_padded * TRANSFORM_SIZE;
let pathseg_alloc = alloc;
alloc += pathseg_alloc + self.n_pathseg as usize * PATHSEG_SIZE;
let path_bbox_alloc = alloc;
let n_path = self.n_path as usize;
alloc += path_bbox_alloc + n_path * PATH_BBOX_SIZE;
let drawmonoid_alloc = alloc;
alloc += n_drawobj_padded * DRAWMONOID_SIZE;
let anno_alloc = alloc;
alloc += n_drawobj * ANNOTATED_SIZE;
let clip_alloc = alloc;
let n_clip = self.n_clip as usize;
const CLIP_SIZE: usize = 4;
alloc += n_clip * CLIP_SIZE;
let clip_bic_alloc = alloc;
const CLIP_BIC_SIZE: usize = 8;
// This can round down, as we only reduce the prefix
alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE;
let clip_stack_alloc = alloc;
const CLIP_EL_SIZE: usize = 20;
alloc += n_clip * CLIP_EL_SIZE;
let clip_bbox_alloc = alloc;
const CLIP_BBOX_SIZE: usize = 16;
alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE;
let draw_bbox_alloc = alloc;
alloc += n_drawobj * DRAW_BBOX_SIZE;
let drawinfo_alloc = alloc;
// TODO: not optimized; it can be accumulated during encoding or summed from drawtags
const MAX_DRAWINFO_SIZE: usize = 44;
alloc += n_drawobj * MAX_DRAWINFO_SIZE;
// These constants depend on encoding and may need to be updated.
const PATH_SIZE: usize = 12;
const BIN_SIZE: usize = 8;
let width_in_tiles = width / TILE_W;
let height_in_tiles = height / TILE_H;
let tile_base = alloc;
alloc += ((n_path + 3) & !3) * PATH_SIZE;
let bin_base = alloc;
alloc += ((n_drawobj + 255) & !255) * BIN_SIZE;
let ptcl_base = alloc;
alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC;
let config = Config {
mem_size: 0, // to be filled in later
n_elements: n_drawobj as u32,
n_pathseg: self.n_pathseg,
pathseg_alloc: pathseg_alloc as u32,
anno_alloc: anno_alloc as u32,
trans_alloc: trans_alloc as u32,
path_bbox_alloc: path_bbox_alloc as u32,
drawmonoid_alloc: drawmonoid_alloc as u32,
clip_alloc: clip_alloc as u32,
clip_bic_alloc: clip_bic_alloc as u32,
clip_stack_alloc: clip_stack_alloc as u32,
clip_bbox_alloc: clip_bbox_alloc as u32,
draw_bbox_alloc: draw_bbox_alloc as u32,
drawinfo_alloc: drawinfo_alloc as u32,
n_trans: n_trans as u32,
n_path: self.n_path,
n_clip: self.n_clip,
trans_offset: trans_offset as u32,
linewidth_offset: linewidth_offset as u32,
pathtag_offset: pathtag_offset as u32,
pathseg_offset: pathseg_offset as u32,
drawtag_offset: drawtag_offset as u32,
drawdata_offset: drawdata_offset as u32,
width_in_tiles: width_in_tiles as u32,
height_in_tiles: height_in_tiles as u32,
tile_alloc: tile_base as u32,
bin_alloc: bin_base as u32,
ptcl_alloc: ptcl_base as u32,
};
(config, alloc)
}
}
fn align_up(x: usize, align: usize) -> usize {
debug_assert!(align.is_power_of_two());
(x + align - 1) & !(align - 1)
} }

View file

@ -4,7 +4,7 @@ const DO_SRGB_CONVERSION: bool = false;
use std::borrow::Cow; use std::borrow::Cow;
use crate::encoder::GlyphEncoder; use crate::encoder::GlyphEncoder;
use crate::stages::{Config, Transform}; use crate::stages::Transform;
use piet::kurbo::{Affine, PathEl, Point, Rect, Shape}; use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
use piet::{ use piet::{
Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext, Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext,
@ -18,7 +18,7 @@ use piet_gpu_types::scene::Element;
use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache}; use crate::gradient::{Colrv1RadialGradient, LinearGradient, RadialGradient, RampCache};
use crate::text::Font; use crate::text::Font;
pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder}; pub use crate::text::{PietGpuText, PietGpuTextLayout, PietGpuTextLayoutBuilder};
use crate::Blend; use crate::{Blend, SceneStats};
pub struct PietGpuImage; pub struct PietGpuImage;
@ -95,44 +95,15 @@ impl PietGpuRenderContext {
} }
} }
pub fn stage_config(&self) -> (Config, usize) { pub(crate) fn stats(&self) -> SceneStats {
self.new_encoder.stage_config() self.new_encoder.stats()
}
/// Number of draw objects.
///
/// This is for the new element processing pipeline. It's not necessarily the
/// same as the number of paths (as in the old pipeline), but it might take a
/// while to sort that out.
pub fn n_drawobj(&self) -> usize {
self.new_encoder.n_drawobj()
}
/// Number of paths.
pub fn n_path(&self) -> u32 {
self.new_encoder.n_path()
}
pub fn n_pathseg(&self) -> u32 {
self.new_encoder.n_pathseg()
}
pub fn n_pathtag(&self) -> usize {
self.new_encoder.n_pathtag()
}
pub fn n_transform(&self) -> usize {
self.new_encoder.n_transform()
}
pub fn n_clip(&self) -> u32 {
self.new_encoder.n_clip()
} }
pub fn write_scene(&self, buf: &mut BufWrite) { pub fn write_scene(&self, buf: &mut BufWrite) {
self.new_encoder.write_scene(buf); self.new_encoder.write_scene(buf);
} }
// TODO: delete
pub fn get_scene_buf(&mut self) -> &[u8] { pub fn get_scene_buf(&mut self) -> &[u8] {
const ALIGN: usize = 128; const ALIGN: usize = 128;
let padded_size = (self.elements.len() + (ALIGN - 1)) & ALIGN.wrapping_neg(); let padded_size = (self.elements.len() + (ALIGN - 1)) & ALIGN.wrapping_neg();
@ -194,7 +165,6 @@ impl RenderContext for PietGpuRenderContext {
let rad = self.ramp_cache.add_radial_gradient(&rad); let rad = self.ramp_cache.add_radial_gradient(&rad);
Ok(PietGpuBrush::RadGradient(rad)) Ok(PietGpuBrush::RadGradient(rad))
} }
_ => todo!("don't do radial gradients yet"),
} }
} }

View file

@ -0,0 +1,332 @@
// Copyright 2022 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
use bytemuck::Pod;
use piet_gpu_hal::{CmdBuf, Error, Image, QueryPool, Semaphore, Session, SubmittedCmdBuf};
use crate::{EncodedSceneRef, MemoryHeader, PietGpuRenderContext, Renderer, SceneStats};
/// Additional logic for sequencing rendering operations, specifically
/// for handling failure and reallocation.
///
/// It may be this shouldn't be a separate object from Renderer.
pub struct RenderDriver {
frames: Vec<RenderFrame>,
renderer: Renderer,
buf_ix: usize,
/// The index of a pending fine rasterization submission.
pending: Option<usize>,
}
pub struct TargetState<'a> {
pub cmd_buf: &'a mut CmdBuf,
pub image: &'a Image,
}
#[derive(Default, Debug)]
pub struct TimingStats {
coarse: Vec<f64>,
fine: Vec<f64>,
}
struct RenderFrame {
cmd_buf: CmdBufState,
coarse_query_pool: QueryPool,
fine_query_pool: QueryPool,
timing_stats: TimingStats,
}
enum CmdBufState {
Start,
Submitted(SubmittedCmdBuf),
Ready(CmdBuf),
}
impl RenderDriver {
/// Create new render driver.
///
/// Should probably be fallible.
///
/// We can get n from the renderer as well.
pub fn new(session: &Session, n: usize, renderer: Renderer) -> RenderDriver {
let frames = (0..n)
.map(|_| {
// Maybe should allocate here so it doesn't happen on first frame?
let cmd_buf = CmdBufState::default();
let coarse_query_pool =
session.create_query_pool(Renderer::COARSE_QUERY_POOL_SIZE)?;
let fine_query_pool = session.create_query_pool(Renderer::FINE_QUERY_POOL_SIZE)?;
Ok(RenderFrame {
cmd_buf,
coarse_query_pool,
fine_query_pool,
timing_stats: TimingStats::default(),
})
})
.collect::<Result<_, Error>>()
.unwrap();
RenderDriver {
frames,
renderer,
buf_ix: 0,
pending: None,
}
}
pub fn upload_render_ctx(
&mut self,
session: &Session,
render_ctx: &mut PietGpuRenderContext,
) -> Result<(), Error> {
let stats = render_ctx.stats();
self.ensure_scene_buffers(session, &stats)?;
self.renderer.upload_render_ctx(render_ctx, self.buf_ix)
}
pub fn upload_scene<T: Copy + Pod>(
&mut self,
session: &Session,
scene: &EncodedSceneRef<T>,
) -> Result<(), Error> {
let stats = scene.stats();
self.ensure_scene_buffers(session, &stats)?;
self.renderer.upload_scene(scene, self.buf_ix)
}
fn ensure_scene_buffers(&mut self, session: &Session, stats: &SceneStats) -> Result<(), Error> {
let scene_size = stats.scene_size();
unsafe {
self.renderer
.realloc_scene_if_needed(session, scene_size as u64, self.buf_ix)?;
}
let memory_size = self.renderer.memory_size(&stats);
// TODO: better estimate of additional memory needed
// Note: if we were to cover the worst-case binning output, we could make the
// binning stage infallible and cut checking logic. It also may not be a bad
// estimate for the rest.
let estimated_needed = memory_size as u64 + (1 << 20);
if estimated_needed > self.renderer.memory_buf_size() {
if let Some(pending) = self.pending.take() {
// There might be a fine rasterization task that binds the memory buffer
// still in flight.
self.frames[pending].cmd_buf.wait();
}
unsafe {
self.renderer.realloc_memory(session, estimated_needed)?;
}
}
Ok(())
}
/// Run one try of the coarse rendering pipeline.
pub(crate) fn try_run_coarse(&mut self, session: &Session) -> Result<MemoryHeader, Error> {
let frame = &mut self.frames[self.buf_ix];
let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
unsafe {
cmd_buf.begin();
// TODO: probably want to return query results as well
self.renderer
.record_coarse(cmd_buf, &frame.coarse_query_pool, self.buf_ix);
self.renderer.record_readback(cmd_buf);
let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
cmd_buf.finish_timestamps(&frame.coarse_query_pool);
cmd_buf.host_barrier();
cmd_buf.finish();
frame.cmd_buf.submit(session, &[], &[])?;
frame.cmd_buf.wait();
frame.timing_stats.coarse = session.fetch_query_pool(&frame.coarse_query_pool)?;
let mut result = Vec::new();
// TODO: consider read method for single POD value
self.renderer.memory_buf_readback.read(&mut result)?;
Ok(result[0])
}
}
/// Run the coarse render pipeline, ensuring enough memory for intermediate buffers.
pub fn run_coarse(&mut self, session: &Session) -> Result<(), Error> {
loop {
let mem_header = self.try_run_coarse(session)?;
//println!("{:?}", mem_header);
if mem_header.mem_error == 0 {
let blend_needed = mem_header.blend_offset as u64;
if blend_needed > self.renderer.blend_size() {
unsafe {
self.renderer.realloc_blend(session, blend_needed)?;
}
}
return Ok(());
}
// Not enough memory, reallocate and retry.
// TODO: be smarter (multiplier for early stages)
let mem_size = mem_header.mem_offset + 4096;
// Safety rationalization: no command buffers containing the buffer are
// in flight.
unsafe {
self.renderer.realloc_memory(session, mem_size.into())?;
self.renderer.upload_config(self.buf_ix)?;
}
}
}
/// Record the fine rasterizer, leaving the command buffer open.
pub fn record_fine(&mut self, session: &Session) -> Result<TargetState, Error> {
let frame = &mut self.frames[self.buf_ix];
let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
unsafe {
cmd_buf.begin();
self.renderer
.record_fine(cmd_buf, &frame.fine_query_pool, 0);
}
let image = &self.renderer.image_dev;
Ok(TargetState { cmd_buf, image })
}
/// Submit the current command buffer.
pub fn submit(
&mut self,
session: &Session,
wait_semaphores: &[&Semaphore],
signal_semaphores: &[&Semaphore],
) -> Result<(), Error> {
let frame = &mut self.frames[self.buf_ix];
let cmd_buf = frame.cmd_buf.cmd_buf(session)?;
unsafe {
cmd_buf.finish_timestamps(&frame.fine_query_pool);
cmd_buf.host_barrier();
cmd_buf.finish();
frame
.cmd_buf
.submit(session, wait_semaphores, signal_semaphores)?
}
self.pending = Some(self.buf_ix);
Ok(())
}
unsafe fn wait_frame(&mut self, session: &Session, buf_ix: usize) {
let frame = &mut self.frames[buf_ix];
frame.cmd_buf.wait();
if let Ok(stats) = session.fetch_query_pool(&frame.fine_query_pool) {
frame.timing_stats.fine = stats;
}
if self.pending == Some(buf_ix) {
self.pending = None;
}
}
pub unsafe fn wait(&mut self, session: &Session) {
self.wait_frame(session, self.buf_ix);
}
/// Move to the next buffer.
pub fn next_buffer(&mut self) {
self.buf_ix = (self.buf_ix + 1) % self.frames.len()
}
pub unsafe fn get_timing_stats(&mut self, session: &Session, buf_ix: usize) -> &TimingStats {
self.wait_frame(session, buf_ix);
&self.frames[buf_ix].timing_stats
}
pub fn wait_all(&mut self, session: &Session) {
for buf_ix in 0..self.frames.len() {
unsafe {
self.wait_frame(session, buf_ix);
}
}
}
}
impl Default for CmdBufState {
fn default() -> Self {
CmdBufState::Start
}
}
impl CmdBufState {
/// Get a command buffer suitable for recording.
///
/// If the command buffer is submitted, wait.
fn cmd_buf(&mut self, session: &Session) -> Result<&mut CmdBuf, Error> {
if let CmdBufState::Ready(cmd_buf) = self {
return Ok(cmd_buf);
}
if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
if let Ok(Some(cmd_buf)) = submitted.wait() {
*self = CmdBufState::Ready(cmd_buf);
}
}
if matches!(self, CmdBufState::Start) {
*self = CmdBufState::Ready(session.cmd_buf()?);
}
if let CmdBufState::Ready(cmd_buf) = self {
Ok(cmd_buf)
} else {
unreachable!()
}
}
unsafe fn submit(
&mut self,
session: &Session,
wait_semaphores: &[&Semaphore],
signal_semaphores: &[&Semaphore],
) -> Result<(), Error> {
if let CmdBufState::Ready(cmd_buf) = std::mem::take(self) {
let submitted = session.run_cmd_buf(cmd_buf, wait_semaphores, signal_semaphores)?;
*self = CmdBufState::Submitted(submitted);
Ok(())
} else {
Err("Tried to submit CmdBufState not in ready state".into())
}
}
fn wait(&mut self) {
if matches!(self, CmdBufState::Submitted(_)) {
if let CmdBufState::Submitted(submitted) = std::mem::take(self) {
if let Ok(Some(cmd_buf)) = submitted.wait() {
*self = CmdBufState::Ready(cmd_buf);
}
}
}
}
}
impl TimingStats {
pub fn print_summary(&self) {
let ts = &self.coarse;
println!("Element time: {:.3}ms", ts[0] * 1e3);
println!("Clip + bin + tile time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
println!("Coarse path time: {:.3}ms", (ts[4] - ts[2]) * 1e3);
println!("Backdrop time: {:.3}ms", (ts[6] - ts[5]) * 1e3);
println!("Coarse raster kernel time: {:.3}ms", (ts[8] - ts[7]) * 1e3);
println!("Fine kernel time: {:.3}ms", self.fine[0] * 1e3);
}
pub fn short_summary(&self) -> String {
let ts = &self.coarse;
let el = ts[0] * 1e3;
let cl = (ts[2] - ts[1]) * 1e3;
let cp = (ts[4] - ts[3]) * 1e3;
let bd = (ts[6] - ts[5]) * 1e3;
let cr = (ts[8] - ts[7]) * 1e3;
let fr = self.fine[0] * 1e3;
let total = el + cl + cp + bd + cr + fr;
format!(
"{:.3}ms :: el:{:.3}ms|cl:{:.3}ms|cp:{:.3}ms|bd:{:.3}ms|cr:{:.3}ms|fr:{:.3}ms",
total, el, cl, cp, bd, cr, fr
)
}
}

View file

@ -37,6 +37,7 @@ pub use transform::{
#[repr(C)] #[repr(C)]
#[derive(Clone, Copy, Default, Debug, Zeroable, Pod)] #[derive(Clone, Copy, Default, Debug, Zeroable, Pod)]
pub struct Config { pub struct Config {
pub mem_size: u32,
pub n_elements: u32, // paths pub n_elements: u32, // paths
pub n_pathseg: u32, pub n_pathseg: u32,
pub width_in_tiles: u32, pub width_in_tiles: u32,
@ -167,3 +168,17 @@ impl ElementStage {
.record(pass, &code.draw_code, &binding.draw_binding, n_drawobj); .record(pass, &code.draw_code, &binding.draw_binding, n_drawobj);
} }
} }
impl ElementBinding {
pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
self.transform_binding.rebind_memory(session, memory);
self.path_binding.rebind_memory(session, memory);
self.draw_binding.rebind_memory(session, memory);
}
pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
self.transform_binding.rebind_scene(session, scene);
self.path_binding.rebind_scene(session, scene);
self.draw_binding.rebind_scene(session, scene);
}
}

View file

@ -93,4 +93,9 @@ impl ClipBinding {
pass.memory_barrier(); pass.memory_barrier();
} }
} }
pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
}
} }

View file

@ -163,3 +163,15 @@ impl DrawStage {
); );
} }
} }
impl DrawBinding {
pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
}
pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
}
}

View file

@ -200,6 +200,19 @@ impl PathStage {
} }
} }
impl PathBinding {
pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
session.update_buffer_descriptor(&mut self.clear_ds, 0, memory);
session.update_buffer_descriptor(&mut self.path_ds, 0, memory);
}
pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
session.update_buffer_descriptor(&mut self.path_ds, 2, scene);
}
}
pub struct PathEncoder<'a> { pub struct PathEncoder<'a> {
tag_stream: &'a mut Vec<u8>, tag_stream: &'a mut Vec<u8>,
// If we're never going to use the i16 encoding, it might be // If we're never going to use the i16 encoding, it might be

View file

@ -166,6 +166,18 @@ impl TransformStage {
} }
} }
impl TransformBinding {
pub unsafe fn rebind_memory(&mut self, session: &Session, memory: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 0, memory);
session.update_buffer_descriptor(&mut self.leaf_ds, 0, memory);
}
pub unsafe fn rebind_scene(&mut self, session: &Session, scene: &Buffer) {
session.update_buffer_descriptor(&mut self.reduce_ds, 2, scene);
session.update_buffer_descriptor(&mut self.leaf_ds, 2, scene);
}
}
impl Transform { impl Transform {
pub const IDENTITY: Transform = Transform { pub const IDENTITY: Transform = Transform {
mat: [1.0, 0.0, 0.0, 1.0], mat: [1.0, 0.0, 0.0, 1.0],