diff --git a/piet-gpu-types/src/ptcl.rs b/piet-gpu-types/src/ptcl.rs index bdf342b..98c4d44 100644 --- a/piet-gpu-types/src/ptcl.rs +++ b/piet-gpu-types/src/ptcl.rs @@ -13,8 +13,9 @@ piet_gpu! { end: [f32; 2], } struct CmdStroke { - // Consider a specialization to one segment. - seg_ref: Ref, + // This is really a Ref, but we don't have cross-module + // references. + tile_ref: u32, half_width: f32, rgba_color: u32, } diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 04a20ba..0714f00 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -171,7 +171,7 @@ fn main() -> Result<(), Error> { let fence = device.create_fence(false)?; let mut cmd_buf = device.create_cmd_buf()?; - let query_pool = device.create_query_pool(5)?; + let query_pool = device.create_query_pool(7)?; let mut ctx = PietGpuRenderContext::new(); if let Some(input) = matches.value_of("INPUT") { @@ -204,14 +204,16 @@ fn main() -> Result<(), Error> { println!("Element kernel time: {:.3}ms", ts[0] * 1e3); println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3); println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3); - /* - println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); - */ + println!("Binning kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3); + println!("Coarse raster kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3); + println!("Render kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3); + /* let mut data: Vec = Default::default(); - device.read_buffer(&renderer.tile_buf, &mut data).unwrap(); + device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); piet_gpu::dump_k1_data(&data); //trace_ptcl(&data); + */ let mut img_data: Vec = Default::default(); // Note: because png can use a `&[u8]` slice, we could avoid an extra copy diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 3656f77..28efd16 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -15,17 +15,22 @@ layout(set = 0, binding = 1) buffer BinsBuf { uint[] bins; }; -layout(set = 0, binding = 2) buffer AllocBuf { +layout(set = 0, binding = 2) buffer TileBuf { + uint[] tile; +}; + +layout(set = 0, binding = 3) buffer AllocBuf { uint n_elements; uint alloc; }; -layout(set = 0, binding = 3) buffer PtclBuf { +layout(set = 0, binding = 4) buffer PtclBuf { uint[] ptcl; }; #include "annotated.h" #include "bins.h" +#include "tile.h" #include "ptcl.h" #define LG_N_PART_READ 8 @@ -197,37 +202,11 @@ void main() { tag = Annotated_tag(ref); } - // Setup for coverage algorithm. - float a, b, c; // Bounding box of element in pixel coordinates. float xmin, xmax, ymin, ymax; uint my_slice = th_ix / 32; uint my_mask = 1 << (th_ix & 31); switch (tag) { - case Annotated_FillLine: - case Annotated_StrokeLine: - AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); - xmin = min(line.p0.x, line.p1.x) - line.stroke.x; - xmax = max(line.p0.x, line.p1.x) + line.stroke.x; - ymin = min(line.p0.y, line.p1.y) - line.stroke.y; - ymax = max(line.p0.y, line.p1.y) + line.stroke.y; - float dx = line.p1.x - line.p0.x; - float dy = line.p1.y - line.p0.y; - if (tag == Annotated_FillLine) { - // Set bit for backdrop sign calculation, 1 is +1, 0 is -1. - if (dy < 0) { - atomicOr(sh_bd_sign[my_slice], my_mask); - } else { - atomicAnd(sh_bd_sign[my_slice], ~my_mask); - } - } - atomicOr(sh_is_segment[my_slice], my_mask); - // Set up for per-scanline coverage formula, below. - float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy; - c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX; - b = invslope; // Note: assumes square tiles, otherwise scale. - a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX; - break; case Annotated_Fill: case Annotated_Stroke: // Note: we take advantage of the fact that fills and strokes @@ -237,10 +216,6 @@ void main() { xmax = fill.bbox.z; ymin = fill.bbox.y; ymax = fill.bbox.w; - // Just let the clamping to xmin and xmax determine the bounds. - a = 0.0; - b = 0.0; - c = 1e9; break; default: ymin = 0; @@ -254,37 +229,23 @@ void main() { // Compute bounding box in tiles and clip to this bin. int x0 = int(floor((xmin - xy0.x) * SX)); int x1 = int(ceil((xmax - xy0.x) * SX)); - int xr = int(ceil((right_edge - xy0.x) * SX)); int y0 = int(floor((ymin - xy0.y) * SY)); int y1 = int(ceil((ymax - xy0.y) * SY)); x0 = clamp(x0, 0, N_TILE_X); x1 = clamp(x1, x0, N_TILE_X); - xr = clamp(xr, 0, N_TILE_X); y0 = clamp(y0, 0, N_TILE_Y); y1 = clamp(y1, y0, N_TILE_Y); - float t = a + b * float(y0); for (uint y = y0; y < y1; y++) { - uint xx0 = clamp(int(floor(t - c)), x0, x1); - uint xx1 = clamp(int(ceil(t + c)), x0, x1); - for (uint x = xx0; x < xx1; x++) { + for (uint x = x0; x < x1; x++) { atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask); } - if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) { - // Assign backdrop to all tiles to the right of the ray crossing the - // top edge of this tile, up to the right edge of the fill bbox. - float xray = t - 0.5 * b; - xx0 = max(int(ceil(xray)), 0); - for (uint x = xx0; x < xr; x++) { - atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask); - } - } - t += b; } barrier(); // We've computed coverage and other info for each element in the input, now for // the output stage. We'll do segments first using a more parallel algorithm. + /* uint seg_count = 0; for (uint i = 0; i < N_SLICE; i++) { seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]); @@ -372,45 +333,29 @@ void main() { Segment seg = Segment(line.p0, line.p1, y_edge); Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg); } + */ // Output non-segment elements for this tile. The thread does a sequential walk // through the non-segment elements, and for segments, count and backdrop are // aggregated using bit counting. uint slice_ix = 0; uint bitmap = sh_bitmaps[0][th_ix]; - uint bd_bitmap = sh_backdrop[0][th_ix]; - uint bd_sign = sh_bd_sign[0]; - uint is_segment = sh_is_segment[0]; - uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1]; - seg_count = 0; while (true) { - uint nonseg_bitmap = bitmap & ~is_segment; - if (nonseg_bitmap == 0) { - backdrop += count_backdrop(bd_bitmap, bd_sign); - seg_count += bitCount(bitmap & is_segment); + if (bitmap == 0) { slice_ix++; if (slice_ix == N_SLICE) { break; } bitmap = sh_bitmaps[slice_ix][th_ix]; - bd_bitmap = sh_backdrop[slice_ix][th_ix]; - bd_sign = sh_bd_sign[slice_ix]; - is_segment = sh_is_segment[slice_ix]; - nonseg_bitmap = bitmap & ~is_segment; - if (nonseg_bitmap == 0) { + if (bitmap == 0) { continue; } } - uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap); + uint element_ref_ix = slice_ix * 32 + findLSB(bitmap); uint element_ix = sh_elements[element_ref_ix]; - // Bits up to and including the lsb - uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap; - backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign); - seg_count += bitCount(bitmap & bd_mask & is_segment); - // Clear bits that have been consumed. - bd_bitmap &= ~bd_mask; - bitmap &= ~bd_mask; + // Clear LSB + bitmap &= bitmap - 1; // At this point, we read the element again from global memory. // If that turns out to be expensive, maybe we can pack it into @@ -419,6 +364,7 @@ void main() { tag = Annotated_tag(ref); switch (tag) { + /* case Annotated_Fill: if (last_chunk_n > 0 || seg_count > 0) { SegChunkRef chunk_ref = SegChunkRef(0); @@ -460,63 +406,34 @@ void main() { seg_count = 0; backdrop = 0; break; + */ case Annotated_Stroke: - // TODO: reduce divergence & code duplication? Much of the - // fill and stroke processing is in common. - if (last_chunk_n > 0 || seg_count > 0) { - SegChunkRef chunk_ref = SegChunkRef(0); - if (seg_count > 0) { - chunk_ref = alloc_seg_chunk(); - SegChunk chunk; - chunk.n = seg_count; - chunk.next = SegChunkRef(0); - uint seg_offset = seg_alloc + seg_start * Segment_size; - chunk.segs = SegmentRef(seg_offset); - SegChunk_write(chunk_ref, chunk); + // Because the only elements we're processing right now are + // paths, we can just use the element index as the path index. + // In future, when we're doing a bunch of stuff, the path index + // should probably be stored in the annotated element. + uint path_ix = element_ix; + Path path = Path_read(PathRef(path_ix * Path_size)); + // It may be we have a strong guarantee this will always be `true`, but + // I prefer not to take chances. + if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) { + uint stride = path.bbox.z - path.bbox.x; + uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x; + Tile tile = Tile_read(Tile_index(path.tiles, tile_subix)); + if (tile.tile.offset != 0) { + AnnoStroke stroke = Annotated_Stroke_read(ref); + CmdStroke cmd_stroke; + cmd_stroke.tile_ref = tile.tile.offset; + cmd_stroke.half_width = 0.5 * stroke.linewidth; + cmd_stroke.rgba_color = stroke.rgba_color; + alloc_cmd(cmd_ref, cmd_limit); + Cmd_Stroke_write(cmd_ref, cmd_stroke); + cmd_ref.offset += Cmd_size; } - if (last_chunk_n > 0) { - SegChunk chunk; - chunk.n = last_chunk_n; - chunk.next = chunk_ref; - chunk.segs = last_chunk_segs; - SegChunk_write(last_chunk_ref, chunk); - } else { - first_seg_chunk = chunk_ref; - } - - AnnoStroke stroke = Annotated_Stroke_read(ref); - CmdStroke cmd_stroke; - cmd_stroke.seg_ref = first_seg_chunk; - cmd_stroke.half_width = 0.5 * stroke.linewidth; - cmd_stroke.rgba_color = stroke.rgba_color; - alloc_cmd(cmd_ref, cmd_limit); - Cmd_Stroke_write(cmd_ref, cmd_stroke); - cmd_ref.offset += Cmd_size; - last_chunk_n = 0; } - seg_start += seg_count; - seg_count = 0; - break; - default: - // This shouldn't happen, but just in case. - seg_start++; break; } } - if (seg_count > 0) { - SegChunkRef chunk_ref = alloc_seg_chunk(); - if (last_chunk_n > 0) { - SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs)); - } else { - first_seg_chunk = chunk_ref; - } - // TODO: free two registers by writing count and segments ref now, - // as opposed to deferring SegChunk write until all fields are known. - last_chunk_ref = chunk_ref; - last_chunk_n = seg_count; - uint seg_offset = seg_alloc + seg_start * Segment_size; - last_chunk_segs = SegmentRef(seg_offset); - } barrier(); rd_ix += N_TILE; diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index 4b7e1c4..6b2afaf 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 2c068aa..0ecda68 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -17,9 +17,14 @@ layout(set = 0, binding = 0) buffer PtclBuf { uint[] ptcl; }; -layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image; +layout(set = 0, binding = 1) buffer TileBuf { + uint[] tile; +}; + +layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image; #include "ptcl.h" +#include "tile.h" #include "setup.h" @@ -57,28 +62,25 @@ void main() { CmdStroke stroke = Cmd_Stroke_read(cmd_ref); float df[CHUNK]; for (uint k = 0; k < CHUNK; k++) df[k] = 1e9; - SegChunkRef seg_chunk_ref = stroke.seg_ref; + TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref); do { - SegChunk seg_chunk = SegChunk_read(seg_chunk_ref); - SegmentRef segs = seg_chunk.segs; - for (int i = 0; i < seg_chunk.n; i++) { - Segment seg = Segment_read(Segment_index(segs, i)); - vec2 line_vec = seg.end - seg.start; - for (uint k = 0; k < CHUNK; k++) { - vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; - dpos.y += float(k * CHUNK_DY); - float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); - df[k] = min(df[k], length(line_vec * t - dpos)); - } + TileSeg seg = TileSeg_read(tile_seg_ref); + vec2 line_vec = seg.end - seg.start; + for (uint k = 0; k < CHUNK; k++) { + vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; + dpos.y += float(k * CHUNK_DY); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k] = min(df[k], length(line_vec * t - dpos)); } - seg_chunk_ref = seg_chunk.next; - } while (seg_chunk_ref.offset != 0); + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0); fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx; for (uint k = 0; k < CHUNK; k++) { float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0); rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a); } break; + /* case Cmd_Fill: CmdFill fill = Cmd_Fill_read(cmd_ref); // Probably better to store as float, but conversion is no doubt cheap. @@ -117,6 +119,7 @@ void main() { rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a); } break; + */ case Cmd_Solid: CmdSolid solid = Cmd_Solid_read(cmd_ref); fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 5215e2f..cb27407 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index ff79925..5a4b78c 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -78,7 +78,7 @@ void main() { int stride = bbox.z - bbox.x; int base = (y0 - bbox.y) * stride - bbox.x; // TODO: can be tighter, use c to bound width - uint n_tile_alloc = uint(stride * (bbox.w - bbox.y)); + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); // Consider using subgroups to aggregate atomic add. uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); TileSeg tile_seg; diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index ed212d7..cf0d4b9 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index dd1f9a8..d337598 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) { } struct CmdStroke { - SegChunkRef seg_ref; + uint tile_ref; float half_width; uint rgba_color; }; @@ -220,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { uint raw1 = ptcl[ix + 1]; uint raw2 = ptcl[ix + 2]; CmdStroke s; - s.seg_ref = SegChunkRef(raw0); + s.tile_ref = raw0; s.half_width = uintBitsToFloat(raw1); s.rgba_color = raw2; return s; @@ -228,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.seg_ref.offset; + ptcl[ix + 0] = s.tile_ref; ptcl[ix + 1] = floatBitsToUint(s.half_width); ptcl[ix + 2] = s.rgba_color; } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 19e9b43..b568827 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -183,9 +183,9 @@ impl Renderer { device.write_buffer(&scene_buf, &scene)?; let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?; - let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?; - let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?; - let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?; + let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let pathseg_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let tile_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; @@ -228,10 +228,10 @@ impl Renderer { let bin_alloc_buf_dev = device.create_buffer(12, dev)?; // TODO: constants - let bin_alloc_start = ((n_elements + 255) & !255) * 8; + let bin_alloc_start = ((n_paths + 255) & !255) * 8; device.write_buffer( &bin_alloc_buf_host, - &[n_elements as u32, 0, bin_alloc_start as u32], + &[n_paths as u32, 0, bin_alloc_start as u32], )?; let bin_code = include_bytes!("../shader/binning.spv"); let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?; @@ -250,16 +250,20 @@ impl Renderer { &[n_elements as u32, coarse_alloc_start as u32], )?; let coarse_code = include_bytes!("../shader/coarse.spv"); - let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?; + let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 5, 0)?; let coarse_ds = device.create_descriptor_set( &coarse_pipeline, - &[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf], + &[&anno_buf, &bin_buf, &tile_buf, &coarse_alloc_buf_dev, &ptcl_buf], &[], )?; let k4_code = include_bytes!("../shader/kernel4.spv"); - let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?; - let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?; + let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 2, 1)?; + let k4_ds = device.create_descriptor_set( + &k4_pipeline, + &[&ptcl_buf, &tile_buf], + &[&image_dev] + )?; Ok(Renderer { scene_buf, @@ -328,32 +332,31 @@ impl Renderer { &self.path_ds, (((self.n_pathseg + 31) / 32) as u32, 1, 1), ); - /* + cmd_buf.write_timestamp(&query_pool, 3); + // Note: this barrier is not needed as an actual dependency between + // pipeline stages, but I am keeping it in so that timer queries are + // easier to interpret. + cmd_buf.memory_barrier(); cmd_buf.dispatch( &self.bin_pipeline, &self.bin_ds, - (((self.n_elements + 255) / 256) as u32, 1, 1), + (((self.n_paths + 255) / 256) as u32, 1, 1), ); - */ - cmd_buf.write_timestamp(&query_pool, 3); + cmd_buf.write_timestamp(&query_pool, 4); cmd_buf.memory_barrier(); - /* cmd_buf.dispatch( &self.coarse_pipeline, &self.coarse_ds, (WIDTH as u32 / 256, HEIGHT as u32 / 256, 1), ); - */ - cmd_buf.write_timestamp(&query_pool, 4); + cmd_buf.write_timestamp(&query_pool, 5); cmd_buf.memory_barrier(); - /* cmd_buf.dispatch( &self.k4_pipeline, &self.k4_ds, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ); - cmd_buf.write_timestamp(&query_pool, 5); - */ + cmd_buf.write_timestamp(&query_pool, 6); cmd_buf.memory_barrier(); cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc); } diff --git a/piet-gpu/src/pico_svg.rs b/piet-gpu/src/pico_svg.rs index 140c42d..0aac61a 100644 --- a/piet-gpu/src/pico_svg.rs +++ b/piet-gpu/src/pico_svg.rs @@ -49,8 +49,8 @@ impl PicoSvg { for item in &self.items { match item { Item::Fill(fill_item) => { - rc.fill(&fill_item.path, &fill_item.color); - //rc.stroke(&fill_item.path, &fill_item.color, 1.0); + //rc.fill(&fill_item.path, &fill_item.color); + rc.stroke(&fill_item.path, &fill_item.color, 1.0); } Item::Stroke(stroke_item) => { rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);