mirror of
synced 2025-01-09 20:31:29 +11:00
Continue building out pipeline
Plumbs the new tiling scheme to k4. This works (stroke only) but still has some performance issues.
This commit is contained in:
@ -13,8 +13,9 @@ piet_gpu! {
end: [f32; 2],
struct CmdStroke {
// Consider a specialization to one segment.
seg_ref: Ref<SegChunk>,
// This is really a Ref<Tile>, but we don't have cross-module
// references.
tile_ref: u32,
half_width: f32,
rgba_color: u32,
@ -171,7 +171,7 @@ fn main() -> Result<(), Error> {
let fence = device.create_fence(false)?;
let mut cmd_buf = device.create_cmd_buf()?;
let query_pool = device.create_query_pool(5)?;
let query_pool = device.create_query_pool(7)?;
let mut ctx = PietGpuRenderContext::new();
if let Some(input) = matches.value_of("INPUT") {
@ -204,14 +204,16 @@ fn main() -> Result<(), Error> {
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
println!("Binning kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
println!("Coarse raster kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
println!("Render kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
let mut data: Vec<u32> = Default::default();
device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
let mut img_data: Vec<u8> = Default::default();
// Note: because png can use a `&[u8]` slice, we could avoid an extra copy
@ -15,17 +15,22 @@ layout(set = 0, binding = 1) buffer BinsBuf {
uint[] bins;
layout(set = 0, binding = 2) buffer AllocBuf {
layout(set = 0, binding = 2) buffer TileBuf {
uint[] tile;
layout(set = 0, binding = 3) buffer AllocBuf {
uint n_elements;
uint alloc;
layout(set = 0, binding = 3) buffer PtclBuf {
layout(set = 0, binding = 4) buffer PtclBuf {
uint[] ptcl;
#include "annotated.h"
#include "bins.h"
#include "tile.h"
#include "ptcl.h"
#define LG_N_PART_READ 8
@ -197,37 +202,11 @@ void main() {
tag = Annotated_tag(ref);
// Setup for coverage algorithm.
float a, b, c;
// Bounding box of element in pixel coordinates.
float xmin, xmax, ymin, ymax;
uint my_slice = th_ix / 32;
uint my_mask = 1 << (th_ix & 31);
switch (tag) {
case Annotated_FillLine:
case Annotated_StrokeLine:
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
float dx = line.p1.x - line.p0.x;
float dy = line.p1.y - line.p0.y;
if (tag == Annotated_FillLine) {
// Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
if (dy < 0) {
atomicOr(sh_bd_sign[my_slice], my_mask);
} else {
atomicAnd(sh_bd_sign[my_slice], ~my_mask);
atomicOr(sh_is_segment[my_slice], my_mask);
// Set up for per-scanline coverage formula, below.
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
b = invslope; // Note: assumes square tiles, otherwise scale.
a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
@ -237,10 +216,6 @@ void main() {
xmax = fill.bbox.z;
ymin = fill.bbox.y;
ymax = fill.bbox.w;
// Just let the clamping to xmin and xmax determine the bounds.
a = 0.0;
b = 0.0;
c = 1e9;
ymin = 0;
@ -254,37 +229,23 @@ void main() {
// Compute bounding box in tiles and clip to this bin.
int x0 = int(floor((xmin - xy0.x) * SX));
int x1 = int(ceil((xmax - xy0.x) * SX));
int xr = int(ceil((right_edge - xy0.x) * SX));
int y0 = int(floor((ymin - xy0.y) * SY));
int y1 = int(ceil((ymax - xy0.y) * SY));
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
xr = clamp(xr, 0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
float t = a + b * float(y0);
for (uint y = y0; y < y1; y++) {
uint xx0 = clamp(int(floor(t - c)), x0, x1);
uint xx1 = clamp(int(ceil(t + c)), x0, x1);
for (uint x = xx0; x < xx1; x++) {
for (uint x = x0; x < x1; x++) {
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
// Assign backdrop to all tiles to the right of the ray crossing the
// top edge of this tile, up to the right edge of the fill bbox.
float xray = t - 0.5 * b;
xx0 = max(int(ceil(xray)), 0);
for (uint x = xx0; x < xr; x++) {
atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
t += b;
// We've computed coverage and other info for each element in the input, now for
// the output stage. We'll do segments first using a more parallel algorithm.
uint seg_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
@ -372,45 +333,29 @@ void main() {
Segment seg = Segment(line.p0, line.p1, y_edge);
Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
// Output non-segment elements for this tile. The thread does a sequential walk
// through the non-segment elements, and for segments, count and backdrop are
// aggregated using bit counting.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
uint bd_bitmap = sh_backdrop[0][th_ix];
uint bd_sign = sh_bd_sign[0];
uint is_segment = sh_is_segment[0];
uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
seg_count = 0;
while (true) {
uint nonseg_bitmap = bitmap & ~is_segment;
if (nonseg_bitmap == 0) {
backdrop += count_backdrop(bd_bitmap, bd_sign);
seg_count += bitCount(bitmap & is_segment);
if (bitmap == 0) {
if (slice_ix == N_SLICE) {
bitmap = sh_bitmaps[slice_ix][th_ix];
bd_bitmap = sh_backdrop[slice_ix][th_ix];
bd_sign = sh_bd_sign[slice_ix];
is_segment = sh_is_segment[slice_ix];
nonseg_bitmap = bitmap & ~is_segment;
if (nonseg_bitmap == 0) {
if (bitmap == 0) {
uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
uint element_ix = sh_elements[element_ref_ix];
// Bits up to and including the lsb
uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
seg_count += bitCount(bitmap & bd_mask & is_segment);
// Clear bits that have been consumed.
bd_bitmap &= ~bd_mask;
bitmap &= ~bd_mask;
// Clear LSB
bitmap &= bitmap - 1;
// At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into
@ -419,6 +364,7 @@ void main() {
tag = Annotated_tag(ref);
switch (tag) {
case Annotated_Fill:
if (last_chunk_n > 0 || seg_count > 0) {
SegChunkRef chunk_ref = SegChunkRef(0);
@ -460,63 +406,34 @@ void main() {
seg_count = 0;
backdrop = 0;
case Annotated_Stroke:
// TODO: reduce divergence & code duplication? Much of the
// fill and stroke processing is in common.
if (last_chunk_n > 0 || seg_count > 0) {
SegChunkRef chunk_ref = SegChunkRef(0);
if (seg_count > 0) {
chunk_ref = alloc_seg_chunk();
SegChunk chunk;
chunk.n = seg_count;
chunk.next = SegChunkRef(0);
uint seg_offset = seg_alloc + seg_start * Segment_size;
chunk.segs = SegmentRef(seg_offset);
SegChunk_write(chunk_ref, chunk);
// Because the only elements we're processing right now are
// paths, we can just use the element index as the path index.
// In future, when we're doing a bunch of stuff, the path index
// should probably be stored in the annotated element.
uint path_ix = element_ix;
Path path = Path_read(PathRef(path_ix * Path_size));
// It may be we have a strong guarantee this will always be `true`, but
// I prefer not to take chances.
if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) {
uint stride = path.bbox.z - path.bbox.x;
uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x;
Tile tile = Tile_read(Tile_index(path.tiles, tile_subix));
if (tile.tile.offset != 0) {
AnnoStroke stroke = Annotated_Stroke_read(ref);
CmdStroke cmd_stroke;
cmd_stroke.tile_ref = tile.tile.offset;
cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
if (last_chunk_n > 0) {
SegChunk chunk;
chunk.n = last_chunk_n;
chunk.next = chunk_ref;
chunk.segs = last_chunk_segs;
SegChunk_write(last_chunk_ref, chunk);
} else {
first_seg_chunk = chunk_ref;
AnnoStroke stroke = Annotated_Stroke_read(ref);
CmdStroke cmd_stroke;
cmd_stroke.seg_ref = first_seg_chunk;
cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
last_chunk_n = 0;
seg_start += seg_count;
seg_count = 0;
// This shouldn't happen, but just in case.
if (seg_count > 0) {
SegChunkRef chunk_ref = alloc_seg_chunk();
if (last_chunk_n > 0) {
SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
} else {
first_seg_chunk = chunk_ref;
// TODO: free two registers by writing count and segments ref now,
// as opposed to deferring SegChunk write until all fields are known.
last_chunk_ref = chunk_ref;
last_chunk_n = seg_count;
uint seg_offset = seg_alloc + seg_start * Segment_size;
last_chunk_segs = SegmentRef(seg_offset);
rd_ix += N_TILE;
Binary file not shown.
@ -17,9 +17,14 @@ layout(set = 0, binding = 0) buffer PtclBuf {
uint[] ptcl;
layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
layout(set = 0, binding = 1) buffer TileBuf {
uint[] tile;
layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
#include "ptcl.h"
#include "tile.h"
#include "setup.h"
@ -57,28 +62,25 @@ void main() {
CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
float df[CHUNK];
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
SegChunkRef seg_chunk_ref = stroke.seg_ref;
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
do {
SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
SegmentRef segs = seg_chunk.segs;
for (int i = 0; i < seg_chunk.n; i++) {
Segment seg = Segment_read(Segment_index(segs, i));
vec2 line_vec = seg.end - seg.start;
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
dpos.y += float(k * CHUNK_DY);
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k] = min(df[k], length(line_vec * t - dpos));
TileSeg seg = TileSeg_read(tile_seg_ref);
vec2 line_vec = seg.end - seg.start;
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
dpos.y += float(k * CHUNK_DY);
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k] = min(df[k], length(line_vec * t - dpos));
seg_chunk_ref = seg_chunk.next;
} while (seg_chunk_ref.offset != 0);
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0);
fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
for (uint k = 0; k < CHUNK; k++) {
float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
case Cmd_Fill:
CmdFill fill = Cmd_Fill_read(cmd_ref);
// Probably better to store as float, but conversion is no doubt cheap.
@ -117,6 +119,7 @@ void main() {
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
case Cmd_Solid:
CmdSolid solid = Cmd_Solid_read(cmd_ref);
fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
Binary file not shown.
@ -78,7 +78,7 @@ void main() {
int stride = bbox.z - bbox.x;
int base = (y0 - bbox.y) * stride - bbox.x;
// TODO: can be tighter, use c to bound width
uint n_tile_alloc = uint(stride * (bbox.w - bbox.y));
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
// Consider using subgroups to aggregate atomic add.
uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
TileSeg tile_seg;
Binary file not shown.
@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
struct CmdStroke {
SegChunkRef seg_ref;
uint tile_ref;
float half_width;
uint rgba_color;
@ -220,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
CmdStroke s;
s.seg_ref = SegChunkRef(raw0);
s.tile_ref = raw0;
s.half_width = uintBitsToFloat(raw1);
s.rgba_color = raw2;
return s;
@ -228,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.seg_ref.offset;
ptcl[ix + 0] = s.tile_ref;
ptcl[ix + 1] = floatBitsToUint(s.half_width);
ptcl[ix + 2] = s.rgba_color;
@ -183,9 +183,9 @@ impl<D: Device> Renderer<D> {
device.write_buffer(&scene_buf, &scene)?;
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?;
let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?;
let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?;
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let pathseg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let tile_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
@ -228,10 +228,10 @@ impl<D: Device> Renderer<D> {
let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
// TODO: constants
let bin_alloc_start = ((n_elements + 255) & !255) * 8;
let bin_alloc_start = ((n_paths + 255) & !255) * 8;
&[n_elements as u32, 0, bin_alloc_start as u32],
&[n_paths as u32, 0, bin_alloc_start as u32],
let bin_code = include_bytes!("../shader/binning.spv");
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
@ -250,16 +250,20 @@ impl<D: Device> Renderer<D> {
&[n_elements as u32, coarse_alloc_start as u32],
let coarse_code = include_bytes!("../shader/coarse.spv");
let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 5, 0)?;
let coarse_ds = device.create_descriptor_set(
&[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
&[&anno_buf, &bin_buf, &tile_buf, &coarse_alloc_buf_dev, &ptcl_buf],
let k4_code = include_bytes!("../shader/kernel4.spv");
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 2, 1)?;
let k4_ds = device.create_descriptor_set(
&[&ptcl_buf, &tile_buf],
Ok(Renderer {
@ -328,32 +332,31 @@ impl<D: Device> Renderer<D> {
(((self.n_pathseg + 31) / 32) as u32, 1, 1),
cmd_buf.write_timestamp(&query_pool, 3);
// Note: this barrier is not needed as an actual dependency between
// pipeline stages, but I am keeping it in so that timer queries are
// easier to interpret.
(((self.n_elements + 255) / 256) as u32, 1, 1),
(((self.n_paths + 255) / 256) as u32, 1, 1),
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.write_timestamp(&query_pool, 4);
(WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
cmd_buf.write_timestamp(&query_pool, 4);
cmd_buf.write_timestamp(&query_pool, 5);
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.write_timestamp(&query_pool, 6);
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
@ -49,8 +49,8 @@ impl PicoSvg {
for item in &self.items {
match item {
Item::Fill(fill_item) => {
rc.fill(&fill_item.path, &fill_item.color);
//rc.stroke(&fill_item.path, &fill_item.color, 1.0);
//rc.fill(&fill_item.path, &fill_item.color);
rc.stroke(&fill_item.path, &fill_item.color, 1.0);
Item::Stroke(stroke_item) => {
rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
Reference in a new issue