mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Continue building out pipeline
Plumbs the new tiling scheme to k4. This works (stroke only) but still has some performance issues.
This commit is contained in:
parent
294f6fd1db
commit
70a9c17e23
|
@ -13,8 +13,9 @@ piet_gpu! {
|
||||||
end: [f32; 2],
|
end: [f32; 2],
|
||||||
}
|
}
|
||||||
struct CmdStroke {
|
struct CmdStroke {
|
||||||
// Consider a specialization to one segment.
|
// This is really a Ref<Tile>, but we don't have cross-module
|
||||||
seg_ref: Ref<SegChunk>,
|
// references.
|
||||||
|
tile_ref: u32,
|
||||||
half_width: f32,
|
half_width: f32,
|
||||||
rgba_color: u32,
|
rgba_color: u32,
|
||||||
}
|
}
|
||||||
|
|
|
@ -171,7 +171,7 @@ fn main() -> Result<(), Error> {
|
||||||
|
|
||||||
let fence = device.create_fence(false)?;
|
let fence = device.create_fence(false)?;
|
||||||
let mut cmd_buf = device.create_cmd_buf()?;
|
let mut cmd_buf = device.create_cmd_buf()?;
|
||||||
let query_pool = device.create_query_pool(5)?;
|
let query_pool = device.create_query_pool(7)?;
|
||||||
|
|
||||||
let mut ctx = PietGpuRenderContext::new();
|
let mut ctx = PietGpuRenderContext::new();
|
||||||
if let Some(input) = matches.value_of("INPUT") {
|
if let Some(input) = matches.value_of("INPUT") {
|
||||||
|
@ -204,14 +204,16 @@ fn main() -> Result<(), Error> {
|
||||||
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
|
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
|
||||||
println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
|
println!("Tile allocation kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
|
||||||
println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
|
println!("Coarse path kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
|
||||||
/*
|
println!("Binning kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
|
||||||
println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
|
println!("Coarse raster kernel time: {:.3}ms", (ts[4] - ts[3]) * 1e3);
|
||||||
*/
|
println!("Render kernel time: {:.3}ms", (ts[5] - ts[4]) * 1e3);
|
||||||
|
|
||||||
|
/*
|
||||||
let mut data: Vec<u32> = Default::default();
|
let mut data: Vec<u32> = Default::default();
|
||||||
device.read_buffer(&renderer.tile_buf, &mut data).unwrap();
|
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
|
||||||
piet_gpu::dump_k1_data(&data);
|
piet_gpu::dump_k1_data(&data);
|
||||||
//trace_ptcl(&data);
|
//trace_ptcl(&data);
|
||||||
|
*/
|
||||||
|
|
||||||
let mut img_data: Vec<u8> = Default::default();
|
let mut img_data: Vec<u8> = Default::default();
|
||||||
// Note: because png can use a `&[u8]` slice, we could avoid an extra copy
|
// Note: because png can use a `&[u8]` slice, we could avoid an extra copy
|
||||||
|
|
|
@ -15,17 +15,22 @@ layout(set = 0, binding = 1) buffer BinsBuf {
|
||||||
uint[] bins;
|
uint[] bins;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(set = 0, binding = 2) buffer AllocBuf {
|
layout(set = 0, binding = 2) buffer TileBuf {
|
||||||
|
uint[] tile;
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(set = 0, binding = 3) buffer AllocBuf {
|
||||||
uint n_elements;
|
uint n_elements;
|
||||||
uint alloc;
|
uint alloc;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(set = 0, binding = 3) buffer PtclBuf {
|
layout(set = 0, binding = 4) buffer PtclBuf {
|
||||||
uint[] ptcl;
|
uint[] ptcl;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "annotated.h"
|
#include "annotated.h"
|
||||||
#include "bins.h"
|
#include "bins.h"
|
||||||
|
#include "tile.h"
|
||||||
#include "ptcl.h"
|
#include "ptcl.h"
|
||||||
|
|
||||||
#define LG_N_PART_READ 8
|
#define LG_N_PART_READ 8
|
||||||
|
@ -197,37 +202,11 @@ void main() {
|
||||||
tag = Annotated_tag(ref);
|
tag = Annotated_tag(ref);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup for coverage algorithm.
|
|
||||||
float a, b, c;
|
|
||||||
// Bounding box of element in pixel coordinates.
|
// Bounding box of element in pixel coordinates.
|
||||||
float xmin, xmax, ymin, ymax;
|
float xmin, xmax, ymin, ymax;
|
||||||
uint my_slice = th_ix / 32;
|
uint my_slice = th_ix / 32;
|
||||||
uint my_mask = 1 << (th_ix & 31);
|
uint my_mask = 1 << (th_ix & 31);
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
case Annotated_FillLine:
|
|
||||||
case Annotated_StrokeLine:
|
|
||||||
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
|
|
||||||
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
|
|
||||||
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
|
|
||||||
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
|
|
||||||
ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
|
|
||||||
float dx = line.p1.x - line.p0.x;
|
|
||||||
float dy = line.p1.y - line.p0.y;
|
|
||||||
if (tag == Annotated_FillLine) {
|
|
||||||
// Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
|
|
||||||
if (dy < 0) {
|
|
||||||
atomicOr(sh_bd_sign[my_slice], my_mask);
|
|
||||||
} else {
|
|
||||||
atomicAnd(sh_bd_sign[my_slice], ~my_mask);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
atomicOr(sh_is_segment[my_slice], my_mask);
|
|
||||||
// Set up for per-scanline coverage formula, below.
|
|
||||||
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
|
|
||||||
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
|
|
||||||
b = invslope; // Note: assumes square tiles, otherwise scale.
|
|
||||||
a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
|
|
||||||
break;
|
|
||||||
case Annotated_Fill:
|
case Annotated_Fill:
|
||||||
case Annotated_Stroke:
|
case Annotated_Stroke:
|
||||||
// Note: we take advantage of the fact that fills and strokes
|
// Note: we take advantage of the fact that fills and strokes
|
||||||
|
@ -237,10 +216,6 @@ void main() {
|
||||||
xmax = fill.bbox.z;
|
xmax = fill.bbox.z;
|
||||||
ymin = fill.bbox.y;
|
ymin = fill.bbox.y;
|
||||||
ymax = fill.bbox.w;
|
ymax = fill.bbox.w;
|
||||||
// Just let the clamping to xmin and xmax determine the bounds.
|
|
||||||
a = 0.0;
|
|
||||||
b = 0.0;
|
|
||||||
c = 1e9;
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
ymin = 0;
|
ymin = 0;
|
||||||
|
@ -254,37 +229,23 @@ void main() {
|
||||||
// Compute bounding box in tiles and clip to this bin.
|
// Compute bounding box in tiles and clip to this bin.
|
||||||
int x0 = int(floor((xmin - xy0.x) * SX));
|
int x0 = int(floor((xmin - xy0.x) * SX));
|
||||||
int x1 = int(ceil((xmax - xy0.x) * SX));
|
int x1 = int(ceil((xmax - xy0.x) * SX));
|
||||||
int xr = int(ceil((right_edge - xy0.x) * SX));
|
|
||||||
int y0 = int(floor((ymin - xy0.y) * SY));
|
int y0 = int(floor((ymin - xy0.y) * SY));
|
||||||
int y1 = int(ceil((ymax - xy0.y) * SY));
|
int y1 = int(ceil((ymax - xy0.y) * SY));
|
||||||
x0 = clamp(x0, 0, N_TILE_X);
|
x0 = clamp(x0, 0, N_TILE_X);
|
||||||
x1 = clamp(x1, x0, N_TILE_X);
|
x1 = clamp(x1, x0, N_TILE_X);
|
||||||
xr = clamp(xr, 0, N_TILE_X);
|
|
||||||
y0 = clamp(y0, 0, N_TILE_Y);
|
y0 = clamp(y0, 0, N_TILE_Y);
|
||||||
y1 = clamp(y1, y0, N_TILE_Y);
|
y1 = clamp(y1, y0, N_TILE_Y);
|
||||||
float t = a + b * float(y0);
|
|
||||||
for (uint y = y0; y < y1; y++) {
|
for (uint y = y0; y < y1; y++) {
|
||||||
uint xx0 = clamp(int(floor(t - c)), x0, x1);
|
for (uint x = x0; x < x1; x++) {
|
||||||
uint xx1 = clamp(int(ceil(t + c)), x0, x1);
|
|
||||||
for (uint x = xx0; x < xx1; x++) {
|
|
||||||
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
|
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
|
||||||
}
|
}
|
||||||
if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
|
|
||||||
// Assign backdrop to all tiles to the right of the ray crossing the
|
|
||||||
// top edge of this tile, up to the right edge of the fill bbox.
|
|
||||||
float xray = t - 0.5 * b;
|
|
||||||
xx0 = max(int(ceil(xray)), 0);
|
|
||||||
for (uint x = xx0; x < xr; x++) {
|
|
||||||
atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t += b;
|
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
// We've computed coverage and other info for each element in the input, now for
|
// We've computed coverage and other info for each element in the input, now for
|
||||||
// the output stage. We'll do segments first using a more parallel algorithm.
|
// the output stage. We'll do segments first using a more parallel algorithm.
|
||||||
|
|
||||||
|
/*
|
||||||
uint seg_count = 0;
|
uint seg_count = 0;
|
||||||
for (uint i = 0; i < N_SLICE; i++) {
|
for (uint i = 0; i < N_SLICE; i++) {
|
||||||
seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
|
seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
|
||||||
|
@ -372,45 +333,29 @@ void main() {
|
||||||
Segment seg = Segment(line.p0, line.p1, y_edge);
|
Segment seg = Segment(line.p0, line.p1, y_edge);
|
||||||
Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
|
Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// Output non-segment elements for this tile. The thread does a sequential walk
|
// Output non-segment elements for this tile. The thread does a sequential walk
|
||||||
// through the non-segment elements, and for segments, count and backdrop are
|
// through the non-segment elements, and for segments, count and backdrop are
|
||||||
// aggregated using bit counting.
|
// aggregated using bit counting.
|
||||||
uint slice_ix = 0;
|
uint slice_ix = 0;
|
||||||
uint bitmap = sh_bitmaps[0][th_ix];
|
uint bitmap = sh_bitmaps[0][th_ix];
|
||||||
uint bd_bitmap = sh_backdrop[0][th_ix];
|
|
||||||
uint bd_sign = sh_bd_sign[0];
|
|
||||||
uint is_segment = sh_is_segment[0];
|
|
||||||
uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
|
|
||||||
seg_count = 0;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
uint nonseg_bitmap = bitmap & ~is_segment;
|
if (bitmap == 0) {
|
||||||
if (nonseg_bitmap == 0) {
|
|
||||||
backdrop += count_backdrop(bd_bitmap, bd_sign);
|
|
||||||
seg_count += bitCount(bitmap & is_segment);
|
|
||||||
slice_ix++;
|
slice_ix++;
|
||||||
if (slice_ix == N_SLICE) {
|
if (slice_ix == N_SLICE) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
bitmap = sh_bitmaps[slice_ix][th_ix];
|
bitmap = sh_bitmaps[slice_ix][th_ix];
|
||||||
bd_bitmap = sh_backdrop[slice_ix][th_ix];
|
if (bitmap == 0) {
|
||||||
bd_sign = sh_bd_sign[slice_ix];
|
|
||||||
is_segment = sh_is_segment[slice_ix];
|
|
||||||
nonseg_bitmap = bitmap & ~is_segment;
|
|
||||||
if (nonseg_bitmap == 0) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
|
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
|
||||||
uint element_ix = sh_elements[element_ref_ix];
|
uint element_ix = sh_elements[element_ref_ix];
|
||||||
|
|
||||||
// Bits up to and including the lsb
|
// Clear LSB
|
||||||
uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
|
bitmap &= bitmap - 1;
|
||||||
backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
|
|
||||||
seg_count += bitCount(bitmap & bd_mask & is_segment);
|
|
||||||
// Clear bits that have been consumed.
|
|
||||||
bd_bitmap &= ~bd_mask;
|
|
||||||
bitmap &= ~bd_mask;
|
|
||||||
|
|
||||||
// At this point, we read the element again from global memory.
|
// At this point, we read the element again from global memory.
|
||||||
// If that turns out to be expensive, maybe we can pack it into
|
// If that turns out to be expensive, maybe we can pack it into
|
||||||
|
@ -419,6 +364,7 @@ void main() {
|
||||||
tag = Annotated_tag(ref);
|
tag = Annotated_tag(ref);
|
||||||
|
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
|
/*
|
||||||
case Annotated_Fill:
|
case Annotated_Fill:
|
||||||
if (last_chunk_n > 0 || seg_count > 0) {
|
if (last_chunk_n > 0 || seg_count > 0) {
|
||||||
SegChunkRef chunk_ref = SegChunkRef(0);
|
SegChunkRef chunk_ref = SegChunkRef(0);
|
||||||
|
@ -460,63 +406,34 @@ void main() {
|
||||||
seg_count = 0;
|
seg_count = 0;
|
||||||
backdrop = 0;
|
backdrop = 0;
|
||||||
break;
|
break;
|
||||||
|
*/
|
||||||
case Annotated_Stroke:
|
case Annotated_Stroke:
|
||||||
// TODO: reduce divergence & code duplication? Much of the
|
// Because the only elements we're processing right now are
|
||||||
// fill and stroke processing is in common.
|
// paths, we can just use the element index as the path index.
|
||||||
if (last_chunk_n > 0 || seg_count > 0) {
|
// In future, when we're doing a bunch of stuff, the path index
|
||||||
SegChunkRef chunk_ref = SegChunkRef(0);
|
// should probably be stored in the annotated element.
|
||||||
if (seg_count > 0) {
|
uint path_ix = element_ix;
|
||||||
chunk_ref = alloc_seg_chunk();
|
Path path = Path_read(PathRef(path_ix * Path_size));
|
||||||
SegChunk chunk;
|
// It may be we have a strong guarantee this will always be `true`, but
|
||||||
chunk.n = seg_count;
|
// I prefer not to take chances.
|
||||||
chunk.next = SegChunkRef(0);
|
if (tile_x >= path.bbox.x && tile_x < path.bbox.z && tile_y >= path.bbox.y && tile_y < path.bbox.w) {
|
||||||
uint seg_offset = seg_alloc + seg_start * Segment_size;
|
uint stride = path.bbox.z - path.bbox.x;
|
||||||
chunk.segs = SegmentRef(seg_offset);
|
uint tile_subix = (tile_y - path.bbox.y) * stride + tile_x - path.bbox.x;
|
||||||
SegChunk_write(chunk_ref, chunk);
|
Tile tile = Tile_read(Tile_index(path.tiles, tile_subix));
|
||||||
|
if (tile.tile.offset != 0) {
|
||||||
|
AnnoStroke stroke = Annotated_Stroke_read(ref);
|
||||||
|
CmdStroke cmd_stroke;
|
||||||
|
cmd_stroke.tile_ref = tile.tile.offset;
|
||||||
|
cmd_stroke.half_width = 0.5 * stroke.linewidth;
|
||||||
|
cmd_stroke.rgba_color = stroke.rgba_color;
|
||||||
|
alloc_cmd(cmd_ref, cmd_limit);
|
||||||
|
Cmd_Stroke_write(cmd_ref, cmd_stroke);
|
||||||
|
cmd_ref.offset += Cmd_size;
|
||||||
}
|
}
|
||||||
if (last_chunk_n > 0) {
|
|
||||||
SegChunk chunk;
|
|
||||||
chunk.n = last_chunk_n;
|
|
||||||
chunk.next = chunk_ref;
|
|
||||||
chunk.segs = last_chunk_segs;
|
|
||||||
SegChunk_write(last_chunk_ref, chunk);
|
|
||||||
} else {
|
|
||||||
first_seg_chunk = chunk_ref;
|
|
||||||
}
|
|
||||||
|
|
||||||
AnnoStroke stroke = Annotated_Stroke_read(ref);
|
|
||||||
CmdStroke cmd_stroke;
|
|
||||||
cmd_stroke.seg_ref = first_seg_chunk;
|
|
||||||
cmd_stroke.half_width = 0.5 * stroke.linewidth;
|
|
||||||
cmd_stroke.rgba_color = stroke.rgba_color;
|
|
||||||
alloc_cmd(cmd_ref, cmd_limit);
|
|
||||||
Cmd_Stroke_write(cmd_ref, cmd_stroke);
|
|
||||||
cmd_ref.offset += Cmd_size;
|
|
||||||
last_chunk_n = 0;
|
|
||||||
}
|
}
|
||||||
seg_start += seg_count;
|
|
||||||
seg_count = 0;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
// This shouldn't happen, but just in case.
|
|
||||||
seg_start++;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (seg_count > 0) {
|
|
||||||
SegChunkRef chunk_ref = alloc_seg_chunk();
|
|
||||||
if (last_chunk_n > 0) {
|
|
||||||
SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
|
|
||||||
} else {
|
|
||||||
first_seg_chunk = chunk_ref;
|
|
||||||
}
|
|
||||||
// TODO: free two registers by writing count and segments ref now,
|
|
||||||
// as opposed to deferring SegChunk write until all fields are known.
|
|
||||||
last_chunk_ref = chunk_ref;
|
|
||||||
last_chunk_n = seg_count;
|
|
||||||
uint seg_offset = seg_alloc + seg_start * Segment_size;
|
|
||||||
last_chunk_segs = SegmentRef(seg_offset);
|
|
||||||
}
|
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
rd_ix += N_TILE;
|
rd_ix += N_TILE;
|
||||||
|
|
Binary file not shown.
|
@ -17,9 +17,14 @@ layout(set = 0, binding = 0) buffer PtclBuf {
|
||||||
uint[] ptcl;
|
uint[] ptcl;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
|
layout(set = 0, binding = 1) buffer TileBuf {
|
||||||
|
uint[] tile;
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
|
||||||
|
|
||||||
#include "ptcl.h"
|
#include "ptcl.h"
|
||||||
|
#include "tile.h"
|
||||||
|
|
||||||
#include "setup.h"
|
#include "setup.h"
|
||||||
|
|
||||||
|
@ -57,28 +62,25 @@ void main() {
|
||||||
CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
|
CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
|
||||||
float df[CHUNK];
|
float df[CHUNK];
|
||||||
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
|
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
|
||||||
SegChunkRef seg_chunk_ref = stroke.seg_ref;
|
TileSegRef tile_seg_ref = TileSegRef(stroke.tile_ref);
|
||||||
do {
|
do {
|
||||||
SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
|
TileSeg seg = TileSeg_read(tile_seg_ref);
|
||||||
SegmentRef segs = seg_chunk.segs;
|
vec2 line_vec = seg.end - seg.start;
|
||||||
for (int i = 0; i < seg_chunk.n; i++) {
|
for (uint k = 0; k < CHUNK; k++) {
|
||||||
Segment seg = Segment_read(Segment_index(segs, i));
|
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
|
||||||
vec2 line_vec = seg.end - seg.start;
|
dpos.y += float(k * CHUNK_DY);
|
||||||
for (uint k = 0; k < CHUNK; k++) {
|
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
|
||||||
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
|
df[k] = min(df[k], length(line_vec * t - dpos));
|
||||||
dpos.y += float(k * CHUNK_DY);
|
|
||||||
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
|
|
||||||
df[k] = min(df[k], length(line_vec * t - dpos));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
seg_chunk_ref = seg_chunk.next;
|
tile_seg_ref = seg.next;
|
||||||
} while (seg_chunk_ref.offset != 0);
|
} while (tile_seg_ref.offset != 0);
|
||||||
fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
|
fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
|
||||||
for (uint k = 0; k < CHUNK; k++) {
|
for (uint k = 0; k < CHUNK; k++) {
|
||||||
float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
|
float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
|
||||||
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
|
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
/*
|
||||||
case Cmd_Fill:
|
case Cmd_Fill:
|
||||||
CmdFill fill = Cmd_Fill_read(cmd_ref);
|
CmdFill fill = Cmd_Fill_read(cmd_ref);
|
||||||
// Probably better to store as float, but conversion is no doubt cheap.
|
// Probably better to store as float, but conversion is no doubt cheap.
|
||||||
|
@ -117,6 +119,7 @@ void main() {
|
||||||
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
|
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
*/
|
||||||
case Cmd_Solid:
|
case Cmd_Solid:
|
||||||
CmdSolid solid = Cmd_Solid_read(cmd_ref);
|
CmdSolid solid = Cmd_Solid_read(cmd_ref);
|
||||||
fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
|
fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
|
||||||
|
|
Binary file not shown.
|
@ -78,7 +78,7 @@ void main() {
|
||||||
int stride = bbox.z - bbox.x;
|
int stride = bbox.z - bbox.x;
|
||||||
int base = (y0 - bbox.y) * stride - bbox.x;
|
int base = (y0 - bbox.y) * stride - bbox.x;
|
||||||
// TODO: can be tighter, use c to bound width
|
// TODO: can be tighter, use c to bound width
|
||||||
uint n_tile_alloc = uint(stride * (bbox.w - bbox.y));
|
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
|
||||||
// Consider using subgroups to aggregate atomic add.
|
// Consider using subgroups to aggregate atomic add.
|
||||||
uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
|
uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
|
||||||
TileSeg tile_seg;
|
TileSeg tile_seg;
|
||||||
|
|
Binary file not shown.
|
@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct CmdStroke {
|
struct CmdStroke {
|
||||||
SegChunkRef seg_ref;
|
uint tile_ref;
|
||||||
float half_width;
|
float half_width;
|
||||||
uint rgba_color;
|
uint rgba_color;
|
||||||
};
|
};
|
||||||
|
@ -220,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
|
||||||
uint raw1 = ptcl[ix + 1];
|
uint raw1 = ptcl[ix + 1];
|
||||||
uint raw2 = ptcl[ix + 2];
|
uint raw2 = ptcl[ix + 2];
|
||||||
CmdStroke s;
|
CmdStroke s;
|
||||||
s.seg_ref = SegChunkRef(raw0);
|
s.tile_ref = raw0;
|
||||||
s.half_width = uintBitsToFloat(raw1);
|
s.half_width = uintBitsToFloat(raw1);
|
||||||
s.rgba_color = raw2;
|
s.rgba_color = raw2;
|
||||||
return s;
|
return s;
|
||||||
|
@ -228,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
|
||||||
|
|
||||||
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
|
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
|
||||||
uint ix = ref.offset >> 2;
|
uint ix = ref.offset >> 2;
|
||||||
ptcl[ix + 0] = s.seg_ref.offset;
|
ptcl[ix + 0] = s.tile_ref;
|
||||||
ptcl[ix + 1] = floatBitsToUint(s.half_width);
|
ptcl[ix + 1] = floatBitsToUint(s.half_width);
|
||||||
ptcl[ix + 2] = s.rgba_color;
|
ptcl[ix + 2] = s.rgba_color;
|
||||||
}
|
}
|
||||||
|
|
|
@ -183,9 +183,9 @@ impl<D: Device> Renderer<D> {
|
||||||
device.write_buffer(&scene_buf, &scene)?;
|
device.write_buffer(&scene_buf, &scene)?;
|
||||||
|
|
||||||
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
|
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
|
||||||
let anno_buf = device.create_buffer(64 * 1024 * 1024, host)?;
|
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let pathseg_buf = device.create_buffer(64 * 1024 * 1024, host)?;
|
let pathseg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let tile_buf = device.create_buffer(64 * 1024 * 1024, host)?;
|
let tile_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
|
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
|
||||||
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
|
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
|
||||||
|
@ -228,10 +228,10 @@ impl<D: Device> Renderer<D> {
|
||||||
let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
|
let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
|
||||||
|
|
||||||
// TODO: constants
|
// TODO: constants
|
||||||
let bin_alloc_start = ((n_elements + 255) & !255) * 8;
|
let bin_alloc_start = ((n_paths + 255) & !255) * 8;
|
||||||
device.write_buffer(
|
device.write_buffer(
|
||||||
&bin_alloc_buf_host,
|
&bin_alloc_buf_host,
|
||||||
&[n_elements as u32, 0, bin_alloc_start as u32],
|
&[n_paths as u32, 0, bin_alloc_start as u32],
|
||||||
)?;
|
)?;
|
||||||
let bin_code = include_bytes!("../shader/binning.spv");
|
let bin_code = include_bytes!("../shader/binning.spv");
|
||||||
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
|
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
|
||||||
|
@ -250,16 +250,20 @@ impl<D: Device> Renderer<D> {
|
||||||
&[n_elements as u32, coarse_alloc_start as u32],
|
&[n_elements as u32, coarse_alloc_start as u32],
|
||||||
)?;
|
)?;
|
||||||
let coarse_code = include_bytes!("../shader/coarse.spv");
|
let coarse_code = include_bytes!("../shader/coarse.spv");
|
||||||
let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
|
let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 5, 0)?;
|
||||||
let coarse_ds = device.create_descriptor_set(
|
let coarse_ds = device.create_descriptor_set(
|
||||||
&coarse_pipeline,
|
&coarse_pipeline,
|
||||||
&[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
|
&[&anno_buf, &bin_buf, &tile_buf, &coarse_alloc_buf_dev, &ptcl_buf],
|
||||||
&[],
|
&[],
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let k4_code = include_bytes!("../shader/kernel4.spv");
|
let k4_code = include_bytes!("../shader/kernel4.spv");
|
||||||
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
|
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 2, 1)?;
|
||||||
let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
|
let k4_ds = device.create_descriptor_set(
|
||||||
|
&k4_pipeline,
|
||||||
|
&[&ptcl_buf, &tile_buf],
|
||||||
|
&[&image_dev]
|
||||||
|
)?;
|
||||||
|
|
||||||
Ok(Renderer {
|
Ok(Renderer {
|
||||||
scene_buf,
|
scene_buf,
|
||||||
|
@ -328,32 +332,31 @@ impl<D: Device> Renderer<D> {
|
||||||
&self.path_ds,
|
&self.path_ds,
|
||||||
(((self.n_pathseg + 31) / 32) as u32, 1, 1),
|
(((self.n_pathseg + 31) / 32) as u32, 1, 1),
|
||||||
);
|
);
|
||||||
/*
|
cmd_buf.write_timestamp(&query_pool, 3);
|
||||||
|
// Note: this barrier is not needed as an actual dependency between
|
||||||
|
// pipeline stages, but I am keeping it in so that timer queries are
|
||||||
|
// easier to interpret.
|
||||||
|
cmd_buf.memory_barrier();
|
||||||
cmd_buf.dispatch(
|
cmd_buf.dispatch(
|
||||||
&self.bin_pipeline,
|
&self.bin_pipeline,
|
||||||
&self.bin_ds,
|
&self.bin_ds,
|
||||||
(((self.n_elements + 255) / 256) as u32, 1, 1),
|
(((self.n_paths + 255) / 256) as u32, 1, 1),
|
||||||
);
|
);
|
||||||
*/
|
cmd_buf.write_timestamp(&query_pool, 4);
|
||||||
cmd_buf.write_timestamp(&query_pool, 3);
|
|
||||||
cmd_buf.memory_barrier();
|
cmd_buf.memory_barrier();
|
||||||
/*
|
|
||||||
cmd_buf.dispatch(
|
cmd_buf.dispatch(
|
||||||
&self.coarse_pipeline,
|
&self.coarse_pipeline,
|
||||||
&self.coarse_ds,
|
&self.coarse_ds,
|
||||||
(WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
|
(WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
|
||||||
);
|
);
|
||||||
*/
|
cmd_buf.write_timestamp(&query_pool, 5);
|
||||||
cmd_buf.write_timestamp(&query_pool, 4);
|
|
||||||
cmd_buf.memory_barrier();
|
cmd_buf.memory_barrier();
|
||||||
/*
|
|
||||||
cmd_buf.dispatch(
|
cmd_buf.dispatch(
|
||||||
&self.k4_pipeline,
|
&self.k4_pipeline,
|
||||||
&self.k4_ds,
|
&self.k4_ds,
|
||||||
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
|
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
|
||||||
);
|
);
|
||||||
cmd_buf.write_timestamp(&query_pool, 5);
|
cmd_buf.write_timestamp(&query_pool, 6);
|
||||||
*/
|
|
||||||
cmd_buf.memory_barrier();
|
cmd_buf.memory_barrier();
|
||||||
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
|
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,8 +49,8 @@ impl PicoSvg {
|
||||||
for item in &self.items {
|
for item in &self.items {
|
||||||
match item {
|
match item {
|
||||||
Item::Fill(fill_item) => {
|
Item::Fill(fill_item) => {
|
||||||
rc.fill(&fill_item.path, &fill_item.color);
|
//rc.fill(&fill_item.path, &fill_item.color);
|
||||||
//rc.stroke(&fill_item.path, &fill_item.color, 1.0);
|
rc.stroke(&fill_item.path, &fill_item.color, 1.0);
|
||||||
}
|
}
|
||||||
Item::Stroke(stroke_item) => {
|
Item::Stroke(stroke_item) => {
|
||||||
rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
|
rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
|
||||||
|
|
Loading…
Reference in a new issue