Implement stroked polylines

This version seems to work but the allocation of segments has low
utilization. Probably best to allocate in chunks rather than try to
make them contiguous.
This commit is contained in:
Raph Levien 2020-04-28 11:02:19 -07:00
parent 55e35dd879
commit cb06b1bc3d
20 changed files with 502 additions and 44 deletions

View file

@ -1,5 +1,6 @@
pub mod encoder; pub mod encoder;
pub mod ptcl; pub mod ptcl;
pub mod scene; pub mod scene;
pub mod segment;
pub mod test; pub mod test;
pub mod tilegroup; pub mod tilegroup;

View file

@ -6,6 +6,7 @@ fn main() {
match mod_name.as_str() { match mod_name.as_str() {
"scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()), "scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
"tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()), "tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
"segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
"ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()), "ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
"test" => print!("{}", piet_gpu_types::test::gen_gpu_test()), "test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
_ => println!("Oops, unknown module name"), _ => println!("Oops, unknown module name"),

View file

@ -13,8 +13,10 @@ piet_gpu! {
end: [f32; 2], end: [f32; 2],
} }
struct CmdStroke { struct CmdStroke {
// In existing code, this is f16. Should we have support? n_segs: u32,
halfWidth: f32, // Should be Ref<Segment> if we had cross-module references.
seg_ref: u32,
half_width: f32,
rgba_color: u32, rgba_color: u32,
} }
struct CmdFill { struct CmdFill {

View file

@ -0,0 +1,27 @@
use piet_gpu_derive::piet_gpu;
// Structures representing segments for stroke/fill items.
piet_gpu! {
#[gpu_write]
mod segment {
struct TileHeader {
n: u32,
items: Ref<ItemHeader>,
}
// Note: this is only suitable for strokes, fills require backdrop.
struct ItemHeader {
n: u32,
segments: Ref<Segment>,
}
// TODO: strongly consider using f16. If so, these would be
// relative to the tile. We're doing f32 for now to minimize
// divergence from piet-metal originals.
struct Segment {
start: [f32; 2],
end: [f32; 2],
}
}
}

View file

@ -1,5 +1,18 @@
use piet_gpu_derive::piet_gpu; use piet_gpu_derive::piet_gpu;
// Structures representing tilegroup instances (output of kernel 1).
// There are three outputs: the main instances, the stroke instances,
// and the fill instances. All three are conceptually a list of
// instances, but the encoding is slightly different. The first is
// encoded with Instance, Jump, and End. The other two are encoded
// as a linked list of Chunk.
// The motivation for the difference is that the first requires fewer
// registers to track state, but the second contains information that
// is useful up front for doing dynamic allocation in kernel 2, as
// well as increasing read parallelism; the "jump" approach really is
// geared to sequential reading.
piet_gpu! { piet_gpu! {
#[gpu_write] #[gpu_write]
mod tilegroup { mod tilegroup {
@ -11,7 +24,11 @@ piet_gpu! {
offset: [f32; 2], offset: [f32; 2],
} }
struct Jump { struct Jump {
new_ref: u32, new_ref: Ref<TileGroup>,
}
struct Chunk {
chunk_n: u32,
next: Ref<Chunk>,
} }
enum TileGroup { enum TileGroup {
Instance(Instance), Instance(Instance),

View file

@ -11,6 +11,8 @@ build image.spv: glsl image.comp | scene.h
build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h
build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h setup.h build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h ptcl.h setup.h
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h

View file

@ -7,8 +7,7 @@
// subgroups (or possibly both) to parallelize the reading of the input and // subgroups (or possibly both) to parallelize the reading of the input and
// the computation of tilegroup intersection. // the computation of tilegroup intersection.
// //
// In addition, there are some features currently missing. One is the use of // In addition, there are some features currently missing, such as support
// a bump allocator to extend the current fixed allocation. Another is support
// for clipping. // for clipping.
#version 450 #version 450
@ -46,8 +45,17 @@ void main() {
StackElement stack[MAX_STACK]; StackElement stack[MAX_STACK];
uint stack_ix = 0; uint stack_ix = 0;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x; uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size; uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
// State for stroke references.
TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_ALLOC - Instance_size;
uint stroke_chunk_n = 0;
uint stroke_n = 0;
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX); vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
PietItemRef root = PietItemRef(0); PietItemRef root = PietItemRef(0);
SimpleGroup group = PietItem_Group_read(root); SimpleGroup group = PietItem_Group_read(root);
@ -60,9 +68,11 @@ void main() {
bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX)) bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
&& max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX)); && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
bool is_group = false; bool is_group = false;
uint tag;
if (hit) { if (hit) {
PietItemRef item_ref = PietItem_index(group.items, tos.index); PietItemRef item_ref = PietItem_index(group.items, tos.index);
is_group = PietItem_tag(item_ref) == PietItem_Group; tag = PietItem_tag(item_ref);
is_group = tag == PietItem_Group;
} }
if (hit && !is_group) { if (hit && !is_group) {
PietItemRef item_ref = PietItem_index(group.items, tos.index); PietItemRef item_ref = PietItem_index(group.items, tos.index);
@ -70,13 +80,27 @@ void main() {
if (tg_ref.offset > tg_limit) { if (tg_ref.offset > tg_limit) {
// Allocation exceeded; do atomic bump alloc. // Allocation exceeded; do atomic bump alloc.
uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC); uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
Jump jump = Jump(new_tg); Jump jump = Jump(TileGroupRef(new_tg));
TileGroup_Jump_write(tg_ref, jump); TileGroup_Jump_write(tg_ref, jump);
tg_ref = TileGroupRef(new_tg); tg_ref = TileGroupRef(new_tg);
tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size; tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
} }
TileGroup_Instance_write(tg_ref, ins); TileGroup_Instance_write(tg_ref, ins);
tg_ref.offset += TileGroup_size; tg_ref.offset += TileGroup_size;
if (tag == PietItem_Poly) {
if (stroke_ref.offset > stroke_limit) {
uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
stroke_chunk_start = ChunkRef(new_stroke);
stroke_ref = InstanceRef(new_stroke + Chunk_size);
stroke_n += stroke_chunk_n;
stroke_chunk_n = 0;
stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
}
Instance_write(stroke_ref, ins);
stroke_chunk_n++;
stroke_ref.offset += Instance_size;
}
} }
if (is_group) { if (is_group) {
PietItemRef item_ref = PietItem_index(group.items, tos.index); PietItemRef item_ref = PietItem_index(group.items, tos.index);
@ -99,4 +123,10 @@ void main() {
} }
} }
TileGroup_End_write(tg_ref); TileGroup_End_write(tg_ref);
stroke_n += stroke_chunk_n;
if (stroke_n > 0) {
Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
}
tilegroup[stroke_start.offset >> 2] = stroke_n;
} }

Binary file not shown.

View file

@ -0,0 +1,127 @@
// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
// (polyline) items in the scene and generates a list of segments for each, for
// each tile.
#version 450
#extension GL_GOOGLE_include_directive : enable
layout(local_size_x = 32) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
layout(set = 0, binding = 2) buffer SegmentBuf {
uint[] segment;
};
layout(set = 0, binding = 3) buffer AllocBuf {
uint alloc;
};
#include "scene.h"
#include "tilegroup.h"
#include "segment.h"
#include "setup.h"
void main() {
uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
uint stroke_n = tilegroup[stroke_start.offset >> 2];
TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
if (stroke_n > 0) {
ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
Chunk chunk = Chunk_read(chunk_ref);
InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
SegmentRef seg_ref = SegmentRef(0);
uint seg_limit = 0;
// Iterate through items; stroke_n holds count remaining.
while (true) {
if (chunk.chunk_n == 0) {
chunk_ref = chunk.next;
chunk = Chunk_read(chunk_ref);
stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
}
Instance ins = Instance_read(stroke_ref);
PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
// Process the stroke polyline item.
uint max_n_segs = poly.n_points - 1;
uint reserve = max_n_segs * Segment_size;
if (seg_ref.offset + reserve > seg_limit) {
// This is a heuristic to balance atomic bandwidth and utilization.
// The output always gets a contiguous allocation. We might use
// all, some, or none of the capacity.
uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve;
seg_ref.offset = atomicAdd(alloc, capacity_bytes);
seg_limit = seg_ref.offset + capacity_bytes;
}
uint n_segs = 0;
vec2 start = Point_read(poly.points).xy;
for (uint j = 0; j < max_n_segs; j++) {
poly.points.offset += Point_size;
vec2 end = Point_read(poly.points).xy;
// Process one segment.
// This logic just tests for collision. What we probably want to do
// is a clipping algorithm like Liang-Barsky, and then store coords
// relative to the tile in f16. See also:
// https://tavianator.com/fast-branchless-raybounding-box-intersections/
// Also note that when we go to the fancy version, we want to compute
// the (horizontal projection of) the bounding box of the intersection
// once per tilegroup, so we can assign work to individual tiles.
float a = end.y - start.y;
float b = start.x - end.x;
float c = -(a * start.x + b * start.y);
float half_width = 0.5 * poly.width;
// Tile boundaries padded by half-width.
float xmin = xy0.x - half_width;
float ymin = xy0.y - half_width;
float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
float s00 = sign(b * ymin + a * xmin + c);
float s01 = sign(b * ymin + a * xmax + c);
float s10 = sign(b * ymax + a * xmin + c);
float s11 = sign(b * ymax + a * xmax + c);
// If bounding boxes intersect and not all four corners are on the same side, hit.
// Also note: this is designed to be false on NAN input.
if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
&& max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
&& s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
{
Segment seg = Segment(start, end);
Segment_write(Segment_index(seg_ref, n_segs), seg);
n_segs++;
}
start = end;
}
ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref));
if (--stroke_n == 0) {
break;
}
seg_ref.offset += n_segs * Segment_size;
stroke_ref.offset += Instance_size;
chunk.chunk_n--;
item_header.offset += ItemHeader_size;
}
} else {
// As an optimization, we could just write 0 for the size.
TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
}
}

Binary file not shown.

View file

@ -16,16 +16,22 @@ layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup; uint[] tilegroup;
}; };
layout(set = 0, binding = 2) buffer PtclBuf { // Used readonly
layout(set = 0, binding = 2) buffer SegmentBuf {
uint[] segment;
};
layout(set = 0, binding = 3) buffer PtclBuf {
uint[] ptcl; uint[] ptcl;
}; };
layout(set = 0, binding = 3) buffer AllocBuf { layout(set = 0, binding = 4) buffer AllocBuf {
uint alloc; uint alloc;
}; };
#include "scene.h" #include "scene.h"
#include "tilegroup.h" #include "tilegroup.h"
#include "segment.h"
#include "ptcl.h" #include "ptcl.h"
#include "setup.h" #include "setup.h"
@ -45,17 +51,19 @@ void main() {
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES); + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX); vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_INITIAL_ALLOC); TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
while (true) { while (true) {
uint tg_tag = TileGroup_tag(tg_ref); uint tg_tag = TileGroup_tag(tg_ref);
if (tg_tag == TileGroup_End) { if (tg_tag == TileGroup_End) {
break; break;
} }
if (tg_tag == TileGroup_Jump) { if (tg_tag == TileGroup_Jump) {
tg_ref = TileGroupRef(TileGroup_Jump_read(tg_ref).new_ref); tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
continue; continue;
} }
// Assume tg_tag is `Instance`, though there will be more cases. // Assume tg_tag is `Instance`, though there will be more cases.
@ -76,6 +84,22 @@ void main() {
cmd_ref.offset += Cmd_size; cmd_ref.offset += Cmd_size;
} }
break; break;
case PietItem_Poly:
ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
stroke_th.items.offset += ItemHeader_size;
if (stroke_item.n > 0) {
PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
CmdStroke cmd = CmdStroke(
stroke_item.n,
stroke_item.segments.offset,
0.5 * poly.width,
poly.rgba_color
);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd);
cmd_ref.offset += Cmd_size;
}
break;
} }
tg_ref.offset += TileGroup_size; tg_ref.offset += TileGroup_size;
} }

Binary file not shown.

View file

@ -14,11 +14,17 @@ layout(set = 0, binding = 0) buffer PtclBuf {
uint[] ptcl; uint[] ptcl;
}; };
layout(set = 0, binding = 1) buffer ImageBuf { // Used readonly
layout(set = 0, binding = 1) buffer SegmentBuf {
uint[] segment;
};
layout(set = 0, binding = 2) buffer ImageBuf {
uint[] image; uint[] image;
}; };
#include "ptcl.h" #include "ptcl.h"
#include "segment.h"
#include "setup.h" #include "setup.h"
@ -41,10 +47,24 @@ void main() {
CmdCircle circle = Cmd_Circle_read(cmd_ref); CmdCircle circle = Cmd_Circle_read(cmd_ref);
float r = length(xy + vec2(0.5, 0.5) - circle.center.xy); float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0); float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color); vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
// TODO: sRGB // TODO: sRGB
rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a); rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
break; break;
case Cmd_Stroke:
CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
float df = 1e9;
for (int i = 0; i < stroke.n_segs; i++) {
Segment seg = Segment_read(Segment_index(SegmentRef(stroke.seg_ref), i));
vec2 line_vec = seg.end - seg.start;
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df = min(df, length(line_vec * t - dpos));
}
fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
break;
case Cmd_Jump: case Cmd_Jump:
cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref); cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
continue; continue;

Binary file not shown.

View file

@ -60,11 +60,13 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
} }
struct CmdStroke { struct CmdStroke {
float halfWidth; uint n_segs;
uint seg_ref;
float half_width;
uint rgba_color; uint rgba_color;
}; };
#define CmdStroke_size 8 #define CmdStroke_size 16
CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) { CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
return CmdStrokeRef(ref.offset + index * CmdStroke_size); return CmdStrokeRef(ref.offset + index * CmdStroke_size);
@ -184,16 +186,22 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0]; uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1]; uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
uint raw3 = ptcl[ix + 3];
CmdStroke s; CmdStroke s;
s.halfWidth = uintBitsToFloat(raw0); s.n_segs = raw0;
s.rgba_color = raw1; s.seg_ref = raw1;
s.half_width = uintBitsToFloat(raw2);
s.rgba_color = raw3;
return s; return s;
} }
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
ptcl[ix + 0] = floatBitsToUint(s.halfWidth); ptcl[ix + 0] = s.n_segs;
ptcl[ix + 1] = s.rgba_color; ptcl[ix + 1] = s.seg_ref;
ptcl[ix + 2] = floatBitsToUint(s.half_width);
ptcl[ix + 3] = s.rgba_color;
} }
CmdFill CmdFill_read(CmdFillRef ref) { CmdFill CmdFill_read(CmdFillRef ref) {

99
piet-gpu/shader/segment.h Normal file
View file

@ -0,0 +1,99 @@
// Code auto-generated by piet-gpu-derive
struct TileHeaderRef {
uint offset;
};
struct ItemHeaderRef {
uint offset;
};
struct SegmentRef {
uint offset;
};
struct TileHeader {
uint n;
ItemHeaderRef items;
};
#define TileHeader_size 8
TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
return TileHeaderRef(ref.offset + index * TileHeader_size);
}
struct ItemHeader {
uint n;
SegmentRef segments;
};
#define ItemHeader_size 8
ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
return ItemHeaderRef(ref.offset + index * ItemHeader_size);
}
struct Segment {
vec2 start;
vec2 end;
};
#define Segment_size 16
SegmentRef Segment_index(SegmentRef ref, uint index) {
return SegmentRef(ref.offset + index * Segment_size);
}
TileHeader TileHeader_read(TileHeaderRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
uint raw1 = segment[ix + 1];
TileHeader s;
s.n = raw0;
s.items = ItemHeaderRef(raw1);
return s;
}
void TileHeader_write(TileHeaderRef ref, TileHeader s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = s.n;
segment[ix + 1] = s.items.offset;
}
ItemHeader ItemHeader_read(ItemHeaderRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
uint raw1 = segment[ix + 1];
ItemHeader s;
s.n = raw0;
s.segments = SegmentRef(raw1);
return s;
}
void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = s.n;
segment[ix + 1] = s.segments.offset;
}
Segment Segment_read(SegmentRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
uint raw1 = segment[ix + 1];
uint raw2 = segment[ix + 2];
uint raw3 = segment[ix + 3];
Segment s;
s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
void Segment_write(SegmentRef ref, Segment s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = floatBitsToUint(s.start.x);
segment[ix + 1] = floatBitsToUint(s.start.y);
segment[ix + 2] = floatBitsToUint(s.end.x);
segment[ix + 3] = floatBitsToUint(s.end.y);
}

View file

@ -15,6 +15,15 @@
#define TILEGROUP_INITIAL_ALLOC 1024 #define TILEGROUP_INITIAL_ALLOC 1024
// Quick note on layout of tilegroups (k1 output): in the base,
// there is a region of size TILEGROUP_STRIDE for each tilegroup.
// At offset 0 are the main instances, encoded with Jump. At offset
// TILEGROUP_STROKE_START are the stroke instances, encoded with
// Head and Link.
#define TILEGROUP_STRIDE 2048
#define TILEGROUP_STROKE_START 1024
#define TILEGROUP_STROKE_ALLOC 1024
// TODO: compute all these // TODO: compute all these
#define WIDTH_IN_TILES 128 #define WIDTH_IN_TILES 128

View file

@ -8,6 +8,10 @@ struct JumpRef {
uint offset; uint offset;
}; };
struct ChunkRef {
uint offset;
};
struct TileGroupRef { struct TileGroupRef {
uint offset; uint offset;
}; };
@ -24,7 +28,7 @@ InstanceRef Instance_index(InstanceRef ref, uint index) {
} }
struct Jump { struct Jump {
uint new_ref; TileGroupRef new_ref;
}; };
#define Jump_size 4 #define Jump_size 4
@ -33,6 +37,17 @@ JumpRef Jump_index(JumpRef ref, uint index) {
return JumpRef(ref.offset + index * Jump_size); return JumpRef(ref.offset + index * Jump_size);
} }
struct Chunk {
uint chunk_n;
ChunkRef next;
};
#define Chunk_size 8
ChunkRef Chunk_index(ChunkRef ref, uint index) {
return ChunkRef(ref.offset + index * Chunk_size);
}
#define TileGroup_Instance 0 #define TileGroup_Instance 0
#define TileGroup_Jump 1 #define TileGroup_Jump 1
#define TileGroup_End 2 #define TileGroup_End 2
@ -64,13 +79,29 @@ Jump Jump_read(JumpRef ref) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
uint raw0 = tilegroup[ix + 0]; uint raw0 = tilegroup[ix + 0];
Jump s; Jump s;
s.new_ref = raw0; s.new_ref = TileGroupRef(raw0);
return s; return s;
} }
void Jump_write(JumpRef ref, Jump s) { void Jump_write(JumpRef ref, Jump s) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
tilegroup[ix + 0] = s.new_ref; tilegroup[ix + 0] = s.new_ref.offset;
}
Chunk Chunk_read(ChunkRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = tilegroup[ix + 0];
uint raw1 = tilegroup[ix + 1];
Chunk s;
s.chunk_n = raw0;
s.next = ChunkRef(raw1);
return s;
}
void Chunk_write(ChunkRef ref, Chunk s) {
uint ix = ref.offset >> 2;
tilegroup[ix + 0] = s.chunk_n;
tilegroup[ix + 1] = s.next.offset;
} }
uint TileGroup_tag(TileGroupRef ref) { uint TileGroup_tag(TileGroupRef ref) {

View file

@ -4,7 +4,7 @@ use std::path::Path;
use rand::{Rng, RngCore}; use rand::{Rng, RngCore};
use piet::kurbo::{Circle, Point}; use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
use piet::{Color, RenderContext}; use piet::{Color, RenderContext};
use piet_gpu_hal::vulkan::VkInstance; use piet_gpu_hal::vulkan::VkInstance;
@ -22,13 +22,15 @@ const TILE_H: usize = 16;
const WIDTH_IN_TILEGROUPS: usize = 4; const WIDTH_IN_TILEGROUPS: usize = 4;
const HEIGHT_IN_TILEGROUPS: usize = 96; const HEIGHT_IN_TILEGROUPS: usize = 96;
const TILEGROUP_INITIAL_ALLOC: usize = 1024; const TILEGROUP_STRIDE: usize = 2048;
const WIDTH_IN_TILES: usize = 124; const WIDTH_IN_TILES: usize = 128;
const HEIGHT_IN_TILES: usize = 96; const HEIGHT_IN_TILES: usize = 96;
const PTCL_INITIAL_ALLOC: usize = 1024; const PTCL_INITIAL_ALLOC: usize = 1024;
const N_CIRCLES: usize = 10_000; const K2_PER_TILE_SIZE: usize = 8;
const N_CIRCLES: usize = 1;
fn render_scene(rc: &mut impl RenderContext) { fn render_scene(rc: &mut impl RenderContext) {
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
@ -42,6 +44,29 @@ fn render_scene(rc: &mut impl RenderContext) {
let circle = Circle::new(center, radius); let circle = Circle::new(center, radius);
rc.fill(circle, &color); rc.fill(circle, &color);
} }
rc.stroke(
Line::new((100.0, 100.0), (200.0, 150.0)),
&Color::WHITE,
5.0,
);
render_cardioid(rc);
}
fn render_cardioid(rc: &mut impl RenderContext) {
let n = 100;
let dth = std::f64::consts::PI * 2.0 / (n as f64);
let center = Point::new(1024.0, 768.0);
let r = 750.0;
let mut path = BezPath::new();
for i in 1..n {
let p0 = center + Vec2::from_angle(i as f64 * dth) * r;
let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r;
rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
path.move_to(p0);
path.line_to(p1);
//rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0);
}
rc.stroke(&path, &Color::BLACK, 2.0);
} }
#[allow(unused)] #[allow(unused)]
@ -80,7 +105,8 @@ fn main() {
.unwrap(); .unwrap();
device.write_buffer(&scene_buf, &scene).unwrap(); device.write_buffer(&scene_buf, &scene).unwrap();
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap(); let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev).unwrap();
let ptcl_buf = device.create_buffer(12 * 1024 * 4096, dev).unwrap(); let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev).unwrap();
let segment_buf = device.create_buffer(64 * 1024 * 1024, dev).unwrap();
let image_buf = device let image_buf = device
.create_buffer((WIDTH * HEIGHT * 4) as u64, host) .create_buffer((WIDTH * HEIGHT * 4) as u64, host)
.unwrap(); .unwrap();
@ -90,7 +116,7 @@ fn main() {
let k1_alloc_buf_host = device.create_buffer(4, host).unwrap(); let k1_alloc_buf_host = device.create_buffer(4, host).unwrap();
let k1_alloc_buf_dev = device.create_buffer(4, dev).unwrap(); let k1_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_INITIAL_ALLOC; let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
device device
.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32]) .write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])
.unwrap(); .unwrap();
@ -103,6 +129,21 @@ fn main() {
) )
.unwrap(); .unwrap();
let k2s_alloc_buf_host = device.create_buffer(4, host).unwrap();
let k2s_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device
.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
.unwrap();
let k2s_code = include_bytes!("../shader/kernel2s.spv");
let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4).unwrap();
let k2s_ds = device
.create_descriptor_set(
&k2s_pipeline,
&[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
)
.unwrap();
let k3_alloc_buf_host = device.create_buffer(4, host).unwrap(); let k3_alloc_buf_host = device.create_buffer(4, host).unwrap();
let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap(); let k3_alloc_buf_dev = device.create_buffer(4, dev).unwrap();
let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
@ -110,24 +151,32 @@ fn main() {
.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32]) .write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
.unwrap(); .unwrap();
let k3_code = include_bytes!("../shader/kernel3.spv"); let k3_code = include_bytes!("../shader/kernel3.spv");
let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 4).unwrap(); let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 5).unwrap();
let k3_ds = device let k3_ds = device
.create_descriptor_set( .create_descriptor_set(
&k3_pipeline, &k3_pipeline,
&[&scene_dev, &tilegroup_buf, &ptcl_buf, &k3_alloc_buf_dev], &[
&scene_dev,
&tilegroup_buf,
&segment_buf,
&ptcl_buf,
&k3_alloc_buf_dev,
],
) )
.unwrap(); .unwrap();
let k4_code = include_bytes!("../shader/kernel4.spv"); let k4_code = include_bytes!("../shader/kernel4.spv");
let pipeline = device.create_simple_compute_pipeline(k4_code, 2).unwrap(); let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3).unwrap();
let descriptor_set = device let k4_ds = device
.create_descriptor_set(&pipeline, &[&ptcl_buf, &image_dev]) .create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &image_dev])
.unwrap(); .unwrap();
let query_pool = device.create_query_pool(4).unwrap();
let query_pool = device.create_query_pool(5).unwrap();
let mut cmd_buf = device.create_cmd_buf().unwrap(); let mut cmd_buf = device.create_cmd_buf().unwrap();
cmd_buf.begin(); cmd_buf.begin();
cmd_buf.copy_buffer(&scene_buf, &scene_dev); cmd_buf.copy_buffer(&scene_buf, &scene_dev);
cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev); cmd_buf.copy_buffer(&k1_alloc_buf_host, &k1_alloc_buf_dev);
cmd_buf.copy_buffer(&k2s_alloc_buf_host, &k2s_alloc_buf_dev);
cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev); cmd_buf.copy_buffer(&k3_alloc_buf_host, &k3_alloc_buf_dev);
cmd_buf.clear_buffer(&tilegroup_buf); cmd_buf.clear_buffer(&tilegroup_buf);
cmd_buf.clear_buffer(&ptcl_buf); cmd_buf.clear_buffer(&ptcl_buf);
@ -141,36 +190,47 @@ fn main() {
cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.dispatch( cmd_buf.dispatch(
&k3_pipeline, &k2s_pipeline,
&k3_ds, &k2s_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1), ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
); );
cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.write_timestamp(&query_pool, 2);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.dispatch( cmd_buf.dispatch(
&pipeline, &k3_pipeline,
&descriptor_set, &k3_ds,
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), ((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
); );
cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier(); cmd_buf.memory_barrier();
cmd_buf.dispatch(
&k4_pipeline,
&k4_ds,
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
);
cmd_buf.write_timestamp(&query_pool, 4);
cmd_buf.memory_barrier();
cmd_buf.copy_buffer(&image_dev, &image_buf); cmd_buf.copy_buffer(&image_dev, &image_buf);
cmd_buf.finish(); cmd_buf.finish();
device.run_cmd_buf(&cmd_buf).unwrap(); device.run_cmd_buf(&cmd_buf).unwrap();
let timestamps = device.reap_query_pool(query_pool).unwrap(); let timestamps = device.reap_query_pool(query_pool).unwrap();
println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3); println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
println!( println!(
"Kernel 3 time: {:.3}ms", "Kernel 2 time: {:.3}ms",
(timestamps[1] - timestamps[0]) * 1e3 (timestamps[1] - timestamps[0]) * 1e3
); );
println!( println!(
"Render time: {:.3}ms", "Kernel 3 time: {:.3}ms",
(timestamps[2] - timestamps[1]) * 1e3 (timestamps[2] - timestamps[1]) * 1e3
); );
println!(
"Render time: {:.3}ms",
(timestamps[3] - timestamps[2]) * 1e3
);
/* /*
let mut k1_data: Vec<u32> = Default::default(); let mut k1_data: Vec<u32> = Default::default();
device.read_buffer(&ptcl_buf, &mut k1_data).unwrap(); device.read_buffer(&segment_buf, &mut k1_data).unwrap();
dump_k1_data(&k1_data); dump_k1_data(&k1_data);
*/ */

View file

@ -259,7 +259,7 @@ fn flatten_shape(
} }
_ => (), _ => (),
} }
println!("{:?}", el); //println!("{:?}", el);
}); });
let n_points = points.len() as u32; let n_points = points.len() as u32;
let points_ref = points.encode(encoder).transmute(); let points_ref = points.encode(encoder).transmute();