Checkpoint parallel output

Parallel segment output seems to be working for strokes.
This commit is contained in:
Raph Levien 2020-05-25 09:08:21 -07:00
parent 24b3def0a1
commit 8eaf49a04d
9 changed files with 130 additions and 52 deletions

View file

@ -13,8 +13,8 @@ piet_gpu! {
end: [f32; 2], end: [f32; 2],
} }
struct CmdStroke { struct CmdStroke {
// Should be Ref<SegChunk> // Consider a specialization to one segment.
seg_ref: u32, seg_ref: Ref<SegChunk>,
half_width: f32, half_width: f32,
rgba_color: u32, rgba_color: u32,
} }
@ -63,7 +63,8 @@ piet_gpu! {
struct SegChunk { struct SegChunk {
n: u32, n: u32,
next: Ref<SegChunk>, next: Ref<SegChunk>,
// Segments follow (could represent this as a variable sized array). // Actually a reference to a variable-sized slice.
segs: Ref<Segment>,
} }
} }
} }

View file

@ -75,6 +75,58 @@ fn trace_merge(buf: &[u32]) {
} }
} }
/// Interpret the output of the coarse raster stage, for diagnostic purposes.
#[allow(unused)]
fn trace_ptcl(buf: &[u32]) {
for y in 0..96 {
for x in 0..128 {
let tile_ix = y * 128 + x;
println!("tile {} @({}, {})", tile_ix, x, y);
let mut tile_offset = tile_ix * 1024;
loop {
let tag = buf[tile_offset / 4];
match tag {
0 => break,
4 => {
let line_width = f32::from_bits(buf[tile_offset / 4 + 2]);
let rgba_color = buf[tile_offset / 4 + 3];
println!(" {:x}: stroke {:x} {}", tile_offset, rgba_color, line_width);
let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
let n = buf[seg_chunk / 4] as usize;
let segs = buf[seg_chunk / 4 + 2] as usize;
println!(" chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
for i in 0..n {
let x0 = f32::from_bits(buf[segs / 4 + i * 4]);
let y0 = f32::from_bits(buf[segs / 4 + i * 4 + 1]);
let x1 = f32::from_bits(buf[segs / 4 + i * 4 + 2]);
let y1 = f32::from_bits(buf[segs / 4 + i * 4 + 3]);
println!(" ({:.3}, {:.3}) - ({:.3}, {:.3})", x0, y0, x1, y1);
}
loop {
seg_chunk = buf[seg_chunk / 4 + 1] as usize;
if seg_chunk == 0 {
break;
}
}
}
_ => {
println!("{:x}: {}", tile_offset, tag);
}
}
if tag == 0 {
break;
}
if tag == 8 {
tile_offset = buf[tile_offset / 4 + 1] as usize;
} else {
tile_offset += 20;
}
}
}
}
}
fn main() -> Result<(), Error> { fn main() -> Result<(), Error> {
let (instance, _) = VkInstance::new(None)?; let (instance, _) = VkInstance::new(None)?;
unsafe { unsafe {
@ -109,7 +161,7 @@ fn main() -> Result<(), Error> {
let mut data: Vec<u32> = Default::default(); let mut data: Vec<u32> = Default::default();
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
piet_gpu::dump_k1_data(&data); piet_gpu::dump_k1_data(&data);
//trace_merge(&data); //trace_ptcl(&data);
*/ */
let mut img_data: Vec<u8> = Default::default(); let mut img_data: Vec<u8> = Default::default();

View file

@ -51,6 +51,7 @@ shared uint sh_is_segment[N_SLICE];
// Count of total number of segments in each tile, then // Count of total number of segments in each tile, then
// inclusive prefix sum of same. // inclusive prefix sum of same.
shared uint sh_seg_count[N_TILE]; shared uint sh_seg_count[N_TILE];
shared uint sh_orig_seg_count[N_TILE];
shared uint sh_seg_alloc; shared uint sh_seg_alloc;
// scale factors useful for converting coordinates to tiles // scale factors useful for converting coordinates to tiles
@ -68,6 +69,11 @@ void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
} }
} }
// TODO: aggregate rather than doing an atomic every time
SegChunkRef alloc_seg_chunk() {
return SegChunkRef(atomicAdd(alloc, SegChunk_size));
}
// Accumulate delta to backdrop. // Accumulate delta to backdrop.
// //
// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each // Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
@ -91,10 +97,10 @@ void main() {
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
// Allocation and management of segment output // Allocation and management of segment output
SegChunkRef seg_chunk_ref = SegChunkRef(0);
SegChunkRef first_seg_chunk = SegChunkRef(0); SegChunkRef first_seg_chunk = SegChunkRef(0);
uint seg_limit = 0; SegChunkRef last_chunk_ref = SegChunkRef(0);
uint chunk_n_segs = 0; uint last_chunk_n = 0;
SegmentRef last_chunk_segs = SegmentRef(0);
uint wr_ix = 0; uint wr_ix = 0;
uint rd_ix = 0; uint rd_ix = 0;
@ -274,12 +280,10 @@ void main() {
uint seg_count = 0; uint seg_count = 0;
for (uint i = 0; i < N_SLICE; i++) { for (uint i = 0; i < N_SLICE; i++) {
// Count each segment as 1 and each non-segment element as 1. A finer seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
// approach would be to count bytes accurately (non-segment elements that
// are not strokes and fills wouldn't count).
seg_count += bitCount(sh_bitmaps[i][th_ix]);
} }
sh_seg_count[th_ix] = seg_count; sh_seg_count[th_ix] = seg_count;
sh_orig_seg_count[th_ix] = seg_count;
// Prefix sum of sh_seg_count // Prefix sum of sh_seg_count
for (uint i = 0; i < LG_N_TILE; i++) { for (uint i = 0; i < LG_N_TILE; i++) {
barrier(); barrier();
@ -290,14 +294,13 @@ void main() {
sh_seg_count[th_ix] = seg_count; sh_seg_count[th_ix] = seg_count;
} }
if (th_ix == N_TILE - 1) { if (th_ix == N_TILE - 1) {
sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size + SegChunk_size); sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size);
} }
barrier(); barrier();
uint total_seg_count = sh_seg_count[N_TILE - 1]; uint total_seg_count = sh_seg_count[N_TILE - 1];
uint seg_alloc = sh_seg_alloc; uint seg_alloc = sh_seg_alloc;
// Output buffer is allocated as segments for each tile laid end-to-end, // Output buffer is allocated as segments for each tile laid end-to-end.
// but with gaps for non-segment elements (to fit the linked list headers).
for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) { for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) {
// Find the work item; this thread is now not bound to an element or tile. // Find the work item; this thread is now not bound to an element or tile.
@ -322,8 +325,9 @@ void main() {
// tile, accelerated by bit counting. Binary search might help, maybe not. // tile, accelerated by bit counting. Binary search might help, maybe not.
uint slice_ix = 0; uint slice_ix = 0;
uint seq_bits; uint seq_bits;
while (true) { while (true) {
seq_bits = sh_bitmaps[slice_ix][tile_ix]; seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix];
uint this_count = bitCount(seq_bits); uint this_count = bitCount(seq_bits);
if (this_count > seq_ix) { if (this_count > seq_ix) {
break; break;
@ -339,15 +343,13 @@ void main() {
bit_ix = probe; bit_ix = probe;
} }
} }
if ((sh_is_segment[slice_ix] & (1 << bit_ix)) != 0) { uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size;
uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size; uint rd_el_ix = (rd_ix + slice_ix * 32 + bit_ix) % N_RINGBUF;
uint rd_el_ix = (rd_ix + slice_ix * 32 + bit_ix) % N_RINGBUF; uint element_ix = sh_elements[rd_el_ix];
uint element_ix = sh_elements[rd_el_ix]; ref = AnnotatedRef(element_ix * Annotated_size);
ref = AnnotatedRef(element_ix * Annotated_size); AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); Segment seg = Segment(line.p0, line.p1);
Segment seg = Segment(line.p0, line.p1); Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
Segment_write(SegmentRef(seg_alloc + Segment_size * ix + SegChunk_size), seg);
}
} }
// Output non-segment elements for this tile. The thread does a sequential walk // Output non-segment elements for this tile. The thread does a sequential walk
@ -397,6 +399,7 @@ void main() {
switch (tag) { switch (tag) {
case Annotated_Fill: case Annotated_Fill:
/*
if (seg_count > 0) { if (seg_count > 0) {
AnnoFill fill = Annotated_Fill_read(ref); AnnoFill fill = Annotated_Fill_read(ref);
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0))); SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
@ -415,32 +418,45 @@ void main() {
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color)); Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
cmd_ref.offset += Cmd_size; cmd_ref.offset += Cmd_size;
} }
*/
backdrop = 0; backdrop = 0;
seg_count = 0; seg_count = 0;
break; break;
case Annotated_Stroke: case Annotated_Stroke:
if (chunk_n_segs > 0 || seg_count > 0) { if (last_chunk_n > 0 || seg_count > 0) {
uint chunk_offset = seg_count > 0 ? seg_alloc + seg_start * Segment_size : 0; // TODO: noncontiguous case
SegChunkRef chunk_start = SegChunkRef(chunk_offset);
if (chunk_n_segs > 0) { SegChunkRef chunk_ref = SegChunkRef(0);
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, chunk_start));
} else {
first_seg_chunk = chunk_start;
}
if (seg_count > 0) { if (seg_count > 0) {
SegChunk_write(chunk_start, SegChunk(seg_count, SegChunkRef(0))); chunk_ref = alloc_seg_chunk();
SegChunk chunk;
chunk.n = seg_count;
chunk.next = SegChunkRef(0);
uint seg_offset = seg_alloc + seg_start * Segment_size;
chunk.segs = SegmentRef(seg_offset);
SegChunk_write(chunk_ref, chunk);
} }
if (last_chunk_n > 0) {
SegChunk chunk;
chunk.n = last_chunk_n;
chunk.next = chunk_ref;
chunk.segs = last_chunk_segs;
SegChunk_write(last_chunk_ref, chunk);
} else {
first_seg_chunk = chunk_ref;
}
AnnoStroke stroke = Annotated_Stroke_read(ref); AnnoStroke stroke = Annotated_Stroke_read(ref);
CmdStroke cmd_stroke; CmdStroke cmd_stroke;
cmd_stroke.seg_ref = first_seg_chunk.offset; cmd_stroke.seg_ref = first_seg_chunk;
cmd_stroke.half_width = 0.5 * stroke.linewidth; cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color; cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit); alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke); Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size; cmd_ref.offset += Cmd_size;
chunk_n_segs = 0; last_chunk_n = 0;
} }
seg_start += seg_count + 1; seg_start += seg_count;
seg_count = 0; seg_count = 0;
break; break;
default: default:
@ -450,14 +466,18 @@ void main() {
} }
} }
if (seg_count > 0) { if (seg_count > 0) {
SegChunkRef chunk_start = SegChunkRef(seg_alloc + seg_start * Segment_size); SegChunkRef chunk_ref = alloc_seg_chunk();
if (chunk_n_segs > 0) { if (last_chunk_n > 0) {
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, chunk_start)); SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
} else { } else {
first_seg_chunk = chunk_start; first_seg_chunk = chunk_ref;
} }
seg_chunk_ref = chunk_start; // TODO: free two registers by writing count and segments ref now,
chunk_n_segs = seg_count; // as opposed to deferring SegChunk write until all fields are known.
last_chunk_ref = chunk_ref;
last_chunk_n = seg_count;
uint seg_offset = seg_alloc + seg_start * Segment_size;
last_chunk_segs = SegmentRef(seg_offset);
} }
barrier(); barrier();

Binary file not shown.

View file

@ -47,11 +47,12 @@ void main() {
case Cmd_Stroke: case Cmd_Stroke:
CmdStroke stroke = Cmd_Stroke_read(cmd_ref); CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
float df = 1e9; float df = 1e9;
SegChunkRef seg_chunk_ref = SegChunkRef(stroke.seg_ref); SegChunkRef seg_chunk_ref = stroke.seg_ref;
do { do {
SegChunk seg_chunk = SegChunk_read(seg_chunk_ref); SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
SegmentRef segs = seg_chunk.segs;
for (int i = 0; i < seg_chunk.n; i++) { for (int i = 0; i < seg_chunk.n; i++) {
Segment seg = Segment_read(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * i)); Segment seg = Segment_read(Segment_index(segs, i));
vec2 line_vec = seg.end - seg.start; vec2 line_vec = seg.end - seg.start;
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start; vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);

Binary file not shown.

View file

@ -68,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
} }
struct CmdStroke { struct CmdStroke {
uint seg_ref; SegChunkRef seg_ref;
float half_width; float half_width;
uint rgba_color; uint rgba_color;
}; };
@ -163,9 +163,10 @@ SegmentRef Segment_index(SegmentRef ref, uint index) {
struct SegChunk { struct SegChunk {
uint n; uint n;
SegChunkRef next; SegChunkRef next;
SegmentRef segs;
}; };
#define SegChunk_size 8 #define SegChunk_size 12
SegChunkRef SegChunk_index(SegChunkRef ref, uint index) { SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
return SegChunkRef(ref.offset + index * SegChunk_size); return SegChunkRef(ref.offset + index * SegChunk_size);
@ -218,7 +219,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
uint raw1 = ptcl[ix + 1]; uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2]; uint raw2 = ptcl[ix + 2];
CmdStroke s; CmdStroke s;
s.seg_ref = raw0; s.seg_ref = SegChunkRef(raw0);
s.half_width = uintBitsToFloat(raw1); s.half_width = uintBitsToFloat(raw1);
s.rgba_color = raw2; s.rgba_color = raw2;
return s; return s;
@ -226,7 +227,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.seg_ref; ptcl[ix + 0] = s.seg_ref.offset;
ptcl[ix + 1] = floatBitsToUint(s.half_width); ptcl[ix + 1] = floatBitsToUint(s.half_width);
ptcl[ix + 2] = s.rgba_color; ptcl[ix + 2] = s.rgba_color;
} }
@ -416,9 +417,11 @@ SegChunk SegChunk_read(SegChunkRef ref) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0]; uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1]; uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
SegChunk s; SegChunk s;
s.n = raw0; s.n = raw0;
s.next = SegChunkRef(raw1); s.next = SegChunkRef(raw1);
s.segs = SegmentRef(raw2);
return s; return s;
} }
@ -426,5 +429,6 @@ void SegChunk_write(SegChunkRef ref, SegChunk s) {
uint ix = ref.offset >> 2; uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.n; ptcl[ix + 0] = s.n;
ptcl[ix + 1] = s.next.offset; ptcl[ix + 1] = s.next.offset;
ptcl[ix + 2] = s.segs.offset;
} }

View file

@ -46,8 +46,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
let circle = Circle::new(center, radius); let circle = Circle::new(center, radius);
rc.fill(circle, &color); rc.fill(circle, &color);
} }
let mut path = BezPath::new();
/* /*
let mut path = BezPath::new();
path.move_to((100.0, 1150.0)); path.move_to((100.0, 1150.0));
path.line_to((200.0, 1200.0)); path.line_to((200.0, 1200.0));
path.line_to((150.0, 1250.0)); path.line_to((150.0, 1250.0));
@ -59,8 +59,8 @@ pub fn render_scene(rc: &mut impl RenderContext) {
&Color::WHITE, &Color::WHITE,
5.0, 5.0,
); );
render_cardioid(rc); //render_cardioid(rc);
//render_tiger(rc); render_tiger(rc);
} }
#[allow(unused)] #[allow(unused)]

View file

@ -58,7 +58,7 @@ impl PicoSvg {
} }
pub fn render(&self, rc: &mut impl RenderContext) { pub fn render(&self, rc: &mut impl RenderContext) {
for item in self.items.iter().take(30) { for item in &self.items {
match item { match item {
Item::Fill(fill_item) => { Item::Fill(fill_item) => {
//rc.fill(&fill_item.path, &fill_item.color); //rc.fill(&fill_item.path, &fill_item.color);