Fix prefix sum

First, add decoupled lookback.

Second, fix problem with monoid that was overly aggressive in resetting
the bbox.
This commit is contained in:
Raph Levien 2020-05-15 20:09:39 -07:00
parent 868b0320a4
commit 93044b469b
4 changed files with 84 additions and 15 deletions

View file

@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
piet_gpu::dump_k1_data(&data);
let mut data: Vec<u32> = Default::default();
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
device.read_buffer(&renderer.anno_buf, &mut data).unwrap();
piet_gpu::dump_k1_data(&data);
*/

View file

@ -18,10 +18,10 @@ layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
// This will be used for inter-workgroup aggregates. In the
// meantime, for development, it has been used to store the
// scan of the state objects.
layout(set = 0, binding = 1) buffer StateBuf {
// It would be better to use the Vulkan memory model than
// "volatile" but shooting for compatibility here rather
// than doing things right.
layout(set = 0, binding = 1) volatile buffer StateBuf {
uint[] state;
};
@ -34,8 +34,28 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
#include "state.h"
#include "annotated.h"
#define StateBuf_stride (4 + 2 * State_size)
StateRef state_aggregate_ref(uint partition_ix) {
return StateRef(8 + partition_ix * StateBuf_stride);
}
StateRef state_prefix_ref(uint partition_ix) {
return StateRef(8 + partition_ix * StateBuf_stride + State_size);
}
uint state_flag_index(uint partition_ix) {
return 1 + partition_ix * (StateBuf_stride / 4);
}
// These correspond to X, A, P respectively in the prefix sum paper.
#define FLAG_NOT_READY 0
#define FLAG_AGGREGATE_READY 1
#define FLAG_PREFIX_READY 2
#define FLAG_SET_LINEWIDTH 1
#define FLAG_RESET_BBOX 2
#define FLAG_SET_BBOX 2
#define FLAG_RESET_BBOX 4
// This is almost like a monoid (the interaction between transformation and
// bounding boxes is approximate)
@ -47,7 +67,9 @@ State combine_state(State a, State b) {
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
c.bbox = a.bbox;
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
{
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
}
@ -59,7 +81,7 @@ State combine_state(State a, State b) {
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
c.flags = a.flags | b.flags;
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
return c;
}
@ -122,10 +144,18 @@ shared vec4 sh_bbox[WG_SIZE];
shared float sh_width[WG_SIZE];
shared uint sh_flags[WG_SIZE];
shared uint sh_tile_ix;
shared State sh_prefix;
void main() {
State th_state[N_ROWS];
// this becomes an atomic counter
uint tile_ix = gl_WorkGroupID.x;
// Determine partition to process by atomic counter (described in Section
// 4.4 of prefix sum paper).
if (gl_LocalInvocationID.x == 0) {
sh_tile_ix = atomicAdd(state[0], 1);
}
barrier();
uint tile_ix = sh_tile_ix;
uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
ElementRef ref = ElementRef(ix * Element_size);
@ -162,16 +192,54 @@ void main() {
sh_flags[gl_LocalInvocationID.x] = agg.flags;
}
// TODO: if last invocation in wg, publish agg.
barrier();
State exclusive;
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
exclusive.translate = vec2(0.0, 0.0);
exclusive.linewidth = 1.0; //TODO should be 0.0
exclusive.flags = 0;
// TODO: do decoupled look-back
// Publish aggregate for this partition
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
// Note: with memory model, we'd want to generate the atomic store version of this.
State_write(state_aggregate_ref(tile_ix), agg);
uint flag = FLAG_AGGREGATE_READY;
memoryBarrierBuffer();
if (tile_ix == 0) {
State_write(state_prefix_ref(tile_ix), agg);
flag = FLAG_PREFIX_READY;
}
state[state_flag_index(tile_ix)] = flag;
if (tile_ix != 0) {
// step 4 of paper: decoupled lookback
uint look_back_ix = tile_ix - 1;
while (true) {
flag = state[state_flag_index(look_back_ix)];
if (flag == FLAG_PREFIX_READY) {
State their_prefix = State_read(state_prefix_ref(look_back_ix));
exclusive = combine_state(their_prefix, exclusive);
break;
} else if (flag == FLAG_AGGREGATE_READY) {
State their_agg = State_read(state_aggregate_ref(look_back_ix));
exclusive = combine_state(their_agg, exclusive);
look_back_ix--;
}
// else spin
}
// step 5 of paper: compute inclusive prefix
State inclusive_prefix = combine_state(exclusive, agg);
sh_prefix = exclusive;
State_write(state_prefix_ref(tile_ix), inclusive_prefix);
memoryBarrierBuffer();
flag = FLAG_PREFIX_READY;
state[state_flag_index(tile_ix)] = flag;
}
}
barrier();
if (tile_ix != 0) {
exclusive = sh_prefix;
}
State row = exclusive;
if (gl_LocalInvocationID.x > 0) {

Binary file not shown.

View file

@ -158,7 +158,7 @@ impl<D: Device> Renderer<D> {
.unwrap();
device.write_buffer(&scene_buf, &scene)?;
let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
@ -245,6 +245,7 @@ impl<D: Device> Renderer<D> {
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
cmd_buf.clear_buffer(&self.state_buf);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(
&self.image_dev,