mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Fix prefix sum
First, add decoupled lookback. Second, fix problem with monoid that was overly aggressive in resetting the bbox.
This commit is contained in:
parent
868b0320a4
commit
93044b469b
|
@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
|
|||
piet_gpu::dump_k1_data(&data);
|
||||
|
||||
let mut data: Vec<u32> = Default::default();
|
||||
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
|
||||
device.read_buffer(&renderer.anno_buf, &mut data).unwrap();
|
||||
piet_gpu::dump_k1_data(&data);
|
||||
*/
|
||||
|
||||
|
|
|
@ -18,10 +18,10 @@ layout(set = 0, binding = 0) readonly buffer SceneBuf {
|
|||
uint[] scene;
|
||||
};
|
||||
|
||||
// This will be used for inter-workgroup aggregates. In the
|
||||
// meantime, for development, it has been used to store the
|
||||
// scan of the state objects.
|
||||
layout(set = 0, binding = 1) buffer StateBuf {
|
||||
// It would be better to use the Vulkan memory model than
|
||||
// "volatile" but shooting for compatibility here rather
|
||||
// than doing things right.
|
||||
layout(set = 0, binding = 1) volatile buffer StateBuf {
|
||||
uint[] state;
|
||||
};
|
||||
|
||||
|
@ -34,8 +34,28 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
|
|||
#include "state.h"
|
||||
#include "annotated.h"
|
||||
|
||||
#define StateBuf_stride (4 + 2 * State_size)
|
||||
|
||||
StateRef state_aggregate_ref(uint partition_ix) {
|
||||
return StateRef(8 + partition_ix * StateBuf_stride);
|
||||
}
|
||||
|
||||
StateRef state_prefix_ref(uint partition_ix) {
|
||||
return StateRef(8 + partition_ix * StateBuf_stride + State_size);
|
||||
}
|
||||
|
||||
uint state_flag_index(uint partition_ix) {
|
||||
return 1 + partition_ix * (StateBuf_stride / 4);
|
||||
}
|
||||
|
||||
// These correspond to X, A, P respectively in the prefix sum paper.
|
||||
#define FLAG_NOT_READY 0
|
||||
#define FLAG_AGGREGATE_READY 1
|
||||
#define FLAG_PREFIX_READY 2
|
||||
|
||||
#define FLAG_SET_LINEWIDTH 1
|
||||
#define FLAG_RESET_BBOX 2
|
||||
#define FLAG_SET_BBOX 2
|
||||
#define FLAG_RESET_BBOX 4
|
||||
|
||||
// This is almost like a monoid (the interaction between transformation and
|
||||
// bounding boxes is approximate)
|
||||
|
@ -47,7 +67,9 @@ State combine_state(State a, State b) {
|
|||
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
|
||||
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
|
||||
c.bbox = a.bbox;
|
||||
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
|
||||
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
|
||||
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
|
||||
{
|
||||
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
|
||||
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
|
||||
}
|
||||
|
@ -59,7 +81,7 @@ State combine_state(State a, State b) {
|
|||
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
|
||||
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
|
||||
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
|
||||
c.flags = a.flags | b.flags;
|
||||
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -122,10 +144,18 @@ shared vec4 sh_bbox[WG_SIZE];
|
|||
shared float sh_width[WG_SIZE];
|
||||
shared uint sh_flags[WG_SIZE];
|
||||
|
||||
shared uint sh_tile_ix;
|
||||
shared State sh_prefix;
|
||||
|
||||
void main() {
|
||||
State th_state[N_ROWS];
|
||||
// this becomes an atomic counter
|
||||
uint tile_ix = gl_WorkGroupID.x;
|
||||
// Determine partition to process by atomic counter (described in Section
|
||||
// 4.4 of prefix sum paper).
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sh_tile_ix = atomicAdd(state[0], 1);
|
||||
}
|
||||
barrier();
|
||||
uint tile_ix = sh_tile_ix;
|
||||
|
||||
uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
||||
ElementRef ref = ElementRef(ix * Element_size);
|
||||
|
@ -162,16 +192,54 @@ void main() {
|
|||
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
||||
}
|
||||
|
||||
// TODO: if last invocation in wg, publish agg.
|
||||
|
||||
barrier();
|
||||
State exclusive;
|
||||
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
|
||||
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
|
||||
exclusive.translate = vec2(0.0, 0.0);
|
||||
exclusive.linewidth = 1.0; //TODO should be 0.0
|
||||
exclusive.flags = 0;
|
||||
// TODO: do decoupled look-back
|
||||
|
||||
// Publish aggregate for this partition
|
||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||
// Note: with memory model, we'd want to generate the atomic store version of this.
|
||||
State_write(state_aggregate_ref(tile_ix), agg);
|
||||
uint flag = FLAG_AGGREGATE_READY;
|
||||
memoryBarrierBuffer();
|
||||
if (tile_ix == 0) {
|
||||
State_write(state_prefix_ref(tile_ix), agg);
|
||||
flag = FLAG_PREFIX_READY;
|
||||
}
|
||||
state[state_flag_index(tile_ix)] = flag;
|
||||
if (tile_ix != 0) {
|
||||
// step 4 of paper: decoupled lookback
|
||||
uint look_back_ix = tile_ix - 1;
|
||||
while (true) {
|
||||
flag = state[state_flag_index(look_back_ix)];
|
||||
if (flag == FLAG_PREFIX_READY) {
|
||||
State their_prefix = State_read(state_prefix_ref(look_back_ix));
|
||||
exclusive = combine_state(their_prefix, exclusive);
|
||||
break;
|
||||
} else if (flag == FLAG_AGGREGATE_READY) {
|
||||
State their_agg = State_read(state_aggregate_ref(look_back_ix));
|
||||
exclusive = combine_state(their_agg, exclusive);
|
||||
look_back_ix--;
|
||||
}
|
||||
// else spin
|
||||
}
|
||||
|
||||
// step 5 of paper: compute inclusive prefix
|
||||
State inclusive_prefix = combine_state(exclusive, agg);
|
||||
sh_prefix = exclusive;
|
||||
State_write(state_prefix_ref(tile_ix), inclusive_prefix);
|
||||
memoryBarrierBuffer();
|
||||
flag = FLAG_PREFIX_READY;
|
||||
state[state_flag_index(tile_ix)] = flag;
|
||||
}
|
||||
}
|
||||
barrier();
|
||||
if (tile_ix != 0) {
|
||||
exclusive = sh_prefix;
|
||||
}
|
||||
|
||||
State row = exclusive;
|
||||
if (gl_LocalInvocationID.x > 0) {
|
||||
|
|
Binary file not shown.
|
@ -158,7 +158,7 @@ impl<D: Device> Renderer<D> {
|
|||
.unwrap();
|
||||
device.write_buffer(&scene_buf, &scene)?;
|
||||
|
||||
let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
|
||||
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
|
||||
|
@ -245,6 +245,7 @@ impl<D: Device> Renderer<D> {
|
|||
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
|
||||
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
|
||||
cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
|
||||
cmd_buf.clear_buffer(&self.state_buf);
|
||||
cmd_buf.memory_barrier();
|
||||
cmd_buf.image_barrier(
|
||||
&self.image_dev,
|
||||
|
|
Loading…
Reference in a new issue