mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-25 18:56:35 +11:00
Fix prefix sum
First, add decoupled lookback. Second, fix problem with monoid that was overly aggressive in resetting the bbox.
This commit is contained in:
parent
868b0320a4
commit
93044b469b
4 changed files with 84 additions and 15 deletions
|
@ -70,7 +70,7 @@ fn main() -> Result<(), Error> {
|
||||||
piet_gpu::dump_k1_data(&data);
|
piet_gpu::dump_k1_data(&data);
|
||||||
|
|
||||||
let mut data: Vec<u32> = Default::default();
|
let mut data: Vec<u32> = Default::default();
|
||||||
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
|
device.read_buffer(&renderer.anno_buf, &mut data).unwrap();
|
||||||
piet_gpu::dump_k1_data(&data);
|
piet_gpu::dump_k1_data(&data);
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
|
@ -18,10 +18,10 @@ layout(set = 0, binding = 0) readonly buffer SceneBuf {
|
||||||
uint[] scene;
|
uint[] scene;
|
||||||
};
|
};
|
||||||
|
|
||||||
// This will be used for inter-workgroup aggregates. In the
|
// It would be better to use the Vulkan memory model than
|
||||||
// meantime, for development, it has been used to store the
|
// "volatile" but shooting for compatibility here rather
|
||||||
// scan of the state objects.
|
// than doing things right.
|
||||||
layout(set = 0, binding = 1) buffer StateBuf {
|
layout(set = 0, binding = 1) volatile buffer StateBuf {
|
||||||
uint[] state;
|
uint[] state;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -34,8 +34,28 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf {
|
||||||
#include "state.h"
|
#include "state.h"
|
||||||
#include "annotated.h"
|
#include "annotated.h"
|
||||||
|
|
||||||
|
#define StateBuf_stride (4 + 2 * State_size)
|
||||||
|
|
||||||
|
StateRef state_aggregate_ref(uint partition_ix) {
|
||||||
|
return StateRef(8 + partition_ix * StateBuf_stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
StateRef state_prefix_ref(uint partition_ix) {
|
||||||
|
return StateRef(8 + partition_ix * StateBuf_stride + State_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint state_flag_index(uint partition_ix) {
|
||||||
|
return 1 + partition_ix * (StateBuf_stride / 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
// These correspond to X, A, P respectively in the prefix sum paper.
|
||||||
|
#define FLAG_NOT_READY 0
|
||||||
|
#define FLAG_AGGREGATE_READY 1
|
||||||
|
#define FLAG_PREFIX_READY 2
|
||||||
|
|
||||||
#define FLAG_SET_LINEWIDTH 1
|
#define FLAG_SET_LINEWIDTH 1
|
||||||
#define FLAG_RESET_BBOX 2
|
#define FLAG_SET_BBOX 2
|
||||||
|
#define FLAG_RESET_BBOX 4
|
||||||
|
|
||||||
// This is almost like a monoid (the interaction between transformation and
|
// This is almost like a monoid (the interaction between transformation and
|
||||||
// bounding boxes is approximate)
|
// bounding boxes is approximate)
|
||||||
|
@ -47,7 +67,9 @@ State combine_state(State a, State b) {
|
||||||
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
|
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
|
||||||
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
|
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
|
||||||
c.bbox = a.bbox;
|
c.bbox = a.bbox;
|
||||||
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
|
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
|
||||||
|
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
|
||||||
|
{
|
||||||
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
|
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
|
||||||
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
|
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
|
||||||
}
|
}
|
||||||
|
@ -59,7 +81,7 @@ State combine_state(State a, State b) {
|
||||||
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
|
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
|
||||||
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
|
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
|
||||||
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
|
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
|
||||||
c.flags = a.flags | b.flags;
|
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,10 +144,18 @@ shared vec4 sh_bbox[WG_SIZE];
|
||||||
shared float sh_width[WG_SIZE];
|
shared float sh_width[WG_SIZE];
|
||||||
shared uint sh_flags[WG_SIZE];
|
shared uint sh_flags[WG_SIZE];
|
||||||
|
|
||||||
|
shared uint sh_tile_ix;
|
||||||
|
shared State sh_prefix;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
State th_state[N_ROWS];
|
State th_state[N_ROWS];
|
||||||
// this becomes an atomic counter
|
// Determine partition to process by atomic counter (described in Section
|
||||||
uint tile_ix = gl_WorkGroupID.x;
|
// 4.4 of prefix sum paper).
|
||||||
|
if (gl_LocalInvocationID.x == 0) {
|
||||||
|
sh_tile_ix = atomicAdd(state[0], 1);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
uint tile_ix = sh_tile_ix;
|
||||||
|
|
||||||
uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
||||||
ElementRef ref = ElementRef(ix * Element_size);
|
ElementRef ref = ElementRef(ix * Element_size);
|
||||||
|
@ -162,16 +192,54 @@ void main() {
|
||||||
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: if last invocation in wg, publish agg.
|
|
||||||
|
|
||||||
barrier();
|
|
||||||
State exclusive;
|
State exclusive;
|
||||||
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
|
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
|
||||||
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
|
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
|
||||||
exclusive.translate = vec2(0.0, 0.0);
|
exclusive.translate = vec2(0.0, 0.0);
|
||||||
exclusive.linewidth = 1.0; //TODO should be 0.0
|
exclusive.linewidth = 1.0; //TODO should be 0.0
|
||||||
exclusive.flags = 0;
|
exclusive.flags = 0;
|
||||||
// TODO: do decoupled look-back
|
|
||||||
|
// Publish aggregate for this partition
|
||||||
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
|
// Note: with memory model, we'd want to generate the atomic store version of this.
|
||||||
|
State_write(state_aggregate_ref(tile_ix), agg);
|
||||||
|
uint flag = FLAG_AGGREGATE_READY;
|
||||||
|
memoryBarrierBuffer();
|
||||||
|
if (tile_ix == 0) {
|
||||||
|
State_write(state_prefix_ref(tile_ix), agg);
|
||||||
|
flag = FLAG_PREFIX_READY;
|
||||||
|
}
|
||||||
|
state[state_flag_index(tile_ix)] = flag;
|
||||||
|
if (tile_ix != 0) {
|
||||||
|
// step 4 of paper: decoupled lookback
|
||||||
|
uint look_back_ix = tile_ix - 1;
|
||||||
|
while (true) {
|
||||||
|
flag = state[state_flag_index(look_back_ix)];
|
||||||
|
if (flag == FLAG_PREFIX_READY) {
|
||||||
|
State their_prefix = State_read(state_prefix_ref(look_back_ix));
|
||||||
|
exclusive = combine_state(their_prefix, exclusive);
|
||||||
|
break;
|
||||||
|
} else if (flag == FLAG_AGGREGATE_READY) {
|
||||||
|
State their_agg = State_read(state_aggregate_ref(look_back_ix));
|
||||||
|
exclusive = combine_state(their_agg, exclusive);
|
||||||
|
look_back_ix--;
|
||||||
|
}
|
||||||
|
// else spin
|
||||||
|
}
|
||||||
|
|
||||||
|
// step 5 of paper: compute inclusive prefix
|
||||||
|
State inclusive_prefix = combine_state(exclusive, agg);
|
||||||
|
sh_prefix = exclusive;
|
||||||
|
State_write(state_prefix_ref(tile_ix), inclusive_prefix);
|
||||||
|
memoryBarrierBuffer();
|
||||||
|
flag = FLAG_PREFIX_READY;
|
||||||
|
state[state_flag_index(tile_ix)] = flag;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
if (tile_ix != 0) {
|
||||||
|
exclusive = sh_prefix;
|
||||||
|
}
|
||||||
|
|
||||||
State row = exclusive;
|
State row = exclusive;
|
||||||
if (gl_LocalInvocationID.x > 0) {
|
if (gl_LocalInvocationID.x > 0) {
|
||||||
|
|
Binary file not shown.
|
@ -158,7 +158,7 @@ impl<D: Device> Renderer<D> {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
device.write_buffer(&scene_buf, &scene)?;
|
device.write_buffer(&scene_buf, &scene)?;
|
||||||
|
|
||||||
let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
|
||||||
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
|
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
|
||||||
|
@ -245,6 +245,7 @@ impl<D: Device> Renderer<D> {
|
||||||
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
|
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
|
||||||
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
|
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
|
||||||
cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
|
cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
|
||||||
|
cmd_buf.clear_buffer(&self.state_buf);
|
||||||
cmd_buf.memory_barrier();
|
cmd_buf.memory_barrier();
|
||||||
cmd_buf.image_barrier(
|
cmd_buf.image_barrier(
|
||||||
&self.image_dev,
|
&self.image_dev,
|
||||||
|
|
Loading…
Add table
Reference in a new issue