mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Merge pull request #27 from bzm3r/vestige-cleanup
Clean up bits of right edge tracking logic left over from sort-middle.
This commit is contained in:
commit
72e2dfab3d
|
@ -15,24 +15,18 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
|
||||||
uint[] annotated;
|
uint[] annotated;
|
||||||
};
|
};
|
||||||
|
|
||||||
// This is for scanning forward for right_edge data.
|
layout(set = 0, binding = 1) buffer AllocBuf {
|
||||||
layout(set = 0, binding = 1) buffer StateBuf {
|
|
||||||
uint[] state;
|
|
||||||
};
|
|
||||||
|
|
||||||
layout(set = 0, binding = 2) buffer AllocBuf {
|
|
||||||
uint n_elements; // paths
|
uint n_elements; // paths
|
||||||
// Will be incremented atomically to claim tiles
|
// Will be incremented atomically to claim tiles
|
||||||
uint tile_ix;
|
uint tile_ix;
|
||||||
uint alloc;
|
uint alloc;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(set = 0, binding = 3) buffer BinsBuf {
|
layout(set = 0, binding = 2) buffer BinsBuf {
|
||||||
uint[] bins;
|
uint[] bins;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "annotated.h"
|
#include "annotated.h"
|
||||||
#include "state.h"
|
|
||||||
#include "bins.h"
|
#include "bins.h"
|
||||||
|
|
||||||
// scale factors useful for converting coordinates to bins
|
// scale factors useful for converting coordinates to bins
|
||||||
|
@ -52,12 +46,6 @@ shared uint sh_chunk_start[N_TILE];
|
||||||
|
|
||||||
shared float sh_right_edge[N_TILE];
|
shared float sh_right_edge[N_TILE];
|
||||||
|
|
||||||
#define StateBuf_stride (8 + 2 * State_size)
|
|
||||||
|
|
||||||
uint state_right_edge_index(uint partition_ix) {
|
|
||||||
return 2 + partition_ix * (StateBuf_stride / 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
uint chunk_n = 0;
|
uint chunk_n = 0;
|
||||||
uint my_n_elements = n_elements;
|
uint my_n_elements = n_elements;
|
||||||
|
@ -103,37 +91,6 @@ void main() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
// If the last element in this partition is a fill edge, then we need to do a
|
|
||||||
// look-forward to find the right edge of its corresponding fill. That data is
|
|
||||||
// recorded in aggregates computed in the element processing pass.
|
|
||||||
if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
|
|
||||||
uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
|
|
||||||
// This is sequential but the expectation is that the amount of
|
|
||||||
// look-forward is small (performance may degrade in the case
|
|
||||||
// of massively complex paths).
|
|
||||||
do {
|
|
||||||
my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
|
|
||||||
aggregate_ix++;
|
|
||||||
} while (isinf(my_right_edge));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now propagate right_edge backward, from fill to segment.
|
|
||||||
for (uint i = 0; i < LG_N_TILE; i++) {
|
|
||||||
// Note: we could try to cut down on write bandwidth here if the value hasn't
|
|
||||||
// changed, but not sure it's worth the complexity to track.
|
|
||||||
sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
|
|
||||||
barrier();
|
|
||||||
if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
|
|
||||||
my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
|
|
||||||
}
|
|
||||||
barrier();
|
|
||||||
}
|
|
||||||
if (crosses_edge) {
|
|
||||||
x1 = int(ceil(my_right_edge * SX));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
// At this point, we run an iterator over the coverage area,
|
// At this point, we run an iterator over the coverage area,
|
||||||
// trying to keep divergence low.
|
// trying to keep divergence low.
|
||||||
// Right now, it's just a bbox, but we'll get finer with
|
// Right now, it's just a bbox, but we'll get finer with
|
||||||
|
|
Binary file not shown.
|
@ -167,9 +167,7 @@ shared uint sh_flags[WG_SIZE];
|
||||||
shared uint sh_path_count[WG_SIZE];
|
shared uint sh_path_count[WG_SIZE];
|
||||||
shared uint sh_pathseg_count[WG_SIZE];
|
shared uint sh_pathseg_count[WG_SIZE];
|
||||||
|
|
||||||
shared uint sh_min_fill;
|
shared uint sh_part_ix;
|
||||||
|
|
||||||
shared uint sh_tile_ix;
|
|
||||||
shared State sh_prefix;
|
shared State sh_prefix;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
@ -177,28 +175,21 @@ void main() {
|
||||||
// Determine partition to process by atomic counter (described in Section
|
// Determine partition to process by atomic counter (described in Section
|
||||||
// 4.4 of prefix sum paper).
|
// 4.4 of prefix sum paper).
|
||||||
if (gl_LocalInvocationID.x == 0) {
|
if (gl_LocalInvocationID.x == 0) {
|
||||||
sh_tile_ix = atomicAdd(state[0], 1);
|
sh_part_ix = atomicAdd(state[0], 1);
|
||||||
sh_min_fill = ~0;
|
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
uint tile_ix = sh_tile_ix;
|
uint part_ix = sh_part_ix;
|
||||||
|
|
||||||
uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
||||||
ElementRef ref = ElementRef(ix * Element_size);
|
ElementRef ref = ElementRef(ix * Element_size);
|
||||||
|
|
||||||
bool is_fill;
|
bool is_fill;
|
||||||
uint my_min_fill = ~0;
|
|
||||||
th_state[0] = map_element(ref, is_fill);
|
th_state[0] = map_element(ref, is_fill);
|
||||||
if (is_fill) my_min_fill = ix;
|
|
||||||
for (uint i = 1; i < N_ROWS; i++) {
|
for (uint i = 1; i < N_ROWS; i++) {
|
||||||
// discussion question: would it be faster to load using more coherent patterns
|
// discussion question: would it be faster to load using more coherent patterns
|
||||||
// into thread memory? This is kinda strided.
|
// into thread memory? This is kinda strided.
|
||||||
th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
|
th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
|
||||||
if (is_fill && my_min_fill == ~0) {
|
|
||||||
my_min_fill = ix + i;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
atomicMin(sh_min_fill, my_min_fill);
|
|
||||||
State agg = th_state[N_ROWS - 1];
|
State agg = th_state[N_ROWS - 1];
|
||||||
sh_mat[gl_LocalInvocationID.x] = agg.mat;
|
sh_mat[gl_LocalInvocationID.x] = agg.mat;
|
||||||
sh_translate[gl_LocalInvocationID.x] = agg.translate;
|
sh_translate[gl_LocalInvocationID.x] = agg.translate;
|
||||||
|
@ -243,17 +234,17 @@ void main() {
|
||||||
// Publish aggregate for this partition
|
// Publish aggregate for this partition
|
||||||
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
|
||||||
// Note: with memory model, we'd want to generate the atomic store version of this.
|
// Note: with memory model, we'd want to generate the atomic store version of this.
|
||||||
State_write(state_aggregate_ref(tile_ix), agg);
|
State_write(state_aggregate_ref(part_ix), agg);
|
||||||
uint flag = FLAG_AGGREGATE_READY;
|
uint flag = FLAG_AGGREGATE_READY;
|
||||||
memoryBarrierBuffer();
|
memoryBarrierBuffer();
|
||||||
if (tile_ix == 0) {
|
if (part_ix == 0) {
|
||||||
State_write(state_prefix_ref(tile_ix), agg);
|
State_write(state_prefix_ref(part_ix), agg);
|
||||||
flag = FLAG_PREFIX_READY;
|
flag = FLAG_PREFIX_READY;
|
||||||
}
|
}
|
||||||
state[state_flag_index(tile_ix)] = flag;
|
state[state_flag_index(part_ix)] = flag;
|
||||||
if (tile_ix != 0) {
|
if (part_ix != 0) {
|
||||||
// step 4 of paper: decoupled lookback
|
// step 4 of paper: decoupled lookback
|
||||||
uint look_back_ix = tile_ix - 1;
|
uint look_back_ix = part_ix - 1;
|
||||||
while (true) {
|
while (true) {
|
||||||
flag = state[state_flag_index(look_back_ix)];
|
flag = state[state_flag_index(look_back_ix)];
|
||||||
if (flag == FLAG_PREFIX_READY) {
|
if (flag == FLAG_PREFIX_READY) {
|
||||||
|
@ -271,15 +262,14 @@ void main() {
|
||||||
// step 5 of paper: compute inclusive prefix
|
// step 5 of paper: compute inclusive prefix
|
||||||
State inclusive_prefix = combine_state(exclusive, agg);
|
State inclusive_prefix = combine_state(exclusive, agg);
|
||||||
sh_prefix = exclusive;
|
sh_prefix = exclusive;
|
||||||
State_write(state_prefix_ref(tile_ix), inclusive_prefix);
|
State_write(state_prefix_ref(part_ix), inclusive_prefix);
|
||||||
memoryBarrierBuffer();
|
memoryBarrierBuffer();
|
||||||
flag = FLAG_PREFIX_READY;
|
flag = FLAG_PREFIX_READY;
|
||||||
state[state_flag_index(tile_ix)] = flag;
|
state[state_flag_index(part_ix)] = flag;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
my_min_fill = sh_min_fill;
|
if (part_ix != 0) {
|
||||||
if (tile_ix != 0) {
|
|
||||||
exclusive = sh_prefix;
|
exclusive = sh_prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -296,14 +286,9 @@ void main() {
|
||||||
other.pathseg_count = sh_pathseg_count[ix];
|
other.pathseg_count = sh_pathseg_count[ix];
|
||||||
row = combine_state(row, other);
|
row = combine_state(row, other);
|
||||||
}
|
}
|
||||||
if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
|
|
||||||
state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
|
|
||||||
}
|
|
||||||
for (uint i = 0; i < N_ROWS; i++) {
|
for (uint i = 0; i < N_ROWS; i++) {
|
||||||
State st = combine_state(row, th_state[i]);
|
State st = combine_state(row, th_state[i]);
|
||||||
if (my_min_fill == ix + i) {
|
|
||||||
state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
|
|
||||||
}
|
|
||||||
// We write the state now for development purposes, but the
|
// We write the state now for development purposes, but the
|
||||||
// actual goal is to write transformed and annotated elements.
|
// actual goal is to write transformed and annotated elements.
|
||||||
//State_write(StateRef((ix + i) * State_size), st);
|
//State_write(StateRef((ix + i) * State_size), st);
|
||||||
|
|
Binary file not shown.
|
@ -237,10 +237,10 @@ impl<D: Device> Renderer<D> {
|
||||||
&[n_paths as u32, 0, bin_alloc_start as u32],
|
&[n_paths as u32, 0, bin_alloc_start as u32],
|
||||||
)?;
|
)?;
|
||||||
let bin_code = include_bytes!("../shader/binning.spv");
|
let bin_code = include_bytes!("../shader/binning.spv");
|
||||||
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
|
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
|
||||||
let bin_ds = device.create_descriptor_set(
|
let bin_ds = device.create_descriptor_set(
|
||||||
&bin_pipeline,
|
&bin_pipeline,
|
||||||
&[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
|
&[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
|
||||||
&[],
|
&[],
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue