Clean up bits of right edge tracking logic left over from sort-middle.

This commit is contained in:
bhmerchant@gmail.com 2020-08-09 16:13:50 -07:00
parent 724c4899f2
commit d836d21d12
5 changed files with 18 additions and 76 deletions

View file

@ -15,24 +15,18 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated; uint[] annotated;
}; };
// This is for scanning forward for right_edge data. layout(set = 0, binding = 1) buffer AllocBuf {
layout(set = 0, binding = 1) buffer StateBuf {
uint[] state;
};
layout(set = 0, binding = 2) buffer AllocBuf {
uint n_elements; // paths uint n_elements; // paths
// Will be incremented atomically to claim tiles // Will be incremented atomically to claim tiles
uint tile_ix; uint tile_ix;
uint alloc; uint alloc;
}; };
layout(set = 0, binding = 3) buffer BinsBuf { layout(set = 0, binding = 2) buffer BinsBuf {
uint[] bins; uint[] bins;
}; };
#include "annotated.h" #include "annotated.h"
#include "state.h"
#include "bins.h" #include "bins.h"
// scale factors useful for converting coordinates to bins // scale factors useful for converting coordinates to bins
@ -52,12 +46,6 @@ shared uint sh_chunk_start[N_TILE];
shared float sh_right_edge[N_TILE]; shared float sh_right_edge[N_TILE];
#define StateBuf_stride (8 + 2 * State_size)
uint state_right_edge_index(uint partition_ix) {
return 2 + partition_ix * (StateBuf_stride / 4);
}
void main() { void main() {
uint chunk_n = 0; uint chunk_n = 0;
uint my_n_elements = n_elements; uint my_n_elements = n_elements;
@ -103,37 +91,6 @@ void main() {
break; break;
} }
/*
// If the last element in this partition is a fill edge, then we need to do a
// look-forward to find the right edge of its corresponding fill. That data is
// recorded in aggregates computed in the element processing pass.
if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
// This is sequential but the expectation is that the amount of
// look-forward is small (performance may degrade in the case
// of massively complex paths).
do {
my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
aggregate_ix++;
} while (isinf(my_right_edge));
}
// Now propagate right_edge backward, from fill to segment.
for (uint i = 0; i < LG_N_TILE; i++) {
// Note: we could try to cut down on write bandwidth here if the value hasn't
// changed, but not sure it's worth the complexity to track.
sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
barrier();
if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
}
barrier();
}
if (crosses_edge) {
x1 = int(ceil(my_right_edge * SX));
}
*/
// At this point, we run an iterator over the coverage area, // At this point, we run an iterator over the coverage area,
// trying to keep divergence low. // trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with // Right now, it's just a bbox, but we'll get finer with

Binary file not shown.

View file

@ -167,9 +167,7 @@ shared uint sh_flags[WG_SIZE];
shared uint sh_path_count[WG_SIZE]; shared uint sh_path_count[WG_SIZE];
shared uint sh_pathseg_count[WG_SIZE]; shared uint sh_pathseg_count[WG_SIZE];
shared uint sh_min_fill; shared uint sh_part_ix;
shared uint sh_tile_ix;
shared State sh_prefix; shared State sh_prefix;
void main() { void main() {
@ -177,28 +175,21 @@ void main() {
// Determine partition to process by atomic counter (described in Section // Determine partition to process by atomic counter (described in Section
// 4.4 of prefix sum paper). // 4.4 of prefix sum paper).
if (gl_LocalInvocationID.x == 0) { if (gl_LocalInvocationID.x == 0) {
sh_tile_ix = atomicAdd(state[0], 1); sh_part_ix = atomicAdd(state[0], 1);
sh_min_fill = ~0;
} }
barrier(); barrier();
uint tile_ix = sh_tile_ix; uint part_ix = sh_part_ix;
uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
ElementRef ref = ElementRef(ix * Element_size); ElementRef ref = ElementRef(ix * Element_size);
bool is_fill; bool is_fill;
uint my_min_fill = ~0;
th_state[0] = map_element(ref, is_fill); th_state[0] = map_element(ref, is_fill);
if (is_fill) my_min_fill = ix;
for (uint i = 1; i < N_ROWS; i++) { for (uint i = 1; i < N_ROWS; i++) {
// discussion question: would it be faster to load using more coherent patterns // discussion question: would it be faster to load using more coherent patterns
// into thread memory? This is kinda strided. // into thread memory? This is kinda strided.
th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill)); th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
if (is_fill && my_min_fill == ~0) {
my_min_fill = ix + i;
} }
}
atomicMin(sh_min_fill, my_min_fill);
State agg = th_state[N_ROWS - 1]; State agg = th_state[N_ROWS - 1];
sh_mat[gl_LocalInvocationID.x] = agg.mat; sh_mat[gl_LocalInvocationID.x] = agg.mat;
sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_translate[gl_LocalInvocationID.x] = agg.translate;
@ -243,17 +234,17 @@ void main() {
// Publish aggregate for this partition // Publish aggregate for this partition
if (gl_LocalInvocationID.x == WG_SIZE - 1) { if (gl_LocalInvocationID.x == WG_SIZE - 1) {
// Note: with memory model, we'd want to generate the atomic store version of this. // Note: with memory model, we'd want to generate the atomic store version of this.
State_write(state_aggregate_ref(tile_ix), agg); State_write(state_aggregate_ref(part_ix), agg);
uint flag = FLAG_AGGREGATE_READY; uint flag = FLAG_AGGREGATE_READY;
memoryBarrierBuffer(); memoryBarrierBuffer();
if (tile_ix == 0) { if (part_ix == 0) {
State_write(state_prefix_ref(tile_ix), agg); State_write(state_prefix_ref(part_ix), agg);
flag = FLAG_PREFIX_READY; flag = FLAG_PREFIX_READY;
} }
state[state_flag_index(tile_ix)] = flag; state[state_flag_index(part_ix)] = flag;
if (tile_ix != 0) { if (part_ix != 0) {
// step 4 of paper: decoupled lookback // step 4 of paper: decoupled lookback
uint look_back_ix = tile_ix - 1; uint look_back_ix = part_ix - 1;
while (true) { while (true) {
flag = state[state_flag_index(look_back_ix)]; flag = state[state_flag_index(look_back_ix)];
if (flag == FLAG_PREFIX_READY) { if (flag == FLAG_PREFIX_READY) {
@ -271,15 +262,14 @@ void main() {
// step 5 of paper: compute inclusive prefix // step 5 of paper: compute inclusive prefix
State inclusive_prefix = combine_state(exclusive, agg); State inclusive_prefix = combine_state(exclusive, agg);
sh_prefix = exclusive; sh_prefix = exclusive;
State_write(state_prefix_ref(tile_ix), inclusive_prefix); State_write(state_prefix_ref(part_ix), inclusive_prefix);
memoryBarrierBuffer(); memoryBarrierBuffer();
flag = FLAG_PREFIX_READY; flag = FLAG_PREFIX_READY;
state[state_flag_index(tile_ix)] = flag; state[state_flag_index(part_ix)] = flag;
} }
} }
barrier(); barrier();
my_min_fill = sh_min_fill; if (part_ix != 0) {
if (tile_ix != 0) {
exclusive = sh_prefix; exclusive = sh_prefix;
} }
@ -296,14 +286,9 @@ void main() {
other.pathseg_count = sh_pathseg_count[ix]; other.pathseg_count = sh_pathseg_count[ix];
row = combine_state(row, other); row = combine_state(row, other);
} }
if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
}
for (uint i = 0; i < N_ROWS; i++) { for (uint i = 0; i < N_ROWS; i++) {
State st = combine_state(row, th_state[i]); State st = combine_state(row, th_state[i]);
if (my_min_fill == ix + i) {
state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
}
// We write the state now for development purposes, but the // We write the state now for development purposes, but the
// actual goal is to write transformed and annotated elements. // actual goal is to write transformed and annotated elements.
//State_write(StateRef((ix + i) * State_size), st); //State_write(StateRef((ix + i) * State_size), st);

Binary file not shown.

View file

@ -237,10 +237,10 @@ impl<D: Device> Renderer<D> {
&[n_paths as u32, 0, bin_alloc_start as u32], &[n_paths as u32, 0, bin_alloc_start as u32],
)?; )?;
let bin_code = include_bytes!("../shader/binning.spv"); let bin_code = include_bytes!("../shader/binning.spv");
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?; let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
let bin_ds = device.create_descriptor_set( let bin_ds = device.create_descriptor_set(
&bin_pipeline, &bin_pipeline,
&[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf], &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
&[], &[],
)?; )?;