diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 0ddeb7e..1413927 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -15,24 +15,18 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf { uint[] annotated; }; -// This is for scanning forward for right_edge data. -layout(set = 0, binding = 1) buffer StateBuf { - uint[] state; -}; - -layout(set = 0, binding = 2) buffer AllocBuf { +layout(set = 0, binding = 1) buffer AllocBuf { uint n_elements; // paths // Will be incremented atomically to claim tiles uint tile_ix; uint alloc; }; -layout(set = 0, binding = 3) buffer BinsBuf { +layout(set = 0, binding = 2) buffer BinsBuf { uint[] bins; }; #include "annotated.h" -#include "state.h" #include "bins.h" // scale factors useful for converting coordinates to bins @@ -52,12 +46,6 @@ shared uint sh_chunk_start[N_TILE]; shared float sh_right_edge[N_TILE]; -#define StateBuf_stride (8 + 2 * State_size) - -uint state_right_edge_index(uint partition_ix) { - return 2 + partition_ix * (StateBuf_stride / 4); -} - void main() { uint chunk_n = 0; uint my_n_elements = n_elements; @@ -103,37 +91,6 @@ void main() { break; } - /* - // If the last element in this partition is a fill edge, then we need to do a - // look-forward to find the right edge of its corresponding fill. That data is - // recorded in aggregates computed in the element processing pass. - if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) { - uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO; - // This is sequential but the expectation is that the amount of - // look-forward is small (performance may degrade in the case - // of massively complex paths). - do { - my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]); - aggregate_ix++; - } while (isinf(my_right_edge)); - } - - // Now propagate right_edge backward, from fill to segment. - for (uint i = 0; i < LG_N_TILE; i++) { - // Note: we could try to cut down on write bandwidth here if the value hasn't - // changed, but not sure it's worth the complexity to track. - sh_right_edge[gl_LocalInvocationID.x] = my_right_edge; - barrier(); - if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) { - my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)]; - } - barrier(); - } - if (crosses_edge) { - x1 = int(ceil(my_right_edge * SX)); - } - */ - // At this point, we run an iterator over the coverage area, // trying to keep divergence low. // Right now, it's just a bbox, but we'll get finer with diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 0d30dfc..1b31cd1 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 905f875..1886907 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -167,9 +167,7 @@ shared uint sh_flags[WG_SIZE]; shared uint sh_path_count[WG_SIZE]; shared uint sh_pathseg_count[WG_SIZE]; -shared uint sh_min_fill; - -shared uint sh_tile_ix; +shared uint sh_part_ix; shared State sh_prefix; void main() { @@ -177,28 +175,21 @@ void main() { // Determine partition to process by atomic counter (described in Section // 4.4 of prefix sum paper). if (gl_LocalInvocationID.x == 0) { - sh_tile_ix = atomicAdd(state[0], 1); - sh_min_fill = ~0; + sh_part_ix = atomicAdd(state[0], 1); } barrier(); - uint tile_ix = sh_tile_ix; + uint part_ix = sh_part_ix; - uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; + uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); bool is_fill; - uint my_min_fill = ~0; th_state[0] = map_element(ref, is_fill); - if (is_fill) my_min_fill = ix; for (uint i = 1; i < N_ROWS; i++) { // discussion question: would it be faster to load using more coherent patterns // into thread memory? This is kinda strided. th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill)); - if (is_fill && my_min_fill == ~0) { - my_min_fill = ix + i; - } } - atomicMin(sh_min_fill, my_min_fill); State agg = th_state[N_ROWS - 1]; sh_mat[gl_LocalInvocationID.x] = agg.mat; sh_translate[gl_LocalInvocationID.x] = agg.translate; @@ -243,17 +234,17 @@ void main() { // Publish aggregate for this partition if (gl_LocalInvocationID.x == WG_SIZE - 1) { // Note: with memory model, we'd want to generate the atomic store version of this. - State_write(state_aggregate_ref(tile_ix), agg); + State_write(state_aggregate_ref(part_ix), agg); uint flag = FLAG_AGGREGATE_READY; memoryBarrierBuffer(); - if (tile_ix == 0) { - State_write(state_prefix_ref(tile_ix), agg); + if (part_ix == 0) { + State_write(state_prefix_ref(part_ix), agg); flag = FLAG_PREFIX_READY; } - state[state_flag_index(tile_ix)] = flag; - if (tile_ix != 0) { + state[state_flag_index(part_ix)] = flag; + if (part_ix != 0) { // step 4 of paper: decoupled lookback - uint look_back_ix = tile_ix - 1; + uint look_back_ix = part_ix - 1; while (true) { flag = state[state_flag_index(look_back_ix)]; if (flag == FLAG_PREFIX_READY) { @@ -271,15 +262,14 @@ void main() { // step 5 of paper: compute inclusive prefix State inclusive_prefix = combine_state(exclusive, agg); sh_prefix = exclusive; - State_write(state_prefix_ref(tile_ix), inclusive_prefix); + State_write(state_prefix_ref(part_ix), inclusive_prefix); memoryBarrierBuffer(); flag = FLAG_PREFIX_READY; - state[state_flag_index(tile_ix)] = flag; + state[state_flag_index(part_ix)] = flag; } } barrier(); - my_min_fill = sh_min_fill; - if (tile_ix != 0) { + if (part_ix != 0) { exclusive = sh_prefix; } @@ -296,14 +286,9 @@ void main() { other.pathseg_count = sh_pathseg_count[ix]; row = combine_state(row, other); } - if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) { - state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity - } for (uint i = 0; i < N_ROWS; i++) { State st = combine_state(row, th_state[i]); - if (my_min_fill == ix + i) { - state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z); - } + // We write the state now for development purposes, but the // actual goal is to write transformed and annotated elements. //State_write(StateRef((ix + i) * State_size), st); diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 552956c..5bd0650 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 7c0a1fc..324df71 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -237,10 +237,10 @@ impl Renderer { &[n_paths as u32, 0, bin_alloc_start as u32], )?; let bin_code = include_bytes!("../shader/binning.spv"); - let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?; + let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?; let bin_ds = device.create_descriptor_set( &bin_pipeline, - &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf], + &[&anno_buf, &bin_alloc_buf_dev, &bin_buf], &[], )?;