diff --git a/piet-gpu-types/src/state.rs b/piet-gpu-types/src/state.rs index 35076f0..b93e9f3 100644 --- a/piet-gpu-types/src/state.rs +++ b/piet-gpu-types/src/state.rs @@ -8,6 +8,7 @@ piet_gpu! { translate: [f32; 2], bbox: [f32; 4], linewidth: f32, + right_edge: f32, flags: u32, } } diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index c3067e7..cba0217 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -11,24 +11,33 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf { uint[] annotated; }; -layout(set = 0, binding = 1) buffer AllocBuf { +// This is for scanning forward for right_edge data. +layout(set = 0, binding = 1) buffer StateBuf { + uint[] state; +}; + +layout(set = 0, binding = 2) buffer AllocBuf { uint n_elements; // Will be incremented atomically to claim tiles uint tile_ix; uint alloc; }; -layout(set = 0, binding = 2) buffer BinsBuf { +layout(set = 0, binding = 3) buffer BinsBuf { uint[] bins; }; #include "annotated.h" +#include "state.h" #include "bins.h" // scale factors useful for converting coordinates to bins #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) +// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) +#define INFINITY (1.0 / 0.0) + // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; @@ -37,6 +46,14 @@ shared uint sh_chunk_start[N_TILE]; shared uint sh_chunk_end[N_TILE]; shared uint sh_chunk_jump[N_TILE]; +shared float sh_right_edge[N_TILE]; + +#define StateBuf_stride (4 + 2 * State_size) + +StateRef state_aggregate_ref(uint partition_ix) { + return StateRef(8 + partition_ix * StateBuf_stride); +} + void main() { BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC; @@ -65,6 +82,7 @@ void main() { tag = Annotated_tag(ref); } int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + float my_right_edge = INFINITY; switch (tag) { case Annotated_Line: AnnoLineSeg line = Annotated_Line_read(ref); @@ -82,8 +100,37 @@ void main() { y0 = int(floor(fill.bbox.y * SY)); x1 = int(ceil(fill.bbox.z * SX)); y1 = int(ceil(fill.bbox.w * SY)); + my_right_edge = x1; break; } + + // If the last element in this partition is a fill edge, then we need to do a + // look-forward to find the right edge of its corresponding fill. That data is + // recorded in aggregates computed in the element processing pass. + if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_Line) { + uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO; + // This is sequential but the expectation is that the amount of + // look-forward is small (performance may degrade in the case + // of massively complex paths). + do { + StateRef agg_ref = state_aggregate_ref(aggregate_ix); + my_right_edge = State_read(agg_ref).right_edge; + aggregate_ix++; + } while (isinf(my_right_edge)); + } + + // Now propagate right_edge backward, from fill to segment. + for (uint i = 0; i < LG_N_TILE; i++) { + // Note: we could try to cut down on write bandwidth here if the value hasn't + // changed, but not sure it's worth the complexity to track. + sh_right_edge[gl_LocalInvocationID.x] = my_right_edge; + barrier(); + if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) { + my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)]; + } + barrier(); + } + // At this point, we run an iterator over the coverage area, // trying to keep divergence low. // Right now, it's just a bbox, but we'll get finer with @@ -141,6 +188,7 @@ void main() { chunk_n = element_count - chunk_n; } else { chunk_end = ~0; + chunk_new_start = ~0; chunk_n = element_count; } sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset; diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 76148c2..a5379e6 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 8f87b87..15ad80d 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -10,7 +10,7 @@ #define N_ROWS 4 #define WG_SIZE 32 #define LG_WG_SIZE 5 -#define TILE_SIZE (WG_SIZE * N_ROWS) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) layout(local_size_x = WG_SIZE, local_size_y = 1) in; @@ -81,6 +81,7 @@ State combine_state(State a, State b) { c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; + c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge; c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; return c; @@ -143,6 +144,7 @@ shared vec4 sh_mat[WG_SIZE]; shared vec2 sh_translate[WG_SIZE]; shared vec4 sh_bbox[WG_SIZE]; shared float sh_width[WG_SIZE]; +shared float sh_right_edge[WG_SIZE]; shared uint sh_flags[WG_SIZE]; shared uint sh_tile_ix; @@ -158,7 +160,7 @@ void main() { barrier(); uint tile_ix = sh_tile_ix; - uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS; + uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); th_state[0] = map_element(ref); @@ -172,6 +174,7 @@ void main() { sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge; sh_flags[gl_LocalInvocationID.x] = agg.flags; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); @@ -190,6 +193,7 @@ void main() { sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; + sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge; sh_flags[gl_LocalInvocationID.x] = agg.flags; } @@ -198,6 +202,7 @@ void main() { exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); exclusive.translate = vec2(0.0, 0.0); exclusive.linewidth = 1.0; //TODO should be 0.0 + exclusive.right_edge = 0.0; exclusive.flags = 0; // Publish aggregate for this partition @@ -250,6 +255,7 @@ void main() { other.translate = sh_translate[ix]; other.bbox = sh_bbox[ix]; other.linewidth = sh_width[ix]; + other.right_edge = sh_right_edge[ix]; other.flags = sh_flags[ix]; row = combine_state(row, other); } diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 7828aa4..ff0ae2d 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 5d8fb9b..6b00661 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -51,9 +51,14 @@ #define N_TILE_X 16 #define N_TILE_Y 16 #define N_TILE (N_TILE_X * N_TILE_Y) +#define LG_N_TILE 8 #define N_SLICE (N_TILE / 32) // Number of workgroups for binning kernel #define N_WG 16 +// This is the ratio of the number of elements in a binning workgroup +// over the number of elements in a partition workgroup. +#define ELEMENT_BINNING_RATIO 4 + #define BIN_INITIAL_ALLOC 64 #define BIN_ALLOC 256 diff --git a/piet-gpu/shader/state.h b/piet-gpu/shader/state.h index 2547b93..bc6192f 100644 --- a/piet-gpu/shader/state.h +++ b/piet-gpu/shader/state.h @@ -9,10 +9,11 @@ struct State { vec2 translate; vec4 bbox; float linewidth; + float right_edge; uint flags; }; -#define State_size 48 +#define State_size 52 StateRef State_index(StateRef ref, uint index) { return StateRef(ref.offset + index * State_size); @@ -32,12 +33,14 @@ State State_read(StateRef ref) { uint raw9 = state[ix + 9]; uint raw10 = state[ix + 10]; uint raw11 = state[ix + 11]; + uint raw12 = state[ix + 12]; State s; s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5)); s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9)); s.linewidth = uintBitsToFloat(raw10); - s.flags = raw11; + s.right_edge = uintBitsToFloat(raw11); + s.flags = raw12; return s; } @@ -54,6 +57,7 @@ void State_write(StateRef ref, State s) { state[ix + 8] = floatBitsToUint(s.bbox.z); state[ix + 9] = floatBitsToUint(s.bbox.w); state[ix + 10] = floatBitsToUint(s.linewidth); - state[ix + 11] = s.flags; + state[ix + 11] = floatBitsToUint(s.right_edge); + state[ix + 12] = s.flags; } diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 2dca39d..70b02f5 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -185,10 +185,10 @@ impl Renderer { ]) ?; let bin_code = include_bytes!("../shader/binning.spv"); - let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?; + let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?; let bin_ds = device.create_descriptor_set( &bin_pipeline, - &[&anno_buf, &bin_alloc_buf_dev, &bin_buf], + &[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf], &[], )?;