diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index 672b42d..f37f0cd 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -70,7 +70,7 @@ fn main() -> Result<(), Error> { piet_gpu::dump_k1_data(&data); let mut data: Vec = Default::default(); - device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); + device.read_buffer(&renderer.anno_buf, &mut data).unwrap(); piet_gpu::dump_k1_data(&data); */ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 94084c8..76d56b6 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -18,10 +18,10 @@ layout(set = 0, binding = 0) readonly buffer SceneBuf { uint[] scene; }; -// This will be used for inter-workgroup aggregates. In the -// meantime, for development, it has been used to store the -// scan of the state objects. -layout(set = 0, binding = 1) buffer StateBuf { +// It would be better to use the Vulkan memory model than +// "volatile" but shooting for compatibility here rather +// than doing things right. +layout(set = 0, binding = 1) volatile buffer StateBuf { uint[] state; }; @@ -34,8 +34,28 @@ layout(set = 0, binding = 2) buffer AnnotatedBuf { #include "state.h" #include "annotated.h" +#define StateBuf_stride (4 + 2 * State_size) + +StateRef state_aggregate_ref(uint partition_ix) { + return StateRef(8 + partition_ix * StateBuf_stride); +} + +StateRef state_prefix_ref(uint partition_ix) { + return StateRef(8 + partition_ix * StateBuf_stride + State_size); +} + +uint state_flag_index(uint partition_ix) { + return 1 + partition_ix * (StateBuf_stride / 4); +} + +// These correspond to X, A, P respectively in the prefix sum paper. +#define FLAG_NOT_READY 0 +#define FLAG_AGGREGATE_READY 1 +#define FLAG_PREFIX_READY 2 + #define FLAG_SET_LINEWIDTH 1 -#define FLAG_RESET_BBOX 2 +#define FLAG_SET_BBOX 2 +#define FLAG_RESET_BBOX 4 // This is almost like a monoid (the interaction between transformation and // bounding boxes is approximate) @@ -47,7 +67,9 @@ State combine_state(State a, State b) { c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { c.bbox = a.bbox; - } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) { + } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && + (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) + { c.bbox.xy = min(a.bbox.xy, c.bbox.xy); c.bbox.zw = max(a.bbox.zw, c.bbox.zw); } @@ -59,7 +81,7 @@ State combine_state(State a, State b) { c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; - c.flags = a.flags | b.flags; + c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; return c; } @@ -122,10 +144,18 @@ shared vec4 sh_bbox[WG_SIZE]; shared float sh_width[WG_SIZE]; shared uint sh_flags[WG_SIZE]; +shared uint sh_tile_ix; +shared State sh_prefix; + void main() { State th_state[N_ROWS]; - // this becomes an atomic counter - uint tile_ix = gl_WorkGroupID.x; + // Determine partition to process by atomic counter (described in Section + // 4.4 of prefix sum paper). + if (gl_LocalInvocationID.x == 0) { + sh_tile_ix = atomicAdd(state[0], 1); + } + barrier(); + uint tile_ix = sh_tile_ix; uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); @@ -162,16 +192,54 @@ void main() { sh_flags[gl_LocalInvocationID.x] = agg.flags; } - // TODO: if last invocation in wg, publish agg. - - barrier(); State exclusive; exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); exclusive.translate = vec2(0.0, 0.0); exclusive.linewidth = 1.0; //TODO should be 0.0 exclusive.flags = 0; - // TODO: do decoupled look-back + + // Publish aggregate for this partition + if (gl_LocalInvocationID.x == WG_SIZE - 1) { + // Note: with memory model, we'd want to generate the atomic store version of this. + State_write(state_aggregate_ref(tile_ix), agg); + uint flag = FLAG_AGGREGATE_READY; + memoryBarrierBuffer(); + if (tile_ix == 0) { + State_write(state_prefix_ref(tile_ix), agg); + flag = FLAG_PREFIX_READY; + } + state[state_flag_index(tile_ix)] = flag; + if (tile_ix != 0) { + // step 4 of paper: decoupled lookback + uint look_back_ix = tile_ix - 1; + while (true) { + flag = state[state_flag_index(look_back_ix)]; + if (flag == FLAG_PREFIX_READY) { + State their_prefix = State_read(state_prefix_ref(look_back_ix)); + exclusive = combine_state(their_prefix, exclusive); + break; + } else if (flag == FLAG_AGGREGATE_READY) { + State their_agg = State_read(state_aggregate_ref(look_back_ix)); + exclusive = combine_state(their_agg, exclusive); + look_back_ix--; + } + // else spin + } + + // step 5 of paper: compute inclusive prefix + State inclusive_prefix = combine_state(exclusive, agg); + sh_prefix = exclusive; + State_write(state_prefix_ref(tile_ix), inclusive_prefix); + memoryBarrierBuffer(); + flag = FLAG_PREFIX_READY; + state[state_flag_index(tile_ix)] = flag; + } + } + barrier(); + if (tile_ix != 0) { + exclusive = sh_prefix; + } State row = exclusive; if (gl_LocalInvocationID.x > 0) { diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index f858129..a19b5f8 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 0ac8299..bf7a7c7 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -158,7 +158,7 @@ impl Renderer { .unwrap(); device.write_buffer(&scene_buf, &scene)?; - let state_buf = device.create_buffer(64 * 1024 * 1024, dev)?; + let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?; let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?; let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?; @@ -245,6 +245,7 @@ impl Renderer { cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev); cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev); cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev); + cmd_buf.clear_buffer(&self.state_buf); cmd_buf.memory_barrier(); cmd_buf.image_barrier( &self.image_dev,