diff --git a/piet-gpu/bin/cli.rs b/piet-gpu/bin/cli.rs index fe8c4ac..5d7c09e 100644 --- a/piet-gpu/bin/cli.rs +++ b/piet-gpu/bin/cli.rs @@ -184,8 +184,8 @@ fn main() -> Result<(), Error> { /* let mut data: Vec = Default::default(); device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap(); - //piet_gpu::dump_k1_data(&data); - trace_ptcl(&data); + piet_gpu::dump_k1_data(&data); + //trace_ptcl(&data); */ let mut img_data: Vec = Default::default(); diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 713a654..d35c2d9 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -43,10 +43,7 @@ layout(set = 0, binding = 3) buffer BinsBuf { // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; -shared uint sh_my_tile; shared uint sh_chunk_start[N_TILE]; -shared uint sh_chunk_end[N_TILE]; -shared uint sh_chunk_jump[N_TILE]; shared float sh_right_edge[N_TILE]; @@ -57,179 +54,140 @@ uint state_right_edge_index(uint partition_ix) { } void main() { - BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC); - uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC; uint chunk_n = 0; uint my_n_elements = n_elements; - while (true) { - if (gl_LocalInvocationID.x == 0) { - sh_my_tile = atomicAdd(tile_ix, 1); + uint my_partition = gl_WorkGroupID.x; + + for (uint i = 0; i < N_SLICE; i++) { + bitmaps[i][gl_LocalInvocationID.x] = 0; + } + barrier(); + + // Read inputs and determine coverage of bins + uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + uint tag = Annotated_Nop; + if (element_ix < my_n_elements) { + tag = Annotated_tag(ref); + } + int x0 = 0, y0 = 0, x1 = 0, y1 = 0; + float my_right_edge = INFINITY; + bool crosses_edge = false; + switch (tag) { + case Annotated_FillLine: + case Annotated_StrokeLine: + AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); + x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX)); + y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY)); + x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX)); + y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY)); + crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY); + break; + case Annotated_Fill: + case Annotated_Stroke: + // Note: we take advantage of the fact that fills and strokes + // have compatible layout. + AnnoFill fill = Annotated_Fill_read(ref); + x0 = int(floor(fill.bbox.x * SX)); + y0 = int(floor(fill.bbox.y * SY)); + x1 = int(ceil(fill.bbox.z * SX)); + y1 = int(ceil(fill.bbox.w * SY)); + // It probably makes more sense to track x1, to avoid having to redo + // the rounding to tile coords. + my_right_edge = fill.bbox.z; + break; + } + + // If the last element in this partition is a fill edge, then we need to do a + // look-forward to find the right edge of its corresponding fill. That data is + // recorded in aggregates computed in the element processing pass. + if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) { + uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO; + // This is sequential but the expectation is that the amount of + // look-forward is small (performance may degrade in the case + // of massively complex paths). + do { + my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]); + aggregate_ix++; + } while (isinf(my_right_edge)); + } + + // Now propagate right_edge backward, from fill to segment. + for (uint i = 0; i < LG_N_TILE; i++) { + // Note: we could try to cut down on write bandwidth here if the value hasn't + // changed, but not sure it's worth the complexity to track. + sh_right_edge[gl_LocalInvocationID.x] = my_right_edge; + barrier(); + if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) { + my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)]; } barrier(); - uint my_tile = sh_my_tile; - if (my_tile * N_TILE >= my_n_elements) { - break; - } + } + if (crosses_edge) { + x1 = int(ceil(my_right_edge * SX)); + } - for (uint i = 0; i < N_SLICE; i++) { - bitmaps[i][gl_LocalInvocationID.x] = 0; - } - barrier(); - - // Read inputs and determine coverage of bins - uint element_ix = my_tile * N_TILE + gl_LocalInvocationID.x; - AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); - uint tag = Annotated_Nop; - if (element_ix < my_n_elements) { - tag = Annotated_tag(ref); - } - int x0 = 0, y0 = 0, x1 = 0, y1 = 0; - float my_right_edge = INFINITY; - bool crosses_edge = false; - switch (tag) { - case Annotated_FillLine: - case Annotated_StrokeLine: - AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref); - x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX)); - y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY)); - x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX)); - y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY)); - crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY); - break; - case Annotated_Fill: - case Annotated_Stroke: - // Note: we take advantage of the fact that fills and strokes - // have compatible layout. - AnnoFill fill = Annotated_Fill_read(ref); - x0 = int(floor(fill.bbox.x * SX)); - y0 = int(floor(fill.bbox.y * SY)); - x1 = int(ceil(fill.bbox.z * SX)); - y1 = int(ceil(fill.bbox.w * SY)); - // It probably makes more sense to track x1, to avoid having to redo - // the rounding to tile coords. - my_right_edge = fill.bbox.z; - break; - } - - // If the last element in this partition is a fill edge, then we need to do a - // look-forward to find the right edge of its corresponding fill. That data is - // recorded in aggregates computed in the element processing pass. - if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) { - uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO; - // This is sequential but the expectation is that the amount of - // look-forward is small (performance may degrade in the case - // of massively complex paths). - do { - my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]); - aggregate_ix++; - } while (isinf(my_right_edge)); - } - - // Now propagate right_edge backward, from fill to segment. - for (uint i = 0; i < LG_N_TILE; i++) { - // Note: we could try to cut down on write bandwidth here if the value hasn't - // changed, but not sure it's worth the complexity to track. - sh_right_edge[gl_LocalInvocationID.x] = my_right_edge; - barrier(); - if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) { - my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)]; - } - barrier(); - } - if (crosses_edge) { - x1 = int(ceil(my_right_edge * SX)); - } - - // At this point, we run an iterator over the coverage area, - // trying to keep divergence low. - // Right now, it's just a bbox, but we'll get finer with - // segments. - x0 = clamp(x0, 0, N_TILE_X); - x1 = clamp(x1, x0, N_TILE_X); - y0 = clamp(y0, 0, N_TILE_Y); - y1 = clamp(y1, y0, N_TILE_Y); - if (x0 == x1) y1 = y0; - int x = x0, y = y0; - uint my_slice = gl_LocalInvocationID.x / 32; - uint my_mask = 1 << (gl_LocalInvocationID.x & 31); - while (y < y1) { - atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask); - x++; - if (x == x1) { - x = x0; - y++; - } - } - - barrier(); - // Allocate output segments. - uint element_count = 0; - for (uint i = 0; i < N_SLICE; i++) { - element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); - count[i][gl_LocalInvocationID.x] = element_count; - } - // element_count is number of elements covering bin for this invocation. - if (element_count != 0) { - uint chunk_end; - uint chunk_new_start; - // Refactor to reduce code duplication? - if (chunk_n > 0) { - uint next_chunk = chunk_ref.offset + BinChunk_size + chunk_n * BinInstance_size; - if (next_chunk + BinChunk_size + min(24, element_count * BinInstance_size) > wr_limit) { - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + element_count * BinInstance_size); - // could try to reduce fragmentation if BIN_ALLOC is only a bit above needed - next_chunk = atomicAdd(alloc, alloc_amount); - wr_limit = next_chunk + alloc_amount; - } - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(next_chunk))); - chunk_ref = BinChunkRef(next_chunk); - } - BinInstanceRef instance_ref = BinInstanceRef(chunk_ref.offset + BinChunk_size); - if (instance_ref.offset + element_count * BinInstance_size > wr_limit) { - chunk_end = wr_limit; - chunk_n = (wr_limit - instance_ref.offset) / BinInstance_size; - uint alloc_amount = max(BIN_ALLOC, BinChunk_size + (element_count - chunk_n) * BinInstance_size); - chunk_new_start = atomicAdd(alloc, alloc_amount); - wr_limit = chunk_new_start + alloc_amount; - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(chunk_new_start))); - chunk_ref = BinChunkRef(chunk_new_start); - chunk_new_start += BinChunk_size; - chunk_n = element_count - chunk_n; - } else { - chunk_end = ~0; - chunk_new_start = ~0; - chunk_n = element_count; - } - sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset; - sh_chunk_end[gl_LocalInvocationID.x] = chunk_end; - sh_chunk_jump[gl_LocalInvocationID.x] = chunk_new_start - chunk_end; - } - - barrier(); - // Use similar strategy as Laine & Karras paper; loop over bbox of bins - // touched by this element - x = x0; - y = y0; - while (y < y1) { - uint bin_ix = y * N_TILE_X + x; - uint out_mask = bitmaps[my_slice][bin_ix]; - if ((out_mask & my_mask) != 0) { - uint idx = bitCount(out_mask & (my_mask - 1)); - if (my_slice > 0) { - idx += count[my_slice - 1][bin_ix]; - } - uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size; - if (out_offset >= sh_chunk_end[bin_ix]) { - out_offset += sh_chunk_jump[bin_ix]; - } - BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge)); - } - x++; - if (x == x1) { - x = x0; - y++; - } + // At this point, we run an iterator over the coverage area, + // trying to keep divergence low. + // Right now, it's just a bbox, but we'll get finer with + // segments. + x0 = clamp(x0, 0, N_TILE_X); + x1 = clamp(x1, x0, N_TILE_X); + y0 = clamp(y0, 0, N_TILE_Y); + y1 = clamp(y1, y0, N_TILE_Y); + if (x0 == x1) y1 = y0; + int x = x0, y = y0; + uint my_slice = gl_LocalInvocationID.x / 32; + uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + while (y < y1) { + atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask); + x++; + if (x == x1) { + x = x0; + y++; + } + } + + barrier(); + // Allocate output segments. + uint element_count = 0; + for (uint i = 0; i < N_SLICE; i++) { + element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); + count[i][gl_LocalInvocationID.x] = element_count; + } + // element_count is number of elements covering bin for this invocation. + uint chunk_start = 0; + if (element_count != 0) { + // TODO: aggregate atomic adds (subgroup is probably fastest) + chunk_start = atomicAdd(alloc, element_count * BinInstance_size); + sh_chunk_start[gl_LocalInvocationID.x] = chunk_start; + } + // Note: it might be more efficient for reading to do this in the + // other order (each bin is a contiguous sequence of partitions) + uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; + bins[out_ix] = element_count; + bins[out_ix + 1] = chunk_start; + + barrier(); + // Use similar strategy as Laine & Karras paper; loop over bbox of bins + // touched by this element + x = x0; + y = y0; + while (y < y1) { + uint bin_ix = y * N_TILE_X + x; + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0) { + uint idx = bitCount(out_mask & (my_mask - 1)); + if (my_slice > 0) { + idx += count[my_slice - 1][bin_ix]; + } + uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size; + BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge)); + } + x++; + if (x == x1) { + x = x0; + y++; } } - BinChunk_write(chunk_ref, BinChunk(chunk_n, BinChunkRef(0))); } diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index e932e4d..6ea0877 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 1f73318..3656f77 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -16,6 +16,7 @@ layout(set = 0, binding = 1) buffer BinsBuf { }; layout(set = 0, binding = 2) buffer AllocBuf { + uint n_elements; uint alloc; }; @@ -27,19 +28,15 @@ layout(set = 0, binding = 3) buffer PtclBuf { #include "bins.h" #include "ptcl.h" -#define N_RINGBUF 512 +#define LG_N_PART_READ 8 +#define N_PART_READ (1 << LG_N_PART_READ) -shared uint sh_elements[N_RINGBUF]; -shared float sh_right_edge[N_RINGBUF]; -shared uint sh_chunk[N_WG]; -shared uint sh_chunk_next[N_WG]; -shared uint sh_chunk_n[N_WG]; -shared uint sh_min_buf; -// Some of these are kept in shared memory to ease register -// pressure, but it could go either way. -shared uint sh_first_el[N_WG]; -shared uint sh_selected_n; -shared uint sh_elements_ref; +shared uint sh_elements[N_TILE]; +shared float sh_right_edge[N_TILE]; + +// Number of elements in the partition; prefix sum. +shared uint sh_part_count[N_PART_READ]; +shared uint sh_part_elements[N_PART_READ]; shared uint sh_bitmaps[N_SLICE][N_TILE]; shared uint sh_backdrop[N_SLICE][N_TILE]; @@ -96,14 +93,16 @@ void main() { // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x; + uint partition_ix = 0; + uint n_partitions = (n_elements + N_TILE - 1) / N_TILE; // Top left coordinates of this bin. vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y); uint th_ix = gl_LocalInvocationID.x; uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X; uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X; - uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x; - CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); + uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x; + CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC); uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; // Allocation and management of segment output @@ -113,18 +112,14 @@ void main() { SegmentRef last_chunk_segs = SegmentRef(0); alloc_chunk_remaining = 0; - uint wr_ix = 0; + // I'm sure we can figure out how to do this with at least one fewer register... + // Items up to rd_ix have been read from sh_elements uint rd_ix = 0; - uint first_el; - if (th_ix < N_WG) { - uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC; - sh_chunk[th_ix] = start_chunk; - BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk)); - sh_chunk_n[th_ix] = chunk.n; - sh_chunk_next[th_ix] = chunk.next.offset; - sh_first_el[th_ix] = chunk.n > 0 ? - BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0; - } + // Items up to wr_ix have been written into sh_elements + uint wr_ix = 0; + // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements + uint part_start_ix = 0; + uint ready_ix = 0; if (th_ix < N_SLICE) { sh_bd_sign[th_ix] = 0; } @@ -138,56 +133,56 @@ void main() { sh_is_segment[th_ix] = 0; } - while (wr_ix - rd_ix <= N_TILE) { - // Choose segment with least element. - uint my_min; - if (th_ix < N_WG) { - if (th_ix == 0) { - sh_selected_n = 0; - sh_min_buf = ~0; + // parallel read of input partitions + do { + if (ready_ix == wr_ix && partition_ix < n_partitions) { + part_start_ix = ready_ix; + uint count = 0; + if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) { + uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; + count = bins[in_ix]; + sh_part_elements[th_ix] = bins[in_ix + 1]; } + // prefix sum of counts + for (uint i = 0; i < LG_N_PART_READ; i++) { + if (th_ix < N_PART_READ) { + sh_part_count[th_ix] = count; + } + barrier(); + if (th_ix < N_PART_READ) { + if (th_ix >= (1 << i)) { + count += sh_part_count[th_ix - (1 << i)]; + } + } + barrier(); + } + if (th_ix < N_PART_READ) { + sh_part_count[th_ix] = part_start_ix + count; + } + barrier(); + ready_ix = sh_part_count[N_PART_READ - 1]; + partition_ix += N_PART_READ; } - barrier(); - // Tempting to do this with subgroups, but atomic should be good enough. - if (th_ix < N_WG) { - my_min = sh_first_el[th_ix]; - atomicMin(sh_min_buf, my_min); - } - barrier(); - if (th_ix < N_WG) { - if (my_min == sh_min_buf && my_min != ~0) { - sh_elements_ref = sh_chunk[th_ix] + BinChunk_size; - uint selected_n = sh_chunk_n[th_ix]; - sh_selected_n = selected_n; - uint next_chunk = sh_chunk_next[th_ix]; - if (next_chunk == 0) { - sh_first_el[th_ix] = ~0; - } else { - sh_chunk[th_ix] = next_chunk; - BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk)); - sh_chunk_n[th_ix] = chunk.n; - sh_chunk_next[th_ix] = chunk.next.offset; - sh_first_el[th_ix] = BinInstance_read( - BinInstanceRef(next_chunk + BinChunk_size)).element_ix; + // use binary search to find element to read + uint ix = rd_ix + th_ix; + if (ix >= wr_ix && ix < ready_ix) { + uint part_ix = 0; + for (uint i = 0; i < LG_N_PART_READ; i++) { + uint probe = part_ix + ((N_PART_READ / 2) >> i); + if (ix >= sh_part_count[probe - 1]) { + part_ix = probe; } } + ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix; + BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]); + BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix)); + sh_elements[th_ix] = inst.element_ix; + sh_right_edge[th_ix] = inst.right_edge; } barrier(); - uint chunk_n = sh_selected_n; - if (chunk_n == 0) { - // All chunks consumed - break; - } - BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref); - if (th_ix < chunk_n) { - BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, th_ix)); - uint wr_el_ix = (wr_ix + th_ix) % N_RINGBUF; - sh_elements[wr_el_ix] = inst.element_ix; - sh_right_edge[wr_el_ix] = inst.right_edge; - } - wr_ix += chunk_n; - } - barrier(); + + wr_ix = min(rd_ix + N_TILE, ready_ix); + } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions)); // We've done the merge and filled the buffer. @@ -196,9 +191,8 @@ void main() { AnnotatedRef ref; float right_edge = 0.0; if (th_ix + rd_ix < wr_ix) { - uint rd_el_ix = (rd_ix + th_ix) % N_RINGBUF; - uint element_ix = sh_elements[rd_el_ix]; - right_edge = sh_right_edge[rd_el_ix]; + uint element_ix = sh_elements[th_ix]; + right_edge = sh_right_edge[th_ix]; ref = AnnotatedRef(element_ix * Annotated_size); tag = Annotated_tag(ref); } @@ -356,7 +350,7 @@ void main() { } } uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size; - uint rd_el_ix = (rd_ix + slice_ix * 32 + bit_ix) % N_RINGBUF; + uint rd_el_ix = slice_ix * 32 + bit_ix; uint element_ix = sh_elements[rd_el_ix]; ref = AnnotatedRef(element_ix * Annotated_size); AnnoFillLineSeg line = Annotated_FillLine_read(ref); @@ -408,7 +402,7 @@ void main() { } } uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap); - uint element_ix = sh_elements[(rd_ix + element_ref_ix) % N_RINGBUF]; + uint element_ix = sh_elements[element_ref_ix]; // Bits up to and including the lsb uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap; @@ -526,9 +520,7 @@ void main() { barrier(); rd_ix += N_TILE; - // The second disjunct is there as a strange workaround on Nvidia. If it is - // removed, then the kernel fails with ERROR_DEVICE_LOST. - if (rd_ix >= wr_ix || bin_ix == ~0) break; + if (rd_ix >= ready_ix && partition_ix >= n_partitions) break; } Cmd_End_write(cmd_ref); } diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index d951b24..5a43f4a 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 65bbe5c..4652c5a 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -176,12 +176,12 @@ impl Renderer { let bin_alloc_buf_dev = device.create_buffer(12, dev)?; // TODO: constants - let bin_alloc_start = 256 * 64 * N_WG; + let bin_alloc_start = ((n_elements + 255) & !255) * 8; device .write_buffer(&bin_alloc_buf_host, &[ n_elements as u32, 0, - bin_alloc_start, + bin_alloc_start as u32, ]) ?; let bin_code = include_bytes!("../shader/binning.spv"); @@ -192,12 +192,13 @@ impl Renderer { &[], )?; - let coarse_alloc_buf_host = device.create_buffer(4, host)?; - let coarse_alloc_buf_dev = device.create_buffer(4, dev)?; + let coarse_alloc_buf_host = device.create_buffer(8, host)?; + let coarse_alloc_buf_dev = device.create_buffer(8, dev)?; let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; device .write_buffer(&coarse_alloc_buf_host, &[ + n_elements as u32, coarse_alloc_start as u32, ]) ?; @@ -264,7 +265,7 @@ impl Renderer { cmd_buf.dispatch( &self.bin_pipeline, &self.bin_ds, - (N_WG, 1, 1), + (((self.n_elements + 255) / 256) as u32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.memory_barrier();