// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // The binning stage of the pipeline. // // Each workgroup processes N_TILE paths. // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask // based on the path bounding box to bin the paths. #version 450 #extension GL_GOOGLE_include_directive : enable #include "mem.h" #include "setup.h" layout(local_size_x = N_TILE, local_size_y = 1) in; layout(set = 0, binding = 1) readonly buffer ConfigBuf { Config conf; }; #include "bins.h" #include "drawtag.h" // scale factors useful for converting coordinates to bins #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) #define INFINITY (1.0 / 0.0) // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; shared uint sh_chunk_offset[N_TILE]; DrawMonoid load_draw_monoid(uint element_ix) { uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix; uint path_ix = memory[base]; uint clip_ix = memory[base + 1]; uint scene_offset = memory[base + 2]; uint info_offset = memory[base + 3]; return DrawMonoid(path_ix, clip_ix, scene_offset, info_offset); } // Load bounding box computed by clip processing vec4 load_clip_bbox(uint clip_ix) { uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * clip_ix; float x0 = uintBitsToFloat(memory[base]); float y0 = uintBitsToFloat(memory[base + 1]); float x1 = uintBitsToFloat(memory[base + 2]); float y1 = uintBitsToFloat(memory[base + 3]); vec4 bbox = vec4(x0, y0, x1, y1); return bbox; } vec4 bbox_intersect(vec4 a, vec4 b) { return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); } // Load path's bbox from bbox (as written by pathseg). vec4 load_path_bbox(uint path_ix) { uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix; float bbox_l = float(memory[base]) - 32768.0; float bbox_t = float(memory[base + 1]) - 32768.0; float bbox_r = float(memory[base + 2]) - 32768.0; float bbox_b = float(memory[base + 3]) - 32768.0; vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } void store_draw_bbox(uint draw_ix, vec4 bbox) { uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix; memory[base] = floatBitsToUint(bbox.x); memory[base + 1] = floatBitsToUint(bbox.y); memory[base + 2] = floatBitsToUint(bbox.z); memory[base + 3] = floatBitsToUint(bbox.w); } void main() { uint my_partition = gl_WorkGroupID.x; for (uint i = 0; i < N_SLICE; i++) { bitmaps[i][gl_LocalInvocationID.x] = 0; } // Read inputs and determine coverage of bins uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; int x0 = 0, y0 = 0, x1 = 0, y1 = 0; if (element_ix < conf.n_elements) { DrawMonoid draw_monoid = load_draw_monoid(element_ix); uint path_ix = draw_monoid.path_ix; vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9); uint clip_ix = draw_monoid.clip_ix; if (clip_ix > 0) { clip_bbox = load_clip_bbox(clip_ix - 1); } // For clip elements, clip_bbox is the bbox of the clip path, intersected // with enclosing clips. // For other elements, it is the bbox of the enclosing clips. vec4 path_bbox = load_path_bbox(path_ix); vec4 bbox = bbox_intersect(path_bbox, clip_bbox); // Avoid negative-size bbox (is this necessary)? bbox.zw = max(bbox.xy, bbox.zw); // Store clip-intersected bbox for tile_alloc. store_draw_bbox(element_ix, bbox); x0 = int(floor(bbox.x * SX)); y0 = int(floor(bbox.y * SY)); x1 = int(ceil(bbox.z * SX)); y1 = int(ceil(bbox.w * SY)); } // At this point, we run an iterator over the coverage area, // trying to keep divergence low. // Right now, it's just a bbox, but we'll get finer with // segments. uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X; uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y; x0 = clamp(x0, 0, int(width_in_bins)); x1 = clamp(x1, x0, int(width_in_bins)); y0 = clamp(y0, 0, int(height_in_bins)); y1 = clamp(y1, y0, int(height_in_bins)); if (x0 == x1) y1 = y0; int x = x0, y = y0; uint my_slice = gl_LocalInvocationID.x / 32; uint my_mask = 1u << (gl_LocalInvocationID.x & 31); while (y < y1) { atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); x++; if (x == x1) { x = x0; y++; } } barrier(); // Allocate output segments. uint element_count = 0; for (uint i = 0; i < N_SLICE; i++) { element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); count[i][gl_LocalInvocationID.x] = element_count; } // element_count is number of elements covering bin for this invocation. uint chunk_offset = 0; if (element_count != 0) { chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING); sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset; } // Note: it might be more efficient for reading to do this in the // other order (each bin is a contiguous sequence of partitions) uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; write_mem(conf.bin_alloc, out_ix, element_count); write_mem(conf.bin_alloc, out_ix + 1, chunk_offset); barrier(); // Use similar strategy as Laine & Karras paper; loop over bbox of bins // touched by this element x = x0; y = y0; while (y < y1) { uint bin_ix = y * width_in_bins + x; uint out_mask = bitmaps[my_slice][bin_ix]; if ((out_mask & my_mask) != 0) { uint idx = bitCount(out_mask & (my_mask - 1)); if (my_slice > 0) { idx += count[my_slice - 1][bin_ix]; } uint chunk_offset = sh_chunk_offset[bin_ix]; if (chunk_offset != MALLOC_FAILED) { memory[(chunk_offset >> 2) + idx] = element_ix; } } x++; if (x == x1) { x = x0; y++; } } }