// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // The second dispatch of clip stack processing. #version 450 #extension GL_GOOGLE_include_directive : enable #include "mem.h" #include "setup.h" #define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE WG_SIZE layout(local_size_x = WG_SIZE) in; layout(binding = 1) readonly buffer ConfigBuf { Config conf; }; #include "annotated.h" // Some of this is cut'n'paste duplication with the reduce pass, and // arguably should be moved to a common .h file. // The bicyclic monoid struct ClipEl { // index of parent node uint parent_ix; // bounding box vec4 bbox; }; struct Bic { uint a; uint b; }; Bic bic_combine(Bic x, Bic y) { uint m = min(x.b, y.a); return Bic(x.a + y.a - m, x.b + y.b - m); } // Load path's bbox from bbox (as written by pathseg). vec4 load_path_bbox(uint path_ix) { uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix; float bbox_l = float(memory[base]) - 32768.0; float bbox_t = float(memory[base + 1]) - 32768.0; float bbox_r = float(memory[base + 2]) - 32768.0; float bbox_b = float(memory[base + 3]) - 32768.0; vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } vec4 bbox_intersect(vec4 a, vec4 b) { return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); } shared Bic sh_bic[WG_SIZE * 2 - 2]; shared uint sh_stack[PARTITION_SIZE]; shared vec4 sh_stack_bbox[PARTITION_SIZE]; shared uint sh_link[PARTITION_SIZE]; shared vec4 sh_bbox[PARTITION_SIZE]; // This is adapted directly from the stack monoid impl. // Return value is reference within partition if >= 0, // otherwise reference to stack. uint search_link(inout Bic bic) { uint ix = gl_LocalInvocationID.x; uint j = 0; while (j < LG_WG_SIZE) { uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j)); if (((ix >> j) & 1) != 0) { Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic); if (test.b > 0) { break; } bic = test; ix -= 1u << j; } j++; } if (ix > 0) { while (j > 0) { j--; uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j)); Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic); if (test.b == 0) { bic = test; ix -= 1u << j; } } } // ix is the smallest value such that reduce(ix..th).b == 0 if (ix > 0) { return ix - 1; } else { return ~0u - bic.a; } } Bic load_bic(uint ix) { uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix; return Bic(memory[base], memory[base + 1]); } ClipEl load_clip_el(uint ix) { uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix; uint parent_ix = memory[base]; float x0 = uintBitsToFloat(memory[base + 1]); float y0 = uintBitsToFloat(memory[base + 2]); float x1 = uintBitsToFloat(memory[base + 3]); float y1 = uintBitsToFloat(memory[base + 4]); vec4 bbox = vec4(x0, y0, x1, y1); return ClipEl(parent_ix, bbox); } uint load_path_ix(uint ix) { // This is one approach to a partial final block. Another would be // to do a memset to the padding in the command queue. if (ix < conf.n_clip) { return memory[(conf.clip_alloc.offset >> 2) + ix]; } else { // EndClip tags don't implicate further loads. return 0x80000000; } } void store_clip_bbox(uint ix, vec4 bbox) { uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix; memory[base] = floatBitsToUint(bbox.x); memory[base + 1] = floatBitsToUint(bbox.y); memory[base + 2] = floatBitsToUint(bbox.z); memory[base + 3] = floatBitsToUint(bbox.w); } void main() { // materialize stack up to the start of this partition. This // is based on the pure stack monoid, but with two additions. // First, (this only matters if the stack goes deeper than the // partition size, which might be unlikely in practice), the // topmost stack element from each partition is picked, then an // exclusive scan of those. Also note that if this is skipped, // a scan is not needed in the reduce stage. // Second, after the stream compaction, do a scan of the retrieved // bbox values. uint th = gl_LocalInvocationID.x; Bic bic = Bic(0, 0); if (th < gl_WorkGroupID.x) { bic = load_bic(th); } sh_bic[th] = bic; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); if (th + (1u << i) < WG_SIZE) { Bic other = sh_bic[th + (1u << i)]; bic = bic_combine(bic, other); } barrier(); sh_bic[th] = bic; } barrier(); uint stack_size = sh_bic[0].b; // TODO: do bbox scan here (to unlock greater stack depth) // binary search in stack uint sp = PARTITION_SIZE - 1 - th; uint ix = 0; for (uint i = 0; i < LG_WG_SIZE; i++) { uint probe = ix + (uint(PARTITION_SIZE / 2) >> i); if (sp < sh_bic[probe].b) { ix = probe; } } // ix is largest value such that sp < sh_bic[ix].b (if any) uint b = sh_bic[ix].b; vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9); if (sp < b) { // maybe store the index here for future use? ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1); sh_stack[th] = el.parent_ix; bbox = el.bbox; // other element values here? } // forward scan of bbox values of prefix stack for (uint i = 0; i < LG_WG_SIZE; i++) { sh_stack_bbox[th] = bbox; barrier(); if (th >= (1u << i)) { bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox); } barrier(); } sh_stack_bbox[th] = bbox; // Read input and compute bicyclic semigroup binary tree uint inp = load_path_ix(gl_GlobalInvocationID.x); bool is_push = int(inp) >= 0; bic = Bic(1 - uint(is_push), uint(is_push)); sh_bic[th] = bic; if (is_push) { bbox = load_path_bbox(inp); } else { bbox = vec4(-1e9, -1e9, 1e9, 1e9); } uint inbase = 0; for (uint i = 0; i < LG_WG_SIZE - 1; i++) { uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i)); barrier(); if (th < (1u << (LG_WG_SIZE - 1 - i))) { sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]); } inbase = outbase; } barrier(); // Search for predecessor node bic = Bic(0, 0); uint link = search_link(bic); // we use N_SEQ > 1 convention here: // link >= 0 is index within partition // link < 0 is reference to stack // We want grandparent bbox for pop nodes, so follow those links. sh_link[th] = link; barrier(); uint grandparent; if (int(link) >= 0) { grandparent = sh_link[link]; } else { grandparent = link - 1; } // Resolve parent uint parent; if (int(link) >= 0) { parent = gl_WorkGroupID.x * PARTITION_SIZE + link; } else if (int(link + stack_size) >= 0) { parent = sh_stack[PARTITION_SIZE + link]; } else { parent = ~0u; } // bbox scan along parent links for (uint i = 0; i < LG_WG_SIZE; i++) { // sh_link was already stored for first iteration if (i != 0) { sh_link[th] = link; } sh_bbox[th] = bbox; barrier(); if (int(link) >= 0) { bbox = bbox_intersect(sh_bbox[link], bbox); link = sh_link[link]; } barrier(); } if (int(link + stack_size) >= 0) { bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox); } // At this point, bbox is the reduction of bounding boxes along the tree. sh_bbox[th] = bbox; barrier(); uint path_ix = inp; if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) { // Is this load expensive? If so, it's loaded earlier for in-partition // and is in the ClipEl for cross-partition. // If not, can probably get rid of it in the stack intermediate buf. path_ix = load_path_ix(parent); uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * ~inp; // Fix up drawmonoid so path_ix at EndClip matches BeginClip memory[drawmonoid_out_base] = path_ix; if (int(grandparent) >= 0) { bbox = sh_bbox[grandparent]; } else if (int(grandparent + stack_size) >= 0) { bbox = sh_stack_bbox[PARTITION_SIZE + grandparent]; } else { bbox = vec4(-1e9, -1e9, 1e9, 1e9); } } store_clip_bbox(gl_GlobalInvocationID.x, bbox); }