// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // Processing of the path stream, after the tag scan. #version 450 #extension GL_GOOGLE_include_directive : enable #include "mem.h" #include "setup.h" #include "pathtag.h" #define N_SEQ 4 #define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_SEQ) layout(local_size_x = WG_SIZE, local_size_y = 1) in; layout(binding = 1) readonly buffer ConfigBuf { Config conf; }; layout(binding = 2) readonly buffer SceneBuf { uint[] scene; }; #include "tile.h" #include "pathseg.h" #include "scene.h" layout(binding = 3) readonly buffer ParentBuf { TagMonoid[] parent; }; struct Monoid { vec4 bbox; uint flags; }; #define FLAG_RESET_BBOX 1 #define FLAG_SET_BBOX 2 Monoid combine_monoid(Monoid a, Monoid b) { Monoid c; c.bbox = b.bbox; // TODO: I think this should be gated on b & SET_BBOX == false also. if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { c.bbox = a.bbox; } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) { c.bbox.xy = min(a.bbox.xy, c.bbox.xy); c.bbox.zw = max(a.bbox.zw, c.bbox.zw); } c.flags = (a.flags & FLAG_SET_BBOX) | b.flags; c.flags |= ((a.flags & FLAG_RESET_BBOX) << 1); return c; } Monoid monoid_identity() { return Monoid(vec4(0.0, 0.0, 0.0, 0.0), 0); } // These are not both live at the same time. A very smart shader compiler // would be able to figure that out, but I suspect many won't. shared TagMonoid sh_tag[WG_SIZE]; shared Monoid sh_scratch[WG_SIZE]; vec2 read_f32_point(uint ix) { float x = uintBitsToFloat(scene[ix]); float y = uintBitsToFloat(scene[ix + 1]); return vec2(x, y); } vec2 read_i16_point(uint ix) { uint raw = scene[ix]; float x = float(int(raw << 16) >> 16); float y = float(int(raw) >> 16); return vec2(x, y); } // Note: these are 16 bit, which is adequate, but we could use 32 bits. // Round down and saturate to minimum integer; add bias uint round_down(float x) { return uint(max(0.0, floor(x) + 32768.0)); } // Round up and saturate to maximum integer; add bias uint round_up(float x) { return uint(min(65535.0, ceil(x) + 32768.0)); } void main() { Monoid local[N_SEQ]; float linewidth[N_SEQ]; uint save_trans_ix[N_SEQ]; uint ix = gl_GlobalInvocationID.x * N_SEQ; uint tag_word = scene[(conf.pathtag_offset >> 2) + (ix >> 2)]; // Scan the tag monoid TagMonoid local_tm = reduce_tag(tag_word); sh_tag[gl_LocalInvocationID.x] = local_tm; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); if (gl_LocalInvocationID.x >= (1u << i)) { TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; local_tm = combine_tag_monoid(other, local_tm); } barrier(); sh_tag[gl_LocalInvocationID.x] = local_tm; } barrier(); // sh_tag is now the partition-wide inclusive scan of the tag monoid. TagMonoid tm = tag_monoid_identity(); if (gl_WorkGroupID.x > 0) { tm = parent[gl_WorkGroupID.x - 1]; } if (gl_LocalInvocationID.x > 0) { tm = combine_tag_monoid(tm, sh_tag[gl_LocalInvocationID.x - 1]); } // tm is now the full exclusive scan of the tag monoid. // Indices to scene buffer in u32 units. uint ps_ix = (conf.pathseg_offset >> 2) + tm.pathseg_offset; uint lw_ix = (conf.linewidth_offset >> 2) + tm.linewidth_ix; uint save_path_ix = tm.path_ix; uint trans_ix = tm.trans_ix; TransformRef trans_ref = TransformRef(conf.trans_offset + trans_ix * Transform_size); PathSegRef ps_ref = PathSegRef(conf.pathseg_alloc.offset + tm.pathseg_ix * PathSeg_size); for (uint i = 0; i < N_SEQ; i++) { linewidth[i] = uintBitsToFloat(scene[lw_ix]); save_trans_ix[i] = trans_ix; // if N_SEQ > 4, need to load tag_word from local if N_SEQ % 4 == 0 uint tag_byte = tag_word >> (i * 8); uint seg_type = tag_byte & 3; if (seg_type != 0) { // 1 = line, 2 = quad, 3 = cubic // Unpack path segment from input vec2 p0; vec2 p1; vec2 p2; vec2 p3; if ((tag_byte & 8) != 0) { // 32 bit encoding p0 = read_f32_point(ps_ix); p1 = read_f32_point(ps_ix + 2); if (seg_type >= 2) { p2 = read_f32_point(ps_ix + 4); if (seg_type == 3) { p3 = read_f32_point(ps_ix + 6); } } } else { // 16 bit encoding p0 = read_i16_point(ps_ix); p1 = read_i16_point(ps_ix + 1); if (seg_type >= 2) { p2 = read_i16_point(ps_ix + 2); if (seg_type == 3) { p3 = read_i16_point(ps_ix + 3); } } } Transform transform = Transform_read(trans_ref); p0 = transform.mat.xy * p0.x + transform.mat.zw * p0.y + transform.translate; p1 = transform.mat.xy * p1.x + transform.mat.zw * p1.y + transform.translate; vec4 bbox = vec4(min(p0, p1), max(p0, p1)); // Degree-raise and compute bbox if (seg_type >= 2) { p2 = transform.mat.xy * p2.x + transform.mat.zw * p2.y + transform.translate; bbox.xy = min(bbox.xy, p2); bbox.zw = max(bbox.zw, p2); if (seg_type == 3) { p3 = transform.mat.xy * p3.x + transform.mat.zw * p3.y + transform.translate; bbox.xy = min(bbox.xy, p3); bbox.zw = max(bbox.zw, p3); } else { p3 = p2; p2 = mix(p1, p2, 1.0 / 3.0); p1 = mix(p1, p0, 1.0 / 3.0); } } else { p3 = p1; p2 = mix(p3, p0, 1.0 / 3.0); p1 = mix(p0, p3, 1.0 / 3.0); } vec2 stroke = vec2(0.0, 0.0); if (linewidth[i] >= 0.0) { // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm stroke = 0.5 * linewidth[i] * vec2(length(transform.mat.xz), length(transform.mat.yw)); bbox += vec4(-stroke, stroke); } local[i].bbox = bbox; local[i].flags = 0; // Write path segment to output PathCubic cubic; cubic.p0 = p0; cubic.p1 = p1; cubic.p2 = p2; cubic.p3 = p3; cubic.path_ix = tm.path_ix; // Not needed, TODO remove from struct cubic.trans_ix = gl_GlobalInvocationID.x * 4 + i; cubic.stroke = stroke; uint fill_mode = uint(linewidth[i] >= 0.0); PathSeg_Cubic_write(conf.pathseg_alloc, ps_ref, fill_mode, cubic); ps_ref.offset += PathSeg_size; uint n_points = (tag_byte & 3) + ((tag_byte >> 2) & 1); uint n_words = n_points + (n_points & (((tag_byte >> 3) & 1) * 15)); ps_ix += n_words; } else { local[i].bbox = vec4(0.0, 0.0, 0.0, 0.0); // These shifts need to be kept in sync with setup.h uint is_path = (tag_byte >> 4) & 1; // Relies on the fact that RESET_BBOX == 1 local[i].flags = is_path; tm.path_ix += is_path; trans_ix += (tag_byte >> 5) & 1; trans_ref.offset += ((tag_byte >> 5) & 1) * Transform_size; lw_ix += (tag_byte >> 6) & 1; } } // Partition-wide monoid scan for bbox monoid Monoid agg = local[0]; for (uint i = 1; i < N_SEQ; i++) { // Note: this could be fused with the map above, but probably // a thin performance gain not worth the complexity. agg = combine_monoid(agg, local[i]); local[i] = agg; } // local is N_SEQ sub-partition inclusive scan of bbox monoid. sh_scratch[gl_LocalInvocationID.x] = agg; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); if (gl_LocalInvocationID.x >= (1u << i)) { Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)]; agg = combine_monoid(other, agg); } barrier(); sh_scratch[gl_LocalInvocationID.x] = agg; } // sh_scratch is the partition-wide inclusive scan of the bbox monoid, // sampled at the end of the N_SEQ sub-partition. barrier(); uint path_ix = save_path_ix; uint bbox_out_ix = (conf.path_bbox_alloc.offset >> 2) + path_ix * 6; // Write bboxes to paths; do atomic min/max if partial Monoid row = monoid_identity(); if (gl_LocalInvocationID.x > 0) { row = sh_scratch[gl_LocalInvocationID.x - 1]; } for (uint i = 0; i < N_SEQ; i++) { Monoid m = combine_monoid(row, local[i]); // m is partition-wide inclusive scan of bbox monoid. bool do_atomic = false; if (i == N_SEQ - 1 && gl_LocalInvocationID.x == WG_SIZE - 1) { // last element do_atomic = true; } if ((m.flags & FLAG_RESET_BBOX) != 0) { memory[bbox_out_ix + 4] = floatBitsToUint(linewidth[i]); memory[bbox_out_ix + 5] = save_trans_ix[i]; if ((m.flags & FLAG_SET_BBOX) == 0) { do_atomic = true; } else { memory[bbox_out_ix] = round_down(m.bbox.x); memory[bbox_out_ix + 1] = round_down(m.bbox.y); memory[bbox_out_ix + 2] = round_up(m.bbox.z); memory[bbox_out_ix + 3] = round_up(m.bbox.w); bbox_out_ix += 6; do_atomic = false; } } if (do_atomic) { if (m.bbox.z > m.bbox.x || m.bbox.w > m.bbox.y) { // atomic min/max atomicMin(memory[bbox_out_ix], round_down(m.bbox.x)); atomicMin(memory[bbox_out_ix + 1], round_down(m.bbox.y)); atomicMax(memory[bbox_out_ix + 2], round_up(m.bbox.z)); atomicMax(memory[bbox_out_ix + 3], round_up(m.bbox.w)); } bbox_out_ix += 6; } } }