// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense // The element processing stage, first in the pipeline. // // This stage is primarily about applying transforms and computing bounding // boxes. It is organized as a scan over the input elements, producing // annotated output elements. #version 450 #extension GL_GOOGLE_include_directive : enable #define N_ROWS 4 #define WG_SIZE 32 #define LG_WG_SIZE 5 #define PARTITION_SIZE (WG_SIZE * N_ROWS) layout(local_size_x = WG_SIZE, local_size_y = 1) in; layout(set = 0, binding = 0) readonly buffer SceneBuf { uint[] scene; }; // It would be better to use the Vulkan memory model than // "volatile" but shooting for compatibility here rather // than doing things right. layout(set = 0, binding = 1) volatile buffer StateBuf { uint part_counter; uint[] state; }; // The annotated results are stored here. layout(set = 0, binding = 2) buffer AnnotatedBuf { uint[] annotated; }; // Path segments are stored here. layout(set = 0, binding = 3) buffer PathSegBuf { uint[] pathseg; }; #include "scene.h" #include "state.h" #include "annotated.h" #include "pathseg.h" #define StateBuf_stride (4 + 2 * State_size) StateRef state_aggregate_ref(uint partition_ix) { return StateRef(4 + partition_ix * StateBuf_stride); } StateRef state_prefix_ref(uint partition_ix) { return StateRef(4 + partition_ix * StateBuf_stride + State_size); } uint state_flag_index(uint partition_ix) { return partition_ix * (StateBuf_stride / 4); } // These correspond to X, A, P respectively in the prefix sum paper. #define FLAG_NOT_READY 0 #define FLAG_AGGREGATE_READY 1 #define FLAG_PREFIX_READY 2 #define FLAG_SET_LINEWIDTH 1 #define FLAG_SET_BBOX 2 #define FLAG_RESET_BBOX 4 // This is almost like a monoid (the interaction between transformation and // bounding boxes is approximate) State combine_state(State a, State b) { State c; c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { c.bbox = a.bbox; } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) { c.bbox.xy = min(a.bbox.xy, c.bbox.xy); c.bbox.zw = max(a.bbox.zw, c.bbox.zw); } // It would be more concise to cast to matrix types; ah well. c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags; c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; c.path_count = a.path_count + b.path_count; c.pathseg_count = a.pathseg_count + b.pathseg_count; return c; } State map_element(ElementRef ref) { // TODO: it would *probably* be more efficient to make the memory read patterns less // divergent, though it would be more wasted memory. uint tag = Element_tag(ref); State c; c.bbox = vec4(0.0, 0.0, 0.0, 0.0); c.mat = vec4(1.0, 0.0, 0.0, 1.0); c.translate = vec2(0.0, 0.0); c.linewidth = 1.0; // TODO should be 0.0 c.flags = 0; c.path_count = 0; c.pathseg_count = 0; switch (tag) { case Element_FillLine: case Element_StrokeLine: LineSeg line = Element_FillLine_read(ref); c.bbox.xy = min(line.p0, line.p1); c.bbox.zw = max(line.p0, line.p1); c.pathseg_count = 1; break; case Element_FillQuad: case Element_StrokeQuad: QuadSeg quad = Element_FillQuad_read(ref); c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); c.pathseg_count = 1; break; case Element_FillCubic: case Element_StrokeCubic: CubicSeg cubic = Element_FillCubic_read(ref); c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); c.pathseg_count = 1; break; case Element_Fill: case Element_Stroke: case Element_BeginClip: c.flags = FLAG_RESET_BBOX; c.path_count = 1; break; case Element_EndClip: c.path_count = 1; break; case Element_SetLineWidth: SetLineWidth lw = Element_SetLineWidth_read(ref); c.linewidth = lw.width; c.flags = FLAG_SET_LINEWIDTH; break; case Element_Transform: Transform t = Element_Transform_read(ref); c.mat = t.mat; c.translate = t.translate; break; } return c; } // Get the bounding box of a circle transformed by the matrix into an ellipse. vec2 get_linewidth(State st) { // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); } // We should be able to use an array of structs but the NV shader compiler // doesn't seem to like it :/ //shared State sh_state[WG_SIZE]; shared vec4 sh_mat[WG_SIZE]; shared vec2 sh_translate[WG_SIZE]; shared vec4 sh_bbox[WG_SIZE]; shared float sh_width[WG_SIZE]; shared uint sh_flags[WG_SIZE]; shared uint sh_path_count[WG_SIZE]; shared uint sh_pathseg_count[WG_SIZE]; shared uint sh_part_ix; shared State sh_prefix; void main() { State th_state[N_ROWS]; // Determine partition to process by atomic counter (described in Section // 4.4 of prefix sum paper). if (gl_LocalInvocationID.x == 0) { sh_part_ix = atomicAdd(part_counter, 1); } barrier(); uint part_ix = sh_part_ix; uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; ElementRef ref = ElementRef(ix * Element_size); th_state[0] = map_element(ref); for (uint i = 1; i < N_ROWS; i++) { // discussion question: would it be faster to load using more coherent patterns // into thread memory? This is kinda strided. th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); } State agg = th_state[N_ROWS - 1]; sh_mat[gl_LocalInvocationID.x] = agg.mat; sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; sh_flags[gl_LocalInvocationID.x] = agg.flags; sh_path_count[gl_LocalInvocationID.x] = agg.path_count; sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count; for (uint i = 0; i < LG_WG_SIZE; i++) { barrier(); if (gl_LocalInvocationID.x >= (1 << i)) { State other; uint ix = gl_LocalInvocationID.x - (1 << i); other.mat = sh_mat[ix]; other.translate = sh_translate[ix]; other.bbox = sh_bbox[ix]; other.linewidth = sh_width[ix]; other.flags = sh_flags[ix]; other.path_count = sh_path_count[ix]; other.pathseg_count = sh_pathseg_count[ix]; agg = combine_state(other, agg); } barrier(); sh_mat[gl_LocalInvocationID.x] = agg.mat; sh_translate[gl_LocalInvocationID.x] = agg.translate; sh_bbox[gl_LocalInvocationID.x] = agg.bbox; sh_width[gl_LocalInvocationID.x] = agg.linewidth; sh_flags[gl_LocalInvocationID.x] = agg.flags; sh_path_count[gl_LocalInvocationID.x] = agg.path_count; sh_pathseg_count[gl_LocalInvocationID.x] = agg.pathseg_count; } State exclusive; exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); exclusive.translate = vec2(0.0, 0.0); exclusive.linewidth = 1.0; //TODO should be 0.0 exclusive.flags = 0; exclusive.path_count = 0; exclusive.pathseg_count = 0; // Publish aggregate for this partition if (gl_LocalInvocationID.x == WG_SIZE - 1) { // Note: with memory model, we'd want to generate the atomic store version of this. State_write(state_aggregate_ref(part_ix), agg); uint flag = FLAG_AGGREGATE_READY; memoryBarrierBuffer(); if (part_ix == 0) { State_write(state_prefix_ref(part_ix), agg); flag = FLAG_PREFIX_READY; } state[state_flag_index(part_ix)] = flag; if (part_ix != 0) { // step 4 of paper: decoupled lookback uint look_back_ix = part_ix - 1; State their_agg; uint their_ix = 0; while (true) { flag = state[state_flag_index(look_back_ix)]; if (flag == FLAG_PREFIX_READY) { State their_prefix = State_read(state_prefix_ref(look_back_ix)); exclusive = combine_state(their_prefix, exclusive); break; } else if (flag == FLAG_AGGREGATE_READY) { their_agg = State_read(state_aggregate_ref(look_back_ix)); exclusive = combine_state(their_agg, exclusive); look_back_ix--; their_ix = 0; continue; } // else spin // Unfortunately there's no guarantee of forward progress of other // workgroups, so compute a bit of the aggregate before trying again. // In the worst case, spinning stops when the aggregate is complete. ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); State s = map_element(ref); if (their_ix == 0) { their_agg = s; } else { their_agg = combine_state(their_agg, s); } their_ix++; if (their_ix == PARTITION_SIZE) { exclusive = combine_state(their_agg, exclusive); if (look_back_ix == 0) { break; } look_back_ix--; their_ix = 0; } } // step 5 of paper: compute inclusive prefix State inclusive_prefix = combine_state(exclusive, agg); sh_prefix = exclusive; State_write(state_prefix_ref(part_ix), inclusive_prefix); memoryBarrierBuffer(); flag = FLAG_PREFIX_READY; state[state_flag_index(part_ix)] = flag; } } barrier(); if (part_ix != 0) { exclusive = sh_prefix; } State row = exclusive; if (gl_LocalInvocationID.x > 0) { uint ix = gl_LocalInvocationID.x - 1; State other; other.mat = sh_mat[ix]; other.translate = sh_translate[ix]; other.bbox = sh_bbox[ix]; other.linewidth = sh_width[ix]; other.flags = sh_flags[ix]; other.path_count = sh_path_count[ix]; other.pathseg_count = sh_pathseg_count[ix]; row = combine_state(row, other); } for (uint i = 0; i < N_ROWS; i++) { State st = combine_state(row, th_state[i]); // Here we read again from the original scene. There may be // gains to be had from stashing in shared memory or possibly // registers (though register pressure is an issue). ElementRef this_ref = Element_index(ref, i); uint tag = Element_tag(this_ref); switch (tag) { case Element_FillLine: case Element_StrokeLine: LineSeg line = Element_StrokeLine_read(this_ref); vec2 p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate; vec2 p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate; PathStrokeCubic path_cubic; path_cubic.p0 = p0; path_cubic.p1 = mix(p0, p1, 1.0 / 3.0); path_cubic.p2 = mix(p1, p0, 1.0 / 3.0); path_cubic.p3 = p1; path_cubic.path_ix = st.path_count; if (tag == Element_StrokeLine) { path_cubic.stroke = get_linewidth(st); } else { path_cubic.stroke = vec2(0.0); } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic; pathseg[path_out_ref.offset >> 2] = out_tag; PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_FillQuad: case Element_StrokeQuad: QuadSeg quad = Element_StrokeQuad_read(this_ref); p0 = st.mat.xy * quad.p0.x + st.mat.zw * quad.p0.y + st.translate; p1 = st.mat.xy * quad.p1.x + st.mat.zw * quad.p1.y + st.translate; vec2 p2 = st.mat.xy * quad.p2.x + st.mat.zw * quad.p2.y + st.translate; path_cubic; path_cubic.p0 = p0; path_cubic.p1 = mix(p1, p0, 1.0 / 3.0); path_cubic.p2 = mix(p1, p2, 1.0 / 3.0); path_cubic.p3 = p2; path_cubic.path_ix = st.path_count; if (tag == Element_StrokeQuad) { path_cubic.stroke = get_linewidth(st); } else { path_cubic.stroke = vec2(0.0); } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic; pathseg[path_out_ref.offset >> 2] = out_tag; PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_FillCubic: case Element_StrokeCubic: CubicSeg cubic = Element_StrokeCubic_read(this_ref); path_cubic; path_cubic.p0 = st.mat.xy * cubic.p0.x + st.mat.zw * cubic.p0.y + st.translate; path_cubic.p1 = st.mat.xy * cubic.p1.x + st.mat.zw * cubic.p1.y + st.translate; path_cubic.p2 = st.mat.xy * cubic.p2.x + st.mat.zw * cubic.p2.y + st.translate; path_cubic.p3 = st.mat.xy * cubic.p3.x + st.mat.zw * cubic.p3.y + st.translate; path_cubic.path_ix = st.path_count; if (tag == Element_StrokeCubic) { path_cubic.stroke = get_linewidth(st); } else { path_cubic.stroke = vec2(0.0); } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic; pathseg[path_out_ref.offset >> 2] = out_tag; PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_Stroke: Stroke stroke = Element_Stroke_read(this_ref); AnnoStroke anno_stroke; anno_stroke.rgba_color = stroke.rgba_color; vec2 lw = get_linewidth(st); anno_stroke.bbox = st.bbox + vec4(-lw, lw); anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); Annotated_Stroke_write(out_ref, anno_stroke); break; case Element_Fill: Fill fill = Element_Fill_read(this_ref); AnnoFill anno_fill; anno_fill.rgba_color = fill.rgba_color; anno_fill.bbox = st.bbox; out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); Annotated_Fill_write(out_ref, anno_fill); break; case Element_BeginClip: Clip begin_clip = Element_BeginClip_read(this_ref); AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox); // This is the absolute bbox, it's been transformed during encoding. anno_begin_clip.bbox = begin_clip.bbox; out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); Annotated_BeginClip_write(out_ref, anno_begin_clip); break; case Element_EndClip: Clip end_clip = Element_EndClip_read(this_ref); // This bbox is expected to be the same as the begin one. AnnoClip anno_end_clip = AnnoClip(end_clip.bbox); out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); Annotated_EndClip_write(out_ref, anno_end_clip); break; } } }