diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index d544417..e4140cd 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -87,8 +87,8 @@ void main() { // Prefix sum of sh_row_count for (uint i = 0; i < LG_BACKDROP_WG; i++) { barrier(); - if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) { - row_count += sh_row_count[th_ix - (1 << i)]; + if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) { + row_count += sh_row_count[th_ix - (1u << i)]; } barrier(); if (gl_LocalInvocationID.y == 0) { @@ -102,7 +102,7 @@ void main() { // Binary search to find element uint el_ix = 0; for (uint i = 0; i < LG_BACKDROP_WG; i++) { - uint probe = el_ix + ((BACKDROP_WG / 2) >> i); + uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i); if (row >= sh_row_count[probe - 1]) { el_ix = probe; } diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 497915c..6ed2140 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -14,7 +14,7 @@ rule glsl command = $glslang_validator $flags -V -o $out $in rule hlsl - command = $spirv_cross --hlsl $in --output $out + command = $spirv_cross --hlsl --shader-model 60 $in --output $out rule dxil command = $dxc -T cs_6_0 $in -Fo $out @@ -22,23 +22,41 @@ rule dxil rule msl command = $spirv_cross --msl $in --output $out $msl_flags +build gen/binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h +build gen/binning.hlsl: hlsl gen/binning.spv +build gen/binning.dxil: dxil gen/binning.hlsl +build gen/binning.msl: msl gen/binning.spv -build elements.spv: glsl elements.comp | scene.h state.h annotated.h +build gen/tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h +build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv +build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl +build gen/tile_alloc.msl: msl gen/tile_alloc.spv -build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h +build gen/path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h +build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv +build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl +build gen/path_coarse.msl: msl gen/path_coarse.spv -build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h +build gen/backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h +build gen/backdrop.hlsl: hlsl gen/backdrop.spv +build gen/backdrop.dxil: dxil gen/backdrop.hlsl +build gen/backdrop.msl: msl gen/backdrop.spv -build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h - -build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h - -build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h +build gen/backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h flags = -DBACKDROP_DIST_FACTOR=4 +build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv +build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl +build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv -build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h +build gen/coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h +build gen/coarse.hlsl: hlsl gen/coarse.spv +build gen/coarse.dxil: dxil gen/coarse.hlsl +build gen/coarse.msl: msl gen/coarse.spv -build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h +build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h +build gen/kernel4.hlsl: hlsl gen/kernel4.spv +build gen/kernel4.dxil: dxil gen/kernel4.hlsl +build gen/kernel4.msl: msl gen/kernel4.spv # New element pipeline follows diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index b541893..31a64e4 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -172,8 +172,8 @@ void main() { } barrier(); if (th_ix < N_PART_READ) { - if (th_ix >= (1 << i)) { - count += sh_part_count[th_ix - (1 << i)]; + if (th_ix >= (1u << i)) { + count += sh_part_count[th_ix - (1u << i)]; } } barrier(); @@ -190,7 +190,7 @@ void main() { if (ix >= wr_ix && ix < ready_ix && mem_ok) { uint part_ix = 0; for (uint i = 0; i < LG_N_PART_READ; i++) { - uint probe = part_ix + ((N_PART_READ / 2) >> i); + uint probe = part_ix + (uint(N_PART_READ / 2) >> i); if (ix >= sh_part_count[probe - 1]) { part_ix = probe; } @@ -257,8 +257,8 @@ void main() { sh_tile_count[th_ix] = tile_count; for (uint i = 0; i < LG_N_TILE; i++) { barrier(); - if (th_ix >= (1 << i)) { - tile_count += sh_tile_count[th_ix - (1 << i)]; + if (th_ix >= (1u << i)) { + tile_count += sh_tile_count[th_ix - (1u << i)]; } barrier(); sh_tile_count[th_ix] = tile_count; @@ -269,7 +269,7 @@ void main() { // Binary search to find element uint el_ix = 0; for (uint i = 0; i < LG_N_TILE; i++) { - uint probe = el_ix + ((N_TILE / 2) >> i); + uint probe = el_ix + (uint(N_TILE / 2) >> i); if (ix >= sh_tile_count[probe - 1]) { el_ix = probe; } @@ -292,7 +292,7 @@ void main() { } if (include_tile) { uint el_slice = el_ix / 32; - uint el_mask = 1 << (el_ix & 31); + uint el_mask = 1u << (el_ix & 31); atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask); } } @@ -372,7 +372,7 @@ void main() { if (tile.tile.offset == 0 && tile.backdrop == 0) { clip_zero_depth = clip_depth + 1; } else if (tile.tile.offset == 0 && clip_depth < 32) { - clip_one_mask |= (1 << clip_depth); + clip_one_mask |= (1u << clip_depth); } else { AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref); if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { @@ -382,14 +382,14 @@ void main() { Cmd_BeginClip_write(cmd_alloc, cmd_ref); cmd_ref.offset += 4; if (clip_depth < 32) { - clip_one_mask &= ~(1 << clip_depth); + clip_one_mask &= ~(1u << clip_depth); } } clip_depth++; break; case Annotated_EndClip: clip_depth--; - if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { + if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) { if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp deleted file mode 100644 index 6f33544..0000000 --- a/piet-gpu/shader/elements.comp +++ /dev/null @@ -1,467 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -// The element processing stage, first in the pipeline. -// -// This stage is primarily about applying transforms and computing bounding -// boxes. It is organized as a scan over the input elements, producing -// annotated output elements. - -#version 450 -#extension GL_GOOGLE_include_directive : enable - -#include "mem.h" -#include "setup.h" - -#define N_ROWS 4 -#define WG_SIZE 32 -#define LG_WG_SIZE 5 -#define PARTITION_SIZE (WG_SIZE * N_ROWS) - -layout(local_size_x = WG_SIZE, local_size_y = 1) in; - -layout(set = 0, binding = 1) readonly buffer ConfigBuf { - Config conf; -}; - -layout(set = 0, binding = 2) readonly buffer SceneBuf { - uint[] scene; -}; - -// It would be better to use the Vulkan memory model than -// "volatile" but shooting for compatibility here rather -// than doing things right. -layout(set = 0, binding = 3) volatile buffer StateBuf { - uint part_counter; - uint[] state; -}; - -#include "scene.h" -#include "state.h" -#include "annotated.h" -#include "pathseg.h" -#include "tile.h" - -#define StateBuf_stride (4 + 2 * State_size) - -StateRef state_aggregate_ref(uint partition_ix) { - return StateRef(4 + partition_ix * StateBuf_stride); -} - -StateRef state_prefix_ref(uint partition_ix) { - return StateRef(4 + partition_ix * StateBuf_stride + State_size); -} - -uint state_flag_index(uint partition_ix) { - return partition_ix * (StateBuf_stride / 4); -} - -// These correspond to X, A, P respectively in the prefix sum paper. -#define FLAG_NOT_READY 0 -#define FLAG_AGGREGATE_READY 1 -#define FLAG_PREFIX_READY 2 - -#define FLAG_SET_LINEWIDTH 1 -#define FLAG_SET_BBOX 2 -#define FLAG_RESET_BBOX 4 -#define FLAG_SET_FILL_MODE 8 -// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1. -#define LG_FILL_MODE 4 -#define FILL_MODE_BITS 1 -#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE) - -// This is almost like a monoid (the interaction between transformation and -// bounding boxes is approximate) -State combine_state(State a, State b) { - State c; - c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; - c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; - c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; - c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; - if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { - c.bbox = a.bbox; - } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && - (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) - { - c.bbox.xy = min(a.bbox.xy, c.bbox.xy); - c.bbox.zw = max(a.bbox.zw, c.bbox.zw); - } - // It would be more concise to cast to matrix types; ah well. - c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; - c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; - c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; - c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; - c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; - c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; - c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; - c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags; - c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; - uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags; - fill_mode &= FILL_MODE_MASK; - c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode; - c.path_count = a.path_count + b.path_count; - c.pathseg_count = a.pathseg_count + b.pathseg_count; - c.trans_count = a.trans_count + b.trans_count; - return c; -} - -State map_element(ElementRef ref) { - // TODO: it would *probably* be more efficient to make the memory read patterns less - // divergent, though it would be more wasted memory. - uint tag = Element_tag(ref).tag; - State c; - c.bbox = vec4(0.0, 0.0, 0.0, 0.0); - c.mat = vec4(1.0, 0.0, 0.0, 1.0); - c.translate = vec2(0.0, 0.0); - c.linewidth = 1.0; // TODO should be 0.0 - c.flags = 0; - c.path_count = 0; - c.pathseg_count = 0; - c.trans_count = 0; - switch (tag) { - case Element_Line: - LineSeg line = Element_Line_read(ref); - c.bbox.xy = min(line.p0, line.p1); - c.bbox.zw = max(line.p0, line.p1); - c.pathseg_count = 1; - break; - case Element_Quad: - QuadSeg quad = Element_Quad_read(ref); - c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); - c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); - c.pathseg_count = 1; - break; - case Element_Cubic: - CubicSeg cubic = Element_Cubic_read(ref); - c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); - c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); - c.pathseg_count = 1; - break; - case Element_FillColor: - case Element_FillLinGradient: - case Element_FillImage: - case Element_BeginClip: - c.flags = FLAG_RESET_BBOX; - c.path_count = 1; - break; - case Element_EndClip: - c.path_count = 1; - break; - case Element_SetLineWidth: - SetLineWidth lw = Element_SetLineWidth_read(ref); - c.linewidth = lw.width; - c.flags = FLAG_SET_LINEWIDTH; - break; - case Element_Transform: - Transform t = Element_Transform_read(ref); - c.mat = t.mat; - c.translate = t.translate; - c.trans_count = 1; - break; - case Element_SetFillMode: - SetFillMode fm = Element_SetFillMode_read(ref); - c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE); - break; - } - return c; -} - -// Get the bounding box of a circle transformed by the matrix into an ellipse. -vec2 get_linewidth(State st) { - // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm - return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); -} - -shared State sh_state[WG_SIZE]; - -shared uint sh_part_ix; -shared State sh_prefix; -shared uint sh_flag; - -void main() { - State th_state[N_ROWS]; - // Determine partition to process by atomic counter (described in Section - // 4.4 of prefix sum paper). - if (gl_LocalInvocationID.x == 0) { - sh_part_ix = atomicAdd(part_counter, 1); - } - barrier(); - uint part_ix = sh_part_ix; - - uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; - ElementRef ref = ElementRef(ix * Element_size); - - th_state[0] = map_element(ref); - for (uint i = 1; i < N_ROWS; i++) { - // discussion question: would it be faster to load using more coherent patterns - // into thread memory? This is kinda strided. - th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); - } - State agg = th_state[N_ROWS - 1]; - sh_state[gl_LocalInvocationID.x] = agg; - for (uint i = 0; i < LG_WG_SIZE; i++) { - barrier(); - if (gl_LocalInvocationID.x >= (1 << i)) { - State other = sh_state[gl_LocalInvocationID.x - (1 << i)]; - agg = combine_state(other, agg); - } - barrier(); - sh_state[gl_LocalInvocationID.x] = agg; - } - - State exclusive; - exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); - exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); - exclusive.translate = vec2(0.0, 0.0); - exclusive.linewidth = 1.0; //TODO should be 0.0 - exclusive.flags = 0; - exclusive.path_count = 0; - exclusive.pathseg_count = 0; - exclusive.trans_count = 0; - - // Publish aggregate for this partition - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - State_write(state_aggregate_ref(part_ix), agg); - if (part_ix == 0) { - State_write(state_prefix_ref(part_ix), agg); - } - } - // Write flag with release semantics; this is done portably with a barrier. - memoryBarrierBuffer(); - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - uint flag = FLAG_AGGREGATE_READY; - if (part_ix == 0) { - flag = FLAG_PREFIX_READY; - } - state[state_flag_index(part_ix)] = flag; - } - if (part_ix != 0) { - // step 4 of paper: decoupled lookback - uint look_back_ix = part_ix - 1; - - State their_agg; - uint their_ix = 0; - while (true) { - // Read flag with acquire semantics. - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - sh_flag = state[state_flag_index(look_back_ix)]; - } - // The flag load is done only in the last thread. However, because the - // translation of memoryBarrierBuffer to Metal requires uniform control - // flow, we broadcast it to all threads. - memoryBarrierBuffer(); - barrier(); - uint flag = sh_flag; - barrier(); - - if (flag == FLAG_PREFIX_READY) { - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - State their_prefix = State_read(state_prefix_ref(look_back_ix)); - exclusive = combine_state(their_prefix, exclusive); - } - break; - } else if (flag == FLAG_AGGREGATE_READY) { - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - their_agg = State_read(state_aggregate_ref(look_back_ix)); - exclusive = combine_state(their_agg, exclusive); - } - look_back_ix--; - their_ix = 0; - continue; - } - // else spin - - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - // Unfortunately there's no guarantee of forward progress of other - // workgroups, so compute a bit of the aggregate before trying again. - // In the worst case, spinning stops when the aggregate is complete. - ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); - State s = map_element(ref); - if (their_ix == 0) { - their_agg = s; - } else { - their_agg = combine_state(their_agg, s); - } - their_ix++; - if (their_ix == PARTITION_SIZE) { - exclusive = combine_state(their_agg, exclusive); - if (look_back_ix == 0) { - sh_flag = FLAG_PREFIX_READY; - } else { - look_back_ix--; - their_ix = 0; - } - } - } - barrier(); - flag = sh_flag; - barrier(); - if (flag == FLAG_PREFIX_READY) { - break; - } - } - // step 5 of paper: compute inclusive prefix - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - State inclusive_prefix = combine_state(exclusive, agg); - sh_prefix = exclusive; - State_write(state_prefix_ref(part_ix), inclusive_prefix); - } - memoryBarrierBuffer(); - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - state[state_flag_index(part_ix)] = FLAG_PREFIX_READY; - } - } - barrier(); - if (part_ix != 0) { - exclusive = sh_prefix; - } - - State row = exclusive; - if (gl_LocalInvocationID.x > 0) { - State other = sh_state[gl_LocalInvocationID.x - 1]; - row = combine_state(row, other); - } - for (uint i = 0; i < N_ROWS; i++) { - State st = combine_state(row, th_state[i]); - - // Here we read again from the original scene. There may be - // gains to be had from stashing in shared memory or possibly - // registers (though register pressure is an issue). - ElementRef this_ref = Element_index(ref, i); - ElementTag tag = Element_tag(this_ref); - uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE); - bool is_stroke = fill_mode == MODE_STROKE; - switch (tag.tag) { - case Element_Line: - LineSeg line = Element_Line_read(this_ref); - PathCubic path_cubic; - path_cubic.p0 = line.p0; - path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0); - path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0); - path_cubic.p3 = line.p1; - path_cubic.path_ix = st.path_count; - path_cubic.trans_ix = st.trans_count; - if (is_stroke) { - path_cubic.stroke = get_linewidth(st); - } else { - path_cubic.stroke = vec2(0.0); - } - PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); - PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); - break; - case Element_Quad: - QuadSeg quad = Element_Quad_read(this_ref); - path_cubic.p0 = quad.p0; - path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0); - path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0); - path_cubic.p3 = quad.p2; - path_cubic.path_ix = st.path_count; - path_cubic.trans_ix = st.trans_count; - if (is_stroke) { - path_cubic.stroke = get_linewidth(st); - } else { - path_cubic.stroke = vec2(0.0); - } - path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); - PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); - break; - case Element_Cubic: - CubicSeg cubic = Element_Cubic_read(this_ref); - path_cubic.p0 = cubic.p0; - path_cubic.p1 = cubic.p1; - path_cubic.p2 = cubic.p2; - path_cubic.p3 = cubic.p3; - path_cubic.path_ix = st.path_count; - path_cubic.trans_ix = st.trans_count; - if (is_stroke) { - path_cubic.stroke = get_linewidth(st); - } else { - path_cubic.stroke = vec2(0.0); - } - path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); - PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); - break; - case Element_FillColor: - FillColor fill = Element_FillColor_read(this_ref); - AnnoColor anno_fill; - anno_fill.rgba_color = fill.rgba_color; - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_fill.bbox = st.bbox + vec4(-lw, lw); - anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_fill.bbox = st.bbox; - anno_fill.linewidth = 0.0; - } - AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill); - break; - case Element_FillLinGradient: - FillLinGradient lin = Element_FillLinGradient_read(this_ref); - AnnoLinGradient anno_lin; - anno_lin.index = lin.index; - vec2 p0 = st.mat.xy * lin.p0.x + st.mat.zw * lin.p0.y + st.translate; - vec2 p1 = st.mat.xy * lin.p1.x + st.mat.zw * lin.p1.y + st.translate; - vec2 dxy = p1 - p0; - float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y); - float line_x = dxy.x * scale; - float line_y = dxy.y * scale; - anno_lin.line_x = line_x; - anno_lin.line_y = line_y; - anno_lin.line_c = -(p0.x * line_x + p0.y * line_y); - // TODO: consider consolidating bbox calculation - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_lin.bbox = st.bbox + vec4(-lw, lw); - anno_lin.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_lin.bbox = st.bbox; - anno_lin.linewidth = 0.0; - } - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin); - break; - case Element_FillImage: - FillImage fill_img = Element_FillImage_read(this_ref); - AnnoImage anno_img; - anno_img.index = fill_img.index; - anno_img.offset = fill_img.offset; - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_img.bbox = st.bbox + vec4(-lw, lw); - anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_img.bbox = st.bbox; - anno_img.linewidth = 0.0; - } - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img); - break; - case Element_BeginClip: - Clip begin_clip = Element_BeginClip_read(this_ref); - AnnoBeginClip anno_begin_clip; - // This is the absolute bbox, it's been transformed during encoding. - anno_begin_clip.bbox = begin_clip.bbox; - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_begin_clip.linewidth = 0.0; - } - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip); - break; - case Element_EndClip: - Clip end_clip = Element_EndClip_read(this_ref); - // This bbox is expected to be the same as the begin one. - AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox); - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); - break; - case Element_Transform: - TransformSeg transform = TransformSeg(st.mat, st.translate); - TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size); - TransformSeg_write(conf.trans_alloc, trans_ref, transform); - break; - } - } -} diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv deleted file mode 100644 index f906dac..0000000 Binary files a/piet-gpu/shader/elements.spv and /dev/null differ diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil new file mode 100644 index 0000000..4ebcb1c Binary files /dev/null and b/piet-gpu/shader/gen/backdrop.dxil differ diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl new file mode 100644 index 0000000..65b969d --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.hlsl @@ -0,0 +1,283 @@ +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _79 : register(u0, space0); +ByteAddressBuffer _186 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +static uint gl_LocalInvocationIndex; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; + uint gl_LocalInvocationIndex : SV_GroupIndex; +}; + +groupshared uint sh_row_width[256]; +groupshared Alloc sh_row_alloc[256]; +groupshared uint sh_row_count[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _79.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _121; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _165 = { raw2 }; + s.tiles = _165; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _79.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _194; + uint row_count = 0u; + bool mem_ok = _79.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.Load(0)) + { + Alloc _217; + _217.offset = _186.Load(32); + Alloc param; + param.offset = _217.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef _243 = { _186.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _243; + Alloc _247; + _247.offset = _186.Load(16); + Alloc param_3; + param_3.offset = _247.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14); + } + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl new file mode 100644 index 0000000..7640ed0 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.msl @@ -0,0 +1,284 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_79.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_79); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_79); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_79); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_79); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_79.memory[offset] = val; +} + +kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) }; + uint row_count = 0u; + bool mem_ok = v_79.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.conf.n_elements) + { + Alloc param; + param.offset = _186.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1, v_79); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param_3; + param_3.offset = _186.conf.tile_alloc.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4, v_79); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9, v_79); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11, v_79); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14, v_79); + } + } + } +} + diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv similarity index 71% rename from piet-gpu/shader/backdrop.spv rename to piet-gpu/shader/gen/backdrop.spv index a1ed332..f3a7824 100644 Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/gen/backdrop.spv differ diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil new file mode 100644 index 0000000..e6b2f1a Binary files /dev/null and b/piet-gpu/shader/gen/backdrop_lg.dxil differ diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl new file mode 100644 index 0000000..57bb6d3 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.hlsl @@ -0,0 +1,283 @@ +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u); + +RWByteAddressBuffer _79 : register(u0, space0); +ByteAddressBuffer _186 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +static uint gl_LocalInvocationIndex; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; + uint gl_LocalInvocationIndex : SV_GroupIndex; +}; + +groupshared uint sh_row_width[256]; +groupshared Alloc sh_row_alloc[256]; +groupshared uint sh_row_count[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _79.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _121; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _165 = { raw2 }; + s.tiles = _165; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _79.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _194; + uint row_count = 0u; + bool mem_ok = _79.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.Load(0)) + { + Alloc _217; + _217.offset = _186.Load(32); + Alloc param; + param.offset = _217.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef _243 = { _186.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _243; + Alloc _247; + _247.offset = _186.Load(16); + Alloc param_3; + param_3.offset = _247.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14); + } + } + } +} + +[numthreads(256, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl new file mode 100644 index 0000000..1c68980 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.msl @@ -0,0 +1,284 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_79.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_79); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_79); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_79); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_79); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_79.memory[offset] = val; +} + +kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) }; + uint row_count = 0u; + bool mem_ok = v_79.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.conf.n_elements) + { + Alloc param; + param.offset = _186.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1, v_79); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param_3; + param_3.offset = _186.conf.tile_alloc.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4, v_79); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9, v_79); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11, v_79); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14, v_79); + } + } + } +} + diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv similarity index 70% rename from piet-gpu/shader/backdrop_lg.spv rename to piet-gpu/shader/gen/backdrop_lg.spv index 457cb02..a77d46d 100644 Binary files a/piet-gpu/shader/backdrop_lg.spv and b/piet-gpu/shader/gen/backdrop_lg.spv differ diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl index 903a185..64b109f 100644 --- a/piet-gpu/shader/gen/bbox_clear.hlsl +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -27,8 +27,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); -ByteAddressBuffer _21 : register(t1); -RWByteAddressBuffer _45 : register(u0); +ByteAddressBuffer _21 : register(t1, space0); +RWByteAddressBuffer _45 : register(u0, space0); static uint3 gl_GlobalInvocationID; struct SPIRV_Cross_Input diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil new file mode 100644 index 0000000..50034cc Binary files /dev/null and b/piet-gpu/shader/gen/binning.dxil differ diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl new file mode 100644 index 0000000..2b0901e --- /dev/null +++ b/piet-gpu/shader/gen/binning.hlsl @@ -0,0 +1,352 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _84 : register(u0, space0); +ByteAddressBuffer _253 : register(t1, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint bitmaps[8][256]; +groupshared bool sh_alloc_failed; +groupshared uint count[8][256]; +groupshared Alloc sh_chunk_alloc[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _84.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _221 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _221; +} + +AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + AnnoEndClip s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + return s; +} + +AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) +{ + AnnoEndClipRef _228 = { ref.offset + 4u }; + Alloc param = a; + AnnoEndClipRef param_1 = _228; + return AnnoEndClip_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _90; + _84.InterlockedAdd(0, size, _90); + uint offset = _90; + uint _97; + _84.GetDimensions(_97); + _97 = (_97 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_97) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _119; + _84.InterlockedMax(4, 1u, _119); + return r; + } + return r; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _84.Store(offset * 4 + 8, val); +} + +void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint my_n_elements = _253.Load(0); + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = false; + } + GroupMemoryBarrierWithGroupSync(); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + AnnotatedRef _308 = { _253.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _308; + uint tag = 0u; + if (element_ix < my_n_elements) + { + Alloc _318; + _318.offset = _253.Load(32); + Alloc param; + param.offset = _318.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc _336; + _336.offset = _253.Load(32); + Alloc param_2; + param_2.offset = _336.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3); + x0 = int(floor(clip.bbox.x * 0.00390625f)); + y0 = int(floor(clip.bbox.y * 0.00390625f)); + x1 = int(ceil(clip.bbox.z * 0.00390625f)); + y1 = int(ceil(clip.bbox.w * 0.00390625f)); + break; + } + } + uint width_in_bins = ((_253.Load(8) + 16u) - 1u) / 16u; + uint height_in_bins = ((_253.Load(12) + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + while (y < y1) + { + uint _438; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _438); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + GroupMemoryBarrierWithGroupSync(); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_4 = 0u; + uint param_5 = 0u; + bool param_6 = true; + Alloc chunk_alloc = new_alloc(param_4, param_5, param_6); + if (element_count != 0u) + { + uint param_7 = element_count * 4u; + MallocResult _488 = malloc(param_7); + MallocResult chunk = _488; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = true; + } + } + uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc _517; + _517.offset = _253.Load(20); + Alloc param_8; + param_8.offset = _517.offset; + uint param_9 = out_ix; + uint param_10 = element_count; + write_mem(param_8, param_9, param_10); + Alloc _529; + _529.offset = _253.Load(20); + Alloc param_11; + param_11.offset = _529.offset; + uint param_12 = out_ix + 1u; + uint param_13 = chunk_alloc.offset; + write_mem(param_11, param_12, param_13); + GroupMemoryBarrierWithGroupSync(); + bool _544; + if (!sh_alloc_failed) + { + _544 = _84.Load(4) != 0u; + } + else + { + _544 = sh_alloc_failed; + } + if (_544) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(countbits(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + BinInstanceRef _606 = { out_offset }; + BinInstance _608 = { element_ix }; + Alloc param_14 = out_alloc; + BinInstanceRef param_15 = _606; + BinInstance param_16 = _608; + BinInstance_write(param_14, param_15, param_16); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl new file mode 100644 index 0000000..f6e0505 --- /dev/null +++ b/piet-gpu/shader/gen/binning.msl @@ -0,0 +1,350 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_84.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_84, v_84BufferSize); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_84, v_84BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_84, v_84BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_84, v_84BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_84, v_84BufferSize); + AnnoEndClip s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + return s; +} + +static inline __attribute__((always_inline)) +AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = a; + AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u }; + return AnnoEndClip_read(param, param_1, v_84, v_84BufferSize); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_84, constant uint& v_84BufferSize) +{ + uint _90 = atomic_fetch_add_explicit((device atomic_uint*)&v_84.mem_offset, size, memory_order_relaxed); + uint offset = _90; + MallocResult r; + r.failed = (offset + size) > uint(int((v_84BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _119 = atomic_fetch_max_explicit((device atomic_uint*)&v_84.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_84.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_84, constant uint& v_84BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2, v_84, v_84BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_84 [[buffer(0)]], const device ConfigBuf& _253 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint bitmaps[8][256]; + threadgroup short sh_alloc_failed; + threadgroup uint count[8][256]; + threadgroup Alloc sh_chunk_alloc[256]; + constant uint& v_84BufferSize = spvBufferSizeConstants[0]; + uint my_n_elements = _253.conf.n_elements; + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = short(false); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef{ _253.conf.anno_alloc.offset + (element_ix * 40u) }; + uint tag = 0u; + if (element_ix < my_n_elements) + { + Alloc param; + param.offset = _253.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1, v_84, v_84BufferSize).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc param_2; + param_2.offset = _253.conf.anno_alloc.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_84, v_84BufferSize); + x0 = int(floor(clip.bbox.x * 0.00390625)); + y0 = int(floor(clip.bbox.y * 0.00390625)); + x1 = int(ceil(clip.bbox.z * 0.00390625)); + y1 = int(ceil(clip.bbox.w * 0.00390625)); + break; + } + } + uint width_in_bins = ((_253.conf.width_in_tiles + 16u) - 1u) / 16u; + uint height_in_bins = ((_253.conf.height_in_tiles + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + while (y < y1) + { + uint _438 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_4 = 0u; + uint param_5 = 0u; + bool param_6 = true; + Alloc chunk_alloc = new_alloc(param_4, param_5, param_6); + if (element_count != 0u) + { + uint param_7 = element_count * 4u; + MallocResult _488 = malloc(param_7, v_84, v_84BufferSize); + MallocResult chunk = _488; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = short(true); + } + } + uint out_ix = (_253.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc param_8; + param_8.offset = _253.conf.bin_alloc.offset; + uint param_9 = out_ix; + uint param_10 = element_count; + write_mem(param_8, param_9, param_10, v_84, v_84BufferSize); + Alloc param_11; + param_11.offset = _253.conf.bin_alloc.offset; + uint param_12 = out_ix + 1u; + uint param_13 = chunk_alloc.offset; + write_mem(param_11, param_12, param_13, v_84, v_84BufferSize); + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _544; + if (!bool(sh_alloc_failed)) + { + _544 = v_84.mem_error != 0u; + } + else + { + _544 = bool(sh_alloc_failed); + } + if (_544) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(popcount(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + Alloc param_14 = out_alloc; + BinInstanceRef param_15 = BinInstanceRef{ out_offset }; + BinInstance param_16 = BinInstance{ element_ix }; + BinInstance_write(param_14, param_15, param_16, v_84, v_84BufferSize); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/gen/binning.spv similarity index 100% rename from piet-gpu/shader/binning.spv rename to piet-gpu/shader/gen/binning.spv diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil new file mode 100644 index 0000000..16d47ce Binary files /dev/null and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl new file mode 100644 index 0000000..bc96cea --- /dev/null +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -0,0 +1,1386 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoImageRef +{ + uint offset; +}; + +struct AnnoImage +{ + float4 bbox; + float linewidth; + uint index; + int2 offset; +}; + +struct AnnoColorRef +{ + uint offset; +}; + +struct AnnoColor +{ + float4 bbox; + float linewidth; + uint rgba_color; +}; + +struct AnnoLinGradientRef +{ + uint offset; +}; + +struct AnnoLinGradient +{ + float4 bbox; + float linewidth; + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct AnnoBeginClipRef +{ + uint offset; +}; + +struct AnnoBeginClip +{ + float4 bbox; + float linewidth; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _296 : register(u0, space0); +ByteAddressBuffer _1249 : register(t1, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_bitmaps[8][256]; +groupshared Alloc sh_part_elements[256]; +groupshared uint sh_part_count[256]; +groupshared uint sh_elements[256]; +groupshared uint sh_tile_stride[256]; +groupshared uint sh_tile_width[256]; +groupshared uint sh_tile_x0[256]; +groupshared uint sh_tile_y0[256]; +groupshared uint sh_tile_base[256]; +groupshared uint sh_tile_count[256]; + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _373 = { a.offset + offset }; + return _373; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _296.Load(offset * 4 + 8); + return v; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) +{ + BinInstanceRef _754 = { ref.offset + (index * 4u) }; + return _754; +} + +BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + BinInstance s; + s.element_ix = raw0; + return s; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _706 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _706; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _814 = { raw2 }; + s.tiles = _814; + return s; +} + +void write_tile_alloc(uint el_ix, Alloc a) +{ +} + +Alloc read_tile_alloc(uint el_ix, bool mem_ok) +{ + uint _1135; + _296.GetDimensions(_1135); + _1135 = (_1135 - 8) / 4; + uint param = 0u; + uint param_1 = uint(int(_1135) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +Tile Tile_read(Alloc a, TileRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + TileSegRef _839 = { raw0 }; + Tile s; + s.tile = _839; + s.backdrop = int(raw1); + return s; +} + +AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + AnnoColor s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + s.rgba_color = raw5; + return s; +} + +AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref) +{ + AnnoColorRef _712 = { ref.offset + 4u }; + Alloc param = a; + AnnoColorRef param_1 = _712; + return AnnoColor_read(param, param_1); +} + +MallocResult malloc(uint size) +{ + uint _302; + _296.InterlockedAdd(0, size, _302); + uint offset = _302; + uint _309; + _296.GetDimensions(_309); + _309 = (_309 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_309) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _331; + _296.InterlockedMax(4, 1u, _331); + return r; + } + return r; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _296.Store(offset * 4 + 8, val); +} + +void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2); +} + +void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2); + CmdJumpRef _1128 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdJumpRef param_4 = _1128; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5); +} + +bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _1156 = malloc(param); + MallocResult new_cmd = _1156; + if (new_cmd.failed) + { + return false; + } + CmdJump _1166 = { new_cmd.alloc.offset }; + CmdJump jump = _1166; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3); + cmd_alloc = new_cmd.alloc; + CmdRef _1178 = { cmd_alloc.offset }; + cmd_ref = _1178; + cmd_limit = (cmd_alloc.offset + 1024u) - 60u; + return true; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2); + CmdFillRef _1012 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdFillRef param_4 = _1012; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5); +} + +void Cmd_Solid_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2); +} + +void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.half_width); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2); + CmdStrokeRef _1030 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdStrokeRef param_4 = _1030; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5); +} + +void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) +{ + uint param = flags; + if (fill_mode_from_flags(param) == 0u) + { + if (tile.tile.offset != 0u) + { + CmdFill _1202 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _1202; + Alloc param_1 = alloc; + CmdRef param_2 = cmd_ref; + CmdFill param_3 = cmd_fill; + Cmd_Fill_write(param_1, param_2, param_3); + cmd_ref.offset += 12u; + } + else + { + Alloc param_4 = alloc; + CmdRef param_5 = cmd_ref; + Cmd_Solid_write(param_4, param_5); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke _1232 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _1232; + Alloc param_6 = alloc; + CmdRef param_7 = cmd_ref; + CmdStroke param_8 = cmd_stroke; + Cmd_Stroke_write(param_6, param_7, param_8); + cmd_ref.offset += 12u; + } +} + +void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2); +} + +void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2); + CmdColorRef _1056 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdColorRef param_4 = _1056; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5); +} + +AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + AnnoLinGradient s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + s.index = raw5; + s.line_x = asfloat(raw6); + s.line_y = asfloat(raw7); + s.line_c = asfloat(raw8); + return s; +} + +AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref) +{ + AnnoLinGradientRef _722 = { ref.offset + 4u }; + Alloc param = a; + AnnoLinGradientRef param_1 = _722; + return AnnoLinGradient_read(param, param_1); +} + +void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.line_x); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.line_y); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.line_c); + write_mem(param_9, param_10, param_11); +} + +void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2); + CmdLinGradRef _1074 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdLinGradRef param_4 = _1074; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5); +} + +AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + AnnoImage s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + s.index = raw5; + s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16); + return s; +} + +AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref) +{ + AnnoImageRef _732 = { ref.offset + 4u }; + Alloc param = a; + AnnoImageRef param_1 = _732; + return AnnoImage_read(param, param_1); +} + +void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2); + CmdImageRef _1092 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdImageRef param_4 = _1092; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5); +} + +AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + AnnoBeginClip s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + return s; +} + +AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) +{ + AnnoBeginClipRef _742 = { ref.offset + 4u }; + Alloc param = a; + AnnoBeginClipRef param_1 = _742; + return AnnoBeginClip_read(param, param_1); +} + +void Cmd_BeginClip_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2); +} + +void Cmd_EndClip_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2); +} + +void Cmd_End_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint width_in_bins = ((_1249.Load(8) + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1249.Load(0) + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.Load(8)) + bin_tile_x) + tile_x; + Alloc _1314; + _1314.offset = _1249.Load(24); + Alloc param; + param.offset = _1314.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1323 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1323; + uint cmd_limit = (cmd_ref.offset + 1024u) - 60u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint clip_one_mask = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + bool mem_ok = _296.Load(4) == 0u; + Alloc param_3; + Alloc param_5; + uint _1529; + uint element_ix; + AnnotatedRef ref; + Alloc param_14; + Alloc param_16; + uint tile_count; + Alloc param_23; + uint _1841; + Alloc param_29; + Tile tile_1; + AnnoColor fill; + Alloc param_35; + Alloc param_52; + CmdLinGrad cmd_lin; + Alloc param_69; + Alloc param_86; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1581; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1379 = th_ix < 256u; + bool _1387; + if (_1379) + { + _1387 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1387 = _1379; + } + if (_1387) + { + uint in_ix = (_1249.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1404; + _1404.offset = _1249.Load(20); + param_3.offset = _1404.offset; + uint param_4 = in_ix; + count = read_mem(param_3, param_4); + Alloc _1415; + _1415.offset = _1249.Load(20); + param_5.offset = _1415.offset; + uint param_6 = in_ix + 1u; + uint offset = read_mem(param_5, param_6); + uint param_7 = offset; + uint param_8 = count * 4u; + bool param_9 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + GroupMemoryBarrierWithGroupSync(); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + GroupMemoryBarrierWithGroupSync(); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + GroupMemoryBarrierWithGroupSync(); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1529 = sh_part_count[part_ix - 1u]; + } + else + { + _1529 = part_start_ix; + } + ix -= _1529; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef _1548 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1548; + BinInstanceRef param_10 = inst_ref; + uint param_11 = ix; + Alloc param_12 = bin_alloc; + BinInstanceRef param_13 = BinInstance_index(param_10, param_11); + BinInstance inst = BinInstance_read(param_12, param_13); + sh_elements[th_ix] = inst.element_ix; + } + GroupMemoryBarrierWithGroupSync(); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1571 = (wr_ix - rd_ix) < 256u; + if (_1571) + { + _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1581 = _1571; + } + if (_1581) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + AnnotatedRef _1602 = { _1249.Load(32) + (element_ix * 40u) }; + ref = _1602; + Alloc _1605; + _1605.offset = _1249.Load(32); + param_14.offset = _1605.offset; + AnnotatedRef param_15 = ref; + tag = Annotated_tag(param_14, param_15).tag; + } + switch (tag) + { + case 1u: + case 3u: + case 2u: + case 4u: + case 5u: + { + uint path_ix = element_ix; + PathRef _1624 = { _1249.Load(16) + (path_ix * 12u) }; + Alloc _1627; + _1627.offset = _1249.Load(16); + param_16.offset = _1627.offset; + PathRef param_17 = _1624; + Path path = Path_read(param_16, param_17); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_18 = path.tiles.offset; + uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_20 = mem_ok; + Alloc path_alloc = new_alloc(param_18, param_19, param_20); + uint param_21 = th_ix; + Alloc param_22 = path_alloc; + write_tile_alloc(param_21, param_22); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = tile_count; + } + GroupMemoryBarrierWithGroupSync(); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + AnnotatedRef _1826 = { _1249.Load(32) + (sh_elements[el_ix] * 40u) }; + AnnotatedRef ref_1 = _1826; + Alloc _1830; + _1830.offset = _1249.Load(32); + param_23.offset = _1830.offset; + AnnotatedRef param_24 = ref_1; + uint tag_1 = Annotated_tag(param_23, param_24).tag; + if (el_ix > 0u) + { + _1841 = sh_tile_count[el_ix - 1u]; + } + else + { + _1841 = 0u; + } + uint seq_ix = ix_1 - _1841; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if ((tag_1 == 4u) || (tag_1 == 5u)) + { + include_tile = true; + } + else + { + if (mem_ok) + { + uint param_25 = el_ix; + bool param_26 = mem_ok; + TileRef _1901 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_27 = read_tile_alloc(param_25, param_26); + TileRef param_28 = _1901; + Tile tile = Tile_read(param_27, param_28); + bool _1907 = tile.tile.offset != 0u; + bool _1914; + if (!_1907) + { + _1914 = tile.backdrop != 0; + } + else + { + _1914 = _1907; + } + include_tile = _1914; + } + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1934; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1934); + } + } + GroupMemoryBarrierWithGroupSync(); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); + uint element_ix_1 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + AnnotatedRef _1988 = { _1249.Load(32) + (element_ix_1 * 40u) }; + ref = _1988; + Alloc _1993; + _1993.offset = _1249.Load(32); + param_29.offset = _1993.offset; + AnnotatedRef param_30 = ref; + AnnotatedTag tag_2 = Annotated_tag(param_29, param_30); + if (clip_zero_depth == 0u) + { + switch (tag_2.tag) + { + case 1u: + { + uint param_31 = element_ref_ix; + bool param_32 = mem_ok; + TileRef _2029 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_33 = read_tile_alloc(param_31, param_32); + TileRef param_34 = _2029; + tile_1 = Tile_read(param_33, param_34); + Alloc _2036; + _2036.offset = _1249.Load(32); + param_35.offset = _2036.offset; + AnnotatedRef param_36 = ref; + fill = Annotated_Color_read(param_35, param_36); + Alloc param_37 = cmd_alloc; + CmdRef param_38 = cmd_ref; + uint param_39 = cmd_limit; + bool _2048 = alloc_cmd(param_37, param_38, param_39); + cmd_alloc = param_37; + cmd_ref = param_38; + cmd_limit = param_39; + if (!_2048) + { + break; + } + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + uint param_42 = tag_2.flags; + Tile param_43 = tile_1; + float param_44 = fill.linewidth; + write_fill(param_40, param_41, param_42, param_43, param_44); + cmd_ref = param_41; + CmdColor _2072 = { fill.rgba_color }; + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + CmdColor param_47 = _2072; + Cmd_Color_write(param_45, param_46, param_47); + cmd_ref.offset += 8u; + break; + } + case 2u: + { + uint param_48 = element_ref_ix; + bool param_49 = mem_ok; + TileRef _2101 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_50 = read_tile_alloc(param_48, param_49); + TileRef param_51 = _2101; + tile_1 = Tile_read(param_50, param_51); + Alloc _2108; + _2108.offset = _1249.Load(32); + param_52.offset = _2108.offset; + AnnotatedRef param_53 = ref; + AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53); + Alloc param_54 = cmd_alloc; + CmdRef param_55 = cmd_ref; + uint param_56 = cmd_limit; + bool _2120 = alloc_cmd(param_54, param_55, param_56); + cmd_alloc = param_54; + cmd_ref = param_55; + cmd_limit = param_56; + if (!_2120) + { + break; + } + Alloc param_57 = cmd_alloc; + CmdRef param_58 = cmd_ref; + uint param_59 = tag_2.flags; + Tile param_60 = tile_1; + float param_61 = fill.linewidth; + write_fill(param_57, param_58, param_59, param_60, param_61); + cmd_ref = param_58; + cmd_lin.index = lin.index; + cmd_lin.line_x = lin.line_x; + cmd_lin.line_y = lin.line_y; + cmd_lin.line_c = lin.line_c; + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + CmdLinGrad param_64 = cmd_lin; + Cmd_LinGrad_write(param_62, param_63, param_64); + cmd_ref.offset += 20u; + break; + } + case 3u: + { + uint param_65 = element_ref_ix; + bool param_66 = mem_ok; + TileRef _2185 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_67 = read_tile_alloc(param_65, param_66); + TileRef param_68 = _2185; + tile_1 = Tile_read(param_67, param_68); + Alloc _2192; + _2192.offset = _1249.Load(32); + param_69.offset = _2192.offset; + AnnotatedRef param_70 = ref; + AnnoImage fill_img = Annotated_Image_read(param_69, param_70); + Alloc param_71 = cmd_alloc; + CmdRef param_72 = cmd_ref; + uint param_73 = cmd_limit; + bool _2204 = alloc_cmd(param_71, param_72, param_73); + cmd_alloc = param_71; + cmd_ref = param_72; + cmd_limit = param_73; + if (!_2204) + { + break; + } + Alloc param_74 = cmd_alloc; + CmdRef param_75 = cmd_ref; + uint param_76 = tag_2.flags; + Tile param_77 = tile_1; + float param_78 = fill_img.linewidth; + write_fill(param_74, param_75, param_76, param_77, param_78); + cmd_ref = param_75; + CmdImage _2230 = { fill_img.index, fill_img.offset }; + Alloc param_79 = cmd_alloc; + CmdRef param_80 = cmd_ref; + CmdImage param_81 = _2230; + Cmd_Image_write(param_79, param_80, param_81); + cmd_ref.offset += 12u; + break; + } + case 4u: + { + uint param_82 = element_ref_ix; + bool param_83 = mem_ok; + TileRef _2259 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_84 = read_tile_alloc(param_82, param_83); + TileRef param_85 = _2259; + tile_1 = Tile_read(param_84, param_85); + bool _2265 = tile_1.tile.offset == 0u; + bool _2271; + if (_2265) + { + _2271 = tile_1.backdrop == 0; + } + else + { + _2271 = _2265; + } + if (_2271) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + if ((tile_1.tile.offset == 0u) && (clip_depth < 32u)) + { + clip_one_mask |= (1u << clip_depth); + } + else + { + Alloc _2293; + _2293.offset = _1249.Load(32); + param_86.offset = _2293.offset; + AnnotatedRef param_87 = ref; + AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87); + Alloc param_88 = cmd_alloc; + CmdRef param_89 = cmd_ref; + uint param_90 = cmd_limit; + bool _2305 = alloc_cmd(param_88, param_89, param_90); + cmd_alloc = param_88; + cmd_ref = param_89; + cmd_limit = param_90; + if (!_2305) + { + break; + } + Alloc param_91 = cmd_alloc; + CmdRef param_92 = cmd_ref; + uint param_93 = tag_2.flags; + Tile param_94 = tile_1; + float param_95 = begin_clip.linewidth; + write_fill(param_91, param_92, param_93, param_94, param_95); + cmd_ref = param_92; + Alloc param_96 = cmd_alloc; + CmdRef param_97 = cmd_ref; + Cmd_BeginClip_write(param_96, param_97); + cmd_ref.offset += 4u; + if (clip_depth < 32u) + { + clip_one_mask &= (~(1u << clip_depth)); + } + } + } + clip_depth++; + break; + } + case 5u: + { + clip_depth--; + bool _2351 = clip_depth >= 32u; + bool _2360; + if (!_2351) + { + _2360 = (clip_one_mask & (1u << clip_depth)) == 0u; + } + else + { + _2360 = _2351; + } + if (_2360) + { + Alloc param_98 = cmd_alloc; + CmdRef param_99 = cmd_ref; + uint param_100 = cmd_limit; + bool _2369 = alloc_cmd(param_98, param_99, param_100); + cmd_alloc = param_98; + cmd_ref = param_99; + cmd_limit = param_100; + if (!_2369) + { + break; + } + Alloc param_101 = cmd_alloc; + CmdRef param_102 = cmd_ref; + Cmd_Solid_write(param_101, param_102); + cmd_ref.offset += 4u; + Alloc param_103 = cmd_alloc; + CmdRef param_104 = cmd_ref; + Cmd_EndClip_write(param_103, param_104); + cmd_ref.offset += 4u; + } + break; + } + } + } + else + { + switch (tag_2.tag) + { + case 4u: + { + clip_depth++; + break; + } + case 5u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + GroupMemoryBarrierWithGroupSync(); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2432 = (bin_tile_x + tile_x) < _1249.Load(8); + bool _2441; + if (_2432) + { + _2441 = (bin_tile_y + tile_y) < _1249.Load(12); + } + else + { + _2441 = _2432; + } + if (_2441) + { + Alloc param_105 = cmd_alloc; + CmdRef param_106 = cmd_ref; + Cmd_End_write(param_105, param_106); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl new file mode 100644 index 0000000..096f710 --- /dev/null +++ b/piet-gpu/shader/gen/coarse.msl @@ -0,0 +1,1378 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +// Implementation of the GLSL findLSB() function +template +inline T spvFindLSB(T x) +{ + return select(ctz(x), T(-1), x == T(0)); +} + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoImageRef +{ + uint offset; +}; + +struct AnnoImage +{ + float4 bbox; + float linewidth; + uint index; + int2 offset; +}; + +struct AnnoColorRef +{ + uint offset; +}; + +struct AnnoColor +{ + float4 bbox; + float linewidth; + uint rgba_color; +}; + +struct AnnoLinGradientRef +{ + uint offset; +}; + +struct AnnoLinGradient +{ + float4 bbox; + float linewidth; + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct AnnoBeginClipRef +{ + uint offset; +}; + +struct AnnoBeginClip +{ + float4 bbox; + float linewidth; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_296.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index) +{ + return BinInstanceRef{ ref.offset + (index * 4u) }; +} + +static inline __attribute__((always_inline)) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + BinInstance s; + s.element_ix = raw0; + return s; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_296, v_296BufferSize); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) +{ +} + +static inline __attribute__((always_inline)) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint param = 0u; + uint param_1 = uint(int((v_296BufferSize - 8) / 4) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +static inline __attribute__((always_inline)) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Tile s; + s.tile = TileSegRef{ raw0 }; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + AnnoColor s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + s.rgba_color = raw5; + return s; +} + +static inline __attribute__((always_inline)) +AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u }; + return AnnoColor_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint _302 = atomic_fetch_add_explicit((device atomic_uint*)&v_296.mem_offset, size, memory_order_relaxed); + uint offset = _302; + MallocResult r; + r.failed = (offset + size) > uint(int((v_296BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _331 = atomic_fetch_max_explicit((device atomic_uint*)&v_296.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_296.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_296, constant uint& v_296BufferSize) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _1156 = malloc(param, v_296, v_296BufferSize); + MallocResult new_cmd = _1156; + if (new_cmd.failed) + { + return false; + } + CmdJump jump = CmdJump{ new_cmd.alloc.offset }; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3, v_296, v_296BufferSize); + cmd_alloc = new_cmd.alloc; + cmd_ref = CmdRef{ cmd_alloc.offset }; + cmd_limit = (cmd_alloc.offset + 1024u) - 60u; + return true; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.half_width); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint param = flags; + if (fill_mode_from_flags(param) == 0u) + { + if (tile.tile.offset != 0u) + { + CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop }; + Alloc param_1 = alloc; + CmdRef param_2 = cmd_ref; + CmdFill param_3 = cmd_fill; + Cmd_Fill_write(param_1, param_2, param_3, v_296, v_296BufferSize); + cmd_ref.offset += 12u; + } + else + { + Alloc param_4 = alloc; + CmdRef param_5 = cmd_ref; + Cmd_Solid_write(param_4, param_5, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth }; + Alloc param_6 = alloc; + CmdRef param_7 = cmd_ref; + CmdStroke param_8 = cmd_stroke; + Cmd_Stroke_write(param_6, param_7, param_8, v_296, v_296BufferSize); + cmd_ref.offset += 12u; + } +} + +static inline __attribute__((always_inline)) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_296, v_296BufferSize); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_296, v_296BufferSize); + AnnoLinGradient s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + s.index = raw5; + s.line_x = as_type(raw6); + s.line_y = as_type(raw7); + s.line_c = as_type(raw8); + return s; +} + +static inline __attribute__((always_inline)) +AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u }; + return AnnoLinGradient_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.line_x); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.line_y); + write_mem(param_6, param_7, param_8, v_296, v_296BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.line_c); + write_mem(param_9, param_10, param_11, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize); + AnnoImage s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + s.index = raw5; + s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u }; + return AnnoImage_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +AnnoBeginClip AnnoBeginClip_read(thread const Alloc& a, thread const AnnoBeginClipRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + AnnoBeginClip s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + return s; +} + +static inline __attribute__((always_inline)) +AnnoBeginClip Annotated_BeginClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoBeginClipRef param_1 = AnnoBeginClipRef{ ref.offset + 4u }; + return AnnoBeginClip_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_296 [[buffer(0)]], const device ConfigBuf& _1249 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_bitmaps[8][256]; + threadgroup Alloc sh_part_elements[256]; + threadgroup uint sh_part_count[256]; + threadgroup uint sh_elements[256]; + threadgroup uint sh_tile_stride[256]; + threadgroup uint sh_tile_width[256]; + threadgroup uint sh_tile_x0[256]; + threadgroup uint sh_tile_y0[256]; + threadgroup uint sh_tile_base[256]; + threadgroup uint sh_tile_count[256]; + constant uint& v_296BufferSize = spvBufferSizeConstants[0]; + uint width_in_bins = ((_1249.conf.width_in_tiles + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1249.conf.n_elements + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.conf.width_in_tiles) + bin_tile_x) + tile_x; + Alloc param; + param.offset = _1249.conf.ptcl_alloc.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint cmd_limit = (cmd_ref.offset + 1024u) - 60u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint clip_one_mask = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + bool mem_ok = v_296.mem_error == 0u; + Alloc param_3; + Alloc param_5; + uint _1529; + uint element_ix; + AnnotatedRef ref; + Alloc param_14; + Alloc param_16; + uint tile_count; + Alloc param_23; + uint _1841; + Alloc param_29; + Tile tile_1; + AnnoColor fill; + Alloc param_35; + Alloc param_52; + CmdLinGrad cmd_lin; + Alloc param_69; + Alloc param_86; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1581; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1379 = th_ix < 256u; + bool _1387; + if (_1379) + { + _1387 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1387 = _1379; + } + if (_1387) + { + uint in_ix = (_1249.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_3.offset = _1249.conf.bin_alloc.offset; + uint param_4 = in_ix; + count = read_mem(param_3, param_4, v_296, v_296BufferSize); + param_5.offset = _1249.conf.bin_alloc.offset; + uint param_6 = in_ix + 1u; + uint offset = read_mem(param_5, param_6, v_296, v_296BufferSize); + uint param_7 = offset; + uint param_8 = count * 4u; + bool param_9 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1529 = sh_part_count[part_ix - 1u]; + } + else + { + _1529 = part_start_ix; + } + ix -= _1529; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; + BinInstanceRef param_10 = inst_ref; + uint param_11 = ix; + Alloc param_12 = bin_alloc; + BinInstanceRef param_13 = BinInstance_index(param_10, param_11); + BinInstance inst = BinInstance_read(param_12, param_13, v_296, v_296BufferSize); + sh_elements[th_ix] = inst.element_ix; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1571 = (wr_ix - rd_ix) < 256u; + if (_1571) + { + _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1581 = _1571; + } + if (_1581) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix * 40u) }; + param_14.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_15 = ref; + tag = Annotated_tag(param_14, param_15, v_296, v_296BufferSize).tag; + } + switch (tag) + { + case 1u: + case 3u: + case 2u: + case 4u: + case 5u: + { + uint path_ix = element_ix; + param_16.offset = _1249.conf.tile_alloc.offset; + PathRef param_17 = PathRef{ _1249.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_16, param_17, v_296, v_296BufferSize); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_18 = path.tiles.offset; + uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_20 = mem_ok; + Alloc path_alloc = new_alloc(param_18, param_19, param_20); + uint param_21 = th_ix; + Alloc param_22 = path_alloc; + write_tile_alloc(param_21, param_22); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = tile_count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + AnnotatedRef ref_1 = AnnotatedRef{ _1249.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) }; + param_23.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_24 = ref_1; + uint tag_1 = Annotated_tag(param_23, param_24, v_296, v_296BufferSize).tag; + if (el_ix > 0u) + { + _1841 = sh_tile_count[el_ix - 1u]; + } + else + { + _1841 = 0u; + } + uint seq_ix = ix_1 - _1841; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if ((tag_1 == 4u) || (tag_1 == 5u)) + { + include_tile = true; + } + else + { + if (mem_ok) + { + uint param_25 = el_ix; + bool param_26 = mem_ok; + Alloc param_27 = read_tile_alloc(param_25, param_26, v_296, v_296BufferSize); + TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_27, param_28, v_296, v_296BufferSize); + bool _1907 = tile.tile.offset != 0u; + bool _1914; + if (!_1907) + { + _1914 = tile.backdrop != 0; + } + else + { + _1914 = _1907; + } + include_tile = _1914; + } + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1934 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); + uint element_ix_1 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix_1 * 40u) }; + param_29.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_30 = ref; + AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_296, v_296BufferSize); + if (clip_zero_depth == 0u) + { + switch (tag_2.tag) + { + case 1u: + { + uint param_31 = element_ref_ix; + bool param_32 = mem_ok; + Alloc param_33 = read_tile_alloc(param_31, param_32, v_296, v_296BufferSize); + TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_33, param_34, v_296, v_296BufferSize); + param_35.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_36 = ref; + fill = Annotated_Color_read(param_35, param_36, v_296, v_296BufferSize); + Alloc param_37 = cmd_alloc; + CmdRef param_38 = cmd_ref; + uint param_39 = cmd_limit; + bool _2048 = alloc_cmd(param_37, param_38, param_39, v_296, v_296BufferSize); + cmd_alloc = param_37; + cmd_ref = param_38; + cmd_limit = param_39; + if (!_2048) + { + break; + } + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + uint param_42 = tag_2.flags; + Tile param_43 = tile_1; + float param_44 = fill.linewidth; + write_fill(param_40, param_41, param_42, param_43, param_44, v_296, v_296BufferSize); + cmd_ref = param_41; + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + CmdColor param_47 = CmdColor{ fill.rgba_color }; + Cmd_Color_write(param_45, param_46, param_47, v_296, v_296BufferSize); + cmd_ref.offset += 8u; + break; + } + case 2u: + { + uint param_48 = element_ref_ix; + bool param_49 = mem_ok; + Alloc param_50 = read_tile_alloc(param_48, param_49, v_296, v_296BufferSize); + TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_50, param_51, v_296, v_296BufferSize); + param_52.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_53 = ref; + AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_296, v_296BufferSize); + Alloc param_54 = cmd_alloc; + CmdRef param_55 = cmd_ref; + uint param_56 = cmd_limit; + bool _2120 = alloc_cmd(param_54, param_55, param_56, v_296, v_296BufferSize); + cmd_alloc = param_54; + cmd_ref = param_55; + cmd_limit = param_56; + if (!_2120) + { + break; + } + Alloc param_57 = cmd_alloc; + CmdRef param_58 = cmd_ref; + uint param_59 = tag_2.flags; + Tile param_60 = tile_1; + float param_61 = fill.linewidth; + write_fill(param_57, param_58, param_59, param_60, param_61, v_296, v_296BufferSize); + cmd_ref = param_58; + cmd_lin.index = lin.index; + cmd_lin.line_x = lin.line_x; + cmd_lin.line_y = lin.line_y; + cmd_lin.line_c = lin.line_c; + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + CmdLinGrad param_64 = cmd_lin; + Cmd_LinGrad_write(param_62, param_63, param_64, v_296, v_296BufferSize); + cmd_ref.offset += 20u; + break; + } + case 3u: + { + uint param_65 = element_ref_ix; + bool param_66 = mem_ok; + Alloc param_67 = read_tile_alloc(param_65, param_66, v_296, v_296BufferSize); + TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_67, param_68, v_296, v_296BufferSize); + param_69.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_70 = ref; + AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_296, v_296BufferSize); + Alloc param_71 = cmd_alloc; + CmdRef param_72 = cmd_ref; + uint param_73 = cmd_limit; + bool _2204 = alloc_cmd(param_71, param_72, param_73, v_296, v_296BufferSize); + cmd_alloc = param_71; + cmd_ref = param_72; + cmd_limit = param_73; + if (!_2204) + { + break; + } + Alloc param_74 = cmd_alloc; + CmdRef param_75 = cmd_ref; + uint param_76 = tag_2.flags; + Tile param_77 = tile_1; + float param_78 = fill_img.linewidth; + write_fill(param_74, param_75, param_76, param_77, param_78, v_296, v_296BufferSize); + cmd_ref = param_75; + Alloc param_79 = cmd_alloc; + CmdRef param_80 = cmd_ref; + CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset }; + Cmd_Image_write(param_79, param_80, param_81, v_296, v_296BufferSize); + cmd_ref.offset += 12u; + break; + } + case 4u: + { + uint param_82 = element_ref_ix; + bool param_83 = mem_ok; + Alloc param_84 = read_tile_alloc(param_82, param_83, v_296, v_296BufferSize); + TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_84, param_85, v_296, v_296BufferSize); + bool _2265 = tile_1.tile.offset == 0u; + bool _2271; + if (_2265) + { + _2271 = tile_1.backdrop == 0; + } + else + { + _2271 = _2265; + } + if (_2271) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + if ((tile_1.tile.offset == 0u) && (clip_depth < 32u)) + { + clip_one_mask |= (1u << clip_depth); + } + else + { + param_86.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_87 = ref; + AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87, v_296, v_296BufferSize); + Alloc param_88 = cmd_alloc; + CmdRef param_89 = cmd_ref; + uint param_90 = cmd_limit; + bool _2305 = alloc_cmd(param_88, param_89, param_90, v_296, v_296BufferSize); + cmd_alloc = param_88; + cmd_ref = param_89; + cmd_limit = param_90; + if (!_2305) + { + break; + } + Alloc param_91 = cmd_alloc; + CmdRef param_92 = cmd_ref; + uint param_93 = tag_2.flags; + Tile param_94 = tile_1; + float param_95 = begin_clip.linewidth; + write_fill(param_91, param_92, param_93, param_94, param_95, v_296, v_296BufferSize); + cmd_ref = param_92; + Alloc param_96 = cmd_alloc; + CmdRef param_97 = cmd_ref; + Cmd_BeginClip_write(param_96, param_97, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + if (clip_depth < 32u) + { + clip_one_mask &= (~(1u << clip_depth)); + } + } + } + clip_depth++; + break; + } + case 5u: + { + clip_depth--; + bool _2351 = clip_depth >= 32u; + bool _2360; + if (!_2351) + { + _2360 = (clip_one_mask & (1u << clip_depth)) == 0u; + } + else + { + _2360 = _2351; + } + if (_2360) + { + Alloc param_98 = cmd_alloc; + CmdRef param_99 = cmd_ref; + uint param_100 = cmd_limit; + bool _2369 = alloc_cmd(param_98, param_99, param_100, v_296, v_296BufferSize); + cmd_alloc = param_98; + cmd_ref = param_99; + cmd_limit = param_100; + if (!_2369) + { + break; + } + Alloc param_101 = cmd_alloc; + CmdRef param_102 = cmd_ref; + Cmd_Solid_write(param_101, param_102, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + Alloc param_103 = cmd_alloc; + CmdRef param_104 = cmd_ref; + Cmd_EndClip_write(param_103, param_104, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + } + break; + } + } + } + else + { + switch (tag_2.tag) + { + case 4u: + { + clip_depth++; + break; + } + case 5u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2432 = (bin_tile_x + tile_x) < _1249.conf.width_in_tiles; + bool _2441; + if (_2432) + { + _2441 = (bin_tile_y + tile_y) < _1249.conf.height_in_tiles; + } + else + { + _2441 = _2432; + } + if (_2441) + { + Alloc param_105 = cmd_alloc; + CmdRef param_106 = cmd_ref; + Cmd_End_write(param_105, param_106, v_296, v_296BufferSize); + } +} + diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/gen/coarse.spv similarity index 61% rename from piet-gpu/shader/coarse.spv rename to piet-gpu/shader/gen/coarse.spv index 8d4f7c0..fbe025d 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl index 0ef9538..0dec2cd 100644 --- a/piet-gpu/shader/gen/draw_leaf.hlsl +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -158,10 +158,10 @@ static const DrawMonoid _443 = { 1u, 0u }; static const DrawMonoid _445 = { 1u, 1u }; static const DrawMonoid _447 = { 0u, 1u }; -RWByteAddressBuffer _201 : register(u0); -ByteAddressBuffer _225 : register(t2); -ByteAddressBuffer _1008 : register(t3); -ByteAddressBuffer _1042 : register(t1); +RWByteAddressBuffer _201 : register(u0, space0); +ByteAddressBuffer _225 : register(t2, space0); +ByteAddressBuffer _1008 : register(t3, space0); +ByteAddressBuffer _1042 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl index b28c956..216d923 100644 --- a/piet-gpu/shader/gen/draw_reduce.hlsl +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -49,10 +49,10 @@ static const DrawMonoid _90 = { 1u, 1u }; static const DrawMonoid _92 = { 0u, 1u }; static const DrawMonoid _94 = { 0u, 0u }; -ByteAddressBuffer _46 : register(t2); -RWByteAddressBuffer _203 : register(u3); -RWByteAddressBuffer _217 : register(u0); -ByteAddressBuffer _223 : register(t1); +ByteAddressBuffer _46 : register(t2, space0); +RWByteAddressBuffer _203 : register(u3, space0); +RWByteAddressBuffer _217 : register(u0, space0); +ByteAddressBuffer _223 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl index 7dc68b1..ec75d5c 100644 --- a/piet-gpu/shader/gen/draw_root.hlsl +++ b/piet-gpu/shader/gen/draw_root.hlsl @@ -8,7 +8,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const DrawMonoid _18 = { 0u, 0u }; -RWByteAddressBuffer _57 : register(u0); +RWByteAddressBuffer _57 : register(u0, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil new file mode 100644 index 0000000..3b3c42e Binary files /dev/null and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl new file mode 100644 index 0000000..8b0699a --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -0,0 +1,689 @@ +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); + +RWByteAddressBuffer _202 : register(u0, space0); +ByteAddressBuffer _723 : register(t1, space0); +RWTexture2D image_atlas : register(u3, space0); +RWTexture2D gradients : register(u4, space0); +RWTexture2D image : register(u2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +uint spvPackUnorm4x8(float4 value) +{ + uint4 Packed = uint4(round(saturate(value) * 255.0)); + return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24); +} + +float4 spvUnpackUnorm4x8(uint value) +{ + uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24); + return float4(Packed) / 255.0; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _215 = { a.offset + offset }; + return _215; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _202.Load(offset * 4 + 8); + return v; +} + +CmdTag Cmd_tag(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + CmdTag _432 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _432; +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = asfloat(raw1); + return s; +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) +{ + CmdStrokeRef _449 = { ref.offset + 4u }; + Alloc param = a; + CmdStrokeRef param_1 = _449; + return CmdStroke_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TileSeg s; + s.origin = float2(asfloat(raw0), asfloat(raw1)); + s._vector = float2(asfloat(raw2), asfloat(raw3)); + s.y_edge = asfloat(raw4); + TileSegRef _572 = { raw5 }; + s.next = _572; + return s; +} + +uint2 chunk_offset(uint i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) +{ + CmdFillRef _439 = { ref.offset + 4u }; + Alloc param = a; + CmdFillRef param_1 = _439; + return CmdFill_read(param, param_1); +} + +CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdAlpha s; + s.alpha = asfloat(raw0); + return s; +} + +CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) +{ + CmdAlphaRef _459 = { ref.offset + 4u }; + Alloc param = a; + CmdAlphaRef param_1 = _459; + return CmdAlpha_read(param, param_1); +} + +CmdColor CmdColor_read(Alloc a, CmdColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +CmdColor Cmd_Color_read(Alloc a, CmdRef ref) +{ + CmdColorRef _469 = { ref.offset + 4u }; + Alloc param = a; + CmdColorRef param_1 = _469; + return CmdColor_read(param, param_1); +} + +float3 fromsRGB(float3 srgb) +{ + bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z); + float3 below = srgb / 12.9200000762939453125f.xxx; + float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx); + return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z); +} + +float4 unpacksRGB(uint srgba) +{ + float4 color = spvUnpackUnorm4x8(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + CmdLinGrad s; + s.index = raw0; + s.line_x = asfloat(raw1); + s.line_y = asfloat(raw2); + s.line_c = asfloat(raw3); + return s; +} + +CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) +{ + CmdLinGradRef _479 = { ref.offset + 4u }; + Alloc param = a; + CmdLinGradRef param_1 = _479; + return CmdLinGrad_read(param, param_1); +} + +CmdImage CmdImage_read(Alloc a, CmdImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +CmdImage Cmd_Image_read(Alloc a, CmdRef ref) +{ + CmdImageRef _489 = { ref.offset + 4u }; + Alloc param = a; + CmdImageRef param_1 = _489; + return CmdImage_read(param, param_1); +} + +void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) +{ + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas[uv]; + float3 param_1 = fg_rgba.xyz; + float3 _695 = fromsRGB(param_1); + fg_rgba.x = _695.x; + fg_rgba.y = _695.y; + fg_rgba.z = _695.z; + rgba[i] = fg_rgba; + } + spvReturnValue = rgba; +} + +float3 tosRGB(float3 rgb) +{ + bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z); + float3 below = 12.9200000762939453125f.xxx * rgb; + float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx; + return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z); +} + +uint packsRGB(inout float4 rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return spvPackUnorm4x8(rgba.wzyx); +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdJump s; + s.new_ref = raw0; + return s; +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) +{ + CmdJumpRef _499 = { ref.offset + 4u }; + Alloc param = a; + CmdJumpRef param_1 = _499; + return CmdJump_read(param, param_1); +} + +void comp_main() +{ + uint tile_ix = (gl_WorkGroupID.y * _723.Load(8)) + gl_WorkGroupID.x; + Alloc _738; + _738.offset = _723.Load(24); + Alloc param; + param.offset = _738.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _747 = { cmd_alloc.offset }; + CmdRef cmd_ref = _747; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = 0.0f.xxxx; + } + uint clip_depth = 0u; + bool mem_ok = _202.Load(4) == 0u; + float df[8]; + TileSegRef tile_seg_ref; + float area[8]; + uint blend_stack[128][8]; + float blend_alpha_stack[128][8]; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0f; + } + TileSegRef _842 = { stroke.tile_ref }; + tile_seg_ref = _842; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11); + float2 line_vec = seg._vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + 0.5f.xx) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f); + df[k_1] = min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + TileSegRef _964 = { fill.tile_ref }; + tile_seg_ref = _964; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1._vector; + float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx); + if (window.x != window.y) + { + float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx; + float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y)); + float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0f); + float c = max(b, 0.0f); + float d = max(xmin, 0.0f); + float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = min(abs(area[k_5]), 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0f; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba = gradients[int2(x, int(lin.index))]; + float3 param_29 = fg_rgba.xyz; + float3 _1298 = fromsRGB(param_29); + fg_rgba.x = _1298.x; + fg_rgba.y = _1298.y; + fg_rgba.z = _1298.z; + rgba[k_9] = fg_rgba; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_30, param_31); + uint2 param_32 = xy_uint; + CmdImage param_33 = fill_img; + float4 _1327[8]; + fillImage(_1327, param_32, param_33); + float4 img[8] = _1327; + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + float4 fg_k_1 = img[k_10] * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 12u; + break; + } + case 8u: + { + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + uint d_2 = min(clip_depth, 127u); + float4 param_34 = float4(rgba[k_11]); + uint _1390 = packsRGB(param_34); + blend_stack[d_2][k_11] = _1390; + blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f); + rgba[k_11] = 0.0f.xxxx; + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 9u: + { + clip_depth--; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + uint d_3 = min(clip_depth, 127u); + uint param_35 = blend_stack[d_3][k_12]; + float4 bg = unpacksRGB(param_35); + float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1; + } + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_36 = cmd_alloc; + CmdRef param_37 = cmd_ref; + CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref }; + cmd_ref = _1469; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_38 = i_1; + float3 param_39 = rgba[i_1].xyz; + image[int2(xy_uint + chunk_offset(param_38))] = float4(tosRGB(param_39), rgba[i_1].w); + } +} + +[numthreads(8, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl new file mode 100644 index 0000000..9318cc8 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.msl @@ -0,0 +1,728 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_202.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_202); + return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = as_type(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u }; + return CmdStroke_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_202); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_202); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_202); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_202); + TileSeg s; + s.origin = float2(as_type(raw0), as_type(raw1)); + s.vector = float2(as_type(raw2), as_type(raw3)); + s.y_edge = as_type(raw4); + s.next = TileSegRef{ raw5 }; + return s; +} + +static inline __attribute__((always_inline)) +uint2 chunk_offset(thread const uint& i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +static inline __attribute__((always_inline)) +CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u }; + return CmdFill_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + CmdAlpha s; + s.alpha = as_type(raw0); + return s; +} + +static inline __attribute__((always_inline)) +CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u }; + return CmdAlpha_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u }; + return CmdColor_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +float3 fromsRGB(thread const float3& srgb) +{ + bool3 cutoff = srgb >= float3(0.040449999272823333740234375); + float3 below = srgb / float3(12.9200000762939453125); + float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625)); + return select(below, above, cutoff); +} + +static inline __attribute__((always_inline)) +float4 unpacksRGB(thread const uint& srgba) +{ + float4 color = unpack_unorm4x8_to_float(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +static inline __attribute__((always_inline)) +CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_202); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_202); + CmdLinGrad s; + s.index = raw0; + s.line_x = as_type(raw1); + s.line_y = as_type(raw2); + s.line_c = as_type(raw3); + return s; +} + +static inline __attribute__((always_inline)) +CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u }; + return CmdLinGrad_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u }; + return CmdImage_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, thread texture2d image_atlas) +{ + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas.read(uint2(uv)); + float3 param_1 = fg_rgba.xyz; + float3 _695 = fromsRGB(param_1); + fg_rgba.x = _695.x; + fg_rgba.y = _695.y; + fg_rgba.z = _695.z; + rgba[i] = fg_rgba; + } + return rgba; +} + +static inline __attribute__((always_inline)) +float3 tosRGB(thread const float3& rgb) +{ + bool3 cutoff = rgb >= float3(0.003130800090730190277099609375); + float3 below = float3(12.9200000762939453125) * rgb; + float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875); + return select(below, above, cutoff); +} + +static inline __attribute__((always_inline)) +uint packsRGB(thread float4& rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return pack_float_to_unorm4x8(rgba.wzyx); +} + +static inline __attribute__((always_inline)) +CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + CmdJump s; + s.new_ref = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u }; + return CmdJump_read(param, param_1, v_202); +} + +kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x; + Alloc param; + param.offset = _723.conf.ptcl_alloc.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = float4(0.0); + } + uint clip_depth = 0u; + bool mem_ok = v_202.mem_error == 0u; + spvUnsafeArray df; + TileSegRef tile_seg_ref; + spvUnsafeArray area; + spvUnsafeArray, 128> blend_stack; + spvUnsafeArray, 128> blend_alpha_stack; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4, v_202).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0; + } + tile_seg_ref = TileSegRef{ stroke.tile_ref }; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11, v_202); + float2 line_vec = seg.vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + float2(0.5)) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + tile_seg_ref = TileSegRef{ fill.tile_ref }; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1.vector; + float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0)); + if ((isunordered(window.x, window.y) || window.x != window.y)) + { + float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y); + float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y)); + float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07; + float xmax = fast::max(xs.x, xs.y); + float b = fast::min(xmax, 1.0); + float c = fast::max(b, 0.0); + float d = fast::max(xmin, 0.0); + float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = fast::min(abs(area[k_5]), 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24, v_202); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); + float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); + float3 param_29 = fg_rgba.xyz; + float3 _1298 = fromsRGB(param_29); + fg_rgba.x = _1298.x; + fg_rgba.y = _1298.y; + fg_rgba.z = _1298.z; + rgba[k_9] = fg_rgba; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202); + uint2 param_32 = xy_uint; + CmdImage param_33 = fill_img; + spvUnsafeArray img; + img = fillImage(param_32, param_33, image_atlas); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + float4 fg_k_1 = img[k_10] * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 12u; + break; + } + case 8u: + { + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + uint d_2 = min(clip_depth, 127u); + float4 param_34 = float4(rgba[k_11]); + uint _1390 = packsRGB(param_34); + blend_stack[d_2][k_11] = _1390; + blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0); + rgba[k_11] = float4(0.0); + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 9u: + { + clip_depth--; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + uint d_3 = min(clip_depth, 127u); + uint param_35 = blend_stack[d_3][k_12]; + float4 bg = unpacksRGB(param_35); + float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1; + } + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_36 = cmd_alloc; + CmdRef param_37 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref }; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_38 = i_1; + float3 param_39 = rgba[i_1].xyz; + image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38)))); + } +} + diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv similarity index 100% rename from piet-gpu/shader/kernel4.spv rename to piet-gpu/shader/gen/kernel4.spv diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil new file mode 100644 index 0000000..9fd593c Binary files /dev/null and b/piet-gpu/shader/gen/path_coarse.dxil differ diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl new file mode 100644 index 0000000..6025bde --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.hlsl @@ -0,0 +1,664 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u); + +static const PathSegTag _721 = { 0u, 0u }; + +RWByteAddressBuffer _136 : register(u0, space0); +ByteAddressBuffer _710 : register(t1, space0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _136.Load(offset * 4 + 8); + return v; +} + +PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _367; +} + +PathCubic PathCubic_read(Alloc a, PathCubicRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23); + PathCubic s; + s.p0 = float2(asfloat(raw0), asfloat(raw1)); + s.p1 = float2(asfloat(raw2), asfloat(raw3)); + s.p2 = float2(asfloat(raw4), asfloat(raw5)); + s.p3 = float2(asfloat(raw6), asfloat(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(asfloat(raw10), asfloat(raw11)); + return s; +} + +PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) +{ + PathCubicRef _373 = { ref.offset + 4u }; + Alloc param = a; + PathCubicRef param_1 = _373; + return PathCubic_read(param, param_1); +} + +float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t) +{ + float mt = 1.0f - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t); +} + +float approx_parabola_integral(float x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x)))); +} + +SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0f; + if (scale < 1000000000.0f) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + SubdivResult _695 = { val, a0, a2 }; + return _695; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _427 = { raw2 }; + s.tiles = _427; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +float approx_parabola_inv_integral(float x) +{ + return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x))); +} + +float2 eval_quad(float2 p0, float2 p1, float2 p2, float t) +{ + float mt = 1.0f - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t); +} + +MallocResult malloc(uint size) +{ + uint _142; + _136.InterlockedAdd(0, size, _142); + uint offset = _142; + uint _149; + _136.GetDimensions(_149); + _149 = (_149 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_149) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171; + _136.InterlockedMax(4, 1u, _171); + return r; + } + return r; +} + +TileRef Tile_index(TileRef ref, uint index) +{ + TileRef _385 = { ref.offset + (index * 8u) }; + return _385; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _136.Store(offset * 4 + 8, val); +} + +void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.origin.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.origin.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s._vector.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s._vector.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.y_edge); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) }; + PathSegRef ref = _718; + PathSegTag tag = _721; + if (element_ix < _710.Load(4)) + { + Alloc _731; + _731.offset = _710.Load(28); + Alloc param; + param.offset = _731.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1); + } + bool mem_ok = _136.Load(4) == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc _748; + _748.offset = _710.Load(28); + Alloc param_2; + param_2.offset = _748.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0f; + float2 qp0 = cubic.p0; + float _step = 1.0f / float(n_quads); + SubdivResult keep_params[16]; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5f * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125f; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + PathRef _904 = { _710.Load(16) + (path_ix * 12u) }; + Alloc _907; + _907.offset = _710.Load(16); + Alloc param_19; + param_19.offset = _907.offset; + PathRef param_20 = _904; + Path path = Path_read(param_19, param_20); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0f; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5f * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0f / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = lerp(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10f) + { + _1147 = 1000000000.0f; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f; + int x0 = int(floor(xmin * 0.0625f)); + int x1 = int(floor(xmax * 0.0625f) + 1.0f); + int y0 = int(floor(ymin * 0.0625f)); + int y1 = int(floor(ymax * 0.0625f) + 1.0f); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625f)); + int last_xray = int(floor(p1.x * 0.0625f)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374; + _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625f)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef _1454 = { path.tiles.offset }; + TileRef param_46 = _1454; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477; + _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477); + old = _1477; + } + tile_seg.origin = p0; + tile_seg._vector = p1 - p0; + float y_edge = 0.0f; + if (!is_stroke) + { + y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg._vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg._vector = p1 - p; + } + if (tile_seg._vector.x == 0.0f) + { + tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0f; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + TileSegRef _1559 = { tile_offset }; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = _1559; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + +[numthreads(32, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl new file mode 100644 index 0000000..d263f31 --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.msl @@ -0,0 +1,708 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_136.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize); + return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize); + PathCubic s; + s.p0 = float2(as_type(raw0), as_type(raw1)); + s.p1 = float2(as_type(raw2), as_type(raw3)); + s.p2 = float2(as_type(raw4), as_type(raw5)); + s.p3 = float2(as_type(raw6), as_type(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(as_type(raw10), as_type(raw11)); + return s; +} + +static inline __attribute__((always_inline)) +PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u }; + return PathCubic_read(param, param_1, v_136, v_136BufferSize); +} + +static inline __attribute__((always_inline)) +float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t); +} + +static inline __attribute__((always_inline)) +float approx_parabola_integral(thread const float& x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x)))); +} + +static inline __attribute__((always_inline)) +SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0; + if (scale < 1000000000.0) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + return SubdivResult{ val, a0, a2 }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +float approx_parabola_inv_integral(thread const float& x) +{ + return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x))); +} + +static inline __attribute__((always_inline)) +float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t); +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed); + uint offset = _142; + MallocResult r; + r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +TileRef Tile_index(thread const TileRef& ref, thread const uint& index) +{ + return TileRef{ ref.offset + (index * 8u) }; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_136.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.origin.x); + write_mem(param, param_1, param_2, v_136, v_136BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.origin.y); + write_mem(param_3, param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.vector.x); + write_mem(param_6, param_7, param_8, v_136, v_136BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.vector.y); + write_mem(param_9, param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.y_edge); + write_mem(param_12, param_13, param_14, v_136, v_136BufferSize); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17, v_136, v_136BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + constant uint& v_136BufferSize = spvBufferSizeConstants[0]; + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) }; + PathSegTag tag = PathSegTag{ 0u, 0u }; + if (element_ix < _710.conf.n_pathseg) + { + Alloc param; + param.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize); + } + bool mem_ok = v_136.mem_error == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc param_2; + param_2.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0; + float2 qp0 = cubic.p0; + float _step = 1.0 / float(n_quads); + spvUnsafeArray keep_params; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5 * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + Alloc param_19; + param_19.offset = _710.conf.tile_alloc.offset; + PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_19, param_20, v_136, v_136BufferSize); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5 * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0 / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = mix(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x; + float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x; + float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y; + float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10) + { + _1147 = 1000000000.0; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625; + int x0 = int(floor(xmin * 0.0625)); + int x1 = int(floor(xmax * 0.0625) + 1.0); + int y0 = int(floor(ymin * 0.0625)); + int y1 = int(floor(ymax * 0.0625) + 1.0); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625)); + int last_xray = int(floor(p1.x * 0.0625)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = fast::min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef param_46 = TileRef{ path.tiles.offset }; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed); + old = _1477; + } + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; + float y_edge = 0.0; + if (!is_stroke) + { + y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (fast::min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg.vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg.vector = p1 - p; + } + if (tile_seg.vector.x == 0.0) + { + tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = TileSegRef{ tile_offset }; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv similarity index 100% rename from piet-gpu/shader/path_coarse.spv rename to piet-gpu/shader/gen/path_coarse.spv diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl index c7f7df0..a9cee25 100644 --- a/piet-gpu/shader/gen/pathseg.hlsl +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -77,10 +77,10 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; static const Monoid _567 = { 0.0f.xxxx, 0u }; -RWByteAddressBuffer _111 : register(u0); -ByteAddressBuffer _574 : register(t2); -ByteAddressBuffer _639 : register(t1); -ByteAddressBuffer _709 : register(t3); +RWByteAddressBuffer _111 : register(u0, space0); +ByteAddressBuffer _574 : register(t2, space0); +ByteAddressBuffer _639 : register(t1, space0); +ByteAddressBuffer _709 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl index dd7c611..291243e 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.hlsl +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -36,10 +36,10 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); -ByteAddressBuffer _139 : register(t1); -ByteAddressBuffer _150 : register(t2); -RWByteAddressBuffer _238 : register(u3); -RWByteAddressBuffer _258 : register(u0); +ByteAddressBuffer _139 : register(t1, space0); +ByteAddressBuffer _150 : register(t2, space0); +RWByteAddressBuffer _238 : register(u3, space0); +RWByteAddressBuffer _258 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl index 388f99d..f1ec389 100644 --- a/piet-gpu/shader/gen/pathtag_root.hlsl +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -11,7 +11,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; -RWByteAddressBuffer _78 : register(u0); +RWByteAddressBuffer _78 : register(u0, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil new file mode 100644 index 0000000..d69db16 Binary files /dev/null and b/piet-gpu/shader/gen/tile_alloc.dxil differ diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl new file mode 100644 index 0000000..010e714 --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -0,0 +1,335 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _92 : register(u0, space0); +ByteAddressBuffer _305 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared uint sh_tile_count[256]; +groupshared MallocResult sh_tile_alloc; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _92.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _236 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _236; +} + +AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + AnnoEndClip s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + return s; +} + +AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) +{ + AnnoEndClipRef _243 = { ref.offset + 4u }; + Alloc param = a; + AnnoEndClipRef param_1 = _243; + return AnnoEndClip_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _98; + _92.InterlockedAdd(0, size, _98); + uint offset = _98; + uint _105; + _92.GetDimensions(_105); + _105 = (_105 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_105) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _127; + _92.InterlockedMax(4, 1u, _127); + return r; + } + return r; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _169 = { a.offset + offset }; + return _169; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _92.Store(offset * 4 + 8, val); +} + +void Path_write(Alloc a, PathRef ref, Path s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef _312 = { _305.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _312; + AnnotatedRef _321 = { _305.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _321; + uint tag = 0u; + if (element_ix < _305.Load(0)) + { + Alloc _332; + _332.offset = _305.Load(32); + Alloc param; + param.offset = _332.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc _350; + _350.offset = _305.Load(32); + Alloc param_2; + param_2.offset = _350.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3); + x0 = int(floor(clip.bbox.x * 0.0625f)); + y0 = int(floor(clip.bbox.y * 0.0625f)); + x1 = int(ceil(clip.bbox.z * 0.0625f)); + y1 = int(ceil(clip.bbox.w * 0.0625f)); + break; + } + } + x0 = clamp(x0, 0, int(_305.Load(8))); + y0 = clamp(y0, 0, int(_305.Load(12))); + x1 = clamp(x1, 0, int(_305.Load(8))); + y1 = clamp(y1, 0, int(_305.Load(12))); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + if (tag == 5u) + { + tile_count = 0u; + } + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= uint(1 << int(i))) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_4 = total_tile_count * 8u; + MallocResult _477 = malloc(param_4); + sh_tile_alloc = _477; + } + GroupMemoryBarrierWithGroupSync(); + MallocResult alloc_start = sh_tile_alloc; + bool _488; + if (!alloc_start.failed) + { + _488 = _92.Load(4) != 0u; + } + else + { + _488 = alloc_start.failed; + } + if (_488) + { + return; + } + if (element_ix < _305.Load(0)) + { + uint _501; + if (th_ix > 0u) + { + _501 = sh_tile_count[th_ix - 1u]; + } + else + { + _501 = 0u; + } + uint tile_subix = _501; + Alloc param_5 = alloc_start.alloc; + uint param_6 = 8u * tile_subix; + uint param_7 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_5, param_6, param_7); + TileRef _523 = { tiles_alloc.offset }; + path.tiles = _523; + Alloc _528; + _528.offset = _305.Load(16); + Alloc param_8; + param_8.offset = _528.offset; + PathRef param_9 = path_ref; + Path param_10 = path; + Path_write(param_8, param_9, param_10); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_11 = alloc_start.alloc; + uint param_12 = start_ix + i_1; + uint param_13 = 0u; + write_mem(param_11, param_12, param_13); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl new file mode 100644 index 0000000..3906536 --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -0,0 +1,336 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_92.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_92, v_92BufferSize); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_92, constant uint& v_92BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_92, v_92BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_92, v_92BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_92, v_92BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize); + AnnoEndClip s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + return s; +} + +static inline __attribute__((always_inline)) +AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = a; + AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u }; + return AnnoEndClip_read(param, param_1, v_92, v_92BufferSize); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_92, constant uint& v_92BufferSize) +{ + uint _98 = atomic_fetch_add_explicit((device atomic_uint*)&v_92.mem_offset, size, memory_order_relaxed); + uint offset = _98; + MallocResult r; + r.failed = (offset + size) > uint(int((v_92BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _127 = atomic_fetch_max_explicit((device atomic_uint*)&v_92.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_92.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_92, constant uint& v_92BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2, v_92, v_92BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5, v_92, v_92BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8, v_92, v_92BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup uint sh_tile_count[256]; + threadgroup MallocResult sh_tile_alloc; + constant uint& v_92BufferSize = spvBufferSizeConstants[0]; + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) }; + AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) }; + uint tag = 0u; + if (element_ix < _305.conf.n_elements) + { + Alloc param; + param.offset = _305.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc param_2; + param_2.offset = _305.conf.anno_alloc.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize); + x0 = int(floor(clip.bbox.x * 0.0625)); + y0 = int(floor(clip.bbox.y * 0.0625)); + x1 = int(ceil(clip.bbox.z * 0.0625)); + y1 = int(ceil(clip.bbox.w * 0.0625)); + break; + } + } + x0 = clamp(x0, 0, int(_305.conf.width_in_tiles)); + y0 = clamp(y0, 0, int(_305.conf.height_in_tiles)); + x1 = clamp(x1, 0, int(_305.conf.width_in_tiles)); + y1 = clamp(y1, 0, int(_305.conf.height_in_tiles)); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + if (tag == 5u) + { + tile_count = 0u; + } + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= uint(1 << int(i))) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_4 = total_tile_count * 8u; + MallocResult _477 = malloc(param_4, v_92, v_92BufferSize); + sh_tile_alloc = _477; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + MallocResult alloc_start = sh_tile_alloc; + bool _488; + if (!alloc_start.failed) + { + _488 = v_92.mem_error != 0u; + } + else + { + _488 = alloc_start.failed; + } + if (_488) + { + return; + } + if (element_ix < _305.conf.n_elements) + { + uint _501; + if (th_ix > 0u) + { + _501 = sh_tile_count[th_ix - 1u]; + } + else + { + _501 = 0u; + } + uint tile_subix = _501; + Alloc param_5 = alloc_start.alloc; + uint param_6 = 8u * tile_subix; + uint param_7 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_5, param_6, param_7); + path.tiles = TileRef{ tiles_alloc.offset }; + Alloc param_8; + param_8.offset = _305.conf.tile_alloc.offset; + PathRef param_9 = path_ref; + Path param_10 = path; + Path_write(param_8, param_9, param_10, v_92, v_92BufferSize); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_11 = alloc_start.alloc; + uint param_12 = start_ix + i_1; + uint param_13 = 0u; + write_mem(param_11, param_12, param_13, v_92, v_92BufferSize); + } +} + diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv similarity index 79% rename from piet-gpu/shader/tile_alloc.spv rename to piet-gpu/shader/gen/tile_alloc.spv index b443b03..d4a6e31 100644 Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/gen/tile_alloc.spv differ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl index 6fa9267..7744e0f 100644 --- a/piet-gpu/shader/gen/transform_leaf.hlsl +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -51,10 +51,10 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; -RWByteAddressBuffer _71 : register(u0); -ByteAddressBuffer _96 : register(t2); -ByteAddressBuffer _278 : register(t1); -ByteAddressBuffer _377 : register(t3); +RWByteAddressBuffer _71 : register(u0, space0); +ByteAddressBuffer _96 : register(t2, space0); +ByteAddressBuffer _278 : register(t1, space0); +ByteAddressBuffer _377 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl index 60addf3..5ada811 100644 --- a/piet-gpu/shader/gen/transform_reduce.hlsl +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -38,10 +38,10 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); -ByteAddressBuffer _49 : register(t2); -ByteAddressBuffer _161 : register(t1); -RWByteAddressBuffer _251 : register(u3); -RWByteAddressBuffer _267 : register(u0); +ByteAddressBuffer _49 : register(t2, space0); +ByteAddressBuffer _161 : register(t1, space0); +RWByteAddressBuffer _251 : register(u3, space0); +RWByteAddressBuffer _267 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl index 42bbd38..35961b1 100644 --- a/piet-gpu/shader/gen/transform_root.hlsl +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -8,7 +8,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; -RWByteAddressBuffer _89 : register(u0); +RWByteAddressBuffer _89 : register(u0, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; diff --git a/piet-gpu/shader/kernel4_idx.spv b/piet-gpu/shader/kernel4_idx.spv deleted file mode 100644 index 953eae1..0000000 Binary files a/piet-gpu/shader/kernel4_idx.spv and /dev/null differ diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 6340683..3761e9e 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -73,7 +73,7 @@ void main() { for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) { barrier(); if (th_ix >= (1 << i)) { - total_tile_count += sh_tile_count[th_ix - (1 << i)]; + total_tile_count += sh_tile_count[th_ix - (1u << i)]; } barrier(); sh_tile_count[th_ix] = total_tile_count; diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 25627f6..e1bde6a 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -15,7 +15,7 @@ use piet::{ImageFormat, RenderContext}; use piet_gpu_hal::{ BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline, - QueryPool, Session, ShaderCode, + QueryPool, Session, ShaderCode, include_shader, }; use pico_svg::PicoSvg; @@ -161,23 +161,23 @@ impl Renderer { }) .collect(); - let tile_alloc_code = ShaderCode::Spv(include_bytes!("../shader/tile_alloc.spv")); + let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc"); let tile_pipeline = session .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::Buffer])?; let tile_ds = session .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?; - let path_alloc_code = ShaderCode::Spv(include_bytes!("../shader/path_coarse.spv")); + let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse"); let path_pipeline = session .create_compute_pipeline(path_alloc_code, &[BindType::Buffer, BindType::Buffer])?; let path_ds = session .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 { - ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv")) + include_shader!(session, "../shader/gen/backdrop_lg") } else { println!("using small workgroup backdrop kernel"); - ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")) + include_shader!(session, "../shader/gen/backdrop") }; let backdrop_pipeline = session .create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::Buffer])?; @@ -185,13 +185,13 @@ impl Renderer { .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?; // TODO: constants - let bin_code = ShaderCode::Spv(include_bytes!("../shader/binning.spv")); + let bin_code = include_shader!(session, "../shader/gen/binning"); let bin_pipeline = session.create_compute_pipeline(bin_code, &[BindType::Buffer, BindType::Buffer])?; let bin_ds = session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?; - let coarse_code = ShaderCode::Spv(include_bytes!("../shader/coarse.spv")); + let coarse_code = include_shader!(session, "../shader/gen/coarse"); let coarse_pipeline = session.create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::Buffer])?; let coarse_ds = session @@ -210,7 +210,7 @@ impl Renderer { .collect(); let gradients = Self::make_gradient_image(&session); - let k4_code = ShaderCode::Spv(include_bytes!("../shader/kernel4.spv")); + let k4_code = include_shader!(session, "../shader/gen/kernel4"); let k4_pipeline = session.create_compute_pipeline( k4_code, &[