From c503ff28b017e03ef03f493172733370f7d4fcc0 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 3 Dec 2021 15:49:58 -0800 Subject: [PATCH] Make shaders cross-platform Translate all piet-gpu shaders into DXIL and MSL; move generated files into the shader/gen directory. --- piet-gpu/shader/backdrop.comp | 6 +- piet-gpu/shader/build.ninja | 40 +- piet-gpu/shader/coarse.comp | 20 +- piet-gpu/shader/elements.comp | 467 ------- piet-gpu/shader/elements.spv | Bin 81476 -> 0 bytes piet-gpu/shader/gen/backdrop.dxil | Bin 0 -> 4672 bytes piet-gpu/shader/gen/backdrop.hlsl | 283 +++++ piet-gpu/shader/gen/backdrop.msl | 284 +++++ piet-gpu/shader/{ => gen}/backdrop.spv | Bin 12636 -> 12588 bytes piet-gpu/shader/gen/backdrop_lg.dxil | Bin 0 -> 4676 bytes piet-gpu/shader/gen/backdrop_lg.hlsl | 283 +++++ piet-gpu/shader/gen/backdrop_lg.msl | 284 +++++ piet-gpu/shader/{ => gen}/backdrop_lg.spv | Bin 12668 -> 12620 bytes piet-gpu/shader/gen/bbox_clear.hlsl | 4 +- piet-gpu/shader/gen/binning.dxil | Bin 0 -> 5800 bytes piet-gpu/shader/gen/binning.hlsl | 352 ++++++ piet-gpu/shader/gen/binning.msl | 350 ++++++ piet-gpu/shader/{ => gen}/binning.spv | Bin piet-gpu/shader/gen/coarse.dxil | Bin 0 -> 10984 bytes piet-gpu/shader/gen/coarse.hlsl | 1386 +++++++++++++++++++++ piet-gpu/shader/gen/coarse.msl | 1378 ++++++++++++++++++++ piet-gpu/shader/{ => gen}/coarse.spv | Bin 64200 -> 64040 bytes piet-gpu/shader/gen/draw_leaf.hlsl | 8 +- piet-gpu/shader/gen/draw_reduce.hlsl | 8 +- piet-gpu/shader/gen/draw_root.hlsl | 2 +- piet-gpu/shader/gen/kernel4.dxil | Bin 0 -> 10004 bytes piet-gpu/shader/gen/kernel4.hlsl | 689 ++++++++++ piet-gpu/shader/gen/kernel4.msl | 728 +++++++++++ piet-gpu/shader/{ => gen}/kernel4.spv | Bin piet-gpu/shader/gen/path_coarse.dxil | Bin 0 -> 7064 bytes piet-gpu/shader/gen/path_coarse.hlsl | 664 ++++++++++ piet-gpu/shader/gen/path_coarse.msl | 708 +++++++++++ piet-gpu/shader/{ => gen}/path_coarse.spv | Bin piet-gpu/shader/gen/pathseg.hlsl | 8 +- piet-gpu/shader/gen/pathtag_reduce.hlsl | 8 +- piet-gpu/shader/gen/pathtag_root.hlsl | 2 +- piet-gpu/shader/gen/tile_alloc.dxil | Bin 0 -> 5048 bytes piet-gpu/shader/gen/tile_alloc.hlsl | 335 +++++ piet-gpu/shader/gen/tile_alloc.msl | 336 +++++ piet-gpu/shader/{ => gen}/tile_alloc.spv | Bin 15192 -> 15176 bytes piet-gpu/shader/gen/transform_leaf.hlsl | 8 +- piet-gpu/shader/gen/transform_reduce.hlsl | 8 +- piet-gpu/shader/gen/transform_root.hlsl | 2 +- piet-gpu/shader/kernel4_idx.spv | Bin 34676 -> 0 bytes piet-gpu/shader/tile_alloc.comp | 2 +- piet-gpu/src/lib.rs | 16 +- 46 files changed, 8140 insertions(+), 529 deletions(-) delete mode 100644 piet-gpu/shader/elements.comp delete mode 100644 piet-gpu/shader/elements.spv create mode 100644 piet-gpu/shader/gen/backdrop.dxil create mode 100644 piet-gpu/shader/gen/backdrop.hlsl create mode 100644 piet-gpu/shader/gen/backdrop.msl rename piet-gpu/shader/{ => gen}/backdrop.spv (71%) create mode 100644 piet-gpu/shader/gen/backdrop_lg.dxil create mode 100644 piet-gpu/shader/gen/backdrop_lg.hlsl create mode 100644 piet-gpu/shader/gen/backdrop_lg.msl rename piet-gpu/shader/{ => gen}/backdrop_lg.spv (70%) create mode 100644 piet-gpu/shader/gen/binning.dxil create mode 100644 piet-gpu/shader/gen/binning.hlsl create mode 100644 piet-gpu/shader/gen/binning.msl rename piet-gpu/shader/{ => gen}/binning.spv (100%) create mode 100644 piet-gpu/shader/gen/coarse.dxil create mode 100644 piet-gpu/shader/gen/coarse.hlsl create mode 100644 piet-gpu/shader/gen/coarse.msl rename piet-gpu/shader/{ => gen}/coarse.spv (61%) create mode 100644 piet-gpu/shader/gen/kernel4.dxil create mode 100644 piet-gpu/shader/gen/kernel4.hlsl create mode 100644 piet-gpu/shader/gen/kernel4.msl rename piet-gpu/shader/{ => gen}/kernel4.spv (100%) create mode 100644 piet-gpu/shader/gen/path_coarse.dxil create mode 100644 piet-gpu/shader/gen/path_coarse.hlsl create mode 100644 piet-gpu/shader/gen/path_coarse.msl rename piet-gpu/shader/{ => gen}/path_coarse.spv (100%) create mode 100644 piet-gpu/shader/gen/tile_alloc.dxil create mode 100644 piet-gpu/shader/gen/tile_alloc.hlsl create mode 100644 piet-gpu/shader/gen/tile_alloc.msl rename piet-gpu/shader/{ => gen}/tile_alloc.spv (79%) delete mode 100644 piet-gpu/shader/kernel4_idx.spv diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index d544417..e4140cd 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -87,8 +87,8 @@ void main() { // Prefix sum of sh_row_count for (uint i = 0; i < LG_BACKDROP_WG; i++) { barrier(); - if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) { - row_count += sh_row_count[th_ix - (1 << i)]; + if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) { + row_count += sh_row_count[th_ix - (1u << i)]; } barrier(); if (gl_LocalInvocationID.y == 0) { @@ -102,7 +102,7 @@ void main() { // Binary search to find element uint el_ix = 0; for (uint i = 0; i < LG_BACKDROP_WG; i++) { - uint probe = el_ix + ((BACKDROP_WG / 2) >> i); + uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i); if (row >= sh_row_count[probe - 1]) { el_ix = probe; } diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 497915c..6ed2140 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -14,7 +14,7 @@ rule glsl command = $glslang_validator $flags -V -o $out $in rule hlsl - command = $spirv_cross --hlsl $in --output $out + command = $spirv_cross --hlsl --shader-model 60 $in --output $out rule dxil command = $dxc -T cs_6_0 $in -Fo $out @@ -22,23 +22,41 @@ rule dxil rule msl command = $spirv_cross --msl $in --output $out $msl_flags +build gen/binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h +build gen/binning.hlsl: hlsl gen/binning.spv +build gen/binning.dxil: dxil gen/binning.hlsl +build gen/binning.msl: msl gen/binning.spv -build elements.spv: glsl elements.comp | scene.h state.h annotated.h +build gen/tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h +build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv +build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl +build gen/tile_alloc.msl: msl gen/tile_alloc.spv -build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h +build gen/path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h +build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv +build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl +build gen/path_coarse.msl: msl gen/path_coarse.spv -build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h +build gen/backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h +build gen/backdrop.hlsl: hlsl gen/backdrop.spv +build gen/backdrop.dxil: dxil gen/backdrop.hlsl +build gen/backdrop.msl: msl gen/backdrop.spv -build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h - -build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h - -build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h +build gen/backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h flags = -DBACKDROP_DIST_FACTOR=4 +build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv +build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl +build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv -build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h +build gen/coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h +build gen/coarse.hlsl: hlsl gen/coarse.spv +build gen/coarse.dxil: dxil gen/coarse.hlsl +build gen/coarse.msl: msl gen/coarse.spv -build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h +build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h +build gen/kernel4.hlsl: hlsl gen/kernel4.spv +build gen/kernel4.dxil: dxil gen/kernel4.hlsl +build gen/kernel4.msl: msl gen/kernel4.spv # New element pipeline follows diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index b541893..31a64e4 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -172,8 +172,8 @@ void main() { } barrier(); if (th_ix < N_PART_READ) { - if (th_ix >= (1 << i)) { - count += sh_part_count[th_ix - (1 << i)]; + if (th_ix >= (1u << i)) { + count += sh_part_count[th_ix - (1u << i)]; } } barrier(); @@ -190,7 +190,7 @@ void main() { if (ix >= wr_ix && ix < ready_ix && mem_ok) { uint part_ix = 0; for (uint i = 0; i < LG_N_PART_READ; i++) { - uint probe = part_ix + ((N_PART_READ / 2) >> i); + uint probe = part_ix + (uint(N_PART_READ / 2) >> i); if (ix >= sh_part_count[probe - 1]) { part_ix = probe; } @@ -257,8 +257,8 @@ void main() { sh_tile_count[th_ix] = tile_count; for (uint i = 0; i < LG_N_TILE; i++) { barrier(); - if (th_ix >= (1 << i)) { - tile_count += sh_tile_count[th_ix - (1 << i)]; + if (th_ix >= (1u << i)) { + tile_count += sh_tile_count[th_ix - (1u << i)]; } barrier(); sh_tile_count[th_ix] = tile_count; @@ -269,7 +269,7 @@ void main() { // Binary search to find element uint el_ix = 0; for (uint i = 0; i < LG_N_TILE; i++) { - uint probe = el_ix + ((N_TILE / 2) >> i); + uint probe = el_ix + (uint(N_TILE / 2) >> i); if (ix >= sh_tile_count[probe - 1]) { el_ix = probe; } @@ -292,7 +292,7 @@ void main() { } if (include_tile) { uint el_slice = el_ix / 32; - uint el_mask = 1 << (el_ix & 31); + uint el_mask = 1u << (el_ix & 31); atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask); } } @@ -372,7 +372,7 @@ void main() { if (tile.tile.offset == 0 && tile.backdrop == 0) { clip_zero_depth = clip_depth + 1; } else if (tile.tile.offset == 0 && clip_depth < 32) { - clip_one_mask |= (1 << clip_depth); + clip_one_mask |= (1u << clip_depth); } else { AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref); if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { @@ -382,14 +382,14 @@ void main() { Cmd_BeginClip_write(cmd_alloc, cmd_ref); cmd_ref.offset += 4; if (clip_depth < 32) { - clip_one_mask &= ~(1 << clip_depth); + clip_one_mask &= ~(1u << clip_depth); } } clip_depth++; break; case Annotated_EndClip: clip_depth--; - if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { + if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) { if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp deleted file mode 100644 index 6f33544..0000000 --- a/piet-gpu/shader/elements.comp +++ /dev/null @@ -1,467 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense - -// The element processing stage, first in the pipeline. -// -// This stage is primarily about applying transforms and computing bounding -// boxes. It is organized as a scan over the input elements, producing -// annotated output elements. - -#version 450 -#extension GL_GOOGLE_include_directive : enable - -#include "mem.h" -#include "setup.h" - -#define N_ROWS 4 -#define WG_SIZE 32 -#define LG_WG_SIZE 5 -#define PARTITION_SIZE (WG_SIZE * N_ROWS) - -layout(local_size_x = WG_SIZE, local_size_y = 1) in; - -layout(set = 0, binding = 1) readonly buffer ConfigBuf { - Config conf; -}; - -layout(set = 0, binding = 2) readonly buffer SceneBuf { - uint[] scene; -}; - -// It would be better to use the Vulkan memory model than -// "volatile" but shooting for compatibility here rather -// than doing things right. -layout(set = 0, binding = 3) volatile buffer StateBuf { - uint part_counter; - uint[] state; -}; - -#include "scene.h" -#include "state.h" -#include "annotated.h" -#include "pathseg.h" -#include "tile.h" - -#define StateBuf_stride (4 + 2 * State_size) - -StateRef state_aggregate_ref(uint partition_ix) { - return StateRef(4 + partition_ix * StateBuf_stride); -} - -StateRef state_prefix_ref(uint partition_ix) { - return StateRef(4 + partition_ix * StateBuf_stride + State_size); -} - -uint state_flag_index(uint partition_ix) { - return partition_ix * (StateBuf_stride / 4); -} - -// These correspond to X, A, P respectively in the prefix sum paper. -#define FLAG_NOT_READY 0 -#define FLAG_AGGREGATE_READY 1 -#define FLAG_PREFIX_READY 2 - -#define FLAG_SET_LINEWIDTH 1 -#define FLAG_SET_BBOX 2 -#define FLAG_RESET_BBOX 4 -#define FLAG_SET_FILL_MODE 8 -// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1. -#define LG_FILL_MODE 4 -#define FILL_MODE_BITS 1 -#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE) - -// This is almost like a monoid (the interaction between transformation and -// bounding boxes is approximate) -State combine_state(State a, State b) { - State c; - c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; - c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; - c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; - c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; - if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { - c.bbox = a.bbox; - } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && - (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) - { - c.bbox.xy = min(a.bbox.xy, c.bbox.xy); - c.bbox.zw = max(a.bbox.zw, c.bbox.zw); - } - // It would be more concise to cast to matrix types; ah well. - c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; - c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; - c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; - c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; - c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; - c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; - c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; - c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags; - c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; - uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags; - fill_mode &= FILL_MODE_MASK; - c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode; - c.path_count = a.path_count + b.path_count; - c.pathseg_count = a.pathseg_count + b.pathseg_count; - c.trans_count = a.trans_count + b.trans_count; - return c; -} - -State map_element(ElementRef ref) { - // TODO: it would *probably* be more efficient to make the memory read patterns less - // divergent, though it would be more wasted memory. - uint tag = Element_tag(ref).tag; - State c; - c.bbox = vec4(0.0, 0.0, 0.0, 0.0); - c.mat = vec4(1.0, 0.0, 0.0, 1.0); - c.translate = vec2(0.0, 0.0); - c.linewidth = 1.0; // TODO should be 0.0 - c.flags = 0; - c.path_count = 0; - c.pathseg_count = 0; - c.trans_count = 0; - switch (tag) { - case Element_Line: - LineSeg line = Element_Line_read(ref); - c.bbox.xy = min(line.p0, line.p1); - c.bbox.zw = max(line.p0, line.p1); - c.pathseg_count = 1; - break; - case Element_Quad: - QuadSeg quad = Element_Quad_read(ref); - c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); - c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); - c.pathseg_count = 1; - break; - case Element_Cubic: - CubicSeg cubic = Element_Cubic_read(ref); - c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); - c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); - c.pathseg_count = 1; - break; - case Element_FillColor: - case Element_FillLinGradient: - case Element_FillImage: - case Element_BeginClip: - c.flags = FLAG_RESET_BBOX; - c.path_count = 1; - break; - case Element_EndClip: - c.path_count = 1; - break; - case Element_SetLineWidth: - SetLineWidth lw = Element_SetLineWidth_read(ref); - c.linewidth = lw.width; - c.flags = FLAG_SET_LINEWIDTH; - break; - case Element_Transform: - Transform t = Element_Transform_read(ref); - c.mat = t.mat; - c.translate = t.translate; - c.trans_count = 1; - break; - case Element_SetFillMode: - SetFillMode fm = Element_SetFillMode_read(ref); - c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE); - break; - } - return c; -} - -// Get the bounding box of a circle transformed by the matrix into an ellipse. -vec2 get_linewidth(State st) { - // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm - return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); -} - -shared State sh_state[WG_SIZE]; - -shared uint sh_part_ix; -shared State sh_prefix; -shared uint sh_flag; - -void main() { - State th_state[N_ROWS]; - // Determine partition to process by atomic counter (described in Section - // 4.4 of prefix sum paper). - if (gl_LocalInvocationID.x == 0) { - sh_part_ix = atomicAdd(part_counter, 1); - } - barrier(); - uint part_ix = sh_part_ix; - - uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; - ElementRef ref = ElementRef(ix * Element_size); - - th_state[0] = map_element(ref); - for (uint i = 1; i < N_ROWS; i++) { - // discussion question: would it be faster to load using more coherent patterns - // into thread memory? This is kinda strided. - th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); - } - State agg = th_state[N_ROWS - 1]; - sh_state[gl_LocalInvocationID.x] = agg; - for (uint i = 0; i < LG_WG_SIZE; i++) { - barrier(); - if (gl_LocalInvocationID.x >= (1 << i)) { - State other = sh_state[gl_LocalInvocationID.x - (1 << i)]; - agg = combine_state(other, agg); - } - barrier(); - sh_state[gl_LocalInvocationID.x] = agg; - } - - State exclusive; - exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); - exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); - exclusive.translate = vec2(0.0, 0.0); - exclusive.linewidth = 1.0; //TODO should be 0.0 - exclusive.flags = 0; - exclusive.path_count = 0; - exclusive.pathseg_count = 0; - exclusive.trans_count = 0; - - // Publish aggregate for this partition - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - State_write(state_aggregate_ref(part_ix), agg); - if (part_ix == 0) { - State_write(state_prefix_ref(part_ix), agg); - } - } - // Write flag with release semantics; this is done portably with a barrier. - memoryBarrierBuffer(); - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - uint flag = FLAG_AGGREGATE_READY; - if (part_ix == 0) { - flag = FLAG_PREFIX_READY; - } - state[state_flag_index(part_ix)] = flag; - } - if (part_ix != 0) { - // step 4 of paper: decoupled lookback - uint look_back_ix = part_ix - 1; - - State their_agg; - uint their_ix = 0; - while (true) { - // Read flag with acquire semantics. - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - sh_flag = state[state_flag_index(look_back_ix)]; - } - // The flag load is done only in the last thread. However, because the - // translation of memoryBarrierBuffer to Metal requires uniform control - // flow, we broadcast it to all threads. - memoryBarrierBuffer(); - barrier(); - uint flag = sh_flag; - barrier(); - - if (flag == FLAG_PREFIX_READY) { - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - State their_prefix = State_read(state_prefix_ref(look_back_ix)); - exclusive = combine_state(their_prefix, exclusive); - } - break; - } else if (flag == FLAG_AGGREGATE_READY) { - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - their_agg = State_read(state_aggregate_ref(look_back_ix)); - exclusive = combine_state(their_agg, exclusive); - } - look_back_ix--; - their_ix = 0; - continue; - } - // else spin - - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - // Unfortunately there's no guarantee of forward progress of other - // workgroups, so compute a bit of the aggregate before trying again. - // In the worst case, spinning stops when the aggregate is complete. - ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); - State s = map_element(ref); - if (their_ix == 0) { - their_agg = s; - } else { - their_agg = combine_state(their_agg, s); - } - their_ix++; - if (their_ix == PARTITION_SIZE) { - exclusive = combine_state(their_agg, exclusive); - if (look_back_ix == 0) { - sh_flag = FLAG_PREFIX_READY; - } else { - look_back_ix--; - their_ix = 0; - } - } - } - barrier(); - flag = sh_flag; - barrier(); - if (flag == FLAG_PREFIX_READY) { - break; - } - } - // step 5 of paper: compute inclusive prefix - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - State inclusive_prefix = combine_state(exclusive, agg); - sh_prefix = exclusive; - State_write(state_prefix_ref(part_ix), inclusive_prefix); - } - memoryBarrierBuffer(); - if (gl_LocalInvocationID.x == WG_SIZE - 1) { - state[state_flag_index(part_ix)] = FLAG_PREFIX_READY; - } - } - barrier(); - if (part_ix != 0) { - exclusive = sh_prefix; - } - - State row = exclusive; - if (gl_LocalInvocationID.x > 0) { - State other = sh_state[gl_LocalInvocationID.x - 1]; - row = combine_state(row, other); - } - for (uint i = 0; i < N_ROWS; i++) { - State st = combine_state(row, th_state[i]); - - // Here we read again from the original scene. There may be - // gains to be had from stashing in shared memory or possibly - // registers (though register pressure is an issue). - ElementRef this_ref = Element_index(ref, i); - ElementTag tag = Element_tag(this_ref); - uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE); - bool is_stroke = fill_mode == MODE_STROKE; - switch (tag.tag) { - case Element_Line: - LineSeg line = Element_Line_read(this_ref); - PathCubic path_cubic; - path_cubic.p0 = line.p0; - path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0); - path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0); - path_cubic.p3 = line.p1; - path_cubic.path_ix = st.path_count; - path_cubic.trans_ix = st.trans_count; - if (is_stroke) { - path_cubic.stroke = get_linewidth(st); - } else { - path_cubic.stroke = vec2(0.0); - } - PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); - PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); - break; - case Element_Quad: - QuadSeg quad = Element_Quad_read(this_ref); - path_cubic.p0 = quad.p0; - path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0); - path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0); - path_cubic.p3 = quad.p2; - path_cubic.path_ix = st.path_count; - path_cubic.trans_ix = st.trans_count; - if (is_stroke) { - path_cubic.stroke = get_linewidth(st); - } else { - path_cubic.stroke = vec2(0.0); - } - path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); - PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); - break; - case Element_Cubic: - CubicSeg cubic = Element_Cubic_read(this_ref); - path_cubic.p0 = cubic.p0; - path_cubic.p1 = cubic.p1; - path_cubic.p2 = cubic.p2; - path_cubic.p3 = cubic.p3; - path_cubic.path_ix = st.path_count; - path_cubic.trans_ix = st.trans_count; - if (is_stroke) { - path_cubic.stroke = get_linewidth(st); - } else { - path_cubic.stroke = vec2(0.0); - } - path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); - PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); - break; - case Element_FillColor: - FillColor fill = Element_FillColor_read(this_ref); - AnnoColor anno_fill; - anno_fill.rgba_color = fill.rgba_color; - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_fill.bbox = st.bbox + vec4(-lw, lw); - anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_fill.bbox = st.bbox; - anno_fill.linewidth = 0.0; - } - AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill); - break; - case Element_FillLinGradient: - FillLinGradient lin = Element_FillLinGradient_read(this_ref); - AnnoLinGradient anno_lin; - anno_lin.index = lin.index; - vec2 p0 = st.mat.xy * lin.p0.x + st.mat.zw * lin.p0.y + st.translate; - vec2 p1 = st.mat.xy * lin.p1.x + st.mat.zw * lin.p1.y + st.translate; - vec2 dxy = p1 - p0; - float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y); - float line_x = dxy.x * scale; - float line_y = dxy.y * scale; - anno_lin.line_x = line_x; - anno_lin.line_y = line_y; - anno_lin.line_c = -(p0.x * line_x + p0.y * line_y); - // TODO: consider consolidating bbox calculation - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_lin.bbox = st.bbox + vec4(-lw, lw); - anno_lin.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_lin.bbox = st.bbox; - anno_lin.linewidth = 0.0; - } - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin); - break; - case Element_FillImage: - FillImage fill_img = Element_FillImage_read(this_ref); - AnnoImage anno_img; - anno_img.index = fill_img.index; - anno_img.offset = fill_img.offset; - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_img.bbox = st.bbox + vec4(-lw, lw); - anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_img.bbox = st.bbox; - anno_img.linewidth = 0.0; - } - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img); - break; - case Element_BeginClip: - Clip begin_clip = Element_BeginClip_read(this_ref); - AnnoBeginClip anno_begin_clip; - // This is the absolute bbox, it's been transformed during encoding. - anno_begin_clip.bbox = begin_clip.bbox; - if (is_stroke) { - vec2 lw = get_linewidth(st); - anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - } else { - anno_begin_clip.linewidth = 0.0; - } - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip); - break; - case Element_EndClip: - Clip end_clip = Element_EndClip_read(this_ref); - // This bbox is expected to be the same as the begin one. - AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox); - out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); - Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); - break; - case Element_Transform: - TransformSeg transform = TransformSeg(st.mat, st.translate); - TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size); - TransformSeg_write(conf.trans_alloc, trans_ref, transform); - break; - } - } -} diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv deleted file mode 100644 index f906dac2bed79e34763736d9e148541190bdd856..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 81476 zcmbrn2fQUk)xF&?_YOJd9F?4N&N&AqDZ?;{fk{kYh>~*#$qFJM8N`5MA_xK^B0)ew z1QY}nM3F4=JCjZpV{!~rR?DxJ9P9o4Np$rG_#|t z&B3`ir@o$p_`hi>?}PNv>NYzL*=Cnrw%Kv3A;Tt27&2+{5o1OS88dp^h#|vAPaH99 z^60}x=r?`gH+tN#F;j+@4!wAk?aS#@v)Pz2#OcD)c};e?7(|cmc9ehDh)JPuFE~Bm)t%J z-gDy6ag#=lpE%aUIBZYdqd&LaW*>N;FW7lj^B6mHGJJ4{J9d+eIcDf&Zb3wwx%nJr zU)=$T_NjeYx=#1Qznf!Evp;x?F{3A#p1H|A`g7}P4uI>Uu8-W~zyrr0Zs|R17^OY~ znr{P_<8vT>-L?le=Fd<%^*eDVF@;Wib&viWdYe)3!WX~69ZkQH1nAt0z`fiB4<6_{ zAf+M(?e(vz(bVbY)!U2#x3`-;htjZfq2@6e!7-3K7yb9RnVRwN+`I70ZK1kznd6|w z2a;}kvow>6dM@Mv3I9v&Zhf|>zWP&(`tI@nOZ~%&`bQP@JKODl31I$5_SK)YIku?p zWOnx9{~^W~`eMx1oLI#u_NV_dlDVAF7iZSy)GAJA_xL{}bjS0Qz8JGLXBIKaOZ~ru z>CEdHeQ{=O&Mo5fU5WoCi2XUIh;d;NWAhQCMvwcKi|k()h%qkci!n=caS>yyal`*> z1nXbaSAW*#@}hom{rsOjF~((mF?yRXf%llqp4GX^yVtpzNB?!+(_9Jf_)&V@$vv_M z%IiJQ>rvf%VD=>U2em<*W4NZWx5wA#@?5*|?M0g~bn-zw156pmv-n`1&W$m+)hCS@ z6%$%F{&d(n`*CY=FT5J9(}w{gl-~F6IbXrAy9dnET;Erx^TeLLV?d?$lR!sX>9CP& za<1MfN3KTDzW@30ptBaQ>x)0A`5M@LrkA{Lq}*)WxbeBOLQ#7=y10KndYhZ!eZH-q znnyX-YR*eN*6Pkn`3#`uy!5xTKywGYubm+S#-RU`Pd*A2xpzjh){8+oZ1n1V<8d2p z4CQof-cH#a`$5gu!5N!xg3Y6TmQ?rX&$+kxHoVUl>=>$flw+gzuVXW~w1 zZtjMgbK&0^4|R`nJkb0V8uHhj(ZP>n=*RN}B$PxXT*KbD~<{U59JOHn9 z`X5HK=m9;h$F#8akL*fZfSvz!hPX49_tAF7_{ZRk@%`Y8@q=xCefxT6guDIG=24FE z!*Ki4=MQ&0)IG{ER;$O@PvMWD4~X$Uj7VR5|7JY;Z0%?`0(N6mRC^-H9Ik?;Z z!Oi1sF6}+3(|h})?$O`(-sVZTdAGiueye$;-@SY@@hf=Y_doQmzx8^qTZ=vGyLO%? ze){zcIQ@DSoPIsm=HTsac&At0c~bZ2@7IFO%W(7U_zvklIsNzDFOfSL^Q}v<0sfnd z-uZZec9QpR!QH%jnis*j59;?4yxWh(n%BYoxfLt_e;C#OBf}!lzrTdc;}zOYzORCl z?;pTrzOTW%`7YGF1@6zc`{eikG?twZMUL&w#QgqDTgmSYaPoT-T;}%|c=CG>+@D|H zmGl2LntfsVFF^Bqo3@hQ-@(c69dMc7yYOy)3p5{t`}1p8|Nr0YilzAPuAdKSBYAxU z?&dY9`A1(KgPTvlyYd{?`7E+~|L10PkN)eYx0z;|!WWy*jA|bFE=sQC_jj>-pYi*$ zaPKjG7v=t=zxS8Uv*75%(fGa@&HGFDSr3sTzCIt)?eT|<&}ZRhI(+-`|1aa$=SM%w z{ijc7T7AgtKf6zQD1CcU5A6*l2j{&PoG~2??#?aW+JcQ)e)p*6(LbJxG_&{n4e5Rs z{J*cD{+QKX^gnjMoM)!Zq2wW+1>DWQr{vS71dRsX+CX_ZzvtEH)Py=EFaeV>nZhSI*{Fh4l+xB$4@ z@9CRG#cr-chK?FFal|MQex0x0eYRf&`-GtrCy$;ydi=N{-F4c_dx|Cc>P^6X^x=J# zI_(W^7XJ_RmhP+POtfFg_kGVlxLNAo)?27q4t?18u?PBy*2#CkCz|xVQ`Vne+8yM7 z`k2!ht7T|wXghApf)DKAnVOZrV~0){GNSw5AsPJ32WH26MSQ#CJ$thnc+`l=o%cPR z&yA`1Kcr-=RjEIT&yT*7SR2fI3?19asi#@D;+w*E8ZmbK#Ljc<;70Djo1Y znVQEWbxLClX*FYv=*D0t?Z!N~%h^f0d_tGAlXmM)g?HEPpyqUNayh%}*XGjGT+rox zaWCoezPMlN@-}Wyb9I-mnRc)5^0m_LZSb^vKRE3^)b;CY_t7rzYxjvR?`!v&F7Ipi z`7ZBk_oXgx+wEyy?eevg*Ppw5o#gd4JbBH|#3iq}!TobHs96vmzr_o`r3d)+G|P2) zU;Zn1d0+l(ba`L?>vnmY|Da}Lc-q+&oP74|`t^;|K3(3n!ef|1zm-n^% zP?z_$`)HT9?G9?5geUGx;KY5U>(|B|)cmQ>Z*cQhFy99c89Huw_eCj-!E0#{4;|&+ zFjK|ns`x?`U#j9ORD6w!Z&2|qD!xO-_ZYz0OW^76e&GJ~%Dw^bY>$0=$#fmfb?^PC z`}Np}iSo`qgFlx-_g+TX*=Nu^9Gl_Y_+wy5})72l!aJ5_w=ihri!dsTeziVvyy(25^e@zE7Oq~c>L zKB3|hD?X{>CszFAik~uo4{A<>XKc?OVC!ivtoRozenrKL_&OC|uj1=he6xygQSmJ+zC*=#s`$mhV72mkxn^%0xif>i%9V)(4#dogwUKQV` z;`>&7XvK$Be0asjRD4{;$5(tx#iv&M@Bw^Ka}+#l^`rr|p5~N_pIY&ADt=zY&#(BU z6~DaVS5*9~6~DIP*H!$Mir-f8+be!&#lKtey9V$<&13Lko#*`C=AG&I4s*colLj?Q z@;hXIN95;%rNI5yEx#9qckA{vn}Emg1sb6`zMBs4?P>OcAIk52V&A*6Pk|5f_ndA0 z!v@#~HCMvNOzpP>hY}0psV?AojkR8Vl8#-pYafea( zMdWr{c601)p2L6AK||>B(g7{%+ki?@tEUcrP$RIoA9(yW)c@KEnV$sF@Ycx32WNx0xGEKZ;!V&IWCC z+xGI{@skhgeBiZ@Yry;Gxu;oc0Pk%!hL1R$Uyx7g{It9?wmS~+>1lSU_^uV-tK$1s ze7}khhx@H=#&Yxk``~6Ge9ZXqhYUGz=&(cjzDw?Hrl8Zc(G&Z$p5{0-#-kI?{vJOd z7JCG|zpcT|`F%0E-$y5h3#xh-Rs52QUp9dEHrKuDaT_@fnny5i3c;L|m4z&n3M;2Ib*a9rLT;M>!@HGuav zZ^ON=I$!R${eEYFPfzoH#XqR{G&7cKbWk%ReB$`2owYazxLnWk4e;%078t+>H4DQ> zckgEB;Z%Kd$&E6(7W6@LKPV9rt&5|9J7YR`AZ6 zYn>e5S<&(gG}{3Go@PyWE?vielghqj#ka2bwgdQ}W=A;dhA+m;!jun*^2*q0Pk&HfOq!V(PQ0)I_>|q@_8BF>1Ur0pM%ko(;EZ)dzuBgfXf^g ztN0QXUuppFZI*!_=s zXzh*3xMvKg*VCL^@$(D54gQ;O<8klzez7gH?Yn4?rMg9*rmR-_d2dqlKClgAZC&q4 z#`XR6l3=WrYl--;ab zaBhp;cU>Oyadz5k`BELV--oaR>d4DFa_cxQog9f{D03JB7?2~}9Ao!yS-Di2g*eS2 z&k#$&J=F5dP-|to@w4yxIUmt%OO0vo)67-PdtUr}E>N4h&>TOt%!#@9JfOBPXX@IX z+Tw-gGl1ICg+}#`wrruw+Jzd)Hey`@cw`kt))qLR>&HKFChJ|K7)iy0O z@9An=cC?bFV6j?bi%$Zz&RJI(z#iJ+SV7( zgOB68IH$*=SRC7~=3gau$69T1d>zl&=O%(OIZ8XvUY@ z-hwSRzH89#sI1u^gwxIp-smb1pZBMd6>KRP$T{{%J}zW0=SC zl!?0n*tlwol85UebyI&OGMxZZM#uAN7mb&Qm&7-&r+t|USR8~nZw?c z*6ZOkhkYp3GOqiA)vRZ%{V5Y`2-sL^+76(!t~m^)ln`^#Yat+psvxqru$v9@#`^es>Ocomwy)^}ZALuni}ukC9o zT|?Tn+3t;$Y4;|u{%T%BH&d4Fx!#O>3#ESA-FI%KG_L!OF|-@|o0N(DEwFXeyhgrF zS;jW*ovn{H<9>%Sajmc2xId&!+#i9BtF|Z^+)Y`=b*ysQ#yxQT^|Ot8Dbt3r?%}kJ z2PxCWLtyKuxz--0OdH1hX=_(E<|CAeX9uP+UU#2{dkuQcdFr!5t7U&! z1#Uaqm!m&3Qm%ov2sqE`UN@VfWsKAvqs=JIL)!)|-@MgUfNzU!i=WUGup93%u=C-* zEkCB^=3#yFJQj`r^~Jje(o^so;k|I{-wHOK`+nMg4Xqr9H(MY5|JLF)T7TOcOlOkc z0$}5LpU{7!mT%Vb{ae00Joyeo^EplbLt6WGEkB~=-bb__)AFTSer?OWuV}x$<=Ho$ zX!!xH{f{jl1y6t9NBe{eurrTz#J;=M{ahsW^OLdnr^L0RcjglA=Nh%dIvqZJ!Ow-e z=3~DI?m7;?0zOm0uZGWD@ay5T6#Q1W&-&JP>>qAQ26`qRVz^2sNk$d|%zS-8iFwB(#D2Ujc4_^#0L zJ#Lfut_W9)uXsgz=UQL6)Z4c1MdR4SXS2t__M7f@`uYTX8nC)^C-*a_nq%=)i>)L7 zHQaHP>o50RU&dPf1^Dy?NZ)@4KMs9iyxstJ_BZUtdlT+htH*y17RKVO|D0g`yIM~( z7u>egZC~y?IyK|X-(u?|-u`IY!jtcDt&d!P->Ibj>MTO@8PmqwsO4Fk&fPwcn{T>l zWA`*akK5-(`Bdy@UvYVA|G&=Cx5UQwv!pTY*O8Q7Pip2Z_w!_VjU5kP6CeFgX>GRc zIN1NSxS$y3zH!+AzHH02AIAm8Q1sbPfzJHEcfO8ixbJ+!y`TBsH{7-0yWeo_zV{6`p6`9bwfpWjT)Xdn!?pYFH~bp7?|w_} zyWf)g?l&CRdDk0mec$(%e2t3xzPGgdzBe4$_^vnHXKUZ}h8y2^z2W-%t~XqN-}Q#; z@4Mb`{e9P4a^Llq+;_bt_Z@G^eVpO) z``)(XzPAl`{Csa)a^Km88{c=f;qIlrvkkYt?`^~NU#H?56x_Yncee5O8PE5%;l}fQ zZMe_rzOM~;ynI(1?(@9wYQxz&I`7)Twfn9u-28mk7H)jswT1hP=)1OX^YL9Rv3+n(>*!hOc{U0b;A`K~S8_I%eCZhO9K3%5PrwT0WB@7ls` z&v$L%KC}9+E!_5e*B0LS3|?@bxqZ(TyZQL8E!@wRzH19NKi{>5YxiASxcT|6E!_5e z*B0(GwC~!&&CmC3;rGCO-xh9uzH2MF@7qf5JGVYgeBZm3-1lxJ_q|&mC%*69!tKBB z-oov#@7}`gukYT%?XU0N!ta6m?k(KU2)=s@x4*u33%>>Kd$(}=>pQn_^YOh~xcT_r zE!=#3?-p)8OBCGrzITh=&k(+M3pXF%xrP4}?mM?|^YL9rpf_uX2!^?kP{-;AdupRIq%>GQUqbvNRf$!EZ)Tm2?{xEz~j;A;B3$5Ym?z#bps zukBe*HJ`o2#`bgj^I$)R`T6{hv})|%psD-(W<1M{iRm-EwhxGrSe7r+hM&7VeXdWA zFM_ROU7zvfvAqmdJBd7O^Y>siKL@1WuYl`*zlx@wdVc_``8oZ#Hr8u!<7u;PpC{GQ z_8-A&iSZ|}n%_z6N?#TvpFhLZt?%=v-1^%73RctRv!{GL+V=Cr-@q#oN8Nt7e%}WB z%<33hN3NgPXI017F?|Q@{8+wATkpv^J+yz&+AY(U55dOxI&HZQJ_4&b9?re|UCvG{ zVu(}sA7JZ-f86pD+Hw3RT+RH_cOTf(cb}`}sXr~)c=pR@Z25Z3u46U`>}Ox=Xw%2% zZuN{^FW9kD&)7`|cI?c>b{ymB(bSFMv$|aDbGk8XQ@ZHYM(*qHX;dUAao zW5?S$uuQ*Z#%8~=zGi`|`CXCqwW*bJ=y<2S*|6E3-}R&qv!kh7-_H|r{hz{b9&o+x z=S5Rb{rSMwvyPun7vS_Wu71Xp>my!}((Bm#76RKpOV`*Ul#6nDXkV0j zwgg!1Mq;|g2o_tmTtLNHT6>J>! z%+YFK=SW-Tadoh|^W^77d2DNf)wKB;QXbpdVCP@kT9opv`*pysd+TV^M_+a4IOAaX zpXO;@Z1uRTho+t!)(0Cm{o4R+|Mas>xjwe(8guL|t?wLe2zFd^U2Ozb%XOtqtz5UR zG28RI%1yw2cJ{ky+mkQXUU%9zZSC22<$f<_eV;QnYwc-I?ssDyJM}kj?OwOWm-{`M zc3`tbYxjFZ?Q*ZP*tcx$ey^xqUfQ>6?S9XwU0&L^ZtZ^Ws9her`@}ZzdXBb5Q_pk3 zc3?HXd-S^40qo&*p>2CkHRnd0x!DQq{n>VR1k2s0zeL@g!T#GL>e_dqRLi>94eYwm zwkxIFwaG9xyMwQ%w2rp4w+GnvlJ}l)HT!Gcp9OoEx37g z3F4`n+rE@)$@_EQtQp1eN~HgD@_b4{jwOL@l8@+IPX-`pSUdT_pmfb01>08Ksp z9tu{=d<_G8*l%qIa;n*HvHciMXN5a)ImIr|oQ`;y`HDihs)8{$+6`N(+ zI2fF99u1Z|&R*Atfc>{()Z;$}Y@VCp;dAF$u+N=dd-~g^ntryKK3XQmIIwNrOgrxR z&)81x__%i_^ao!1lrXkEfJp zZcYR{H|m*-lfan^?TK?TSetQ@i;iyT_>8--{^KwEIkYG5Bgqb?xUE}mr%-MyIj4sT}CO-J^u=@_k8PU zv%bFS&RP0lDbJd8?9#6*!R6Rr1y}RBaO|%Jd$|6zeT7ra@eyb2zY4C${u(s(jQzD> zwT%7Oz=^HxI!-lXi<9Gx;EeqZV0p&=Cb0J5$zXR;pTSr^QUfr=yKP=OaZ-DJb_MmTq)$%N&P0d*49^`)Gm^kjn z_Ii+8X8gYmuE+mQH1&-Ccfe{H|GU5$e{J98R5P|Xd4C^lZu#u|1F)K(+vD>?xH;#! z?nhw#)Kl+nuzLBK`yRNuef}P$+?alT{4rQf+kKSs%$&VX(1n!*wEe4y>=Qb>jOoaMsczV0mmm2Pe*>V0qTk zV_?^kb+n}|^|CG7O#glXF30p!|1O~pS1BJ*fw$vz64gwxM@=}mgAM@fW)$NPBJ!@@?1BT|LH#WGB)!w_9>L| z|K$5SZ2Dv@eh*g5J@Hj=#zNaGoND%0Y)*%_TwQ)d%Pr-HwOqgKBd>w$edKjC_2lwL zuv+$!KZBEtwm)&InTt5<<1gTiJ?a0Ol=9gA3ZAvKy+tX{+W8yUwPPJ^)_;RiJ@x;t z-rC-#lxNQ01?Rc-9k4w2kN3ddyVZ^RKBZdXeh9WLZ68p|&CAa$9aY5Hwh zJ||^-|AAhQ@5gBB8Q*_`)iS=F2VdIp9<1#XPBq6@oPJJ=mVWwvP;NhcJ|6`38BjgX zfIZ+m18TRAYPo*WhNXQwl(uZs@;|NHUVO{v^}%qpT*K3YJ?x9N={VJlE6%vj0CpbD z$$6EVLwqgcI}r#(QRPJ>IjRso&CmrkEA1mK4K)0cVRb5m+dALa$C+h5<^%43@!Y`?Vm9#t-Ra z*Nt_w>0=yq$1;7g)aE}%!$L|{2Sq|)XtJaDC@?drQu`H!Lr@n!D{(V%6H`*t+j6m_xC#Lw&A;VwZz*5 ztd{*}Q?QzAR)62Mt0kw+z~BbCc21uqcBJ(9h`&Tom*d)OLFvA^B_-WywxaBz z+?vwg25m!Wf3~Hx@7q!Gzh-;>!*n-0P`Yl6ld;_eoUz>*EO%@_h5xSLNtEjLV<$>A z`{8$iyMxuzkEQY51FjyQJ;A%RKE{yi6Z>bhwRUqd#%JN`@!6~O@fjmNd!woEN-oyf z7rak<&P;FB_JccaE4MMNqhI2F4s33Dc9B~r_RoXOt!#IHxO#ksv_9n+9Dt^t+=hbd z+zv!bZpO5Zeu+B_Y;O6UR&JfxhlA_fMxd$3XC&Ad`7Tzjk9OC`D6rRt-?e5P9t2m9 z&uFkQa&5@a@Vc%J`wD{L9A{LuGL9kbIA9`a{b(kZPWO+6aOjT(qHcP+@-(u zHfyHqd^dMA_$W@-fn$;N;PrS6y7r967r^FjE*Z09;p(~X9S2r(56S1h6X5E{ThcFM zp9nU-Hs8a}pg&mMu{n%V?la3Nl&5l*dD{L|w9{JKZf)GtTa70A@;(FYWKR2LY`K2M zu-&u3wx{h(N_qNs4%j_X+u4+IZPQTiJg{S}?OaOvdR)Wqb?1ZqH`Q&|nAX+DdDxNC zdE1iGSl&-A0fn1;1 zF9#c=+%v9#tHdq*W=!kom$+X6n_KoXxpiW{8eHf0 zRW$YZT+{lv?>MH8!L?}W$?ZCDo!i&YlAAHDqhI1)4>q^#XL9SregoLt%64yrtHY0h##y9KTupIgDk$bKf*C-&RG#who*+u`c*`8wDb+0W$qI9?RZ z9sFlwPWQ8KDsZ@;segm>yPU3Dv31vA+y9M!7%y4z3=b--C^j{Xwpec~dm6@Slx2 z-5>s-z~TO&{wn96IlX?x)^UG$9c;VK^J|py><@ngyFaL#gZsmuz~+$sL9U8tiKg8cM`@@^y^190!^V;|eHtiXYx4`CJ?hk*3t7m`s8(6K}AKr$mAJ3WR zoxg*PuPyt-J79Ik#_Ls{{oy@unWy79743aaZP_0_Xf@{|d4GuZE@!R{xqilQfA|O3 z_H6efO1U=2;Gf`iTHD8za&7Jp{sER_ZM{z@<=G#m0lPogt}(5vPq{yA4L6qiLofc> zAN&om+;y=&w!z?IDb;Pu-yEx@f7652%DI~Xt{$Hm!T!5Hxi01U*gk1AGl8>~X90V- zmeptGT!quMCbo`id3JEtvcF@NXD!bGb}g%$v%h;*GarASGAG!&*-QKzues3FjqkoN zH`sc|wtOD2TI}<-cISR6{O5z~;}LECR&%s+73TOVWi9h805ZwYj5@msR+ z^Y1+4w-mazliM*_8f;zl)cq9LJZ!`3G<83Xt}TAc6n=ge8oy=HwWW>az}8hiv2AyG zux)FbnZDVF6~Kg{M|>|;=qt8*=kEi-iMtZm@zI_!Tp8>b=N={3&+E!{>l$^fW=>WG z*XwLGH1+tb4ld_Su20%rqqV0^xqn+&wrShhQ`ZJ(ukvro%Ik6+xOQWEFIyLEe&xMv zJv8;~sq2H)$~|>MxVnF%mOXVNu<^CIr#?sgjlt?(gZ^!S-1$9)Ha7*Ac{;!Dshgo| z^V)C^+}t{x9?5$PxPP0FwJz7s7`D3=xYHi(ZV8raGu}4f?5SIW<=XtW3bq3~7v{Sy zr96A;_TZQCv0Y@mj3MqRx9Uj zcer|d_5kn7nLSmmk9POC&wz8CnUnj*XVKJmVUKZ-+Y7Ab>EZQ}>u+x~b8y|2d%-?v z+RmXJW9$nyj(XbO4_vqXIW+aO{dusOC+RW@{cZ1$mbS}1VhFmnw0!{BIO@l@`3wb{ zlQ#45Ip#pH`Pdi7!n}u}si)p>uyxg)lRYWb(x;K&x=*9f)YGSfz-pcz<#>)pvro=f zIi3fjX-l6D0UJj>ZI1!hZI4A$Put_bYM%9Yjz>$|<#gv3fQ{pj^_wUwT%B%uv+-xV8`4&-1#{ItdF|&52IA;ojoP*VEC|9Y5oq0k&S+J`=2ux-n0sR7=dW!F3^UjgUY_p4wJpMBM@=DeTN=L)fPeD=K#?DgjyTnm=x+4pN;pMBM} zUqh*8KHe9u2V1v%_Pqg3-S}R6H-fE~XWyH^YO&wk+VlK+3tS(MXt%bSdtkKN(5&kb z?eD1|tfP-(?mg@du=(nfK79l180UFFu3zThn_&A~KKp(PO+EL!Z?`_i zDA(7W=-T4Us9P8*JO!^6YyL*zxV*)E@obR`=OgpTxZn?D%NU82%XS80R@uuAk38u3OirYc+H7 z0JvUfKS5KE&x7D{-sJkE&4*fh+LY(nH*H$x+4o^^`Ak_pKm8P&c4PbO`v}*T+84mF9JD_T@i;J=~Yo z|HwHDe=F*m7hA`D`Ax8E%6`28mSGUHhLY)y&6z`7LmLz5EqT-T3aye*;@D z`|{ghwb=jO+OxO51J}nR+Pkgheh}?FH0ydqd%xB6%f9>pnlX)G9es``fA{4N!RD(^ z`t%XlG0s|*>!;oC8~y>d&*i@SF`9bz!hg0t#whpYPtdi+k4>!;OTThoo(5f8_O@xk z)>Tj4L16Q+Zn-b_plgd?Z{b(&%Y)IirH$#p)>Y5GJU!U9wPpXE0qpp?FKdrJW2?I_ z>yx-MfgK<18N->uj&b%#xqe=ou3OirYc+E+E4W@~v!SWSXLfKoZ*qOo<{Yg(ZOXGR zr%lW3%X4Bc_dCbgHS3-<7dGw2c3++wY<}gwJP(?B_T_oOYURE>A6$I{+ReT^Kiv4* zvM(fad7r! z|BhdtF<26uec8Y1mutJ3dP{?y3uF5C{qpR~p8~&ykL?=My84v+@^<8JEcfN*uw`HN z-vW@kKOKN=dGLjl>bB*-9iWyqu_9P4bLTa`5?nn#D}((v1u_P5eQcjJnpMErQ&$6f zxTmVG%DD}v>s4$W_tZ7Pt_O4S-y@J`PhAV_o~o|hf4@M@eB58w23xn>U)DiW&o#Yn z>toL4{<0prw)m}I_?7$12I$&yjcy3Gu6pWj1U3)bDEF6*(Y3{IlftjuUp7V8mNqs6 zTUR~%%jRI))|P!_3ouPGl6*f_=v%hB`-?t_yA{~+(Vj8f8tk0rT9@nR8gmYvFXt`( z+k#7fdG?0*TV`+A4qRSySzlgf+hfyiZ1;v8!1dm+Bbs{lhMmA_<=(I}T)o^Ic7YpT zTlR)s!RoFhuT{BoE5shy0+{MdxF(GlJ{q{bLP5`>t_u2hP}Yq8$Jt` zYjX_t0cUU68!XrMFt+``jRAKwtby_1GEc|OH825PTh_p#U^S2A zJrO>RGiyMupD|nmlfhX7lfZIqj@@Bk$IP6jP|DZiJ6wjonF{`x(%9D1Ue56j?erp{eWRcY8;-K6!Tbx$_t_b$`3#znS_4u=Ui<=?F@-%;)jojMH)8az0Oh zYtOix2sWqkUHD08>Y2}z!D{7vo(fk#g){g2)4;~p=6udcuBU_5>4FdFJy>aG7W3 z^DK02na{JqY97h^9QYZWnNPWX#<1P zUe4!^jFo(GO7~t%<9e-JjPJtmjK?K#wd~#6)U3ZqTYoXikGV^%%jv%v2<`k^o%WS2 zuvv#3{oOCsI@o>Kf7{cx{kQu6N$KC)roL-(ZcY#Xebm&Sqs7*rv%qtKjWrLY{|=#T ztxde?TCV@PEw`WQ>%fiY_~|Rp7_0|2zBcR0{hhe|S|9AcLuwsuHxN&K1F$xKOR*89 zhtDPI8*-|dqu8+$tEJ5?!L}K`72JNg&itF}Z8$gQ^z*8|=BB1!Irh%s{G1;1ayoYQ ze?CszT!1odY)f7B2bnYLZ4X!T8gWkKnG^SS@AV!Fa~gLc%0*gi8;f$<#^RJ&<2w^W zpYrve35tNH8+S2tI2a`iXk%W`^rn$ujDq+Ev6 z{FbB4zgM{z^~!a!U&TjM{NRdDD7gQA-PD$+|9iu2x9tBuXzKU1d|29s5|f81HZsImDBmKO}T!Fc^ugOCFb#9 zwb)Mpn`i7Ng4NRINnq=!r_GbWws|b4ZOZjCrq{$NVC`Nn;#@B)Vq1mNVw9%&VZYjYfkQW?cI&kL^Nm8TX5Dwe2{wJ}&|%uJ+Wu z7_7~>j-foZOTlH_%iwC|^>jHrakZ!J6<}?~buQ$weHmQFy%MfgUQbuS6IXlceg&+} zxX!6Owy%QAxYxqfuHno$TnA2E?Wy}Uur}kmHsrD0050R+1XsI}GjVSQC$9F?y#=hz zxUMmIY`24*%Uo+;2dl+?2iQ4@{TpDlT#MfXTSq2W*?U*5vvb(|zUM*6yAo&YrUdwzWAu{5P%Lb5^HZi_<-49m?!E_fxOja~`Vr zqZNOm;?Gq4`HH_(@mDMU=YszY{C3N;Cp`c+Z|6E=`4g~u_M`{FY8fZ{q!#~&!D`u) zehO9_%9*wO2smq7d+PoStj#%hY~->199+hI46e2vXX5?>oVeOk_i?Z`<2uIj*q#KJ zai4;#jp0n(UxE`?d+I(7)@EGiN*>#@;4<#7;A#^&6ZhBP#MPd<&w;fW*EyHR_8V{+ z_XW6GxhMS=p19gm_eHQaspq__6E3&`xm&{o1BUJ7C3RWr|w_D+KlTpA&>3vVAo#u zj(5OnvA+v;4aNQ*SS@?Y`(W#+XK(oc?6tNNr)|phOUw_!uAjvG2&@+SKftbo*gpoV zrOkhWt)recKLOk3v7ELk*Do>InmTJBF{kmvDrfA|f}P9Q2f@|SW)Ijp>S?nVY@68| zakZ!J>R@fgb?oJ_tqCsUt_4>s?_+Dj z6IXlct^?L)T<1_8+j`(K?)q@GYq(~ze{TR!T^H{Js%JoakZNQF4Vr~mo zi+ww={fd2iuv*&O0c;)hw7DbLHnX1P`We$}b*I+ubu7+xyuCI~j~zI@j<=!Qk<;sV zXUbg1yHc;bj`ytiJ_X+&Jf!8Bi{0SHEw9Vn(bRKY?g3UyKWtMiWAhoXTCU5_g4N3F zaxZw+jrP>t8?4PaF?V@v`-023`@z-9>+*B(#MPd3Hr%xgV$F z`FYBW=Lys+$MckmpIPzqDt=MLFR%Dj1-}-2UCT2+C&JCEoS&1>)H6RPgVi!W=B$=+ zJQb{#`8f@&R?g4q@XU|))I9^N&H1tK^4QJ-mvPUAtCiRJIq<~Qp1S9PwHenjl*e{H zxQu%NT&=wSUI=E;fbp~b*}_#Gp=hx9@|&IW!$UbYBv^fzY0%W?WubWSetQOWAfO(26irU zU0n}Wi~R<$a}xWFV71t90z0;`-walZ{T8re6#K1Uwfx@iHn43P*SgwW@3&KaozrU| z*R)*!#J&UUI3@Nsz-qC76KsED{}xy+_xo>yt)t%AM;OaH!M1rU*zuCxoA%Vb53J4cvv2a)?gy7~AAqZs z`@~P+iK{(z9|UVNu45#R?O|{k_or~Ra-Vnvp19gm_h(>j#&xXavHcue#(fN~R_+tO zfG4i@)O{SR&A85)JhrF6j!o9)FTrZDKMl5zu|ES=i~U)!xyJr0uv+ZD2HR%r&w4An$v6G5XxKw zFHx_&23{%nAHjcWdDisHaO0F~`gdsRS<}Aq0&)aC~8K1v{)f}IE7JLVu{%TL3-UVy3zqTch?LDwsei!foSnYkz z_Tk3$6`s!L4yX4#V5qk zuN(`y(Wqzcr-7?o!&vw`o+Y5u!qx4wG3EX~&0KoG{$@bkT;<7iMf9nh9#c4z>tSGX zJ)F{9eP=oo^~!mh4z6FB>-73?Cf6CjYRPqZ{AYx#o2xP9=4vi8gMDACZm#m=>UVcX zae5rVnOu(qo9oe(=IXnaxu{p>IvZTSGS}JB)RXHRV727B8vb*_)y>tIa&tA8xxv2Y zQ8!n4a`k(%V>vyJ;Y_Yy0GsP^l;-O9iwjY&%ymAver2xnqp2s?1;A>_bzS@ygsYpY zG3DlJE(?SGZd2V{<;it3^bhs~boE~R$Cf9So z=6W8bx%&BX1L~Ezt^?Pv%ynHf_2jx9SS`7J7XS6(>gH<94kuT0*%0n$TXl1lC)Wee zFXZ$%pEJ2$05;bzQktvJuv<{C%ykpEer2wkqNyj>&A@8Ob$|RfhpU^bG3Acy0ob+# z52I8!S9zK1#n?Q`TrYx~>m`)QbqDH|xo!j3ugrB@H1*`V9ayc*b$htFxf)ZRTz3Qy zBc{5!%9HCT^vgLtF6GR)UIsSTD=5v?=b}BRSLV74T)#5cUD4E&>uzAR9tbUxvb#2IF8xJnyPJpYGpRo>wC$9F? zoe0)uT-TU9w#nc!?i9G1*G`^)4}&MJ_SBsU)@EGSsywzMz-8Pc;cDe~F-O4@S9|In z4c2B{*StKoFM!Lq$HLXhclyV{6IXlc9uL-LT(2E@Y$t-txF^BY%6IxF!xL9~>Yf7D zW?Zj9d2FYF%eZI2)lTQk_?`()TOTlpi~S+6*H-KggVi!#KLy*SajmO8z~*^1ABcX_M>36 zPTv{tpM$MG7;N9=`lS9Zz+OlC``q$4SpUa3^_S}t|0ls$amN2Cu>MbQ>Mz$P{!fEl z_wj!Qtp6`L^_S}t|6hS!&+-2?SpR1^^_T0Tzt3aOfwlXrCeE`O-R=DUAfafk<;=6% zbuGSz@@t$vx7|RQ=d>4yQ9fh-7H*yLx#dMP^*pz{1Xjx&8AC07dKs*i=a%1r)$-hO z49Wf;Za%i7Zr!Eeub^vleh0%}1*>I$_yc$dr)%4|j**&vY5z5F+5YQrwLIV0{vY9K zUp?*r30+&-|1(%E?Y{v|`^L3>HT}~5o8YqjzrfY<9Ax`%!PCBa+W#xMwzU5@uv*%G z8=UryYx`>YrTxEy%l6-atL1sg_TPo4ef6~e9=f))|2|kP?SBAH`^L3>HT}~5hv2gP zkKk%~uCo1qz-_; z{s*CJOZz=wwY1+0PW#5SeKq~k{$Oy~|LNdrc}}za>EX8THB_$u8PK(*{Tab(X@4eg z+BdH4tLc~aX9kz;&jMG=^PKI^3QzlG|7SzjmiA`{tEK%pz-iyOwy&mN+Mg3#wm%nK z?R4hIIJQ4GJnfh3e;#ygX@6d@TH2ovoc4`t`)c~7{rSOV`wPI;&Mn$s5T5qS^}i6h zwzR)6SS{@@0#5tJwS6`H(*C00vi-&2Y8Q3d@BSWPak%Y!pR(Rk;3d%1)Ao{JwY0qy zIBgr#w$=1Y+e?GXwm$_|yP|E|d)}wvw(UJH^_D?XPg~1^)0Q!8OHIGjSq@ybwLDzy z>Y}X`;I`%cGWAwOQ%_qffzy^TY)ehQ)L9u^wzUde?fRmvRpGYfy*Bk$LsL&%tAo>) zF>FgsztmX+T(-3)TqUazH`C#|4hO4 z->c$#7hL}#1=oLQ#SbjF{-X=7{~;A0Q*iw!6kPv_6`xda{ZA~o{wG)bl$PhYY(2Ph zP(Bx}kEWjIq7A@mc`kBJ)UrM{1gqt_Xd|#%`CPOy-1)7ai#9>mmc3z9u$r-*_szks z7uT3`FBd!Swj*}Vt(U*^-U>W7b@bOydve_xT;{qBT+Mrh>)SniTX=FU?;+cvYfG-% zTaPoj?g&nv?ten!DX)d!PUxVv(LekYk6<_Ji4~z zx<6PgxgG#cu3m3)^YdD>9kJJ$^^)r_u(|55pZ4TB99-r)0TyAb&)6(oSjWND@>|#A!D_yHe}K=$ zgQ;@@+%~jj-cAB%4BbEFj){AxeG{KTY2A#^sbI%PfBm%A_rC9A_xKKH?tR~F@wX`N zYVn;|LZG$L&5dG zz2N%aQSomST>rZYuK)Kd{)2++e^0^nzpvsyZh6-D>2SxiyqBGUrk;D*nP9ct%N%31 ztnIVFYPpx44OT1fW#_=1ulingF1og?$Me8yS!Wl3oqyMpb14@)kG3Or4y~7Ub`jWh zroVpLlk3IcGS^GsYURD`Qh0LBwc}oP8M?OQdO27vxqb$kvVuHS~MmG`nc;pST2%f5rAo;<$`R!g4W11C?fTe-P;&Dw_8>(zS6 z^M_#b)L%dC$@53xGS9o=YURD`UU=H{+Lzmo*S-F^&hH1?pYmSz0Ghh~?hEqd^%Jmp z8PizW>%IB`>>l@WX0N`l#Xq9_af|Pvyr0v(`X`jxs~;q0xmW+9;O<>dRQ$<;yVw1? z;QBvb@!u3&|KAr}|5q#ihl1<>X2JD;tKxqxxc=`KT>lR%{!zj8|D@phPt)5!9`b1m zuK!F0*FHR?EG^ zIakXXe*~ofz@(7J`Q$0y1rdoat)?P1$N!*ub=kh`b%(`>(g+x^4{?b{QfrA^4{?*y0+x{E3jH}eGZ&lUDtB+b4}Zh z*!66^l7HEAJhDg5TffTHZVUjIJ%Yz5!NCu73e1SFdNe z`FSndj@awiddc-~U~|=9Kkdo&ZE%_E-{ETIz2hDD{nRh-9q*#4C(rl5YRU5haPo8? zk(--)h;4}7KdhHL{{c2n{q@tHJU<4PdHxfwR^B^!yzgt%y-035?nCaU;ndA9% zuv+q*7o0rZpXKK1-fSCU_hsuP&jsYf)?YvE$#X$)ndd@qweI)p-8>hDn`gdvHP1!R z)RX6;V727AI5>HFUyz%d_XOJzdq1#V@>~jRp8D&jy?$nR47(6T|pli#zSrM$3b+asDZM)n7mD$#rXRnd>%i zwep_3Ej+oF_uTE!wI$c>!D`8MM{siWdXSr+*MjYcy$-CGTz3JRtN!|FPp-Ry%UpMZ ztCjcM-Qnh1-gEarQ%|0Ig4L4eXTizS>rifPUW2wF_WHA4^4te(p8D&jz24(~h27(q zoY~`^Zt)Y8&$Rd{%4a#<<9pF>m6p7nXKTJ|i*KrL%vf3RBitRY~va?d&d z?s(RF)=+e9SqBGNk2C9FIN14dz8!11*m1TUv14q#tbPfy-P+!_~?? z>tJ|tE%&TL(6uGkF<`aiIu4v%ooBiEImfmmc7CmwTn`1CtN!|FPp%WeWv-LpYUQ3a z8J=9rJ!=ZOw&Z#kSS`684orr5H)n7mD$@OS(nd>odwQ|q; z0zA2vd)Be&+LG&WV726W0yw$4zUAiU+O{3B>)LwB^<=QQ>aU;ndM|hxyT@-hvlqP3 z;^!%U+hVVy7h8NM+}@3 z`IOh`sc7oCPEP}?m7fJphrdX@yhAtxu2#N7I1{d(wQfJohC3FHt$mY=?U(I{?UQvg z=I4PObN%(xp8U@T*ZE(7rk;J{La>_m4*!<#o5cDeTs>>yVsP?u4CUtH`01$UMZc8}k4W^KIEV%Ns2Eq1Q|(BgY2U*mLb{E;$i<7(oU zYvY=VUt4h3#!UtHySiHneh>WCmS=3QfSXUbHokvvv1$S-SU2uPIa&N(HY`+dSpK@*7 zfu^3d@eQzAxi-EDH}|ZKZ^6~dwefAZdiHYr@g2Bh;n>un}>8sz()cY~mdD7-s%9H2) zV1NH(ed}qj*T#F;J^sd-wefa~T^oOIv1{X<7W;YnT~61=`;=K5j}X6H8;@4}=LL6d zJXP?A;7=FaHlAsD#`XcY`IKwpCur(f8xMlj%C+$j+}yJ^9)_!xYvZSIb>}2=@H4n$ z;n>AsY z-3R_m%*?Cv_D;dIzgO}13$Fhs1=oL?!Ts^%(-vI+849j_ri#y8aQ){hxc>80eBOfV zzevILU##Mb7hL~k3amXAzE#2X z-=X08?^N-f3$FiO1=oL{itk%+{f8D@|6vs$-tt^OZ?GO*bLDmXCYpM# zu0b`g3BomRfz@)o{S~ZMey0B$-1Tfb<-5PP(Y59C+~2`!xh~%YdwtlhYh5mOo!gGs zHEz9JhaZ65FZ9<>dvg5{T;}=_T&?^}{||U_&F37yYyTKsTXOv;SS`6a7|GS^Om2Q& zW40sq`m$bf9RxR5{q@tHTzkM}uDx)z@-zKlxVid$SvjuLsdFaR>A`Brbw+S<^%|F( zpVznTh`qM0mt1E7o2&l%X-}@Rg3DZIgR7OF>1T&0*K%CvK-ZRB=LD-I*SW#T)%`_o ze(o){BX(c0UUHoeY_9t2r#-pO4=!_E0Iuf!A-{uN5N@un`EpzrLf4jD7Y3^(*G0j} z)xAw_e(r0wBX&=-UUFRmY_9t2r#-nY2`+P83a;k;F}W@cPp;*-ehOV%a{V+|Ex9fW zPOk2&a`SUfwH>khsr8cU3Se{9Uq9{1bwzNQ>q>C7{5I0x7q1LA*TLX&TvtKYmRwf_ zt0mXf!O7Ja2&TT#k0hTDS8%b3Q}p1igLmw9auSMzUjlGl##Hf}rPq^** zcO~UI{0zFb%|AmYd%&O4|_+r?g&j9RW61{q@scKf}(>xOvP( znP=FUTRbD>EG_o=W7ZZ=OF0{-&$Dw-=5xvz;^%o*|8W%`UvQsqrWD-g<*5a?jl)}> zbu|)hKIQYxC^Ypv-y8&1bFa_5jE0+gKDQnWS1Uia9s*a-oY;@CaL2;2wQq8<{jwdg zeX?%Gd;-`p*Iz&F$^TGro&Q8M^*nD)0;_rd^1IMah&36mp0#ioIC(jSa&vL~^wn<~ z>K_4ip0qiZ^5l6W*v~K4x1RQTZ7hi0V_wRvjrm$U7v=mdo`-S)PS?Xil=*yflx*J(?yFN~-_^AbVeVkKp{m-lT`32Yi(t_)MdBv|Nxc*-)xc=8x{JMhce@nsjzpdi8 z7hL~43$FinD}GnYvp$Z7JErCOJ_b!a>-!5}HP^TE>KLnKtse_k%X&HvtmeL$XN=?F z&X?_!`{fDf+Oh|o2v*CwI~nZ!+pcpd7dwx(BX$n0mvwd;*y}-m{j?|7)4^q~XTa6G z29xWV@Z_3n)93fI(6uGkv%zY~^;~drbsfpg&oyK_V%LxLlIsOvbJbrz?aB2*aGC2D z;cDgc`$h2Ns%~9#y%=3va=iqsmRv6bCs)_7-27a>wj;iR(t64DOJH-=Uq9{1^~>Ng z*DK*_<@5Vh@Z_5Ban1EB=-QI&)nK*cdJQG4esuc1Ofzi zC%8k9!QI^k*Wm6lFieoZ5L^;0*dW2(-Q6unu-W_5hgGZBe~adxI#uUZ-Rj%@`n{k0 zSfAIedBk3`>Qd{CU~4trIQ^;hCUC9wW^B2uO07e%Q)}I?L*ezM)?2`Gsr5E+YW2RM zZGGM|%p>-Gp)R!!16!-{#_6y3@Qd)xKyALJWAq;GeQ0{H_v#rLv$i|Y${*AD>+cuc zjV*TizP4 zczwz70a&ek*7zYfYqSPq25_CZ{*S=+z?kIv7;H`L``st-`l9_5tX@9a&%o}joNwgd+b8yyN`}GC9zQlbA&YJ7q`3hcN*8DYCt$eiKfV<1_Exf+OeFsjCdXB$`*OweW zfYr*UuOq*z4!%QU-FLxtCf%TS8#VZeuLMSxZlCa zQET}FUSD$j305ngHF{IX8m+;Yoc~eq*#l#eYgDi`xp(S%MuXQE?dV|j^3jd~&fZb$ z-jOplF=K+W=DH_(!0SugSm3O=#*GcHFKZqLtX4kSalzf?7!O`w;>HIjN8LLU!0Stn z3BhXR)7Oc>_O%a(H5ik5Ck7{fop%y=eTkbCocy)dlfmmt{>i~=<)fVf++B_-;q@hM zDsXbty)!kuzT}t&tX4j2oEDrlT7xk;|I>l(ficN7J=mJs_njHw^+h`)SiO9-Gl8>r z?3sH<&e+7v49=Qszh;5gm$;tbthw%;Uhw*|=H6hn^3l!;?k>k{@cI(h2b>)B9M3M# zksNb?)yk)@bAr=XYcMAB_5~+@JzsOd>r33+;N-8pm zYT^*?aP?;8M6<)b|voE-I@ za0a}-#GMIFj(Y8#1+OnT&IYTMAJFWjbHL`-m-)^G+q2Ycf8>l!%z5C{R&$;YuP<@` z0;jfmzq$ZkUuwG$tX4kSe}j{w-g_>B*O$2efRm%vG7w&0a$F2nE1&gW0yek4%y%g` z>vulaFK29GE(52wx~@U+`Vx0JIJMPtcm=$^)b?MnTKQ%SIkZhe{WI&ju+Ue_;YY+|klr?%Sj8{qXN?nZEGt82IkUSDdv8LU=5 z+9BZNsOuUEuP<@8fRm%f-3qTSIc@{1mCyQb2b)`8=DP!&^*f*Imoqjo!@#Mnp8q@H z^(F2uu(h>6tKAK+FSXqRRx2Ovz2M}iz55@$zQo-JPL6sm?uXZx91noi%4hu#g3YZj z^9=`^w_Wo?@cN>C7_44C+DE|lE?RphXKZ2~1!rBg-pAneCGK%>)>V7_1iZei>q)R$ z`DmX4Cr8~APs8g=+%w?hsI@!`uP-^C1FMzK`kx1zTVLjT0i52|`Cf$A7wt=6_43ia z3{LOV+B-R8`*FYa^VKV0>#fhQufoaa8TK`>oO9$E_H}G|`{?u0TVP*)zS8$5$GaRp zABoN9_uX%UU5|fu$oF)gKi+|px4t(RwSAv-PW8?c{d-{Rt@`)j555VTg zd>?}Ck@Kn7HYWY~2yDHzKOe)%r$3*7?1*U#YOhtmtczWW@kRzCT@09%jw ztVi3p_`d|V>-h>!KJ|PJmh&@Cet!H7w!GI-=K2;px$KWMeFrC>8ovk2sgLdlaCFwF zZCw06g4=8V2~IvW{tTA$&p%~P{emr@J@p&dmwQUzuN;4JxF^KP_dB@VlRx0(t#2fw zwtK=k)jLo0{)N7Fy`%Vpe2(a~tKNCivr+Mx!`Kmw+VPDJmdm=wz?K`0Bk^*nWlV5- zn>l*G$!CtS!E$49BxW42d2-&y#WqepbB_nk+~(FcF8=Ys_Obr?tqI`d6Eh*$Ue%b1 z;Nzaw}uIQhg(4mNjVf@7sd7G zi*u3B*PM)tpq&=n7wrC>n=yahW)^gu`Yzj;p2Z%7JzKH&N4HF|jbFCd*Wur=*!njr z_ILRAE4Kdqi+vFH(SgO*e{r!d$A4w9^$#w#|Ay$0V(TB;*!FB1>|WUR!G5|{ZU5b@ z-SG7W@5w0dXEX0qYDe z*z$>4v5E0KC1xc!`Fv-vGT8N+Q@#E?PptyZKCyrK?7u3!I$zdioLqFPfur*oD!SF- zlTV#-a;bL>aC=Q_!pSFQttO_fX>B<9tZ5yvYwE+HUVm!q4^C}EsSsMjC;R$z1Fdu;dJ*4XmVZv&3r zx%8)x+k(@_x~H~-SLe&M87CLr_TcF1p4tIE`P3OFmwI;uw|lY^oP1(-ZenUrc7c=6 znsx=dram0%^+&%OxZRW8;pC&=1020`>Cf8s1ZQn^PwoY;&X;R5PAz{w}anc(D5uRp%CiqBe%O>fQyr#JPQI0s%` zdTN|pbmxMjtJlPN@X2RSjgxaNUK8gxx?B_5?Q3GO5`RGxpJy0tKYwPv3mbo)VYJ)x zEe@CXi(2v8?f50&a!p4Sler|KWn)Zd@`fFYaPfa*LB`_sJRR*mus>ov4gPX z6LUG(KIR!t+nCgQMdQ!(5TF#g>S28hqf{L-6JIP`XM;^{0!`2u$+f4WZYB6?3(itIOLt> zIE>4@kAmCtJ_aYBc^?PMc{m4c{CnOf8qRr*%fEa0B-ot#Ph$=C=qa$g>$8X2_VE9Z C&%P-D diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil new file mode 100644 index 0000000000000000000000000000000000000000..4ebcb1cc55f84fabfcb69250b09215f2e5f48a4a GIT binary patch literal 4672 zcmeHKe^3)w9)H>0Y&OY~Y#`VLf^I-K1k?p7px||rAR=Nfs8ww3bpz^YD`^x;MZLO7 zAc+>+P{gA2dcj}AS=*v*6{zhbz)_@V@yE5cM;T%*r5)Q+Ydf}e+It(M)9cOjpPQTM z&GgND_kG{zeZJrK-o9`6EO{kHlWXPkr~X8}6JN6M(x^47GAIE6aNz*RV4n-E2-<3B z8=+l;=7STbp)Jc_#1e2k*%mEZm^L{U`#=0_Y+km4-2!oTGT>+Z53ospiouPDdl3zR zMnl7(iER>BP7FhPz3qm&Nqmwog3Fd0mLJ0b;DHo82NUF#EGiNcQE+Y##EmBZu$~0q z0K}9ICbVV*oRit(>kWO_BAiN}`;?@HOpp>$Q?bWO1?>XF_>|Ikh)UOH6ZB$5`chAU zSIe04J%>S5y76j(%3B}TV>T4q>w?Y?cVRQ~@nU!gac?noBjRyo9MT%*8W&yE4=&(k zH$_+C6&^$lw?r2d0I&$gpfD!F!5f((!X=kTz)l8$6o{qA708)2wOB*kp&I!3pr*yE z4KLbNr>r?tL{(N`yjMVYN>As5Le{fK0=Odtppv_Ra7iaIUVAop{dd>&)9=FeTlD6^ zQNmH0?FU=1hofih=$dN0zS4N_&NB#l&L&)e?qI`WLtSzzG{vfOuj&QyV4=qMl>v!Z6k_vx z-v9^`asd$ZY6RnM4}fW^ln51~2)J@7Ds)TYfUHhr0IaA`qpvULLNz9iyix?HVV^#= zNgJgp^47@{7xrQ%B!NZDi;iMvJ`pNJQS9beXau2R)wB(7bqC1-frjhs4mL%xT_MpIn-(*q{9E{tkO}+Tsdj z+2VI8ECS>AF<=0W->vpZkvI63=xZs&;(IYNCQZ)i}y`Q3q>=7+v3jc+~( zBg56lA@SvOIL&G_4a$lbQ%W<$wU1|V>@%(PcT`hvO3vLn4)8bWQCi2>+rztyJx9CnR#}%)mk|I z2uu{U?YWAB@Sd8(dn7=-$6|Poa|l7#6V$EKO6Ge^*)`pDMnP7+MOM+Yi5vy$Pbt+6 zK{baoPVEZBo@>#qW2GUQGiCst-)EX%ht40^Qden{Z6jpW1#X*7w$+AKZE@QechrDJ zb{1Nmg}J3e!F+xE~xMp57@r=QF#uuB>{bA0N(?D?zRi9uE`Q`(N8dNvcD^B47O`>#sRiX>#* zgf9PDb?q-osB5>22X_B!U4x%azo2W}c1##1z!yjhRI+MZ=o0Cnzky3&|KDm^BEh|6 zph-npMGW44?4yRVy0YKMU?Ukfn;Lm@veHnJIv(t_qr|&B@#m!2JoyL z0p)a?!pQI#6br6*u|z= zmJ_Gt$Q|W$r<%+pSJOhkWgvS1d9>~LyRUQ{s&DK#zeMgxUbAIZ)!NM!8y>bULk`HJ zo_+j&y7~TnLwo$KCf7&l?kV{z65qLzzoIuYf$R4MJNg>WzV?qZyBp6gksq1|W7k?M zH&>en4TO+(|h=Q2kW@2PKJLDaZ&;#k%0=85Tqc6t#0!G;ec0UEW!xnS+OC+Mv z9>C3ZB(l!_#6*%yGAko}SfoTAv)?<8GqjaB$C0}zG}cORdNP@=+H0pB8WaNtwy7|n zIx)B1dA_!gj0+-D-e7oK+ zWoc`+4Q_?1>@?ir;cHuTRlzVSFT=w*`9zTGG(>~fRy{~C)fi|ba;&WGyO|HYGt&N3mp1O&^H=mk+`w9zl$BeVQ zGlaAuN+F3FNM~2DiwDW&V7{RqGqr+Vo0F;wRGLzIOlCvsG3kfE5$W3Da|;OTit+N~ zkr+jME3bRo@dyYgOpC@$oP_QUyPL%?hSh48kEA_vqUf7*}^&$98&0&msrN+`{0?&+5@B`t;w2H z2UDbOoS@!G^o}6ZJy=_pZAy;D40moCjIfxmQDdo2lQf0g@I{1F%9B|-iv*U}V1(8f z(bhFGrA(xzN;1q6;>n}gwKBt{O%dx2_xgR&ue2+Ozv+b_-a4~~->X012;X`onNZ%Y zmPNCAuS`cyh}E(R+2BL6rzb|Yf~h+_c(Sn2_;f3NLQwQyNiO)2Kl|+~3CKy2uTarx z>&1%OHQQ=T^qeAwugo0oO1ySiNsw9CGmLpG+uTq}CUcC-UyPW=viNWu4Rf%L)Et0Q` Y> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _121; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _165 = { raw2 }; + s.tiles = _165; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _79.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _194; + uint row_count = 0u; + bool mem_ok = _79.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.Load(0)) + { + Alloc _217; + _217.offset = _186.Load(32); + Alloc param; + param.offset = _217.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef _243 = { _186.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _243; + Alloc _247; + _247.offset = _186.Load(16); + Alloc param_3; + param_3.offset = _247.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14); + } + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl new file mode 100644 index 0000000..7640ed0 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.msl @@ -0,0 +1,284 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_79.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_79); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_79); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_79); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_79); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_79.memory[offset] = val; +} + +kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) }; + uint row_count = 0u; + bool mem_ok = v_79.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.conf.n_elements) + { + Alloc param; + param.offset = _186.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1, v_79); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param_3; + param_3.offset = _186.conf.tile_alloc.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4, v_79); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9, v_79); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11, v_79); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14, v_79); + } + } + } +} + diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv similarity index 71% rename from piet-gpu/shader/backdrop.spv rename to piet-gpu/shader/gen/backdrop.spv index a1ed3320f81fcf8b024ac9e673006b6b4ce80a45..f3a782470d434b7dc5130e9c37d12365f09e8d2d 100644 GIT binary patch delta 3113 zcmZ9O*>6=<6vp@MAVWkY#rV)@+8B%mABZLv=6k8sjs_u zL`sD;I*m-f?;5jqR8}mc)Ysit>}*@w-QSy1W2#RjG#I!cb%RTtZ5_+gsn`;DM-?}u z<=`btyW2}Se-_@~v7m2Jl{cm`ysx9P)Z13gtRurW1eX-l5emhWN% zJ55^FRb4O$UQ%3I?8-OUJ&f0CXr{n$o%M8uA1ylPw z)m8De`6IX$Z993vXTT?8huF;yZUfta`l?*J9qc+{?ddwi5s;bKRhYpjc1NxUzY~6t zVk`4Pcfr-nn?lpZ)c&E#Xm)zJr{#ji0V1=?yDn_X?j}A9+eQt?F&nHl7wp03fc<%J zWA|Wc9$Xx~+zYNf_*}UA$9e99dmi)bO-{}^)#jq$oN{YR^Y}JG&Mb~K53IJCaxOJC zuUu@I&^jMvQD1~;WC2*+%wl-K z2;K$u^E8-`?m@6Q>W)jksF1R1aW{ih8}Ws6zBc8{CU(;vuz7l$>c#hA{){inQjyZ6^35V2d5dFVv&p6_|QFjmN;QGv)#OI;cgB z$HBEz$}$9#;<~Q{tN9wl?RWz2gESU}R)N*yMO_WHxVks1Lq<{`yVCwmHF z19t0a<&HCw^>Rn;Vtp)r8tg>k^v{6R;w~Cft9^V=A zTJn{R`=!J7QF+zwV-SCcaU_nt(~ zCt&jieE*~P4g@zaXxCftYHsAS-37L5V}p2kG*JPYZ)`VSK8e3+{#E-F+=6$FF>`j} z)u&8^>;ZTpNB&an*Q_@m=rz{jxo?io|7eV~qB9$yx{d@$Xq(PfM-=Lxe P7UC_?QQ2_m!IA$1Sv-so delta 3193 zcmZ9O*>6=<6vp@Mz|aH@%3Fh}F(ev%(3qHjV#0$)!~sQARH!XfxN@=37K?};1ROy{ zucD&jfCxfG;DX?+6dVw7#3vv8E1VLdzu)OuFGN`2jZ9hLT;?*87Cnp0z1Knw5FLrT& zy%sH7oG++>mvr=WEG{9Z*mfd1Ov0~(ufkuLMx@>N6Sb3LXO?maQ8isgnO^zoxf0fMC9LPlSkIL) z_*nWWV{p9^*738_=HY>~y;>UA+U(L=b?ms-Sr#i(EHa~1!SzZN+`Li+*B?8LAFnMP zcVP6z%5n{Vn@0pNt)5Hz&FY>_o<6qM2aht9|Gk@n(pPOiX3rNc=49 zQp{X5zom!=zZE{n`c28v(7=~)SQt|8V%=E zOKq(aI!GyKoUs#ZOi^8GuCqF|ZDQ61vZ?P%401QvxRriiFq=7qIMzM@c9`G~g6%cPfqbwZ z0{ay*o^pE8JvxU5xR8yWXRE6ich^kJ~l2VV`ghBqUBpx2181@w+*XD!@oHW%w2 z0cR%SC4Urba}z$ekK_G$YIBca#yqt+8a)AafVc{|@}tmJXhr@;v`9yQN^jXU6aynHB!{u>|$2a1R8S+Fr5B8&X|JqJIG z-&WLA;no>fsTc76RLb1*m@!W)jzL}o$FP@xU-FJG!c17yLX($GU{1RcFOLSVfa~x6 zRk#gf_}9S3;@k5E*q?)$dmY<~IhZ($#&1Gw6c5o`U}NK zDmQi$H;Y`nmXX6}ry9LOp$%eyZD3Dp+y;~o>dOO~jJ-pg| zV23tW!|(I{V}lRCE-<+tFQ3F4?H{QF;5NLA%vrM+Z(Mv3?}?QDL$K28{px*;{{-`A z{?j6_KXw0~L2MGkd=B=s@fG|6Y>jarCizF0&D~5KYrh10kWl3i*qCZMtT|)UXSpng zAtB2kSRT#30*5L`!17S#Yj7J)U1KiRd;?aU7~os5G3!G)WBL7iFW*68fTLh}H2WSL z0~`a(BliQ?jcj@xFQ3H!T@Cd75!{NmpX=fk<3ppDSMYA2k3oL{FTkSzuV7>2PqMkwn>1*HGTsJ@YUT3>NBs|ZRh5qb diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil new file mode 100644 index 0000000000000000000000000000000000000000..e6b2f1aa3f98102c410ab7d255fec28fe675633d GIT binary patch literal 4676 zcmeHKdr%Zd8t zU{{05;u9tD7DbCQnxKga#H4nC5Hx7;aS3{q1qO*)Bq7G6%t^)T0ZDnes{C_xRk^Bk zRZoB4uYcdy-P7Ol8&Z}UjJIE}oNqLjvwHo=@%Wx0pNfbybXg>md?%)lF)E&H^dD_?@cWQz#)jq zY?`4p1K=EC9bchu!)D`j?DU-?2NfnJ14FRavUpW0V%RA&)X=f2M3R{+ja}eMcdInU zG}jRj7;ETFm%A%MTTJ>)Yk80T_I7My8lDLcA?z)vyjwUfp+jo@$_4}z^@1yS_CvuG zx%qXdeo!z$Ismg_3<_fcY@9)pL6%8GBCt&ZKorDcL(?UiRi)Uj&>9DPeCeomtNb&z zm&+VA8FW!0#<_WfqqR&Ln8Uhui-7JZ3FydXz+Y2~4E+at?myG7o$xGnz^pZO^pUo# zL@(HcZ5TWG)us@`{UwIw+s@C*4;lp*nxnO%d`>2Afi^figSJ%0rS4{b3W`_WU{f3u z@)l*%N-5~MS~KyGQ3ME36?mORoHKrnvt$hlZNGO- z^1KfDX7~h(%`8#iAqq7_o?E_g#Mm@qY~%=m(1G85idL*6HgiN#2i`{$#luWa4l@ih z`{0y^fSkN;In47U!w@nEn(2&Q}D0Aq-l94bNqC`+d4p<5Chsw7+kU_}iz=KdnxP>l(mTZ{k&9AiT&Re{P3 zcex~dRx4&i!dS#atF(?Rrh_8SOjKtcPFVu$LAZvo zXofot2}qg7WXlSZ(34TSs^)GfME7rO5b3LBNmW}aBC{f;2O|G@put)aGq+HdJ@-0| z1z_BM4QR*Bns^e^n{)GRATFp3)rOPT({4rKpXAs;VvxUKnaj+PoUtQ=j(@%Ri+Pu@ zp3CvkEvT}^o05s;k~MygsUj~{&BTKsyy;Xt_C=Ywmi5mwugwp@CRb^|XPAhOr-Bo5 zHLe}s$E)_>rg7Em^oF`~QySlEI5XvBa^tB62||A-*|kgd>7zqQRqG#RjjuWkBmEU8 zA@TiKIL#`RyJUqy#;7WYt6qycJG~L2ri%2!!vZo|*%*6n%9*q5DG6eUsg7!Xbl~uo zT~XPY`})!n67-ZZer7zHa&k0mtTA*BY%$Rnzp%czfm?mYS;SB*=C$sj?Qe~R9<8uwva?gy3=AIHe1l*O-_r(8Q89r_q*ji ztRkOPd^sQ=Fv+LVEx|Xn>P~HNlQ|@Z-IjFDp44HS{yv&~&YtW*liCMV#T>C^4Uw?M zSpc8B)8ckoIHEX$D1Io0Xi7qs0ck>ju=%6j>8k$tsOtolZ*wzE|GkR(xwxJYenCHqp2)ov=`oIan|R9oVRW?-7n+ z-QrgYA1wL#qgR`Af80W0mps08?u)OyVfA`si$%|OC(L^F_P38-9(?D6ySi>_Yz8j# zNC*4wygT@p$2YI8nlYAqQ*S@-365*No%F^ykm;Z+!c0T9DOI0x1f~8|@QSg8~Iqf^S$4Y%jojA^$Bw}$Mc}{|i z$zvK7R6Mnk;eBO#ss&KT>P}ui-B44pr{VHEi7j%~rpd*t3kuh6$X|#Yk^~-n?FFXl zh5cQ-y|u=&uVS5}(-wz+@gQw+YkZjQg1e`oZO_FI|9*bQo{RG&H8WuB>inXD64M5_ zp~8iT-yiz%hkpE)ewZ}?jDg}9fGvp@K4%3T!E%5!#-my1(QWq(K2}KMJQL=;!$($A z(!*QPISC#fl^z2k<%L&0gQ(Qx0qBKl6)L4|OC{yNJ?;W1y`_0wi?@@?S zbqY6Gkx_2z>%&QUi47XDMWBqTvn~gOhRG*8bkueR4bCSGS}Kw$-fLxSN)!Wnwlbqf z9y`Npzw7|D$Ng#?Aku}N;1pa75b0mfsZf9vGYv)yI0fM*22_4t+fxjcdVG9=4`YVZ%~2cb*N>BNgJyBQ6oac-}NsRe_G z#gm>WEGt7LQbpNPvx_JsBfwz< z%aORPQeEPN>=rM8hui(Kw{%72>U34oQgbcw(w&iYbTOsD@XgbM2z>L=X-}?jdvd>R zvU?&Q(?v@~f$g#EVs>r^wFu1AS763k&}y;M<-Q_&beqv+jXo~!1GZ7PyVl7gY*~<- zqmBki$Jg@e&yix_liFwZ8+9V4*>66JU-c_hE&_`6)?}V5wT2l&J)pb?k7Vq3%bLhY zFA-r?bEW$5`NR06C%8@$Q60jSik?|MjI(4b*I3Qkqy}V8rDUmjVB3Cp=InYOWs9lI zk0^&JqVtET)5z9tgnk~YYvM;ID@}0c#_j;Ksh_?RZ8wUesCA;zVlhW0sIx?{ygC9@ z3j*qz21c(EsL`T0lZaHjomfifuUP}uSeJV}2Ts>Z$-im&9=vT*3%6H$$mYNGRwONZ zQY~vIx80gxJ0(=hGGwg>Nt_TqwibkJbK#NvtOfgO%TDpq_FIvrU-(lxZ-v>;2z;rW ziCH65+_5PZLpWJab|aI!o5K6MWh9k=jf1z7v4EYc+Y{A`LI0%iYnV5x-G3=Vqod(( zbVXSr*}{G5C9F(t-~ugepM*wZS#_)HN_Es3>I2&4+pAS0F`cn*8a4QevSNYEDg^R< Yf!zG8KxVcH%);4nD&?UFJ~3v$^ZZW literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl new file mode 100644 index 0000000..57bb6d3 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.hlsl @@ -0,0 +1,283 @@ +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u); + +RWByteAddressBuffer _79 : register(u0, space0); +ByteAddressBuffer _186 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +static uint gl_LocalInvocationIndex; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; + uint gl_LocalInvocationIndex : SV_GroupIndex; +}; + +groupshared uint sh_row_width[256]; +groupshared Alloc sh_row_alloc[256]; +groupshared uint sh_row_count[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _79.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _121; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _165 = { raw2 }; + s.tiles = _165; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _79.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _194; + uint row_count = 0u; + bool mem_ok = _79.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.Load(0)) + { + Alloc _217; + _217.offset = _186.Load(32); + Alloc param; + param.offset = _217.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef _243 = { _186.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _243; + Alloc _247; + _247.offset = _186.Load(16); + Alloc param_3; + param_3.offset = _247.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14); + } + } + } +} + +[numthreads(256, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl new file mode 100644 index 0000000..1c68980 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.msl @@ -0,0 +1,284 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_79.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_79); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_79); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_79); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_79); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_79.memory[offset] = val; +} + +kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) }; + uint row_count = 0u; + bool mem_ok = v_79.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _186.conf.n_elements) + { + Alloc param; + param.offset = _186.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + AnnotatedTag tag = Annotated_tag(param, param_1, v_79); + switch (tag.tag) + { + case 3u: + case 2u: + case 4u: + case 1u: + { + uint param_2 = tag.flags; + if (fill_mode_from_flags(param_2) != 0u) + { + break; + } + PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param_3; + param_3.offset = _186.conf.tile_alloc.offset; + PathRef param_4 = path_ref; + Path path = Path_read(param_3, param_4, v_79); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _272 = row_count == 1u; + bool _278; + if (_272) + { + _278 = path.bbox.y > 0u; + } + else + { + _278 = _272; + } + if (_278) + { + row_count = 0u; + } + uint param_5 = path.tiles.offset; + uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_7 = mem_ok; + Alloc path_alloc = new_alloc(param_5, param_6, param_7); + sh_row_alloc[th_ix] = path_alloc; + break; + } + } + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _325 = gl_LocalInvocationID.y == 0u; + bool _332; + if (_325) + { + _332 = th_ix >= (1u << i); + } + else + { + _332 = _325; + } + if (_332) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _411; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _411 = sh_row_count[el_ix - 1u]; + } + else + { + _411 = 0u; + } + uint seq_ix = row - _411; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + uint sum = read_mem(param_8, param_9, v_79); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + sum += read_mem(param_10, param_11, v_79); + Alloc param_12 = tiles_alloc; + uint param_13 = tile_el_ix; + uint param_14 = sum; + write_mem(param_12, param_13, param_14, v_79); + } + } + } +} + diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv similarity index 70% rename from piet-gpu/shader/backdrop_lg.spv rename to piet-gpu/shader/gen/backdrop_lg.spv index 457cb02e46cffa7a0fad71b0630bcc7f250234c7..a77d46d7ec5583df11b98d9a98d15c6385fd6682 100644 GIT binary patch delta 3153 zcmZ9OX>V0k6o&WhAVY!^7(XLa|t&#R3k8$q@(e zI^&FpLWM#_#1R1nEFuX00RM!j(dhGB&T3A%$;x`)cdfnl+Ux9d+J9P(whXJSxol8M zr8G1RPQUFOwr)t)TuQ04y|cNkX-RumM@sdnHkHv};JVZfF1IzcE=y-(i{PzQT$h%C z7cFUTDd+qdcvtJ(&iPedpDOUq*0yp-Q*&Ee`n3eQ+<{;=SfZZE;g{k zq@@e13wpqdnwK;$%s1IJfY+wo@H5TTrF{cpb!q=u{LoqaNa?(R#rt|U4_#AJI@NpS zu->}KEu{a)wxgNDIrtUul?8qrPN+YA5)Msc$nL?Pg8z#@**ogOsio?unSQ#KB)wxU zaz|a{j=IPlcab~p;1^KOaR=8s>H@ziZ5$|6M_)*&jy<>qdn%<74V5t?&YvW6xDnMm z+~CF?ZgBmPvv^ZhEYv~ zoLL-eHdt*FlPc4bxL=#g_k7WIXRM&^Rm&1|mhY#g51mdtzq0OmO6;XQ)>502J~ zUx4{D{$L*0N98uKHG(e$`x)!WNB0oe9CgPfpI=H@wYZx>s*U)x=I-zRfxqbE{?^^!8Q~85wOMf=NIZx@Cr;l zp2lNf*BSEwejU`J#^d1Pl(G!Lq`2-Y!D_w+aXVJQeUQeY&}y)Hyr^ry7FYLXK7se= zDUGeg)I6j(>SRwsY`|_kt=w@&vR>|}U96AAPl26Coc?LBTHHlrYQ@Lrq+|WFK7Tij zldc0Nj6i?Vb6|BF^y4az0?&ihy7No+0$9yEV~+3ai|~W^EqTs*xOwWQQ&yW^0{C-O z1~*`8o>m;2y$p_KuLi$@jm6Z>8A^>;bKPbuV0o;69b7K(H^A*a>*)SXuwr~&-Uj=# zF=KCGA7C~nj>7MNZ7lB4yI{5A9eNM$9WoZ{-Unv}>~hZsKMUi`Ot>`Y5ys z?D}ZnL$Ie+cfEh})T)n}1APQ`I^)^DzsME@1Mzx&3|5=Yg1BIxz|}ojY`zul25a-z zavNA}eIchC?mdZ|Pr>GO`~FAq?FepQ(5^S*)!fKuyAy2J#(MDbXrdQvzOh|+`8fWj z`B&{Ta3kI|#?0A)SD!EzvK!!u9R25DN3ZuQ_XU10=Fj+-mESKPo_&Sls@VQ(u;+{K z+&5rz)V)3OJ($(3E{?U|f^9v<@Eusq`NbI2VhrE=(#9P25kTlKb^si6*bkP+91emT zDe4+y(ZCO2=MfG32v##cW~CMl9QL6_1BVD4&Vvs7CvY@y1T2q@j)L9DqJ4PzIQ~1T zVcpN*2E6rLXWV%^dncxkI>$1I6=<6vp>0EkhGDB5w_*#E@w4L1JP8iir;z5eFPlQL(mAp>na%7K?}+1RM}l z@G2?_4u~Ki0v7~lrQm=SMO1wBuW*V+f4|eS$So(?UEjCX+H0@g?@KsUZzXbMV2y^=Tovz1-5VI1R+Q;2k-x zPfNfHx;oq13%(ZK+cCRmZq6Ih26#_LxxKrkwOsC;mDk&due*I=Ua%oGWNCYm)KaWs z1N%%`G(VqE1utmrYMoy!a$pc2k`BQKnyXKJgJSjR=o$Rj8N9!A_Tb{jt6Lja)|F0G zFC1B|pE8^D|5$cBI!weag)cAoQ0z%Kp*$}C1rALYrJ-pL_7wbI{nc@!rj~N4Q9oTq zl2(~(xl-41rLN`5UCWg__&L;5?%-ObuHmE7=E07%y)t%mQ-f2hl?kJprkkvsG0BLM z2G=TSaN|lET-$#JKVDrw`rwEO%6BcHE!Zj8rD&jJQ^ERobJoM**MYBLOs#aB*OXlk zx{7f*!^qW)Ke!(^VtxLmia9qh^!=IYay+@HJ`LYa@`B$4pTs!AYIbxp*pBo^N4J3W zukdo(;;j%18Mu;(!?4pCmokh+@!JZ2@Y~^i%wNtN>)Zj?XWkWT!kE6lXgZqL{zmib zh^{X6cDFHxs)(bpd%(4-w7_j9 zHg_-F&6#JTa&pefJB@-fD`T6R#cF}o?eA=K9=6QzF+RVDYopc!rIh7G!qG|41@{z6E&!91^sAoy zeul+eOl%{*p{gzR?G(Fccaf*HsYTca89v5)OW8o7UYyayVz5P`iHE@Y#xZO{-ovn% zO^9Re!(f{U{s`DweQd~!y$tNvN`HQ_SwCeE194c(!4}f*j+Pi>MA1jVN*{a$*c{%B z{6eo3Gv-h`?#?Q>$80R-JqFGU#7q7-*y0AfxKCnz+_kYM82a3`I0~%>+dv$JTzMl` zD_45wYKys9dJ3XMvHLY(eQ{Nd>8m|YrH{p*1}k~&^ciqs=#QLd!TN1*Eml4dL;rOU zox6%lw;rs|OJtItzvtjbu-l593fw&XD)j=^N2QEC&(P;?#nH%%;Ar*|@JpWY#S8-` zHB#hd0}Q*}fR#sqSHQJr|0>*q(flT`zWDaM4)(DzW3Mr`GHgtoMd3Ff7K%&sCRktX zCE5b_5*dqmZ-MpO>}IUoqjF&vaj}TSYZ)unQQo>Frp3*6?WegKgSa6}#W_j|JWbJHg-qtb8JGv_Db@!Od7F88c@eR=@ZV z)*UJR2VkYw`qldg`!U1E_$NhNd;0u8g;*q-`3&rC<16?%*c|;{Ci#a9i@TUO=6(To zBcaM+us+qaS!4Rh&vIFgKth&2usn)=2@X|`g5{ygSKwxfI>%Vd`5LS`(ZDxgeddR9 z`ttMlT)u@w1INJfDE1vV8t4bhBlbPmg)Dj;E1$^!U3Jv_0o;VOp7V?=zek~B+Q}JG zdcG$h${!v72-X*0L1X%A&sXW?@uAVm%UBoCMx#H0=P;uF&tQFHPV&d@4eGQ&2i5}f Ks;e*P8~#6mW||5B diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl index 903a185..64b109f 100644 --- a/piet-gpu/shader/gen/bbox_clear.hlsl +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -27,8 +27,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); -ByteAddressBuffer _21 : register(t1); -RWByteAddressBuffer _45 : register(u0); +ByteAddressBuffer _21 : register(t1, space0); +RWByteAddressBuffer _45 : register(u0, space0); static uint3 gl_GlobalInvocationID; struct SPIRV_Cross_Input diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil new file mode 100644 index 0000000000000000000000000000000000000000..50034cce738c087ced84608eeb4c0ac98cd99046 GIT binary patch literal 5800 zcmeHLZB!ITx~`t?>FF7Up@G5l@MRAOK}6G_0tPhm0f>lZRCG~E>;XkoAOpBWz}=lM zW(I*|R7{AP>>xymAucgoBd#VhFfjxnqHzsI)(grKl64iiZjRn~l3UGJy!V`Y|J)z< z<7}N%U2nbh)Km3Vzg^EjQl?6M_PR3sRC{;zvEshX{pFu+M*slLvj7mlHWAizShHZw zhxIP3b^?GFSl6YjU|=&-R;*hdJ>BQp7ymk2l|HR+w~!zyb45BwM8YvM^sCg)TXHb~ zx1i6%NKdTk3&#Y8(EZ9TYz0onEIELf#bJCx7K~MA>#1a(>VSvJO~y#$32LP%Cfk;3 zm(uDK+q*y(qZ&??*u6vD8s!>8&5-FY`>^N~d<_hO+gm`*AQ!Gc!E5%kjBzIF1XuCw zN1Tb+V<|Uhc`D4gSKWL0!JVG$w-sGwuGm{tJHB5v&?dFw6agMgBr>=&U>oM0EaZbOg>% z1&%AFU+EtE{y-l1)aPni`m*Ai&D_MaD~J@>eDa3Q^`eGqR_vWFjfKAc52VEW)EF zOn9pD7kQXsssq&T@QE3qB~e1obi`%!WX^F=YJ84jCfS+c_w*Cl&yBEMCInG%7A#Kt zc9f@|Fu8?Z7hp4V>R<-&*y*7tnB-iZ#{}0?GVgSIjWB%qMNG|0V=)KSeT)>@9YD?( zoe-xrGW+a~R5Z={B~M;}>1{F)vK@3CMQAM!=a<1;EQPH5P)}_j>W#VXqy?I0_~1MU-QDTZ~`n=f|H)=IB@`xedoMq^1H~jlt8!@SNP z4>wjEoR_oa&5@M2I3+mexq-5|D7#Wt?~>Ic{!Iq|79^m}5O5v|WQth4m=Ub+@4x#^ z*43`=fzjib(k~|QnLoy~x3|T8JQulQ7GM5sK~3+e^ujdFHF;;rTWXKcc?WluhtA76 z;zMui8p|5)9=Y}X@Wt+t=+Z9r(i(JW&z_pyIza^?C`+~Kbb@jnTDr%o zqph+YZ^>P|WQg%9WW4T-Nyapig`_*^mOQXu9@L=?&Sk0+&YKeY)JxjX#Pg;^Gn&vd zCM{(J6?uZVJZmw0>a9AvRmTcSBLt<7_>lHhy>$y=EuyT&_13a_L)5)7$-P0z-<^`X z44U255IEcr*fa4Q&<1wP1Koz8ZY_S@u3plmUQ$2tJlh#xr;fLw33Z*%!9|}K6Rx2N zt({`GT(DP%mPc4i^;X#xwdBztK1@r#9mIbiCF4%XklpLP#_J(tGISxc2Nc{w9oity zfD!_VXn6a`xSrfkKi^Uk-@D+T!Fl6rSqor@CnKZxsh3{;!>8KV zAD801?S1UxE{IaFef*Cqcb?t*?E}LXT2ia_fB0bk%`>K4&V$nX$dJ=m-h|zRg!C*g zycyU$H2(JIUhoD!b``;Aw-;12TQ0pC{DNbj(boS(6#+fF?3`zdjbKAm4%w^@>Nqzg z&i0;~I<}uYvA>1(U+~Xmdqazxmj~`0KGRDF!0wapAf6O_h=f|pq0@6n#$isvMV~9Y zDV<;tmX!ROPx<>7)s6=G1NTE*+};m!E8re;Ih~D5uvCJty`1 zH&76t*6#u6pVsf)aP0qrekZ^<|4aRD`v0Qem2; z;T3u?{bQExItUO>J-GrU7vU*53l=E3=u_GrOc44BKYUmcg->gHBn>$IoCJN^nm6pSlF&RPiwnC{EO_<9vDngu9B6eipF^* zji^lRaWH4mq3ihP8}OKiF>R-LsxYavO%AnvyRLvb=OvC6XHj~WMJaXzajNaYg%1m; zE?VqlJ2m(=fcs+3Z)kG6rn0v8RBu>qOHXZK8kqs*ZtADX5g&3D)Xcjb{dc>Ek7Fr8 zF^O+3X=6^uv>%T<9dl+bveqKLoO~+(-Uq7-Tk;Fj=s&@^b4wZzUNV)hENO&lnKym5 z|Io{S=;i-}UTSFo&O+Vv#Wrz2?*aUi9bi2`nyq*&O$w`98|{~;SaD6Et;<$?;YX{G z7%82Etd-I!NO7I2p>EYs-SQ$_`ikN)F4ZUx{rt1iAl<4+u2akG8_+q2obWev4n`Bk zJ>35RLUAE5vaLg|*ghnv)n!xi^fZ-U=JfnED=1*HSwItbRfMAS7xPsn8(wS#ek-`y zr5EgC4wGQH+ z%!+oZBCt+8i58)nYMCRnN48q#6UwRC_UA0iaTxu~vn1obP|SLpmQruG~h z^#s{~J0|vAZ>S@kI<;ZHE2>k0h1qx+wCMS%(B{|kDC`6W^J_J1B1JS+NQ~*lxk8&k zw2E@8>bFJIIO;a*MH$L@ET4*o*FJ`N1->lz?m{vwd#;LZP_7bhj6pPw9%p# z<_hfwkwev% zkg0jhCH)*%lp$vFys9^~y3;gEK#tqV?a-&qUgfy1jAFgjSvARAc6EZjUq;WgRymA3 zQSe3HtmuCJCN6myF?bJ!adT%)S5?1FL`FEiSBA6RoiDdd5oi)-=>G_+Awk;gd`O_T zWK9vcW(aOnpXLaDQa%fJuw!6)2Z=KTeHxBn-<-XWpc&sXMQ~$=U~cWl9Kl)@454$E zIb|{g&9WJSKRGyp?yw$6&}Z2(N#N9V>ik*e(pZ$zyRj$~_kdrH<^sO)TGGxYe9Kre zlda=E_x@dv%&Q)LmZt(4W4;<>OP(Jtj>C*pPYkn)$!_H!hYDKwFWQoK)m+STUyM>N z)HGCyLKj~0!5`Rb-{!vkZ6H^6bUC!TOtCoo#-MBzL|AwBq{t>ep!>1ucb*-&y@<=f zjN4JC%pfw}^8vnKoh*KYz)EQ;G-nyxA@utxEX_O}woyTP#Gg^BkzS_B%{95Pv zwWfOV^T_PJvnbuvk5ZkU$Sd%EK&AXpoPe=L2EnYR zl^l&S)$u>VzEbd81#iBbiux&LXYH1+*GSg7K&8iRc^KAe8(P*E>Hw7As8JG!{T0DK z5G9=$%ES__ul<}DjUmu2R>F?hQ71ul~Z zP%1rc)I=TyW_~bX2{4v(Ux&~mR~RF8P4hc_Y_2z4LN|MfZ2oMKvmL1bnX2Z-{9)8*mZtga*owj3cKmEM;F xmrsCSAQcF_O; literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl new file mode 100644 index 0000000..2b0901e --- /dev/null +++ b/piet-gpu/shader/gen/binning.hlsl @@ -0,0 +1,352 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _84 : register(u0, space0); +ByteAddressBuffer _253 : register(t1, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint bitmaps[8][256]; +groupshared bool sh_alloc_failed; +groupshared uint count[8][256]; +groupshared Alloc sh_chunk_alloc[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _84.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _221 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _221; +} + +AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + AnnoEndClip s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + return s; +} + +AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) +{ + AnnoEndClipRef _228 = { ref.offset + 4u }; + Alloc param = a; + AnnoEndClipRef param_1 = _228; + return AnnoEndClip_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _90; + _84.InterlockedAdd(0, size, _90); + uint offset = _90; + uint _97; + _84.GetDimensions(_97); + _97 = (_97 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_97) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _119; + _84.InterlockedMax(4, 1u, _119); + return r; + } + return r; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _84.Store(offset * 4 + 8, val); +} + +void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint my_n_elements = _253.Load(0); + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = false; + } + GroupMemoryBarrierWithGroupSync(); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + AnnotatedRef _308 = { _253.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _308; + uint tag = 0u; + if (element_ix < my_n_elements) + { + Alloc _318; + _318.offset = _253.Load(32); + Alloc param; + param.offset = _318.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc _336; + _336.offset = _253.Load(32); + Alloc param_2; + param_2.offset = _336.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3); + x0 = int(floor(clip.bbox.x * 0.00390625f)); + y0 = int(floor(clip.bbox.y * 0.00390625f)); + x1 = int(ceil(clip.bbox.z * 0.00390625f)); + y1 = int(ceil(clip.bbox.w * 0.00390625f)); + break; + } + } + uint width_in_bins = ((_253.Load(8) + 16u) - 1u) / 16u; + uint height_in_bins = ((_253.Load(12) + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + while (y < y1) + { + uint _438; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _438); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + GroupMemoryBarrierWithGroupSync(); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_4 = 0u; + uint param_5 = 0u; + bool param_6 = true; + Alloc chunk_alloc = new_alloc(param_4, param_5, param_6); + if (element_count != 0u) + { + uint param_7 = element_count * 4u; + MallocResult _488 = malloc(param_7); + MallocResult chunk = _488; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = true; + } + } + uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc _517; + _517.offset = _253.Load(20); + Alloc param_8; + param_8.offset = _517.offset; + uint param_9 = out_ix; + uint param_10 = element_count; + write_mem(param_8, param_9, param_10); + Alloc _529; + _529.offset = _253.Load(20); + Alloc param_11; + param_11.offset = _529.offset; + uint param_12 = out_ix + 1u; + uint param_13 = chunk_alloc.offset; + write_mem(param_11, param_12, param_13); + GroupMemoryBarrierWithGroupSync(); + bool _544; + if (!sh_alloc_failed) + { + _544 = _84.Load(4) != 0u; + } + else + { + _544 = sh_alloc_failed; + } + if (_544) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(countbits(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + BinInstanceRef _606 = { out_offset }; + BinInstance _608 = { element_ix }; + Alloc param_14 = out_alloc; + BinInstanceRef param_15 = _606; + BinInstance param_16 = _608; + BinInstance_write(param_14, param_15, param_16); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl new file mode 100644 index 0000000..f6e0505 --- /dev/null +++ b/piet-gpu/shader/gen/binning.msl @@ -0,0 +1,350 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_84.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_84, v_84BufferSize); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_84, v_84BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_84, v_84BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_84, v_84BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_84, v_84BufferSize); + AnnoEndClip s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + return s; +} + +static inline __attribute__((always_inline)) +AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = a; + AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u }; + return AnnoEndClip_read(param, param_1, v_84, v_84BufferSize); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_84, constant uint& v_84BufferSize) +{ + uint _90 = atomic_fetch_add_explicit((device atomic_uint*)&v_84.mem_offset, size, memory_order_relaxed); + uint offset = _90; + MallocResult r; + r.failed = (offset + size) > uint(int((v_84BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _119 = atomic_fetch_max_explicit((device atomic_uint*)&v_84.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_84, constant uint& v_84BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_84.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_84, constant uint& v_84BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2, v_84, v_84BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_84 [[buffer(0)]], const device ConfigBuf& _253 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint bitmaps[8][256]; + threadgroup short sh_alloc_failed; + threadgroup uint count[8][256]; + threadgroup Alloc sh_chunk_alloc[256]; + constant uint& v_84BufferSize = spvBufferSizeConstants[0]; + uint my_n_elements = _253.conf.n_elements; + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = short(false); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + AnnotatedRef ref = AnnotatedRef{ _253.conf.anno_alloc.offset + (element_ix * 40u) }; + uint tag = 0u; + if (element_ix < my_n_elements) + { + Alloc param; + param.offset = _253.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1, v_84, v_84BufferSize).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc param_2; + param_2.offset = _253.conf.anno_alloc.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_84, v_84BufferSize); + x0 = int(floor(clip.bbox.x * 0.00390625)); + y0 = int(floor(clip.bbox.y * 0.00390625)); + x1 = int(ceil(clip.bbox.z * 0.00390625)); + y1 = int(ceil(clip.bbox.w * 0.00390625)); + break; + } + } + uint width_in_bins = ((_253.conf.width_in_tiles + 16u) - 1u) / 16u; + uint height_in_bins = ((_253.conf.height_in_tiles + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + while (y < y1) + { + uint _438 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_4 = 0u; + uint param_5 = 0u; + bool param_6 = true; + Alloc chunk_alloc = new_alloc(param_4, param_5, param_6); + if (element_count != 0u) + { + uint param_7 = element_count * 4u; + MallocResult _488 = malloc(param_7, v_84, v_84BufferSize); + MallocResult chunk = _488; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = short(true); + } + } + uint out_ix = (_253.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc param_8; + param_8.offset = _253.conf.bin_alloc.offset; + uint param_9 = out_ix; + uint param_10 = element_count; + write_mem(param_8, param_9, param_10, v_84, v_84BufferSize); + Alloc param_11; + param_11.offset = _253.conf.bin_alloc.offset; + uint param_12 = out_ix + 1u; + uint param_13 = chunk_alloc.offset; + write_mem(param_11, param_12, param_13, v_84, v_84BufferSize); + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _544; + if (!bool(sh_alloc_failed)) + { + _544 = v_84.mem_error != 0u; + } + else + { + _544 = bool(sh_alloc_failed); + } + if (_544) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(popcount(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + Alloc param_14 = out_alloc; + BinInstanceRef param_15 = BinInstanceRef{ out_offset }; + BinInstance param_16 = BinInstance{ element_ix }; + BinInstance_write(param_14, param_15, param_16, v_84, v_84BufferSize); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/gen/binning.spv similarity index 100% rename from piet-gpu/shader/binning.spv rename to piet-gpu/shader/gen/binning.spv diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil new file mode 100644 index 0000000000000000000000000000000000000000..16d47ceb1c63abdaee8fa52a8b5a604855d036d4 GIT binary patch literal 10984 zcmeHtc~leGx^Gp6N(e~^kYJcJ5kUy3#ULQ4Q~;4lj1UoVDh3e|F$zsPwI%@q+Nh|2 zsBIer6crQ&Z0y!HA*iTeBcRePZAV*e^r(%tauBDxD244DTeL5$75SWirshLLJnszIVk)2f@r$R42Gd?0toU4eO}{>%mk?g6l5uIJnHQbdLU{nv?I#v=m0z&*z5b zljJ}4RJ$F?zqafF>mYqKe|+#B-xIDybShMkO;HnHT65QF3Uly6-}J%_>10r6otCgy zcd<8uU&vx3N?WI*EG=N#HX9GS)4u+lpi_j9G)2vYF355Tz|7ziiz!lKg*L;i^x#2+{iRBR8*udNK zAec}#;?b1MhDjJv1CLy2L8f4aD)|=Pvtdwr1uT#u@-|zDk!Ikx0qEieP_rXPmxPh5L#Ogn%(DZ@ zXWaK5jM|h<-oN%dEUq*UJh*AMC|SffAR0MvUa`S5D%(6I>SqqcnxZ}+fbJkk0$(~s zoRpcRgcjSY$U+x7E<-$iOioQ-=R$|h35BHvd_)ELQaUMOW|zEeU-ZsSidK}zGJl0p zJkfbv*q*eK{2&@a#r~x%jY`o_$I><@@kPa%jeA zy05G+D!6v7rtI1rj(lfP#o~&4{KTL8Dez!=#hHr*w>r;+XCJIOcu)Y1+vel<{dqGI zgoF-UAN;ZJervl69pj^w?z;B}sy6L)2bg5dk;$DqvH!=RC& zs=p&={i0#rhu9A>_7ge#M8lb%o6lG8E<1l~j+s)Fx_)x*@^#rO*JQmpyig{iZIqF7*B8nJg|;1*?B^xy2dG6BYVrFZdr-pm=CnE9 z7drF_9b1#erlH#cFXRPwg-mNAPrr~iJ)a!dG04ruX`9k%e(8nlz^}YeR#GU#X}PYn z++iwU`=z{a8NF~Vzi?ee;f4x@$IC(XOFjFohW#8RC%0ERJgRi)F#Pr>Ib0Vyv?&~~ zCn0xALZ-EcOsg>bw&?szL;Q=#fu*`BK`+j{hjpLt(oD}=MpMj9_CMVcR2~IV~etvw~fBF=% z!aE(cts}b+9;~|b#^;9*j+g$H6+nEkS8LTmx-@MhYHs6=-=;13)@7eUW9w7Fyd&og zcivjI@xYQ#_j^(U?C9T!^RD5R(I&1_#v;V{QyUmZHEVW$>h5{|+RuV)7;(DgZP~k< zzmxas1g$Ecf#03JTg~budT1C|7@ErNGStJ#p%uIO&&`qO9tZ9kCz&M02-!)$+_d2WZN<+LyIQW`fAdBO!DaH}A1SOc8tKj#hApx=}?qygH$L*5Vo=K0^|4d48G z=M5+oC8I^7q4D1-e<%X^LlTic81)bo|Ig$PEJA1g%ltup#ZpcZuA;~_wvg2dB5P3N zh+!5)42c1?|J!^4#OY=+pUFtcwny3oAYag1RtTUzRldX~na?NnaeS;Pp2IHm`0tX0 z-vZZ&C4oDj8hHyiL2sg~O7gKNrp%L5_m;&KSrg6pdxDP|zxYBprRBTr1=g(p{UfhGy z&zP1Qd5yOQ+fcd{gBe^LdxvlZ!LS!sFbhuR3TD6hJdSHM!x<%E@hr)-Fi0t&Z}o;! z*IF(##9d3CgUSNX%AZOS(G!XD!J#B#k;(dUN`QU)4 zR3Pkn&C_6yVi$@=7BXm#qfs2Dv=kOWKiGBYFsi@~iy}%5bl%X$?XXkG8&m?Tk&=_} z45k!10YA#XEXX`)733BSO1ZHFmq0=>(KeRQGH4}12WQXIc3mNd;6RNX@>O~v-Y2K` zCqW`sFMJ0b)CE#>NGN1MmaH12HK?@X9t!!s0@4OPxaW|_r!obe7TN~bvP%c`+J7L` zC2_WGq^np(nz4xbAT0T10EMaSAQ zXpt&u9|x8Sr=?0l)Q!wgOXETwwfeh+@neHUl^HB4n7R0)7~(zB@hrg-R`NcOjUX56 z&w7S79IQ5GDS(|2I#3h!?fmezVB89P*q{`9q(9_=31NKsLS?d=B-tw#rkNEh80y4) z(5$ubb4#;jS0`%qHvXRGV5v|r57Gpd5#bG6nc|tq} zrfT?ExYDyC%e7doJe*+OL87?x@sKGbhTOAXKutk^mGH&Xj59V3RC+wxhon=uP)ZAI zeg*%&5>+yuks=e!6SNxpqm~k=_yID7pZ8SNN*DRFT!m9`YTo*WFbqaj_U!^Aq1N10 z>8#<_Ks;?6EFSm~sZ^Wo% zY1i0*QAcxNYhB?|%AcNuqZv@$w8DXreo?t?ss!fj`mc_okLy0Bjg+FE$V*{>Y|JXd){3ghNl4UkPo9epI}_I zG9Pc#Lx4ZHF0p`ThcVe;C~`CF5qe~*k>Gh|m)JzGwSG^JY2)6ZYmLjfe9t)Xu429e z&~`Y*<(*}1k>@E(rDYx&5~Fxmeh>+@_4Fq;%4N>5Q@W+o9!?$Ju-Eg|4&XmH2vD6v zCD+yvzQY@G=RdEG))0d$hZ~a3#MA;VFM?_CyLJU!jG z$KAyprUS?3IbRY9aofi73#`UJB673zPVsC^3uW(zg8Q^dMj~_&6A7 z&*pSahm{=oI@CT*KXcq$8Zd%G!70{KF{z~~ zpgrH?zJla_(F^FTJ0DA1f-vOs6eq(iFOhh3iMD0H7$)CXO_o`e-T->f^z-v^yT z&R3-ZY*zgd9ZTFU7w^AzLT0AD54CxaZHpBRN_d3?VG`dWhv(UPf+gX(#%jsD!5i~& zjOtXyD`Fa>K)m;OYj?#pLlkf#n`|`tT32L=7ZE<-GddQ0GapR2!>R;qn4nl;TcnkR zGtn?tt; zdeNr9LRg6FslXXU9#8zKC|V_A+|hqASrCS}DzXdh~1bvd%+c2ljWYeTkv8A<9zB z*Uy-(KC9g!Q2<-ZIhF&=W*+h@MlIYqnnGR)*pTfd>+vI0mwGSU43_HsB;o;+4W9kW zZ?bjqO(Ip*`z9)X)62+q9c-W@7vSj>osXsq;W@OycF%n-pCbt*1~XuYve5H7)>f1OVyJ|+mQ&34AR>ix&^ev&i=LSe ztY@}(C(DS;-0EXf%)O$Y3^NLl!0*y|7s)!7LI4zf9ZCb?hDgyR?|K5Ws{&u;%^Zje zYNTYNlBAyxIt(s#=_O%+!nIM#>jx7c9%sSj#ll*=SeNZU@H_2Z!3<9GE!} z;;5!Ns4oLCePVn1W#n^}j-~x97xP|`d0o5iyb1yt0&OG4^efTQ6(Gut)r-;OE@E2W znEm|I0CPAlcsMU;SQp^jzbhUOGSVCv59U6ljASDtRY4=a2RQdy6pxO4U+}w>0PUwz z?WgBIeYz0iIRr)rjyR(X8Bc$D>hK{Y=z}ik133^7@Fjn1U(u!}l7trCFXMW}BUir~ zIkt6VV(v5jxO#jmZlu{f8+Y66Qz7fqA6q}I$jySzsXi(!^AHfI^-Bk~CQPIYh2UhB zpK)x$C&fbL4E_M*FkSN+>&jzbbP-?Z=$TGLz${f5Jy&UU;INgb((3vlJ3wxE-+>| zK4^yUzFBfQk>SXM#tdh5F+T7m$<7XBIA;sSDH)mJ?t|luG}excqRmPNf;N6J-I)C> za&ibbEH5-@jS#1@4|_PF9WNJhM+AJ& z1Y(-kcjnNyJapZ8y|#FL`e-6FVfyyUVb3bSq!v@DNj}}8R*=Mz#2^lQ&|83K}`U_Dc?H;6ybkbpu| z8|E-JHV?EQrbKQ%Q@%H-VmYV4m%|tv7Ex%goJYzNJ)?^>PW36JczF^xTEWi>&x<30 z#4}a^mcpPm{`0%5tnG$f%Z7_~4bxSTeNOe*E)y;OVHKD0p@{XNebk9;=38@)uU_k0tuZEPdpu zN0G_WPzTeXF7`$Sa4%TdU+8wf*v+3mel{A_g}C$kw(XNCNq8MzGI3eA8R2lvBdAj- zJWV&@aKEeO^#PHF%YyQlbJ8%yHE^*bQWaC|)zH@R|NR}|rJgA_3m?Ua>NVkidM z<-Jpy=Lup6TE0kYO3Jt~GHz)PaTuwLGG2E4GGlj-19iX$X8hVS zqdxX$)pRlS@72b2J=8m6cWlYrIWr%}&1{aHIzVZp)DwQVPT*H7@H+|o@Y7TcBfwTo zDndXD=GW={YW09{L(f<(=v`7<%J!p(sL1?5Yw6(J#KK5$EO&1wdG^bY$y6}gGmyDy zr#yMFr+kXJ(K~v>cl1sw6T~Kp!%Z~rsj&^*(o(xpX+)WHav2nBa?x{kOFgJA1&GV? zSeKfp%lX_6tBzKqK2Dt(yyy;^Uix-AaSq zmY(>%x{0(G_y~i$l`(2E+$6{SByMG~Zp*%h`|P3_VWS?Q$F1!98hH(+JlUG_u3%jxS;0k<~ckD8BuhjCQwL=xCF2pGiuq=;&s* zisQZ&$M+vNo-NYWm@N9tp5x0H=^3o_?MKrYhn}r>t8h-w<1yFr;xB?jc0AgDJK-Zj z)zo6;)ucovBjcF}Rg=E$X5@lF8cwR&ClAFQ(;p*zEd5)h(Z@c#e(V=F&Pl>O%KILu z1gA83d8JLJ(3QL2y~5eG@PR4?bhU_QPck%=(!d1 zsy{q3@p~d5Ff?3X~WCKV{$=AF}U`O=KCkA`!br|M4paWRgRBAL>`W; z95^*^amDfh>QZ{f?AHvD-rX4Qb~X=-YDkyXxnFwiereC~XS**b86P5z{yTfprNrY< zlU)~SQrD5=8TY|%Z3MfeUk{hc;L_YAwJx|vKBmh@L*2&$Ei$@ZQ6^KAxC_Sdl&xvZz65;he_@q0@lNwGuZoTkEY zRW-+)Y)(Ac+?oLh#eccIGkUqM;7l#} zv&r0jo29vRIRZ^2}@H3+MMHqERRtT!xX^@&5FO5^<)O+Ein-`Q2ky z7tF^OH7;LtZuz2&`MNb)5Aw89oxSwi;_~IkBOu0*t^Q5JHu+aU*wT%>rNeKR z{&{&>Bjq`ev^%qb{mi~r=aCoK6e4MOxKtwE{M6`9gf`2z@H+0U@4UOY^X|&Bx|T6g zbXkBgWEORfNvFQO-Miqc+Y2)0hk&{1!SX_P*%@W*jLV=}-hsRjG>QZCYc#~V-(u&M zv2!o88<~H0A;GL6D(}%gqd(r4{BC)ocFXO%TVCAVaxtCmaB#O6oF4+2Z z!Pd)nJizfZ8(QCuES%n@nBIH3mfvC31n$IIJ}|)bLmj_`;yn2swUM$KjUukAm#a_0 zcC~k=gCkFiom!F62`;(i|6SLqQIzrSW%{{gAE)-0ggx^?z;%r@F-4*Pm%(p2roi{O zV%x1d+m82vRcHDY#bdy|56LV{5pa5W@^=NvlYBw6)3J{6z-5mXT%{){XVk8hSTr(+ zODx2bCL_GI3ga@2N%MTNNL&1|ueh_X_|RPs@k|^e{y$eNco!jl^fvqkho<0X+71Xf zd6RUGl;noxU3nPu9P=wqk+vM>l05soih~8nA|!_Z3oHomh!B9u;RG1pMu5F63GhS$ zJRC}Z&k|s@4FT4VC%{Vxu>NZTe4GG_{S4sQ0Iaex1K`2&1UPvD0Eg@(zyrGp@Cqvc zR?H#5Z4m_LKmx3pNPt6~3C<@7u&j&#HxXd*0t4qrg7Z283{4@x&j_&YA_4B(M}VEJ z0r){G0j`}xaQ=}1>jMbzmWjkLh6!+O6#@Q|3&8L(0z4okIFB#@Shh^@PydkqACxu( AiU0rr literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl new file mode 100644 index 0000000..bc96cea --- /dev/null +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -0,0 +1,1386 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoImageRef +{ + uint offset; +}; + +struct AnnoImage +{ + float4 bbox; + float linewidth; + uint index; + int2 offset; +}; + +struct AnnoColorRef +{ + uint offset; +}; + +struct AnnoColor +{ + float4 bbox; + float linewidth; + uint rgba_color; +}; + +struct AnnoLinGradientRef +{ + uint offset; +}; + +struct AnnoLinGradient +{ + float4 bbox; + float linewidth; + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct AnnoBeginClipRef +{ + uint offset; +}; + +struct AnnoBeginClip +{ + float4 bbox; + float linewidth; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _296 : register(u0, space0); +ByteAddressBuffer _1249 : register(t1, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_bitmaps[8][256]; +groupshared Alloc sh_part_elements[256]; +groupshared uint sh_part_count[256]; +groupshared uint sh_elements[256]; +groupshared uint sh_tile_stride[256]; +groupshared uint sh_tile_width[256]; +groupshared uint sh_tile_x0[256]; +groupshared uint sh_tile_y0[256]; +groupshared uint sh_tile_base[256]; +groupshared uint sh_tile_count[256]; + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _373 = { a.offset + offset }; + return _373; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _296.Load(offset * 4 + 8); + return v; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) +{ + BinInstanceRef _754 = { ref.offset + (index * 4u) }; + return _754; +} + +BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + BinInstance s; + s.element_ix = raw0; + return s; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _706 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _706; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _814 = { raw2 }; + s.tiles = _814; + return s; +} + +void write_tile_alloc(uint el_ix, Alloc a) +{ +} + +Alloc read_tile_alloc(uint el_ix, bool mem_ok) +{ + uint _1135; + _296.GetDimensions(_1135); + _1135 = (_1135 - 8) / 4; + uint param = 0u; + uint param_1 = uint(int(_1135) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +Tile Tile_read(Alloc a, TileRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + TileSegRef _839 = { raw0 }; + Tile s; + s.tile = _839; + s.backdrop = int(raw1); + return s; +} + +AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + AnnoColor s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + s.rgba_color = raw5; + return s; +} + +AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref) +{ + AnnoColorRef _712 = { ref.offset + 4u }; + Alloc param = a; + AnnoColorRef param_1 = _712; + return AnnoColor_read(param, param_1); +} + +MallocResult malloc(uint size) +{ + uint _302; + _296.InterlockedAdd(0, size, _302); + uint offset = _302; + uint _309; + _296.GetDimensions(_309); + _309 = (_309 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_309) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _331; + _296.InterlockedMax(4, 1u, _331); + return r; + } + return r; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _296.Store(offset * 4 + 8, val); +} + +void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2); +} + +void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2); + CmdJumpRef _1128 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdJumpRef param_4 = _1128; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5); +} + +bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _1156 = malloc(param); + MallocResult new_cmd = _1156; + if (new_cmd.failed) + { + return false; + } + CmdJump _1166 = { new_cmd.alloc.offset }; + CmdJump jump = _1166; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3); + cmd_alloc = new_cmd.alloc; + CmdRef _1178 = { cmd_alloc.offset }; + cmd_ref = _1178; + cmd_limit = (cmd_alloc.offset + 1024u) - 60u; + return true; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2); + CmdFillRef _1012 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdFillRef param_4 = _1012; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5); +} + +void Cmd_Solid_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2); +} + +void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.half_width); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2); + CmdStrokeRef _1030 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdStrokeRef param_4 = _1030; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5); +} + +void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) +{ + uint param = flags; + if (fill_mode_from_flags(param) == 0u) + { + if (tile.tile.offset != 0u) + { + CmdFill _1202 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _1202; + Alloc param_1 = alloc; + CmdRef param_2 = cmd_ref; + CmdFill param_3 = cmd_fill; + Cmd_Fill_write(param_1, param_2, param_3); + cmd_ref.offset += 12u; + } + else + { + Alloc param_4 = alloc; + CmdRef param_5 = cmd_ref; + Cmd_Solid_write(param_4, param_5); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke _1232 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _1232; + Alloc param_6 = alloc; + CmdRef param_7 = cmd_ref; + CmdStroke param_8 = cmd_stroke; + Cmd_Stroke_write(param_6, param_7, param_8); + cmd_ref.offset += 12u; + } +} + +void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2); +} + +void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2); + CmdColorRef _1056 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdColorRef param_4 = _1056; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5); +} + +AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + AnnoLinGradient s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + s.index = raw5; + s.line_x = asfloat(raw6); + s.line_y = asfloat(raw7); + s.line_c = asfloat(raw8); + return s; +} + +AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref) +{ + AnnoLinGradientRef _722 = { ref.offset + 4u }; + Alloc param = a; + AnnoLinGradientRef param_1 = _722; + return AnnoLinGradient_read(param, param_1); +} + +void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.line_x); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.line_y); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.line_c); + write_mem(param_9, param_10, param_11); +} + +void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2); + CmdLinGradRef _1074 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdLinGradRef param_4 = _1074; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5); +} + +AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + AnnoImage s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + s.index = raw5; + s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16); + return s; +} + +AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref) +{ + AnnoImageRef _732 = { ref.offset + 4u }; + Alloc param = a; + AnnoImageRef param_1 = _732; + return AnnoImage_read(param, param_1); +} + +void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2); + CmdImageRef _1092 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdImageRef param_4 = _1092; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5); +} + +AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + AnnoBeginClip s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.linewidth = asfloat(raw4); + return s; +} + +AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) +{ + AnnoBeginClipRef _742 = { ref.offset + 4u }; + Alloc param = a; + AnnoBeginClipRef param_1 = _742; + return AnnoBeginClip_read(param, param_1); +} + +void Cmd_BeginClip_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2); +} + +void Cmd_EndClip_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2); +} + +void Cmd_End_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint width_in_bins = ((_1249.Load(8) + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1249.Load(0) + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.Load(8)) + bin_tile_x) + tile_x; + Alloc _1314; + _1314.offset = _1249.Load(24); + Alloc param; + param.offset = _1314.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1323 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1323; + uint cmd_limit = (cmd_ref.offset + 1024u) - 60u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint clip_one_mask = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + bool mem_ok = _296.Load(4) == 0u; + Alloc param_3; + Alloc param_5; + uint _1529; + uint element_ix; + AnnotatedRef ref; + Alloc param_14; + Alloc param_16; + uint tile_count; + Alloc param_23; + uint _1841; + Alloc param_29; + Tile tile_1; + AnnoColor fill; + Alloc param_35; + Alloc param_52; + CmdLinGrad cmd_lin; + Alloc param_69; + Alloc param_86; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1581; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1379 = th_ix < 256u; + bool _1387; + if (_1379) + { + _1387 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1387 = _1379; + } + if (_1387) + { + uint in_ix = (_1249.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1404; + _1404.offset = _1249.Load(20); + param_3.offset = _1404.offset; + uint param_4 = in_ix; + count = read_mem(param_3, param_4); + Alloc _1415; + _1415.offset = _1249.Load(20); + param_5.offset = _1415.offset; + uint param_6 = in_ix + 1u; + uint offset = read_mem(param_5, param_6); + uint param_7 = offset; + uint param_8 = count * 4u; + bool param_9 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + GroupMemoryBarrierWithGroupSync(); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + GroupMemoryBarrierWithGroupSync(); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + GroupMemoryBarrierWithGroupSync(); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1529 = sh_part_count[part_ix - 1u]; + } + else + { + _1529 = part_start_ix; + } + ix -= _1529; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef _1548 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1548; + BinInstanceRef param_10 = inst_ref; + uint param_11 = ix; + Alloc param_12 = bin_alloc; + BinInstanceRef param_13 = BinInstance_index(param_10, param_11); + BinInstance inst = BinInstance_read(param_12, param_13); + sh_elements[th_ix] = inst.element_ix; + } + GroupMemoryBarrierWithGroupSync(); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1571 = (wr_ix - rd_ix) < 256u; + if (_1571) + { + _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1581 = _1571; + } + if (_1581) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + AnnotatedRef _1602 = { _1249.Load(32) + (element_ix * 40u) }; + ref = _1602; + Alloc _1605; + _1605.offset = _1249.Load(32); + param_14.offset = _1605.offset; + AnnotatedRef param_15 = ref; + tag = Annotated_tag(param_14, param_15).tag; + } + switch (tag) + { + case 1u: + case 3u: + case 2u: + case 4u: + case 5u: + { + uint path_ix = element_ix; + PathRef _1624 = { _1249.Load(16) + (path_ix * 12u) }; + Alloc _1627; + _1627.offset = _1249.Load(16); + param_16.offset = _1627.offset; + PathRef param_17 = _1624; + Path path = Path_read(param_16, param_17); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_18 = path.tiles.offset; + uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_20 = mem_ok; + Alloc path_alloc = new_alloc(param_18, param_19, param_20); + uint param_21 = th_ix; + Alloc param_22 = path_alloc; + write_tile_alloc(param_21, param_22); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = tile_count; + } + GroupMemoryBarrierWithGroupSync(); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + AnnotatedRef _1826 = { _1249.Load(32) + (sh_elements[el_ix] * 40u) }; + AnnotatedRef ref_1 = _1826; + Alloc _1830; + _1830.offset = _1249.Load(32); + param_23.offset = _1830.offset; + AnnotatedRef param_24 = ref_1; + uint tag_1 = Annotated_tag(param_23, param_24).tag; + if (el_ix > 0u) + { + _1841 = sh_tile_count[el_ix - 1u]; + } + else + { + _1841 = 0u; + } + uint seq_ix = ix_1 - _1841; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if ((tag_1 == 4u) || (tag_1 == 5u)) + { + include_tile = true; + } + else + { + if (mem_ok) + { + uint param_25 = el_ix; + bool param_26 = mem_ok; + TileRef _1901 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_27 = read_tile_alloc(param_25, param_26); + TileRef param_28 = _1901; + Tile tile = Tile_read(param_27, param_28); + bool _1907 = tile.tile.offset != 0u; + bool _1914; + if (!_1907) + { + _1914 = tile.backdrop != 0; + } + else + { + _1914 = _1907; + } + include_tile = _1914; + } + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1934; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1934); + } + } + GroupMemoryBarrierWithGroupSync(); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); + uint element_ix_1 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + AnnotatedRef _1988 = { _1249.Load(32) + (element_ix_1 * 40u) }; + ref = _1988; + Alloc _1993; + _1993.offset = _1249.Load(32); + param_29.offset = _1993.offset; + AnnotatedRef param_30 = ref; + AnnotatedTag tag_2 = Annotated_tag(param_29, param_30); + if (clip_zero_depth == 0u) + { + switch (tag_2.tag) + { + case 1u: + { + uint param_31 = element_ref_ix; + bool param_32 = mem_ok; + TileRef _2029 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_33 = read_tile_alloc(param_31, param_32); + TileRef param_34 = _2029; + tile_1 = Tile_read(param_33, param_34); + Alloc _2036; + _2036.offset = _1249.Load(32); + param_35.offset = _2036.offset; + AnnotatedRef param_36 = ref; + fill = Annotated_Color_read(param_35, param_36); + Alloc param_37 = cmd_alloc; + CmdRef param_38 = cmd_ref; + uint param_39 = cmd_limit; + bool _2048 = alloc_cmd(param_37, param_38, param_39); + cmd_alloc = param_37; + cmd_ref = param_38; + cmd_limit = param_39; + if (!_2048) + { + break; + } + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + uint param_42 = tag_2.flags; + Tile param_43 = tile_1; + float param_44 = fill.linewidth; + write_fill(param_40, param_41, param_42, param_43, param_44); + cmd_ref = param_41; + CmdColor _2072 = { fill.rgba_color }; + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + CmdColor param_47 = _2072; + Cmd_Color_write(param_45, param_46, param_47); + cmd_ref.offset += 8u; + break; + } + case 2u: + { + uint param_48 = element_ref_ix; + bool param_49 = mem_ok; + TileRef _2101 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_50 = read_tile_alloc(param_48, param_49); + TileRef param_51 = _2101; + tile_1 = Tile_read(param_50, param_51); + Alloc _2108; + _2108.offset = _1249.Load(32); + param_52.offset = _2108.offset; + AnnotatedRef param_53 = ref; + AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53); + Alloc param_54 = cmd_alloc; + CmdRef param_55 = cmd_ref; + uint param_56 = cmd_limit; + bool _2120 = alloc_cmd(param_54, param_55, param_56); + cmd_alloc = param_54; + cmd_ref = param_55; + cmd_limit = param_56; + if (!_2120) + { + break; + } + Alloc param_57 = cmd_alloc; + CmdRef param_58 = cmd_ref; + uint param_59 = tag_2.flags; + Tile param_60 = tile_1; + float param_61 = fill.linewidth; + write_fill(param_57, param_58, param_59, param_60, param_61); + cmd_ref = param_58; + cmd_lin.index = lin.index; + cmd_lin.line_x = lin.line_x; + cmd_lin.line_y = lin.line_y; + cmd_lin.line_c = lin.line_c; + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + CmdLinGrad param_64 = cmd_lin; + Cmd_LinGrad_write(param_62, param_63, param_64); + cmd_ref.offset += 20u; + break; + } + case 3u: + { + uint param_65 = element_ref_ix; + bool param_66 = mem_ok; + TileRef _2185 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_67 = read_tile_alloc(param_65, param_66); + TileRef param_68 = _2185; + tile_1 = Tile_read(param_67, param_68); + Alloc _2192; + _2192.offset = _1249.Load(32); + param_69.offset = _2192.offset; + AnnotatedRef param_70 = ref; + AnnoImage fill_img = Annotated_Image_read(param_69, param_70); + Alloc param_71 = cmd_alloc; + CmdRef param_72 = cmd_ref; + uint param_73 = cmd_limit; + bool _2204 = alloc_cmd(param_71, param_72, param_73); + cmd_alloc = param_71; + cmd_ref = param_72; + cmd_limit = param_73; + if (!_2204) + { + break; + } + Alloc param_74 = cmd_alloc; + CmdRef param_75 = cmd_ref; + uint param_76 = tag_2.flags; + Tile param_77 = tile_1; + float param_78 = fill_img.linewidth; + write_fill(param_74, param_75, param_76, param_77, param_78); + cmd_ref = param_75; + CmdImage _2230 = { fill_img.index, fill_img.offset }; + Alloc param_79 = cmd_alloc; + CmdRef param_80 = cmd_ref; + CmdImage param_81 = _2230; + Cmd_Image_write(param_79, param_80, param_81); + cmd_ref.offset += 12u; + break; + } + case 4u: + { + uint param_82 = element_ref_ix; + bool param_83 = mem_ok; + TileRef _2259 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_84 = read_tile_alloc(param_82, param_83); + TileRef param_85 = _2259; + tile_1 = Tile_read(param_84, param_85); + bool _2265 = tile_1.tile.offset == 0u; + bool _2271; + if (_2265) + { + _2271 = tile_1.backdrop == 0; + } + else + { + _2271 = _2265; + } + if (_2271) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + if ((tile_1.tile.offset == 0u) && (clip_depth < 32u)) + { + clip_one_mask |= (1u << clip_depth); + } + else + { + Alloc _2293; + _2293.offset = _1249.Load(32); + param_86.offset = _2293.offset; + AnnotatedRef param_87 = ref; + AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87); + Alloc param_88 = cmd_alloc; + CmdRef param_89 = cmd_ref; + uint param_90 = cmd_limit; + bool _2305 = alloc_cmd(param_88, param_89, param_90); + cmd_alloc = param_88; + cmd_ref = param_89; + cmd_limit = param_90; + if (!_2305) + { + break; + } + Alloc param_91 = cmd_alloc; + CmdRef param_92 = cmd_ref; + uint param_93 = tag_2.flags; + Tile param_94 = tile_1; + float param_95 = begin_clip.linewidth; + write_fill(param_91, param_92, param_93, param_94, param_95); + cmd_ref = param_92; + Alloc param_96 = cmd_alloc; + CmdRef param_97 = cmd_ref; + Cmd_BeginClip_write(param_96, param_97); + cmd_ref.offset += 4u; + if (clip_depth < 32u) + { + clip_one_mask &= (~(1u << clip_depth)); + } + } + } + clip_depth++; + break; + } + case 5u: + { + clip_depth--; + bool _2351 = clip_depth >= 32u; + bool _2360; + if (!_2351) + { + _2360 = (clip_one_mask & (1u << clip_depth)) == 0u; + } + else + { + _2360 = _2351; + } + if (_2360) + { + Alloc param_98 = cmd_alloc; + CmdRef param_99 = cmd_ref; + uint param_100 = cmd_limit; + bool _2369 = alloc_cmd(param_98, param_99, param_100); + cmd_alloc = param_98; + cmd_ref = param_99; + cmd_limit = param_100; + if (!_2369) + { + break; + } + Alloc param_101 = cmd_alloc; + CmdRef param_102 = cmd_ref; + Cmd_Solid_write(param_101, param_102); + cmd_ref.offset += 4u; + Alloc param_103 = cmd_alloc; + CmdRef param_104 = cmd_ref; + Cmd_EndClip_write(param_103, param_104); + cmd_ref.offset += 4u; + } + break; + } + } + } + else + { + switch (tag_2.tag) + { + case 4u: + { + clip_depth++; + break; + } + case 5u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + GroupMemoryBarrierWithGroupSync(); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2432 = (bin_tile_x + tile_x) < _1249.Load(8); + bool _2441; + if (_2432) + { + _2441 = (bin_tile_y + tile_y) < _1249.Load(12); + } + else + { + _2441 = _2432; + } + if (_2441) + { + Alloc param_105 = cmd_alloc; + CmdRef param_106 = cmd_ref; + Cmd_End_write(param_105, param_106); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl new file mode 100644 index 0000000..096f710 --- /dev/null +++ b/piet-gpu/shader/gen/coarse.msl @@ -0,0 +1,1378 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +// Implementation of the GLSL findLSB() function +template +inline T spvFindLSB(T x) +{ + return select(ctz(x), T(-1), x == T(0)); +} + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoImageRef +{ + uint offset; +}; + +struct AnnoImage +{ + float4 bbox; + float linewidth; + uint index; + int2 offset; +}; + +struct AnnoColorRef +{ + uint offset; +}; + +struct AnnoColor +{ + float4 bbox; + float linewidth; + uint rgba_color; +}; + +struct AnnoLinGradientRef +{ + uint offset; +}; + +struct AnnoLinGradient +{ + float4 bbox; + float linewidth; + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct AnnoBeginClipRef +{ + uint offset; +}; + +struct AnnoBeginClip +{ + float4 bbox; + float linewidth; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_296.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index) +{ + return BinInstanceRef{ ref.offset + (index * 4u) }; +} + +static inline __attribute__((always_inline)) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + BinInstance s; + s.element_ix = raw0; + return s; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_296, v_296BufferSize); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) +{ +} + +static inline __attribute__((always_inline)) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint param = 0u; + uint param_1 = uint(int((v_296BufferSize - 8) / 4) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +static inline __attribute__((always_inline)) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Tile s; + s.tile = TileSegRef{ raw0 }; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + AnnoColor s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + s.rgba_color = raw5; + return s; +} + +static inline __attribute__((always_inline)) +AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u }; + return AnnoColor_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint _302 = atomic_fetch_add_explicit((device atomic_uint*)&v_296.mem_offset, size, memory_order_relaxed); + uint offset = _302; + MallocResult r; + r.failed = (offset + size) > uint(int((v_296BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _331 = atomic_fetch_max_explicit((device atomic_uint*)&v_296.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_296.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_296, constant uint& v_296BufferSize) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _1156 = malloc(param, v_296, v_296BufferSize); + MallocResult new_cmd = _1156; + if (new_cmd.failed) + { + return false; + } + CmdJump jump = CmdJump{ new_cmd.alloc.offset }; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3, v_296, v_296BufferSize); + cmd_alloc = new_cmd.alloc; + cmd_ref = CmdRef{ cmd_alloc.offset }; + cmd_limit = (cmd_alloc.offset + 1024u) - 60u; + return true; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.half_width); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint param = flags; + if (fill_mode_from_flags(param) == 0u) + { + if (tile.tile.offset != 0u) + { + CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop }; + Alloc param_1 = alloc; + CmdRef param_2 = cmd_ref; + CmdFill param_3 = cmd_fill; + Cmd_Fill_write(param_1, param_2, param_3, v_296, v_296BufferSize); + cmd_ref.offset += 12u; + } + else + { + Alloc param_4 = alloc; + CmdRef param_5 = cmd_ref; + Cmd_Solid_write(param_4, param_5, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth }; + Alloc param_6 = alloc; + CmdRef param_7 = cmd_ref; + CmdStroke param_8 = cmd_stroke; + Cmd_Stroke_write(param_6, param_7, param_8, v_296, v_296BufferSize); + cmd_ref.offset += 12u; + } +} + +static inline __attribute__((always_inline)) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_296, v_296BufferSize); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_296, v_296BufferSize); + AnnoLinGradient s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + s.index = raw5; + s.line_x = as_type(raw6); + s.line_y = as_type(raw7); + s.line_c = as_type(raw8); + return s; +} + +static inline __attribute__((always_inline)) +AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u }; + return AnnoLinGradient_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.line_x); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.line_y); + write_mem(param_6, param_7, param_8, v_296, v_296BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.line_c); + write_mem(param_9, param_10, param_11, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize); + AnnoImage s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + s.index = raw5; + s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u }; + return AnnoImage_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); + Alloc param_3 = a; + CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +AnnoBeginClip AnnoBeginClip_read(thread const Alloc& a, thread const AnnoBeginClipRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + AnnoBeginClip s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.linewidth = as_type(raw4); + return s; +} + +static inline __attribute__((always_inline)) +AnnoBeginClip Annotated_BeginClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + AnnoBeginClipRef param_1 = AnnoBeginClipRef{ ref.offset + 4u }; + return AnnoBeginClip_read(param, param_1, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2, v_296, v_296BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_296 [[buffer(0)]], const device ConfigBuf& _1249 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_bitmaps[8][256]; + threadgroup Alloc sh_part_elements[256]; + threadgroup uint sh_part_count[256]; + threadgroup uint sh_elements[256]; + threadgroup uint sh_tile_stride[256]; + threadgroup uint sh_tile_width[256]; + threadgroup uint sh_tile_x0[256]; + threadgroup uint sh_tile_y0[256]; + threadgroup uint sh_tile_base[256]; + threadgroup uint sh_tile_count[256]; + constant uint& v_296BufferSize = spvBufferSizeConstants[0]; + uint width_in_bins = ((_1249.conf.width_in_tiles + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1249.conf.n_elements + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.conf.width_in_tiles) + bin_tile_x) + tile_x; + Alloc param; + param.offset = _1249.conf.ptcl_alloc.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint cmd_limit = (cmd_ref.offset + 1024u) - 60u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint clip_one_mask = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + bool mem_ok = v_296.mem_error == 0u; + Alloc param_3; + Alloc param_5; + uint _1529; + uint element_ix; + AnnotatedRef ref; + Alloc param_14; + Alloc param_16; + uint tile_count; + Alloc param_23; + uint _1841; + Alloc param_29; + Tile tile_1; + AnnoColor fill; + Alloc param_35; + Alloc param_52; + CmdLinGrad cmd_lin; + Alloc param_69; + Alloc param_86; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1581; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1379 = th_ix < 256u; + bool _1387; + if (_1379) + { + _1387 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1387 = _1379; + } + if (_1387) + { + uint in_ix = (_1249.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_3.offset = _1249.conf.bin_alloc.offset; + uint param_4 = in_ix; + count = read_mem(param_3, param_4, v_296, v_296BufferSize); + param_5.offset = _1249.conf.bin_alloc.offset; + uint param_6 = in_ix + 1u; + uint offset = read_mem(param_5, param_6, v_296, v_296BufferSize); + uint param_7 = offset; + uint param_8 = count * 4u; + bool param_9 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1529 = sh_part_count[part_ix - 1u]; + } + else + { + _1529 = part_start_ix; + } + ix -= _1529; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; + BinInstanceRef param_10 = inst_ref; + uint param_11 = ix; + Alloc param_12 = bin_alloc; + BinInstanceRef param_13 = BinInstance_index(param_10, param_11); + BinInstance inst = BinInstance_read(param_12, param_13, v_296, v_296BufferSize); + sh_elements[th_ix] = inst.element_ix; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1571 = (wr_ix - rd_ix) < 256u; + if (_1571) + { + _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1581 = _1571; + } + if (_1581) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix * 40u) }; + param_14.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_15 = ref; + tag = Annotated_tag(param_14, param_15, v_296, v_296BufferSize).tag; + } + switch (tag) + { + case 1u: + case 3u: + case 2u: + case 4u: + case 5u: + { + uint path_ix = element_ix; + param_16.offset = _1249.conf.tile_alloc.offset; + PathRef param_17 = PathRef{ _1249.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_16, param_17, v_296, v_296BufferSize); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_18 = path.tiles.offset; + uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_20 = mem_ok; + Alloc path_alloc = new_alloc(param_18, param_19, param_20); + uint param_21 = th_ix; + Alloc param_22 = path_alloc; + write_tile_alloc(param_21, param_22); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = tile_count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + AnnotatedRef ref_1 = AnnotatedRef{ _1249.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) }; + param_23.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_24 = ref_1; + uint tag_1 = Annotated_tag(param_23, param_24, v_296, v_296BufferSize).tag; + if (el_ix > 0u) + { + _1841 = sh_tile_count[el_ix - 1u]; + } + else + { + _1841 = 0u; + } + uint seq_ix = ix_1 - _1841; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if ((tag_1 == 4u) || (tag_1 == 5u)) + { + include_tile = true; + } + else + { + if (mem_ok) + { + uint param_25 = el_ix; + bool param_26 = mem_ok; + Alloc param_27 = read_tile_alloc(param_25, param_26, v_296, v_296BufferSize); + TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_27, param_28, v_296, v_296BufferSize); + bool _1907 = tile.tile.offset != 0u; + bool _1914; + if (!_1907) + { + _1914 = tile.backdrop != 0; + } + else + { + _1914 = _1907; + } + include_tile = _1914; + } + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1934 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); + uint element_ix_1 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix_1 * 40u) }; + param_29.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_30 = ref; + AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_296, v_296BufferSize); + if (clip_zero_depth == 0u) + { + switch (tag_2.tag) + { + case 1u: + { + uint param_31 = element_ref_ix; + bool param_32 = mem_ok; + Alloc param_33 = read_tile_alloc(param_31, param_32, v_296, v_296BufferSize); + TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_33, param_34, v_296, v_296BufferSize); + param_35.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_36 = ref; + fill = Annotated_Color_read(param_35, param_36, v_296, v_296BufferSize); + Alloc param_37 = cmd_alloc; + CmdRef param_38 = cmd_ref; + uint param_39 = cmd_limit; + bool _2048 = alloc_cmd(param_37, param_38, param_39, v_296, v_296BufferSize); + cmd_alloc = param_37; + cmd_ref = param_38; + cmd_limit = param_39; + if (!_2048) + { + break; + } + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + uint param_42 = tag_2.flags; + Tile param_43 = tile_1; + float param_44 = fill.linewidth; + write_fill(param_40, param_41, param_42, param_43, param_44, v_296, v_296BufferSize); + cmd_ref = param_41; + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + CmdColor param_47 = CmdColor{ fill.rgba_color }; + Cmd_Color_write(param_45, param_46, param_47, v_296, v_296BufferSize); + cmd_ref.offset += 8u; + break; + } + case 2u: + { + uint param_48 = element_ref_ix; + bool param_49 = mem_ok; + Alloc param_50 = read_tile_alloc(param_48, param_49, v_296, v_296BufferSize); + TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_50, param_51, v_296, v_296BufferSize); + param_52.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_53 = ref; + AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_296, v_296BufferSize); + Alloc param_54 = cmd_alloc; + CmdRef param_55 = cmd_ref; + uint param_56 = cmd_limit; + bool _2120 = alloc_cmd(param_54, param_55, param_56, v_296, v_296BufferSize); + cmd_alloc = param_54; + cmd_ref = param_55; + cmd_limit = param_56; + if (!_2120) + { + break; + } + Alloc param_57 = cmd_alloc; + CmdRef param_58 = cmd_ref; + uint param_59 = tag_2.flags; + Tile param_60 = tile_1; + float param_61 = fill.linewidth; + write_fill(param_57, param_58, param_59, param_60, param_61, v_296, v_296BufferSize); + cmd_ref = param_58; + cmd_lin.index = lin.index; + cmd_lin.line_x = lin.line_x; + cmd_lin.line_y = lin.line_y; + cmd_lin.line_c = lin.line_c; + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + CmdLinGrad param_64 = cmd_lin; + Cmd_LinGrad_write(param_62, param_63, param_64, v_296, v_296BufferSize); + cmd_ref.offset += 20u; + break; + } + case 3u: + { + uint param_65 = element_ref_ix; + bool param_66 = mem_ok; + Alloc param_67 = read_tile_alloc(param_65, param_66, v_296, v_296BufferSize); + TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_67, param_68, v_296, v_296BufferSize); + param_69.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_70 = ref; + AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_296, v_296BufferSize); + Alloc param_71 = cmd_alloc; + CmdRef param_72 = cmd_ref; + uint param_73 = cmd_limit; + bool _2204 = alloc_cmd(param_71, param_72, param_73, v_296, v_296BufferSize); + cmd_alloc = param_71; + cmd_ref = param_72; + cmd_limit = param_73; + if (!_2204) + { + break; + } + Alloc param_74 = cmd_alloc; + CmdRef param_75 = cmd_ref; + uint param_76 = tag_2.flags; + Tile param_77 = tile_1; + float param_78 = fill_img.linewidth; + write_fill(param_74, param_75, param_76, param_77, param_78, v_296, v_296BufferSize); + cmd_ref = param_75; + Alloc param_79 = cmd_alloc; + CmdRef param_80 = cmd_ref; + CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset }; + Cmd_Image_write(param_79, param_80, param_81, v_296, v_296BufferSize); + cmd_ref.offset += 12u; + break; + } + case 4u: + { + uint param_82 = element_ref_ix; + bool param_83 = mem_ok; + Alloc param_84 = read_tile_alloc(param_82, param_83, v_296, v_296BufferSize); + TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_84, param_85, v_296, v_296BufferSize); + bool _2265 = tile_1.tile.offset == 0u; + bool _2271; + if (_2265) + { + _2271 = tile_1.backdrop == 0; + } + else + { + _2271 = _2265; + } + if (_2271) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + if ((tile_1.tile.offset == 0u) && (clip_depth < 32u)) + { + clip_one_mask |= (1u << clip_depth); + } + else + { + param_86.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef param_87 = ref; + AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87, v_296, v_296BufferSize); + Alloc param_88 = cmd_alloc; + CmdRef param_89 = cmd_ref; + uint param_90 = cmd_limit; + bool _2305 = alloc_cmd(param_88, param_89, param_90, v_296, v_296BufferSize); + cmd_alloc = param_88; + cmd_ref = param_89; + cmd_limit = param_90; + if (!_2305) + { + break; + } + Alloc param_91 = cmd_alloc; + CmdRef param_92 = cmd_ref; + uint param_93 = tag_2.flags; + Tile param_94 = tile_1; + float param_95 = begin_clip.linewidth; + write_fill(param_91, param_92, param_93, param_94, param_95, v_296, v_296BufferSize); + cmd_ref = param_92; + Alloc param_96 = cmd_alloc; + CmdRef param_97 = cmd_ref; + Cmd_BeginClip_write(param_96, param_97, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + if (clip_depth < 32u) + { + clip_one_mask &= (~(1u << clip_depth)); + } + } + } + clip_depth++; + break; + } + case 5u: + { + clip_depth--; + bool _2351 = clip_depth >= 32u; + bool _2360; + if (!_2351) + { + _2360 = (clip_one_mask & (1u << clip_depth)) == 0u; + } + else + { + _2360 = _2351; + } + if (_2360) + { + Alloc param_98 = cmd_alloc; + CmdRef param_99 = cmd_ref; + uint param_100 = cmd_limit; + bool _2369 = alloc_cmd(param_98, param_99, param_100, v_296, v_296BufferSize); + cmd_alloc = param_98; + cmd_ref = param_99; + cmd_limit = param_100; + if (!_2369) + { + break; + } + Alloc param_101 = cmd_alloc; + CmdRef param_102 = cmd_ref; + Cmd_Solid_write(param_101, param_102, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + Alloc param_103 = cmd_alloc; + CmdRef param_104 = cmd_ref; + Cmd_EndClip_write(param_103, param_104, v_296, v_296BufferSize); + cmd_ref.offset += 4u; + } + break; + } + } + } + else + { + switch (tag_2.tag) + { + case 4u: + { + clip_depth++; + break; + } + case 5u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2432 = (bin_tile_x + tile_x) < _1249.conf.width_in_tiles; + bool _2441; + if (_2432) + { + _2441 = (bin_tile_y + tile_y) < _1249.conf.height_in_tiles; + } + else + { + _2441 = _2432; + } + if (_2441) + { + Alloc param_105 = cmd_alloc; + CmdRef param_106 = cmd_ref; + Cmd_End_write(param_105, param_106, v_296, v_296BufferSize); + } +} + diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/gen/coarse.spv similarity index 61% rename from piet-gpu/shader/coarse.spv rename to piet-gpu/shader/gen/coarse.spv index 8d4f7c07d368e18d09344ad3fd846eae873874c0..fbe025d39c6a00852ac9e6e2b1528683dc1a5848 100644 GIT binary patch literal 64040 zcmbWA1)v?(*|jI!n-JXHg1fsDEJ$$I5QRVlB3N;Ehf+$hQXEPtPLbkJD5aFPIJ8Ki zIK}OMo^xj1yYGB$`TlR(VXw8`cfb4HGW*P&IY~MunSau%>Qzl%^{vjDtZE(8Rg<7p z)f82qRykncHUpO$KVjHXE3CMT4pUbhEkAvxsV1#@Q+9HW9z1f4hIb*~a!yNq_30FG z*59*(|4l-9KcthE2MioEV2dpV4BTMQ(0%tEG=9QCqlOO}HFC`GLBmFl8$NWx$o+@w z*SGK+IcDgniNi{V-guNb$S0?b{-Z{X9ZK1UEae_!M~oOhd_sjXDfw5`nBfNu8m!+U z6PI0`{|{Mqb!?s0ls$YqtEu4QM;=Tz*6m~*==1Q=gU0S7@6+n%X01S54zjrkwd~{bL?KWmW>uz)MYIgX9u@i^xHHelM89!m%#Gw~Hc*PF%K?eY@VQ_@zB_oekdI?!0!v_!hk72PJbAHM)<^u5Um{U}X!^e*rIrN9~>!=nn7N>C+?GdN5TFf|I zb2eGEGdVWK_f;D-gCTDkEs=7DSZ@Myic_(W&bf_#%?@%@SelB z9zFuvT2Ja8-Ff$}R)%-^f?aoN9z%wVJFOn``Nn{ zqdBkoI)!tHTSWWc2Kw0JK9jkXU1w%cq6AD9;KoKbhyV@ZA`q3)h6JK)n;`& z>ki*@C&4{NeUX-J={UwdPp$u4bsVLGKachkMr5T(e(W z5i9-L8r<$zpK80hjR}J%3?EkXN!_FD(++U+Y58tDcu(zqvZ;GaK(dc=`_sL5Ok3>& z@2Wcp&;NRvCY$!uc#ZBv{p7zhxSjv>)gC?KwwC`7k5bR@WZY5hUc?{N6@SKRc#rtS z!ukJhrM=g%Y`glw$6Z%H_U_S6Ylr>+Zoz)Ie`cIUcEz8e8rvg&*E0D(wK2MA!yBEf zrTxI`jvTY`nDG+^j~Uwjgm68mdvvd*sjL0rh3}x&KA$yF7+8|6Mc~Jh{qts`-Ck!k z5uDEl?;HG_D5|m+`?NIVYWS$(quJjEjl`*UHGOqRPv8GO4n@%9V@&fom@?!2BXDv% z47}yw347t+x+dfv-DA*M9SPr-+n)HyJxZUW;KosREae{E`$t>rg*Jg$?a%6|tK;C- z`|lmAGpu(&`*RHS(w}3&?f&$xP6V6lHpBNc-BvDY9%a5K!Sz!&U%7|5Svn_b<~wBY z(0zuD8@sQ1Z|)*bqqOdS&sh&rC~`i9ddc}za69Kd)fwRRMi1L&!nm>fl=my@9^HBO zuFisY`GU<;&BNTuslMN7>*l}L;87z6)i*7@t68e^(Yj&`YTZ?4MR(ts);A+ve(6e6 zr+-V1;7*p9&da%!>En6eb{{*cAA{R--n+U0Oxz8*ts`oC{M9_V``TGu2(SHG{git& z`>Fmv=V_+uGW5Fs5BIXprZrvNeN?rFBkiI6yO@~i-zDJm?^1C3cR84LbB{yry(=}3 z?*8?zu7Jl+n{%n=(ewUlrs_Jh`2KKb(sZLYExDz&`f@cf)0b;(lgY4;*Mi%9=~La% zZnyqGX-!wFKk6Q3e{O`6W9{3XS2d69e~yKk^EJw6#`-==&HJsER^Ladb=_~Z?xSX_ zZbj>ERXp1MEW)1=bqXG^x1`;T0%Uc+}2H~qT@oc`SlPXF$2x4p^4 z(d|1(^H%fd&atz45KfNzwfZLa@EN@wcc$ubc?!sQwIY_jl&%C9r+jru9Mce`&d0)74UM>8-Y(FWP>gX#2&kwx_9H zOWR{djr{)^N&EIn5&u%Bl~z{XJakE~NF9nBPU!QM>P-sPFe%o%jzQ)w-Q; z^`$p_aLd-WngZPV?yYAizs_vcovcS)z8{5me}6emH66T_z@VX{|MS;L?H#bSZ*_fV z#$7CRsnz}-wa#^#T5Df_y{lPS@_sq1>>Q|lDEIsn-tY(FGKeO{)Dcm8%og8~tv%^Of-_tp_bD<5c z<1GO0{tW1>7J_d!eDv6Ht^1BXEpDvVSl8b@dvB&6|8MET$Bpx=uhwUty7w;lw_@1d zB06%2mjUbRdRQC0@8EHReKwW%Z}9aReA@=!zQG4K_|P7_quLw3f1Sr6;P!K6XLSU4 z+~5P2!PLs*=pOYtt79Ad__qJD@jt0yKc&G>Yx^%3|FatQa~k}-w*T_+zo22iu)!~C z`>zoH%Nq8}8~lp4|BCUyreVLX!Eb2$uN42A8upue@Q&&>czciNsQv=ZIKKcc$NA+R z^*XCp8~kr=|E_UU36jHTYu<{<{W$roo?W@Ru6=l?H#S!Qbw|JE~9NnZGZ=<@|lq!?&~g zt_Sa^dU3mybucM7ai##5ai)fs`@*yhK7HH2YaPtgu+P%qv$g%Z*1?<&``kTvM>Q`z zc`gr5o-2dPJXh;cud`aC!Pjd0cjei?VPCHY@2EC_r`@f-Y_;QKZB!3};$gCE+1cT|VN)7G&)Y@OBd4Sqro-cg-W)IF<* zt+P6(!Ov~*iyHir2EVkyuWImX8vNP@zp24*Ztz=raPA4<8Hc-j*gC6w8~nZof2zTs zZty?$;Cw#-Pg~FTuys~1Huy^o{(6JI+2C(A_`41MeuIC|;GZ`5=MDZvgMZiH+$Q(f zD|$8fqzyhD0@4L(DI&)VR#H~1V4K5v81-{1>0_+kydM1wEc;LA7oiVeO} zgRjxxYc=@V4ZcBx4`}d>8hrBx-?G8CYVaKze5VHAxxx2n@F5L8w82L<_&yCjs=>!M z_=E%n_h*Ta2Q_lY|(f8E2U zv%00hZ)@=T8vOnqym$3D+%|YT=Ot7-@27kCbXLza__IBDpXx<;>opT^Ne2xXIi~$N z*IB)U-uhyMF^f<@;M1ASlK8jc_C%l$t5 zE=+%JMY`Mf3xQUhwv8>N+H3g8J@?|(>X<=oW8&GAO1=z<-W#26MDA+85Bk0nhK`CM z?N6%S&MKy^I(@MX<~Ip3q>ahYCyX0BW_)dkJ~_JYPP_g4w)!{)`Y>)#MvomcmM>cK zH3{wXZrP{APXCP9rA^h+%&dE?rbcVGn_1A-Zetp3UEiM!D(&sDoEE#$ytj=tV=!H9 zV`jQ+ekb!v*Zi(KaRff*tL8D3dAIbtlGkd@>*gO|$G`o#)~EG5#XUz2+F|UteFlsh zJ8|EQ*SAie>RgdZ?2yic_f+!1FZ@xS#L^E)}~@vFCk+EQ!0Rzd6jKC`3h z4b}nc2@a5jL!O!i%JE{xd?Y{M?t^_A;cYivn-|N#_U+kIf`mGW4 zx_G&(erNTjwsB*J46nzkHycv9Pq{uPLt_V?klzE?->GViYOV2{8eWdqv<*H(gU{51 z^GgCa{pcDR^IQte-&dK#O5mQqXW@5?XzexHS*-^y+ux|cw`}qDm1_Q5*F5>})bQWE z!S{gkc{Xww1Lc|?A>XTg^>yofM~XhIHSQhNe&W{k+EE<{K9Fxn?bnaM2i5p6b)NXt zk5g~qKGkX9#LsU;9fxxZy&aF=KonX#Ztv=1wATJnFADp7HQJEDHhg(|rcg>GA_~Q-!WP?A|gZHVPg{Obzx_u74y|yQ<{so@EFYiXRhdFMo zvHD!UdVTle!?JuYG6}pLgZCWZ?)BYK%?@v`nU2a|zqi*{XSINS#dqn8Hu#e8@x%9P z?KqCd>hSJ)>8RF*lTItQ0br)RHO5u`Cs;#wlT-VN-@b>Sj^Lf!(orTuk7~A>Y*s9b1-5BFk_qTU-TcNkdy`y>< zE#vnHxXk~F27f`{qW>>8>@PR?`wjj6!B_6V`F%H>`z~g^vs$f(jrT?kzHWoB-{2cI z_?8X6b%Sr);5#(O^oz>Y5eqMv0-{2QC_~i|LMT1|};MX?z^$mVogWu8McQyF^4gO$*|Ea;B?ZJCj zAH#?EJMcl?^|@DcR-g3n>8!qJ@UI*E+Xnx>!6%!n=QTG)gHP4q(>D0@4L(bQ&(`3p zH~5+jzIKCe+~Auw_|^@+ZG)fE;HMRQ0OPqJ52JjnfvRdFM%Z`39`m=0E=^gj^z(UD zjUlZDa4%t9pHq$N`{Vh*9^v!0X)9))q04si%-qzPZRc4zZJW26ZL9eV8O?mveCCVR zyVhnbG{-={SqqKrxuwlnX!cdVeuc)6wzLHb&F5eJ7AZ8JebtsIG;>ws)~qW(pLf-K zX_K6N-c?(r(0tBSTcgnIr`kG&<}$x?3&2y|i9?qp&M+h1H9wO9lF!CD1K)DEZssMr@x?O{!}0IO>Cq4F*n6tah?nE8 z?pUkK=O%*Vtk1ms*EZy-tMA;@vH$AUnHN4irJ8=moVmvO%*$Gug;L+>#&!*uYc%WX z>$uo{bbS|w8!KAdx3gNZ_LZAEx81F}=HQDTb+wtXEl=5xa~5o_13AOm8Y^G2_T%iu zseL6%HS?RBh`uCE9dpxeuC5(z=4E^4;GeP5#~jw8GzYc0@n4&=jAM*-(A15gzcDtT zOpFb|#!#D^Q5-;7#;_kbjkgh8ALCiqc$-ls-sWKAsX3QhP$r)Bw}h)%UtjBQN16KD zgRQS-{T*tXcI)p*sb+nB)pnyyAEOOHOP5sks)8pw#AC(f7!j+xMfuM{ugy_R*AT#x{;?-~>wJ z>0|v9DUE0SlPKk`HS3>Dsb+oa>3;@gayt`jEH&$&Rok3@+d7+4ySmrSIh2!d8qd1g zjd=lOV*V6t9W`TKSli6Wm_MV`u5QeWC==7V+KqWRWn%sUY#lXiR}{7@Ynz(!uA)pl z>uI;n4U~!ZOR(|OydH0?ZT35-x&8{SpLS#4M48ye&~EJ8DHHn+uyxez|DCmMB}!x7 zMX6ogn7^S+OzUbl=KYk3`2g5DYT6zwY!B5oHRC-@nRwRIZk;D66Yoi|@zh)=zo$$c zZN_<~p*b(=&oy+{fcoDX`jxf*c0+f}^#7ot+i&%68@lkHN={dC-CK-bUfK|W*6o$q<6F%!HT1AVQd-{SbYe&+^z%*A--zRZ2t`zpuQ zQ{DIrQyNd(>NWSCOHKQtHMWlNEtf#!|6P1Z35WhmgUx9=&g8Q!nl9lhLptG$lAmL+ z6+E`xz_y*X&S>!S;T?tj61cJAe-*rU!Ec0{v$@?0HaGWL`EP6P=Rn$j6fN-`%g5o< z*8YF2{d0}YchNg+_f5hl!ME;7>(PH6Tz`v1YV=? zy%X+zsCL)ulW>l;y{^4acOK^EF3fvuad~S0Z)fRSVq<%cX-xb0DkVeK(p=AS?=#hs z_v>)y)X%whz{X=}TJ`@4?*&%3zTEp#HRHcmW9uZ|7ic=f-}_emwL8{xGQs*=zaQ9K z?CYHHjcc1RpQ{c{@@wA^>+tsZv_xt?RKflJ- z(f^8?n}hyxzo*wf{k<0Mo|gVS24{(O+5Z7Q0-pMQKc4!I$Les`QsS)*U!?H&`)U1C z-|v%4?)S(g_dDd0``vKK{k~S7_Kx5p>R0e%;Pb%y;q?&MYs`DO_>Uvm8R1JXi>?cw zLmXfCLC=-Af}K;_@Ng}CfxnIEE4lC2O78o$ zlKXzGVcVgj|z7q@YN4~xj3%9=S#KI4U z`%Wy}a*u-hx#+vE*yXK$$g&{Zoa;^3b*u~Rmpv4 z74CZRomIG{@2tX&=R2!#OW#eE-1k!@_nlP9eJ@q=NgCXDQ?WZgzMBfS^!-%0`S^}1 zTz}tDgdJhrwTW|@2SG|_dQj(rSGZ29WUQgg&WWJRN*JUeNPqcbC>U_!u9tZRrvOB-%*8Y z_Z?NZ`T342+|qYc;rd@*aQ%Hp6}zSHsKOmD-%*7-UcRFWH=ggP!j0!Us&Kyp@EujS z<^2VBd>$;g@q9-W{}bT8qYC%A&v#Ve`ulz=e0#X>r^2=Sek$Djd_NU#>HDd0{e3?b zuD|c6!YzG274G=>ek$DY@%>b|@q9lOZam*lg=_cyRJeBEPla3hekxpl-%o|>@B686 z?Y^H1*Y5kNaP7XI3fJ!YDS3bH+5HaVZ`?oo9O!o=e#fvT_u+ixRewj*b_6%n@;AV0 zJ|iAgYj1+pz1Kgk*4~1v>$es+Oy1wCCGOi`wf^|~%=kX1&ymK6&j+=BOdaz>xb@Ui z?<27F{Ej8{K8C9&=TE@aRW}!(E!4z5S6DaW@EJVg;8-N)=jhrJ^GmRr&p8=~ufXc% zID8FP*U$ERZc$7A-+_G|F7y8$uCBk&HS+lP;y$(6-$~HaGfo{~$6ehR3}2;|7`?&G zvF?MWo*0vYjiGLg*7FlMF(v~y^PU_{{lt2{rU0uM#~40ysU^l#;AV`e(bTh!rU9!Z zhR-)__KobQ;*MV zVEY=M*}?j$I}Scus>OdUuv*%k8?5GcOXl1UJTGT*_WKg+`W?x9wQZ|<%;$%zc{t8q z;|sujUUfa&zFa@C&#ks6_L_B2CjY)nsI%mRulU??KLI#SvuI~=wP3r zgMEGu_Sreu=Vr0Fx=*eO_xoSJv(5aihNgZEK76cHtAo`N!)I-^FZs@64NAZJu>6d% zS&MRQP7m$t)OO3Su>0Nbnqd8`*TVFvKiv4%S+}-X8gqTh4LB2X!`g0{oCko7abunH zMqo9+SF>IDdYs0xU2*De2DWba=3w_J*XkDFEjb5r>Zi>)P)ok%W=zY(*a~dS%=gw{ zHOJZd+SJPanwxwZO5<7D|LrKZ=S=^%t?ia+ZwIh*8@?mhHIO;n39OHLe0B!=c@e$~ z-2FUbxGP*A^(*Uf-3@FkZJ*PRL6jbjv9{eg)y!X<*n5D@&%V1J<>JAVj${0Wz#aSK zFchxl=Z)Wu@_t)_0$;&F8jYXTt9W^CGA*#%AVr2 zu@Bfh?5qCfV4kBXlczqTIMvKk+@Jbhe`CSf2YokC?iu@H(|$i=@4d=?V6}3O9|te{ zIUZj2a{^o+_3XzJ!Rn4_`o2Hh`-qIs0dW1)Q|~~qy6cM#mXv^k$+DAlsh{|H?6 z&-UEs4?)+Keg06enn(80!{7&VW}lbqmz<9P+n((nPAOlNk#ek#1TRQweQjB5>ZyM; z*!t?Ccar~c@PC*8@#xx;{|R61?=hjCU5LTzn3t{>E}0p9fy0ww+5U&*$ck!PDSl zed}qrkLOeVg!42`b2F}7f3KIUcS~&-P-f5kDcEbl(mn2HloxS&Soh-EZkhgG0yakW zhD*U}+2iFGavICn;?%tiT<$ABhpS~@`32a+eN5ZsoNC4qC(c#iat^MBI|q)(7L?a; zUdgGSeU|HEpAygVRod}7^1K$F_}78UJgzXn@Rn|s(Tlpcbcjs18hBQ z8M`~dj-9r|z6-1#{u{94oon`Pu$q33w>CA$JGolQGftMx@z?J9vu~DZ?;fyyNqhH# z)$%#6P0jjQ)2_eNzYko_=WpR^na>Bn8DnktbE=t#IB^~Xmt*`8e5JzvFt{9-N8tLX zXM7(8JHEzptmOKcd)l;Yj!o)6j?MayaVD?dfz^$-GNnAWC%}F-W$d1WtJ!z`wW-{z{0 z`&qhozDD^sP7m#WukDuU*Xv+oq#tj9)v|ZWU*$BGIfzsDEwH(SzYR84?n~YQ|ASLK zbMsHI^|YD)o0Mva@h-R==l9_HWPf-ctk(W4Zttxhz+HdZ66Zs(dgk;au(8!0=T9g- z+{3he%&BG{#fkGTuJb}a${?o2CSyd_o4FSHXYb}wfSyT?!BD# zrU&m&X&r6Jbw;rHSjYFLao;QFX%Pn#dCp3l|=!5&`w+7{qcvn_FA zF9a^zS{QC05_b`}KI*BnD7bv?S`1!3J1!2_NBxTWx?TcoEN%8Xxmf0Ta!IhcIu`ny zr}?XIgSH)~`CB@UzGp9=pO=Q0_X*3u%Q;&Xu8(^9vK-j{r@za?^;38MS&=g1sBHyK zHS-WB&dT7d$t{St3f%Rk9-mdg>gMim0_1rXUIUzGEq^mquH`l1+7o9ju=Vrovkq9z z-xg(k_6J)}o7ekxbYfkwy6yWLgch&b-$||yFWa{*pH(+N*Oq6!4Z&(2x&8*g*W=8y zo?Jg;*zU$)+q2z`DCJ)7{_bcK@Gg|r*OqoR1#9!jT&uZuGX9nslYwB@gLAe8rN`I& zYe4Fob9xl zp8f{3oX>5sX-}N(z}C-vZVy)TH-?$d9pKi}=6w3w^&P?Ld0yWcT()U^=WrKvZ5i8L z!D=3v!`xz&!53yHRJi5m%a``(-yy> zgJ-f_2v2`kA1+&x6ES{y0$|Y zOMlNd8myLf$AH=5y6j`&YUZu~zF_O9+wR_!YH4>IIPH3`lDOm1wWZw&V71sMf;)?L z_lK*c-2=eZQMcXwDAlZQUI&5AWlHYY zKd?X%4Z?xIOB0Gnzpok9oXwB z?OYF5i~kMaJvh_uFTwh#C&rE7y*U%(S75b#e%u7Mo_gZ^8m#Vfu>A&BH}fC!(`KKp zrc_HVw}La5!0I;sqfh4UcJM8n`dMGDPsZd;$RU0eKqTlkfG@BQf7(#8W|>#Aq$9t7LAwrCH5)g8~Y@d(^pnrq-u zG;NO2!<2IIW0b~my!|faci>gn=e0c!mgl?aC%}F;ZGG!$*Y6HWZJD#*gPk*NPlDyz ze1`l3SbuF#QOfMD^|kSLuR7C@VQ0X8=PvkEl%EVfy=i30WaHn8*W?bS%>d{ zt*0$x`%kcAt1YqL1*?a@2X;QP7rYNv)6Y53rsfzYSIhr2P9I<^bN>)-jO6qY*qrn; zzFZ&UJD-k)W%BzNY~PdLCt$UFhonu-`dK5+XEVRF|0y=xFTbn(46bfI|Du$q{V%}w zN89IMxov$Zu1nYB*ZOgKX#cjhTPD_bU}J=T54OFmPj8O(QFo2VzXBWEw#4S)Z<4(( z#Qq*R*GLCgKTC7!P1%PtIZayIEt5lEurZRuWMH*iBl1p8V;NhVx>JD7C45S-vF@+0 ziK)PT|EZq&n;L9AZRYRqr_~Z;8gMyA)57)1I-d@#=K9Z^P7ilZwI$9BVD(&!GlGq+ z?zK2GrHA9JZ6;1N`zTJFS-`fRahE&4@wJpY&epX|-C4noNBTY+SS@Q;o0|1AcaC$O z6XyWunQe9YSKcSgiA{UjnhWe)WK8A;&q5vb_{;-V&u3CUu-e-ACC7Q;#@3ea2Id2+ zI|lx~UG8UCKjwBpaG9fR`CPdWy0$!5E(})lFh}7c{KwzJ+b?6w^)tqH*cSuao;H6c zFVA-ai-Y}c!1~(K&Jtj49@g{s_i7n~rNHI-S{m-S0FBwcLlIJdu zRkyx>u0SpRYkt#*wY2>CJpS8rC)?eMSu&o0w>-UGBOMUg!Ul*(%z8=^-!q*2I zH@_p?0IZLC*5ig?<7hJv|Ga}*Vr&F1^Vk@E0ePs$XA`h`_@>}8kAZOW$Zruh1M8!n zJT?a#N1J&Jpi~oYLFu0dP5dq4uDkrkXe+SVC9J9VY~AqL2Cn9x+s*f)+rq7@&9%5a zW!9p$?KpSfbe)Kk%T~m24e75Z!L6Sl!RW_>F>_qc-=+BkAvGu)52r zeIlotb1e4y_s@A90Jc3pn;aXzKR6Jsp0*AGtC^4MD{(CIJ^jI8+jjlvZyWZ{Yebv% zvc@dSb-F#=w(ZxCz}B%3hf;dj2lYcZ)$D^fb9p$}HInQ2NU+)woaS&CrCR)t2J8O_ zW580Xjs>e7!}(yX9S3#~*Jg~PDAf|&5-9xuQ|E~v^%C5A4|FSrjD0oVx5l7&%&(fGvI3Z9fCGB z>*s#gd(q~-wc~Ooezu?3XT!a3PJhmU+aK4Cw)mY3)@F>eDCM?y9;NwOdM|rE8AqHrmw@ee z#_v+FTJG<(sm1>?uv*&vIatko#dd!IPP^JJ=Tx&@vHebe)D!nguyMn$0+(ZdHQce+ zPh0%10c*?H%VWD1T#nIoaJ7ul4dCRe?Rri%a}}Gb^Cef8XACXV&ac4cp1t-au-Xxv z@%c5_T=>`4TKm$?V5X|tm{VK6*SZC)E%R|J*m&yp@kUCu#JwG?mV2N(z}_>f$LCJ4 zy7h0Pl*e`#*t~Mj^&7C7*J6C`u6?q{+yhtBH{-6Bd;WXD<^6QI_uq$2dt(0uJmV{DWY1+h2|T%5y*eD7b9jw!EKz3|(99vmXbmdE|cncko9z&Dq#; z{fy!L{F7kY)Aj^dp8NUVgT0@(zBYXxrc_UVp8_w(I?na;2e?|+33g$ru1FmUnm)x>aUcfRXs;(TlyvE zm%y$u`|~2DT$}sVE8yHay$qJ;-sx4a_fFQ)mUyp$%QgEqxc5%#@%cMgJ^XcWS?3M7 z_fG2Zc@wN2{uX#jI+eBc54b++@p&8E7oS{j@4)p_PrZMF)ysRQcj4x!E%#3Efz=(S z_bKha_ybC9xp(>y?7fpd+VpvWQax+<6Y#%V!~a6pmNon-Sj{7A<1_fjoLR$i{p_=A z_zQ5>@aJH;xvh=uOYk0)*4JikA5p4X-+SV(!E-QX+P(tIwfSdfz6CE*+r9zIGuGdO zm#A&uf#rVJ>pJp3)3H{JXFcs{uNO9T+p}GH+M5Kdzc$;Ir@aoaakSa4JolHKVB7Rs z=_BUwnp5x1spfSccHj50HR{fuD$y-V{o zFWWMve#w0%ur}-42YGUz8EkvypHr9xP2KC@3fh&Z{xT=%zJL|Ta?zdgmFJ=>J)SI)Qlq`5jr z3u1E*(su!{TpLBTF#pLuxDZ&LeQ*)5`=E8SCElXoa!*?f?mnmhp@E(6y`JwD5V`!a?ZujSzSsb?Qt9;{yOgDb$zQCs%G6~XF`lYjQ! z{)<nHozF*;gr_brq8OB>ehE3To0Uey)Ia;&3$kKaMtzuV0p%R066>LhG2R2 z!HvM~gT}L-_O!P#SlwJ~SDyAZ0qd{LcI9brQ?PNg*{(eM;6Si#x({wn>ES-8z8R;Q z`<}SGF4U9PR^W`omSB1E+6J6)*cvR){A>qq?t|N-shjJzlyYOa5AF!gKDYx|ZY=k~ zox#n0a2GUnW9>vK&px;-*nQA^&C9losb6y64XiC=Do^f%z^;dKAKU}39-qNr|E`9t zKe;~1c?j4T{ylZJ>E1mQt{$IZU}Lzyn}=MV*oT9S;cu`KV+341K6`?VVXnrI>l6E4 zU}N}a)DmMPTs=N}gNOHvf*xeZj^x=2%L3 z_Wb?8@8M%S>uEP1wuovx*qG&>KLJfWd)P#<+BnYC+aH{Iwkg-IT-)9go1c6BLD;hA z9|)Ffqo{twf3oKv43=llKLqTaZyjxkcPO~rzYl}E=c~u(aIkv#5#X}Uk#P5X_4phG zRu4ZKT<-bD!1Ym&&#~aXjA6#>IJkc5+4GMFtCxHJ32<}NmOcMOu)1S<5~ckYpG>JO zd;Te4_k4Y{>Eku6o<09`@W0#h&p_9fJ^xIwnn&jMEcj`hx#r~h*=P6sbHLg2&j!oQ z&A&(TT=3qM*4Jikr&6k0-*x_DaMt;GV7WHe`A@)E=jVgv8S9^dv*%v`mS@ku5bU0B zJnLysdp`rKn~Uwr)80j3{k7SyJndZ!HjXyim1obt1Z#&A8}KK@x4wSbll$FZ zW0d>lJ!tB={_h2=`MccIyAPgvwkg-ITo->5G>Dr z`7qf1(mL7_?-6jh_dN=Czf_OUV_^00$H8Tt-@)B4)#LL7SUvnnaJgUp9>BXB%8Tm6&vmvUbzVYKPwbb$ zYPRp%NF2+&S9t~8%7Iw=+lG74Ym^@5q0g(FuXCD*IOG2}@Uq4D{~fO8ca@Hd->a$F zzA>$DoQ%^O;D0wxZ=!2UU)}<%IZkQ&A8^~wZ%p2Xs~tgKu4J5m)jRyhy4vj5yNVq4 zOWQv=KjJhmabmv*Hui(W@fvy`ZlBcS^8t8yd>*QOK7_03Yu~l0*>~Hu4cp3R>c7DM z?lbjMbZz-e{S2(;kv-sZ_$Qpst!>HmGdDj|zXazq^$W1v_3Ph#{1tdCrS-L0|6@vZ z>-(Ad4LEz>*I>CeKU2R0XYczKEYFzummTLb^?P)AK2v*v{Y*8U^|YtGNwBG#i|xwO zUI$ozZMG{5s zjDvsQmpt<`HMseiIt`k-x%&5k$&KY_>U3bQ18x4@U~*&mnK}d5`N_5JGs%o->iYY4 zhspDqIuqE>RP!}2+cKtp$$e(9HtX95dHOsnIDPi-A(N-x?BMj-zmrVvy7K#=Il!M& zT1Q*PVotDi%6)JyH1+J;bA#3VEko+f15Z8klIiDZw;hbxm zk5kP#7bngF;PP5u5bm|E{%GCSLSXgOSs3i~o%b(`z}56KS8ZyptF&oZ?gj1%^6ZP2 ziN7em=9KYS4Bpbn%lO*V%602|g4AE4uAg^EOTt@PQC}_oOV$4QUE9)dHODg7(=u>% z$96f&jIH{zoNA7(IB}N;I~L(9z{{~-5$;&4XDnB;4yU&4|0{#l-T&=#+Oo`N*(zZ3 za~$+H4|83OGP&xrDyN#cip|YEZw+wPy??)+T$_8|THvgE|E@iG`m_!>d!B#qo;-VA zf3SO=@vNsk?X3$|cRp=bp7z!g*EZX2adL6pua9mV>)Eb6bF=~2HtqKSN)Ok+`i7ip z_FG)89rfh32{_}hF<73w27)sV{vCqyT>qPaU!y(iXv=lHIXKs`aqPnuXzE$tTY}Zf z>whbF>e*Mhe#vtiaPr(5EKi=>fs^O9V0rS~9{e>qSVvp(+yR{SlIMdA8_uv(et z&hXSTPq}`MZ`PS*Io`gzko#`Y`PdD6^4J~hXGghT4nk9RUWZV6d|gdismnXzdr-0* zs=<`Tl&2p)%kNmUsl|W4+CO|8+&|ZxJ#{=>ANBZ50GEHRc_Q3D*PJ^0!}U?mJ?#NtV`+1& z52DOiYderr&9N5S-?hj?t}b7@=9cm`YOY^$`w_k_sPIGJec`zl4u$KZ?mex4zt&;k z+|!oNfhodOd2W>}ks+qGmIUEi4&%T8p1D~?U=~%cv>dEOiaB|9b z0PaD@qiah}CxET5?wF)~%Y4S12sS^*Nq=)Q|C1?`zdk2%s+qsoeEp0$6`Vcm6tG;I zpE0L{vuB+KmS@j81N<~T*3o8uiEsIB)r)^OtXJ#bO0XUOjE;x@*4oPC$1>-4F5P&l z8dJ5Jn$tY}H#_|I6a4#L=VC6clX0F0ZvE_0{owlK`9rQx`Z;fH|BkrgZM-&M$0 z0jv4$AiqCa6|A1$lCK6fzPf$0ud9O{ADs>}VQtTIk2w!O zQ}>$Nn9{?~81;=f)m%qn<2dh|a60bB`JkTXO=~^#Y-!(Ip98_><=Pb+&$zK~Uf9K1 z+vYdB331HA>DsoRv)0%(I~%9*=AbnGRz-c+ML$lDIXP2*t{Pi^?gIO7su*it%3QbG z5--<`<@PlxE5nyeO-(Hj+j-UFT zoNDGQcD=YK$<@>DK49C;n2myG%(TUCG+3MY*)Mr)W58;*AitdMwPDab2Xw zj@_aKUJPu0OHg|KY2S}{`g@(Hy>Vdm@bO^dWdEE1*GJv&XU$_GSlt}O$r}({EC9>e@(&lzplY=D7gMN6^n)>~$LD$FOU^Vv< z*S~Y5mUVF?SS{=CD6pE>n?A0;W5CY6^_@ex*!k1nYu&bv1G^XLqfH;zfO^_H0bI6s zB3#XNnD$Nvr#;tz+*r=P{%P-2uS^!C;Ih4+z}3#LeQfWi;I!u&lpE_pO8wK`&%m~)k2ZZ=!|G}8 zVsP2srEs-NY9HJCIXLaPhULa`{pz3gegU>UeYEN0HK3mMt^}9uT@6>es`jzHYr$#H zYd~%+*T4R0?|QK9>7z{_uOaoc_e*fu-mln)M!Zd3^5xtHpjN*fkdWU0}7?e*<<+#C|teE%tlB&SC8Lg4MQU=k(u; zybo-f#gZxRbnm&p1d$4g*=P9sS>ihw$ zW*y&KuSA@u!N$^NzJH{AhSPi>;G=S^?$#?KPn5r#>~$`uZ#N*TvuUoVD;ASUsO# z&x6&n1|4&?_`e8N%jefiV6`PV^>KfA1>B#~HQ}7d#m0$jHDC0y;2+Q;_32KTSqb4|*P<$BaV?R^WjJ$42hUvF-JiwTp9f&ul+$A)&g{<{*LXw9 zO*q}32U2GLnT;6bel%CX-LLu;T)+7me1U@Nzi`3zU$ns&FS!0o6%U^b z^~QDV^_n#{2by|5FXjZRWsSK8)Z#xkSS_Cy^MKXL&x?8Ct}U+# z*N9x~`p`eu|NLODe|@y+;~G;>dkccg_7;Mxm7f=jz|)>9`~);_kk4LI$2jmwSY^{s!}+YW4d z`e@U~JwiS0?Eo&@+X=4bZxr>hy_fm_L$MD9tHnMH?3#~#I9M(A5n$J7?0bUM^7*?L*fx!8UG4e&9SI)9na^Lj z{)xRecva5C-UqCf&)-pC>*w=Vu21TZ2D`qpe~tm`pS@A8PyEM%T_0(0U$9!*+YfC0 zN9)gTxjv~s4(xoV{&=uj>Q4Y$f91NqT%Xk6AME_4{sCaM)IShx{fV5;wOk+T4`3Y} z1lHdAK9e%{4_jc{hSOt9&fGt2RpZSmx90TzVOz@FKO8~~KbOLfD7c?rM;BbbV;lVV zg6n@$!Sz3-!A~o={%00k|FawX+=ApusOJxc(OxT>nc8{&VoEjwx zPkTQDm+f5yS1Uh%FM+2$*O=T`t}p%5-eq9h(?^>=7gDOHyXX5NFcAp4mKCZ z$2R3++tb(av=2{!{T@>vZT3?w{TzU8IH$)D&h&F=jqT?!PWw56GW~pp80n|a+RxTJ z{d@{;y|SNwKvPdYp9ZU?pBv!+N4R?W`6sZs*f-mhi)~L|`)MEk4EFl~eYDw6we-{P zdiUn^*poB;+^fd+b0nwz+=nv#e2Ez8r|&plsd@VO9NcgnejV72sfGyLC#tEZp;0Go?_vrW0!_Vl%%_Te3{_hpx-Xn%SWk23WQ#VK7S$+VvuD?T)e+XBz zU+L>daO-LF8a1Df!RqEJPOkpDI{uqF9tUw|P7bcIdH;yhTo0vmPR##PVq{J%zbyF7 z@NWvP{kwuQ991t)_4NHS?6y<({c|++?2})B)pD<94r&>jufS>Ddj_dbe_j-M_>0=+&)7~WXK`njgsQYCf^q1?SzvJ9l+nob(=HMu7$8mZb z&6znkrpAX;9?R(*98Z}!n3NcfVfYjUcMhg5xPH?%`1A$Wf2M-#KTCtpR&f32EV%x2 z7rY;M-kN88`oir;ImeTssb`KS2dia{?W0=8d`hrd=6EWwS~7Vvy1>2rJ+VpV_)YIPV;Ih3r z;A-Xl_+0R`=N!n5<@oEL_T~ZGo<7?2aSqkf-h5#Docope!D{yu>wJOQCv&tQT+REH z#90V#UlM0wuv!^sk=iG57KN*|?|UX?PWf&SZVt9#41HWbi&HMa>6*&*CD%VWED5%M z`E#O6fz_;k0bLRRyrN+iSmDB6lJ(k=3;}I=XzTKZeHc}wj!E(=3phTTCO*9 zR?C>K0#?iQwklYyyxvxaI~J}l`y?0JAN{i)*95yB_0gt}W1^n+)&`gDtpispueWvK zY0oi{8_V&~Kkcm#wmp5c>EoEHr@alq=HQw+5#Iq|HOFux%8fZ4*Q`0YKB>P6*qA;u z`Fnv)!D`kYNVyrO^|R*W`dHs}wRvrK&55(-xcsW~IX%wd%$hs5#%EHVSL3rOf6VFn z`w3;%-&VxS`g8s4P;m8~8hq!1>p!&M`VVjL5jD?x+Zt|8<$Bu&O+D*vTd-Qzn>ngw zJhua@WxZ_=Rx8)rj&R4p`Ls`RvHj6M`RoFAT*`cQMN`+`{>t-r0PF_thrcn6r9FA= z4leT=1Xpui+Me$h_ki1;b8Wrl!GqD%T_4_83<0ZY_t|q8IDK)v<>uyC+lJV2wqEkw z6Wouw`s=5?{Ee+^=Q2)@i#W4(F0S!UDKDw<&nPeDbnX0{GHYiqVwP)XOu^OnZSegH z?%FxH;QAlZ;D^>cb2Sov34Y~V?Tx0MwX+XcEo;Xd)iOt;z-n1LqrqzB+8GOX92{5s zBp2Hs{gcl)u;WtZGagM{fBP%XoJ;`s!{3<3(w@8~g3G-2hpT1nxKA9wX?s~a?j;A- zde+WCaJ8(RAA!>s$6Ic0j>gKfX6;;E<6lr-Q{yWs zujO>@Tu+&`b2u@}wR3F2)sJuR6KbBZKLTzW<=7vIrk=HM6j&{5!8X-0=0}6ovKEd3 ztCefvIJo_CoXk-!Hb4E7&xv6BS>|&Rn!5hxFV7gC4DN@&F^#3YImWkQ_qd5OWBltH z|B~|N8vly&7EZ_bHp-0gDa0(t_^g7fpVQ#y7ToLeqJryxNrPWnaQ&|;xc=8P__YPs z|E7ZLe{+N1QuB=Qsc`#Tj`3+|>Y4M?!D^Xv$3QJ}eg;@AbABdRt(^0-;f}NO=9tOF zj+g$)=RC0EUgq;-GNG7 zdanPUfz@*TUkpxPoKLyAT|#LaV&~C%$@4O>dFrp9_T>38KH z1M8XR6=>?o^GdK<^1K?HJY7d}b8`*ZhS>FEz2tcv*gW;uPkZvb9$e;m16<8_2+8x8 zaP#!tU-G;WO+9)33apkqe+^Eau3x#ixpr+s?7FpH^1Kynp8D&jy?L$Qhuz~hoVnKT zuJIj|_tf|<%6mDz)_+TxYyCE2dJW0Fw(c&t`n?T)U%~Z%s^I!R-Qa($dDi~zaC0ix z{vBxQxfbsPtL0iWN43n+U0}6bi@yP@mDl1uaL2)MwNG-f{n0=9{1)uEl=<9`rmnyJ zm1j;K0QbY+n8wncydDIXc|8PIYhR0fTKDV^!)?!NG5M|te*|4y*3YA0wLJ4a23GT# z*FWPk5d1i2V%dgy80&YnEwP?}tL3-JPlDBaHqPH$^Lw~$Xv>=Y12|*oJj)#u=h(i9 zonPx_e4YXKBM1HU)81UOk7M_Em@{kkks3ch`Dl$FqI`_gHTyfttl2*ivs|;!7hL_t z27jsMne%7iwo%UcpV8E_2LA$9%Nn#zwaoQj!D?B9&w3?z7(Z3xCn^8H>A3!pGS~j=#L2i? zzS-b!)jWB>1~*Qb_utUeGmd`;t7T7jT;712i+!_Ax!Csf7u%L&nLhsm-j6!^>!-ch z=f7b0c!o25{!@+Z^RqSfx#Z8B_W!Sx>HoXLEc^d{gMUzP_r6aH?sL!Q1-FeaYM$|W z8*UEec)f$Bp7Hu8Sgjnd_u%%!aWO}^*!=Vto11$;#_L0PKkDeOpZ4VO5xAMh$7t$V z&!2$RE+MAhUtB<}f5FwWKYRvGUXF#_TpR~|^}CRIUxFQPZH|dNd42`vzN~=asa2@LmOXUMDTM--S$8aNC%?<{8hg ziEj?&ynchGo_YNitX9tJ_i+2+xR|3{Y<~KS&CPqHjMpUae$>%lKkdn*1KiA`6HPty z+8eBvc|C(Lec`Id)!O<#aCpMwz*smN?~HPT$}&)I4J{72G)G+)Rz8p1GL@tX9s=bZ~RAZ?-8H z+n)Yn+w#6JeV!5Ck2?D6r@h(dx3GKsoil6s^%~pfH)?z(<(r)L{~wg;|E$C;`#*bw z&rxvK^1KE2`Fj3>+r|Pl&v?xQH-~b(W=2!bc+CPa7fRytO$d z^33Zh;9Id--+J1c^ZE&Pj}JIAuOHUfdHtxycTs-K>Ae1nGV{6yG0S;ftHIYUxbwO} z!F_%kP;lGWsOA~ZRpI7P&g*Ju>Y3No!D{8at_imvj*B_U#pb8K*xa_{c_ZVs4!j?A z^w&>&^5_q4=CQ7J&dlq2V71Ka?ZjFiuAX__5S+Xm3%R*C4*KeMC-pW4JKowH6M6F7 z1biPh>swEIb6!8k?oodBdtgG=W@#i->T*rlTG2qDd%P&ntJAD zGq74YH(S8X#lG35Tx@&#i*0MiVs5sE_oI&f`e|?W`D^SRWuL!GKZ6 zDf_%rgYR7P^m!Y&amqe#i>97FZwFQ@`@AFET~pVPIBgsBKcvBj7ToK1WWgVR?^AHw7*+F( z*RF7LD939zH1&+v?qIcYyavPVhvQ<7aq~(#|Sj_ zT)%sQ)h=Otd?vV`SbM?MbN%iOPF{|M+*}+7ef4{gdZWRPw>HN_o;=5ZAIEHc>uGP! zYai-*bWl34&QWKLorm6F=Wi)h;z8_f4KLe2OYR19U9T(g58CYz4+8n>W z)ZHI!p4x0%?)cfi1HjWzTHkuweFv$pv>#PA|Ce&n&q9XV*My@?f|*l-I_O(9|7=KGZn`Y;O7c+YW{6 zqy9N#b2z1kf1i%F!#LIKi`a45iu!VO$J2fs#knf;G#z!10nbROuHAmCnWuAqELbhy zuRFHK!PU*-1WFHcP(Pki%{Im6aAM8X{eI7JJq2uB`(KX1sc71~1|P)lG_abvIp*?Q zk7t0LQ*DmFT-y`WI}7aiYI82++8)Gq4%k@QoELen&vU`^33u1V70P;KS5Ksf981sSk3z8D^LBOf}8a(L{rbc^)s-Vb9xkcx^6B8t6#)v zT-T6V)tyB{yN+k6v2!+Sjs4BcY&G`X#_XKV_neg3Q!b;uTt9wa zbbY}c%U>7Vc5W%S{;G)cokz#)=WzSyc$vH0 zYed`S;H4?m^Y``s0<7*H=5=r-*u(2U+ZCLTaXK#Itoy6Lj$L!zUyZIU|7P$tV70`* z4xISfuI0R!)A-`VzXAO3;{Os|n`>(z->=+Q`{lZJ&;Avfx_|eE-)-Flww}7<;vA}F z?cWS`t!Q&T<=R}=w}M?e+MHXtHsjt7HkLN$S)S|f4zS;K8qa##b3NPzHfDJ}{RT}v z*TdalwL3Xe?;ddK*`{1S|Bgw=)iHD|)34ux)35u$^7QKgaQfvMl>56WucZgUuTfe@ zTiSmJoc4|5ntB*bJ$XL@R`dQN^&W+%o_WjlEAuvgx$DnefYK8L2BF?}AaR<4B?;HhVua{bCNbS!gU_5T1c C(R@q* literal 64200 zcmbWA1%M^hwY6*Lo?&ozYaD{R%V2}M+b{|ZBQOGiKyY_=4FLiK2*CmgAxQ9GQ6zW> zBv=Se;QzjRYjvNlmxTAfN{X}A+WYLYkJPzUx4L^eCR=dws_IuwSq-c%nWAbPGgOnI zRMk|~fL1wp^X)cYdcwpJORu=nvN}v#b+r8SnXa0=>QC9pIc8}0SPg%N{EKsX>Z{M7 zh_n8l9sF-H%KIUmv^;q8A%nNtYVhV84jI1x{zE2AJb3iTA)~v;jvO+gd;G}Z6T1%_ zso%iDuY2t9(UV4$4*l^cbC6F-8-qrV9ygqF09ndC#*G>^VdTULWpeVbs<9&v8ZuPB z#U?Gc2LB(n+#1+AtEv0=c2?8CCv+b|HrDNA9O(1NF+;}fCm+!2=N7F%+m4(tX*4ag z+imq9+%grKV=-!I_vn!$tlz(y89t^bk#-w1q4l;oWi=;!;Xap`#YfJYk$Y@J#6TZ;WlV)GgRxN^~C-5{w38u{ToD# z^lv@eX!URMYQsAA=G|ilj~_asd*s-OtzMeDHjmyu4yXph&86^f^;6xW?5EoQ>}UU0 zjOM)R>%6x1Iwsrp)!fHg+QGeA|7gP8S?%uyOQtiC7d6Z+d72Ldf{NeUX-J={UwdPp$uV$*YL+=yg+k4j2T(e)> z5G(!K7ToUFfNF=jjfq1ijvP_+N!_FD(@t>nY58tHbZ_l`vZ;GaM6!=^`_sF3OkeE= z@2NWk&;NRvCY$!uc#ZBt{p7zZxSjuu)n0w#wwC|5k5b?8WZY5hS;QaG6MyDvWS{uO z!ukJhrM=g%Y+Zoz)Ie`cJzd*aVjjq4M?XPNw;+89%`;f+q# z(gEQ0y2oxZcEZG=V~4jtAzTmY9=&U6+Uh`f;X9hF>-*owp$M9MjA=fHP-eWp2TpF^ z2X8%e;y(Ddt_itE?-+DeN5Z$~wkJMvkJ9HTxN+1SOSwny{?XR@p-m)K`?Gr5>NvRd z{(Hyj4C@`x{v1QS^ygS`yFdM_6T#-X-N?O7x0Q>UN15+QaQ)QHSMFhMmd=Tq`3@U8 ze7_On$L+7)pS#G@D6RY7bJl|tikwfOUUEJa+|GGGbq09-F(bB{IDXuI<^77fM{nN! ztFz!ezF_lI^DuXEs_%E&y7})jbo8hp^-W9vYPRYEw4NA4T6dLM(Y<%3_033+U%JxN z>EBYLxRWKO^Kvd_`gk6=-N%mVd~kct`&So%iMtWEbwq8CznVvHUpuQy;I&_?pK_08 zKh^)|Jk3&Fg(!2t^}ulSA%Ic_c-LsfdB4@t>ia0Qp8KuVebgM) z9caC6eEUfBCf*o}+Ti4ls{ijIw*Nn+jf~4};Ec=d;LPz|?L2xOG3=L`M>!^U!^yYh zr%gXKk8(`Z9FyL)4yMW)4%)MZEreh zO#2SfywyBC^!E>SOgE4JebE1~ z=@84dep|HlyPmc>s^5d#{hhUX32a}sYkiRXUs`V0bhXr5daLc{i?&}V+J3R8?dhsl z)AqR0-Tyx$Y2RKh;=clJkK@eMTXoF#-SW47PGxp_2ePr>y-~FDW=}gE)nCBvF`T`6 zw{ElE9{w+b_#bwb_B_5*>S_SE{XJakE~NF9nBPU!QG4&7sPFe%o%oL&-MXD` z^`$?2Xv;RRnhMwi%{`1#K?H#bSZ}og< z#$7CRsnz}-wa#_AT5Df_{j2F~KDv8Mx6d>E`Bri+w08eTvAg}Z+s0guy@Rto_oqwM z10`yOTc5l-tJ&eb&+L3x3U>%`C&&KPobb`b_jJzfJZMAf zcng7hKLa|eMc`YE95Zfw>%L<^iyNyo*7bMK-ka&i|6BUV@#FpKtM!?u?!62Ctr)hq zn2sFcWx@Ko9@YWxKXm+1pH1b18hrf*-@d_jZ1ABCKD-a_sP=^)Sm$vlxcyw&Ssei$ zKlGqwF}3nIx=+2%>evQ9zU{wU{7-7wPigSe+WyPO|Ez}noCZIy?Y~0&FKXB?Y4FS1 z{wv1+s)qgQ2EV55zf%0KZ`f~a@SEEHE64xVhW)layra4k-rgfRsy~1;&M$z=asG3k zdY#oP4gOl&zh|7^Y}nsw@VDFkJ>&fMhW)(;|Df&PGtM71>>oGyr)~e9asHxV|Ej^i zZu|F)b3ZPOa=mx-;T_cgc*boGaK>#Oa5-*Wed={q3pDsbZU3HeTeM+cyup`f`}d67 z(hd8v4ZeKazh~T5YS>q4@YUM>J>#}!!@hQduiN(T8MpNt_6-|+aNECUeQw&YZ{FZr zw*6Phdfv8S-@d_jZ2PYo|6LpQ-TUy4Y6v{@cqlmYcm%kd$D{kyW1ng8U383H29+p{&<5w*WkZv@Ru6=Rh@ zd9DCXo~wY%JXh~iud`aS!Pjp4_vAUKVPC%w@2EC}r`>J9X?LeSex21W4ZdrG4{z|1 z4L+(5=lgbe+8W!()>-Y};0HALAq{?LgCEw1cT|VN)7G(lY@OBd4Sqr&-cg-W)IF<@ zt+P6(!Ov~*%NqQO2EVewuWRt@8~la_zqP?{Yw+9qaPA4<8HZo=v2|9zZ1DRU{HX?i zroo@>!})#yp0=LvW9zJ5Z19&F{EY_xOM}1F;D2xM_Zs~D2LHIhKW*^O8vN@9=Qg>| zUeT|?CvWg68hpwIpT5CoZ19;HeD(&Pv%%+T@cA2j!3JNb!N1esOE&mY4ZcEyuhifx zH~5+jzIKDJ)8HF6_}~WLxWTt<@U0tsn+D&h!FOr!T^oF_1|Qbo!yCN2!S`$M(G5PK z!6!EOqy|3{?)RHKKewNq8upXn`}&0_htGJYH|%FN_}LA9RUh8J`Z3&Rb)UEs^QV1$ zI;-0o{LTiyufgx{!~0i{!EJ-bb6!HV^M0m}PiOU9ga57%A5gsrZ@p&XE$NV9-DBII zbDh;o=&dhCNZ#*}J``zT6+c z&xPsFtw?YCej(7R)3&jtRQrtV-g_Tjt&SbiHYT1;spQL$=>5^@M&zFM2cYjiaro#M z(*ES??W|(zsWTAUP<|#MhO{vS`o!@=$4;mX(WgZB-D$7?z*ZlpLLb2m%9wFu$MHpL zz9yla{w@2|*y*1Ud$eg^obA*H=NmeXT5n)kM`W(;Pi zZOlxM&F^Gh?wQ~9CXK?!eAPULGw+suSMqADdENXK?D)4o*9NqHPI2$iLv|WBe!s!v z$4%OQlMSphp!z3%+)GaEp4dIE?cLTtMt7am7MCOboz++9ym2-xd|>NmABns9xZy)b zZ!-2k3IiEkJF9^In@4MnP)V(`)cD@WxC7C8efW9A@Rof*HNAYF?g{Nct>#;?=0b0N z7dIbTd4IfcgD=wHi`6mm-QAKk@BQwsf3=*NU%-wSxxZ_{v0FacAtT3+s}1TaL?6dH zx-mm1>{pN7is<7 z)Q3-Aodzd|R(E=Lp>{53_3`E2w878q!#k>r;O)K*sICPkZf}1&s;Bj7t*_4N<#zqn z2zp(-(o>)7LfiOp!$#I)HINOd+^1ZhQ=_p1Pt4B)*x%`EjcTp&oB>{r*GvsQYlF|; zhx0uvoPP8Sjd?DI=FeA|!)oBZKhM%pt%cTJqn*`8;IjSA8hpDJZ(phAzeCNF|LzU{ zp$$G9&gWV82nNbE-7Vjzef9O~9n~205v_6O=XS)c>$Ri$9{9U_Lu$Xi4?eiYN2>G0 zr+%V(3lFHy1SfueM$~b*pwQd#`d610T03t4>MFF>{!uRq`+OtXu%Q#2>Q*1_D(dy> z{i~m&yC>%p!McyOwBGNUpJ?zW8~o`8f3^=FQ2h~}{*~+YMfCRCp1k@TJdvNg8{Hn} zxV6UWEB)&AJ%A6(^1aC9@OF&OYC3T5`tGQ@;O#ZjQ7sN`uP=U9Sij=C^ko`+1^9%K z2eft^$72w@cV2iFhm%e#w=KX-duxn$0Ou#xosag|v;Ux%d>FiJe?)^%Z172ac>n4c zIAhTIvFWsNLR)L?xUIDVZ~t6%J})|}3(?veV>{nFTXot$H^w;C{q0}fTj=d^@2H+c z%lQ2UT;~5=gTJP4(f`*Q_BR{+;|Bkv!9RnS_pqNg>|Zx{KW>`J{5l%Ee}hlg;4?J% zObtG3gU{aJT@AiKgD=$JOEmaW4Zc=`uib}tRO`aI?_v$IR{Pl4D;s>12H&i~w`lO4 z8hn=q->t#-Z1BArd?eg^mgGCWVV~IG2R8WO4Sr;UpV{DN_2K=ibKtI}A>Cv4&b4!H zAD_Q@5S9Dfi_VM9Hc#6g~*5LgceDVgLp}}Wr@L3vs_6DD`!RKr6`5S!01|Qtu zn>6@#4ZcHz@7~~hHu!lBenG(pGoB0cFv`aosH!$*gnbw6v0%IC(UjFnKc83Cn38HR z_Y&6iIn}tnKVAUr5k7yLwqoWPx@;%U%uTJ?cAk~fwt1`Bwwlk7(acxPXTE6tYi;I2 za}4yGz0lasTiV=(W?%K|Dl~?)r7cuwKL6^sSfTmstF~mJnX4MNW zA~??a%+G&qL!P?&&PyHpuWp_B;WJXI>1WJYYpl=wtfko~^^I<9*MPZ3v#!35i|t3( zcTu>pqP2ZHtEFmRxw&)O-KuL2zW7mBn-$v%lwF*&VRIeG8P?WV`I5DZvmdASl_}NC zZ(bt$k~DS9O}n|ecC?w7?U{qWW2KKdtW9YSYV+d14rLj~80(^`8$*9%Y)F|H8-b0X zHZP+%n6ivvKXMvxW4J!Xv##;BpiI0i!NyZ_F1MmgJnL@_SF^sp*583L^>+kYU(NbE z)i&+c-z_zz zJnNrCDR-?||71!v>swF%GboeWnP6k7S^uos=KR~%*_7JVy>8B-oQ%_W*41vzizpNG zVz718jCo0IGbdwSN~vAln3qu|rggO&^J>b({2|ynYTB+TY}eK{HRD}JnRwRIZk?Mb z6YplQ@zlH?Z>eqeJEyt+1g@WUW8X@d*v8Oq?7Juv`)6S5sM-I!Yun0{#=M78ySg!d zPMMh2)o#rDDHHPnuyxe5Jy_Tts%>h<`!!|aSx>ulo}f&;C&9*3bDjK#GIg{W=edUF zyr}=Fp}PjuUvKEw*81BG-7(Yu{f2J8)&JGd&EFh5`n7X3Z`YsNKs5JpH9t$NKMmSI zus-Rh<31C*eqImqnQQKR&rgk6;N=+TYaRWTz~A*dFW6%q#xwV2?!(?!Ikuka#$S}u zc-q#ex%XUZ+83{}b&PMhBpUzk;Y&$4^j`*SPBU;OpXJbW310=$316K29D{A(vF!o2 z?X-19gI@seDC}3jjTQgv;Qb4J3*4N|?GCWHx!1~nRdYWF(*7f8iSJlG2A{t6f426| zHPW9%jprVp_P4IJ@SWlI(f0PNZG&rn<4-~}U+q7r?e6=je=piM-?Z%}W_#}A`ghg* zsG2WP^CN1$an0Rp_20bawww4nGI*}Z@F8&5YWPtOF6Vb>+2Dp1#`u8}TCECN^fgb@+eZL=1eaB;MxN9l#)`Krr`1}2|{;BWx$tCxD z@N}xa59cD^GjJaS?SD{3Q5%@GiWb0DFyjFBkuVNp@!VlFXv(!sigj*L~1) zWv*c7)HXa^OP`@I46QyG&v(L>_Q}$BdL{SWUCDiCS90IgmE7PZ_dQ(6eFsZeYpZQKL+|qYyCHEa#$$f_w?s)hPE!@&~XeIX@TDbncLkqX`9a_nKPgZi@lZBhF z@5#a~eNR?$--(sn_hKb~rNMnSR@%QRxcU07EOwsgB%qPO9X-mnymMrAqF*sc`3~tKjOspNidld`A_ozwfBR zEqzB-@>Lq#cUGm{cUC3$y;Zp5d4K;QIT{Dt7bromKb=aNk*lTl&tb z9f$yloEqzB7uD|c7!u9tZRk)?^sKV`^ z@2J9k*6|%xxbeCQZoCBwel6U0RPnd;9aXsF<2$NwjN!s&L2a$buWs_f)Z;0QWsrxX)d_rwZ5KcU0j!!hJ^-uHAQ3;pXQ% zs&LCI3$DNKsAAXOcU0k)zM~3vynII$?s)l*D%^O!qY5|PeFgVBfcp!spYN#R@A&wR zD%^O!qY6I(?mMb*pZk1A6|TSUr^0uH`;Zo{y`$jf=liMHEqy-~uD|c6!u9w4RJf(@ zr@|c{-%o`*KE9s{H=ggO!j0$qsc`MSp9AkC-%o|>@B686{e3?buHE-j;o5ya z6|UX)Q{mcuKP4Z;J-gpwyvF^r&w+k7;&%*daUae{UiCVfwj;QqmcI#B^BM7|TKfxF z-FyAxYV9q!x_)bO!{q(FTH?M9RvU!B&y4SJ`W$JD_`F~1$J8-DfLl*J^*#h!&+k}L z?;mjWiXH9 z&n;@n|7)<%!)5;8z}5BlxkeuUe%z-v`#Tw$dd8^(?6|8NgW;>x5~Dx3Io1Qv)DvTJ zurbt)(RzLYC&m=uX5Lexsh?QS*HmCN;~2wdF15s%2HcD>Et-1P(R5(7#PB&yE%P!1 z*xbTr1nZM|nhC6rdggd$aOSvN>$9M1%NWcGw!V7e&jxlp6MuHFe(Ld=18iU8GbdO- zb;rSHOSSmV16E7B^Mcj*YIe^{vU%@28glt6AS? zVR`B=1y(ETFAZ1E7%l@=GrwGK%fi(!Lo=?=)M{d%r@f}cK1&Dt93AX4bg<9Q!9F_& z``j!xSNF-);C}z>cea_o)zQ?i$A^!lY7MYjV)(4B_Br2qtV!v2AC{jmHfvL^!|9=Y z-P&&X1$MvtT??$A^;(!d4T2lrI_uRoOJi<8xglp_ZdBVXlk;G(F>a}I-WaUr_iDB) zU!T)hwkuBEEx^_d-xBOT{2+=lNAb`4|>cLD389-m#oeqMy{26sQt814?& zNB!D*T=xJQOWUXPV+f^(W2|jYPBrrvC-z=o^Rw@+N4au(%5`=IXz$~|L$Y})TOZ@|WvpE+jAeIL14RQZ7D+Qh#H)j?V+HTHDU0l;?BveDHMmSl@cu?c)WM z7jmA)X>P`q>+kiF^=_%{BFgNU7lXYPEZyTSrM!&O!@8H(cFXkl3a~MaL4XJ^7Ng|tzbWk z)DI;WubH2M)o;SbHE|o*IO^_ccTjpbM%r%YRI_by+P)K9wtW}8Z2M<$+g4B8cZ1b) zFLV#sIO-YOpMxD+ZE5QlVD<2O!Olmn>0g4?^m9J6sW~6X-BO-$v~13gcGstUwM={W zf$dw``xRI%pYz((te-XS`b_=%!R35E09VU=KLpO0YkQDW%{;`3^J{Q9<`2VHF6@th z%W-)Wu8(@g_c5^JYb?h~uAjN5P0Qxkr2Z4wtp7M?@_G`iZoE|}<+1$+>}OZT?kTvM zeb--`n(Yr!uglbb25m*o@Mpn}LE=3J*GJuWPgAPJ|F>YZ@ZW(~;>`O0Jy;+0`1}D} z*7+mceq~Sk6IdVh+%r87HkLNycx}A^UX?RGFM{<`_geciW#(PmOPp$sf!O}MOzD0q zex=5a)vL9irF-k^ly7i)Xn(V|Tc%%s0UIOzcnhqSy;c4ir?JdIoVss=%_aPAU}HT{ zwfCC8gWus)&-MN;SUq#~9@seQ_UEsZYKi$ixE%Kn;QD00_zK=31~1UxPjDgSLNhcCf*lhd6P*0UIY{=!bw>Ifnhv z97AoXHyK#nzJG;HZmza#8@3hSPVma;@s*dp)`@R_Y{m;80C&!l-{f#L{fwtgE&fx0 z?T`D>1(Z|5Poq@V$L|290;^l!ccOCr{a#@juv&Q!J}q3`=l%omnGURGecz$Vjje4) zu$ngCpURWl%wY4?=DSq6_kPx!1$-c-b+jed*}&#w9pAUg#lCB`j`jW7pgF*YQtG2k zAK&Myr~SFW?v3f|+;Dx$zRm+z&wf5H*q`OFt}*5MIDgrTESsMPX>&e&SHzxm-UU~4 ze~-`naQDxtnE_z60RLebSBr4!uiduYm$ljUj$qfBW#TM|O`q_E;I^H$voKsA_3Uwr zfYtL^yBOHR>qgt6oNBfuPVB|OWn15Y+lRzm0MRK^pVO9tm(Q9@!}U?WroOJ1 z0UJx3{Z1~HdH!4$Y_5)l{^n`^>f531z-j)Lj-&78%jfOo;pKh93h;8yR)p)Lp1!OE zw*Tqx%5eSE-G5f4%s6UWg;UKu#EG*yIBRk%;;jL9y{X4%O|ZJT`!fXcJUg!g&a;_6 zV^pr?b>Z3*XAs!>c~)8uK|P-+@kS2v)a!e`Z0RXT!nZvVGg~*>+=e zZFx4_1gz$f>u*!|Mx1#zl)Uv z_*-U7wgS5zoU?5yJ-*~$4^rQT)9Y|+O1^P#m3%$XD(PaYWXf8l^QymXUPYN_JY#Az zM%vsS>^$hZ9i=?;xg$8^>CcRo^SKi??TNE9*!r2zUBGJoOk(D9SGe`GIiLQl`)*+M z%;6s3vQ6VVhkK%H%h(P9t9fJ&_k!=vnK_i}XWxcW+Lm*66m^DykE7IIn?5^Gs+)_S zKO?|u#`C!_eI1FWEq!{oA7)rIYI|-b2y;n)x z1JSjm-Gjhtv40obS+sjFTrKS$0=AC2?M|drv%Yy93O1Lix$BZ22B!Je@8@W9-u(Gt z>wXWeJ#`NU8!vT_0H?0DvhMfc+SA{o!0B(fE{;ammi`_CR!e`61()M|99%8^JsxZw z_2hK|IC*I=^Ewi)J$ao3PG0_;cG^7|U0d=x1*{hPso-)RPJ^o@uhYTSQBPiHfRmT@ zGOrWi+7tgQaMrc+bT-`Fwad=|J3ndfT(Cas=1ftY$A8pwEu0TNlhfGFuUwzBc_FxL z^CGxyB;LhfwZyvwd;zERj3?J8@h$_G@h*qEE)wqwuv+3>3BHumdd8FMlRp0dT#muj z@Nx`(2-inFZCnFZPmF89j&=BTU~|ei{s^p(dSYA;R?j|t1K7Cg=6MyRTGrH$!LBKv zg`DGz$4zM3()P_@ucx$g3s^1wKLHQtOuM&&^-)iZpMuA5CdO@GwS0cu4z`|p;@kmN z_qo`91FJjvkNIh{Pd8GkC6}LpGnc^XZvLZB=I$QwU7Y$^U#?Hamv^@=$Yx5cMIk5iPo~4xgIq&_^Z@~kYC+leQ-fLS*?}IE8|4-m@O*{|x zTF9Ds0sKc!{cK0BkN8E(++)22HnyeT^}I~^3a5u{zgpWZ)Anm%W90t*b+B5#>yiJN z(^$q9|EFiXH?WoccoS~y2kX!Bzkr=b^;~Cffz|UJ%wNIAQTJZ-9ZCfvhBaa%eLQz+qQbzeh;jkb^1QoIO-Yu55SJSwzTyjSUvn7VCO4)#YbQ@{hT9h zYR*A&xBO4z_)lzQUp|H#BRPEnHYfdzFW1NT&bMP@nfyKlI~K|BGq9TDku{`d{j8zZ z+F{JyV?2E?k@mm9*Y?ZrcE5zH8}D;UdD{OM*#2ny3M{v+uPM#d()H?%Z$EE-IkZm( z)^3?t9bjXGcfxHi>$g8#A9dG^{2T1XHkQ~t{26Dj8?is%oNH)c?PqCDQ&LXF>0wS& z*LKU~Fb&uk$zfWsTCO4a6r9E~2XX374>p(Z8NkNM_oXv}{k~K^>vkrvdggOxuyNGw zk3XNSmYB1E%Q2f3u20teY+$wWyS~}s&at+{nFFkzYjsYrvDLj+=ce><+_lZcsb-(W zi8Bw__A?K1$2Pu}a>w1emZ>{0*zrl<=L4%{Eo)P=e&*6~&vWJc;5_54P5;XKiUqJ~ zPg@Iuotv!tg}`%CM?F3ZgVpmHwg^~lJ^Yg6qHtqt%XbNjfz=&@E=swdd0ou)65ujN z+wwVdNpx*_4qXbY=3$P)rTLFPZ*RYhE!WQ&J78ZHY+#6t_Rz-4cm$>*L`f3iL(ya`e}b%uzgjxzQ4Di7XS6Y=9~4h zKKM*>Q;*LEV0G&cqLjzB5xA^B7=8iu)#I}!7n8b_4sTJRuA6>T;{PY+&uELj@yCtQBNM*gN>ujJhq@z6YoIj z@0%w6j&Rpqeuil$u-X-@src;N@Yw~f=I;gPd(>Uw*45@(+?_IOQQK~udvLl=#K~nF zVz`F%*QSr{dXKp$*n3Rl?FCLe^&y;U))6Py;b6zu_`@jWi8m6Qd)5(Px!0)oo1?(~ zn>p6emUw%E%Q@Qzeiw759-nTodicKJvd(_+ZtAGVXEaznd<@um%6>N%tdDwp#(`Z^ zndkk%`l+Yh0bq4MBjYz7ZjRdACy%7R6Ts?@(?m-9FP=oH&2c#izXQRiQR<^jpP`iM zd6qZ`tmfWW?)%?G({=-UsPlO+Sk1lRx_XWefvY>mhf;dDf2n_uQ_VRRd;R-+Wru-n z&(9{u#_t=x4_8lHhlAD3$Muysmib=)2(WFte)P8u`{y;H&3aj5mgPF#9d6t9>qxM5 z?8DKN9`-@~C{8u|AkJJK3wDj}O-v`1x?P{9J=JHS6bom}aZy zvw`Ds0lMuc_C;{-tJ9y0;r7S1q%D4zfVCOpLQ1*qT}o-bmfq`LPI(2VhxRLLyQQ&Q zLsx;_n_M^I%Q&rP9C6zG0ob;(UtbMYGk@3LHDC|pX!{|jnsLO5b1m3@XZ)@MtL1)A zn_B#T1XfGC*MrsEXKeRIaN5;&1E-qpitTs$qn@}ofsGq}Gq@c4Ti}kpe%j*q6R@_7 zy*##C!Q~kJ6t0#rx*eQcwcW<4X0Bp$b-v{4@{FNn+PMpC?%8{P239+cGd_2N&4qv6 zt#ysw17@nKO*pmXI{i6VTjt{zVB@LV$2%$268D#2wcHck2lgIXJwCqzt6TqGN_lMe zgUu`VU=M)R%I}*VtbMZQJOo$MH{-6BXMkUW%lqr{v*2ND+SC3cU~|gYJqlJU@9!Uj zTTff=?;i)N+x}YgSDyR(r@&?Vw&nf()9BiA-~9|&%_H~s&%%GhY0k!$>t_t_?|&<% zJ#Ei{<+;EA9oYMO>ub~JNlNwf_xIrCS;x74{s33Yn*0;k!|PVtA368t_pMxm;^g@} z*x2RRz5rMEI((JV!*`78a@X67lzw;i5~c4N|4hlyR4-GKR`m*{ZRwYs-vGPD?9c0z za&7Kce*x!S>P@gb_fl_xy_d3%w#54@xLmVu!@ZYMkIy?`_3*!e%Q}CDdoQIPpLfCP z;qQT`rc+s4@5A*`kIx6-f%xQl`w*_5dg}cHtX|$reFQg0ZMm2FCs^Ha`k2!Gi$9^% zmV2pB!QM;hqfMXJDAluuzX1QcHT)&Iwyfc=z-k^@8~=iT&Y3kV*Uvt?hQATlYxrxh z+}zeN2$Q-Oh4r&O*7qKHGVolCnKpknS+33B)#(H;R@?kNWqHPW0C>sT=I<=a z{jQjznw)Qu;a-Romd*$y%%!{V(b#M)B%m-G>IL!~vICX*LX=_1n#>w9+m)owtQ*IpB zhj}jyev8uj`e{#2i-3($?uCn@spq<046NogoO+AHQ_nVA9Ic#h_epbgj+Vsc9;ELQ zV7WGmYH9wHeQ+tTJp14>VD~}mXiL0h!R4N|9Nc|SJwD5W)x%c+mvvTzyAP_zXC<(D z_{!jNA6x~lk9vGo1rKBlGhVB~^;6G2xH?$9+y~cyo1?bugKL7-9VdU+-~Nl&rqq^w za2>Gwpg!94ahpj^?YH%-GrPMqs(Q z4aT-Hc%-o?wdu1irMmUq2R8+0U2g)GYjYpm9GrE%8CagN-V&UBa0{?J``}h!_d(-X zPkY+i8mw+EwkuD2+ko}gX1nsVw=LK>+H6;zeQ-OlZMqNcK;81XL9~_3J zZmhj1<=F>^gWU(s*Su`YnEEC65nyc@Q+aY91$I4@`{3Si_4w=q9?6;YC)X!AcY}@L z-^H^{_wIe+>hakRYz+5z^N{Nk`)IH+{28vq7z0<2&seZA%+(ljePSO6Hio|wml*rQ z)#Gyj*cj$&47on~do4@=XAO)8%e8qeOaf;OOa#ld`8PEW0vp$u2U5zj=YJRcEM9(8?)T=zlWxtJ?v1h+QFQucNjSJY*VgZxwgG0Hb3|LBd}%9KO8LAMo}Hb zf3oKv36^KiKN{?wZyjxkcMQ1PzmJ8x=c~u(IIw#7@!+z~32^s(_4u3!Ru4Z3T<-ZN z!}U>*&ne)6jA6#>RJeZX+4D~WtCxHJ>2Pz@mOcLru)1SEku6o<09O@W0#h&qvpmJ^uo*nn&jMLio9yx#r~h*=P6si^cVxe-T)2ZvNfMOThb5 zT3?&FokOW^eb@PA;H>jY!E$Y`^DDqv=a+-!8SAUS+4HXi%d_YI0PLP`JnLysdsldEUyaK_;V zusnI)1kO187%b2H+yZXy`9DEZH`kjf<;HT)|0y_o{;goSvE1`-2RlFIz3&}xbz|K| zDbJpNC)hpTe9gk){?uP{oBivfyYr=Uz_#6 z0IOTy&)80I_Co&_mt32lu>-)_3;nxX@{Gwqa6V)G8(s2z#!dnDGuC*mzvo4n_ND}@ zn~Uwr)815I{k7SyJnc;lwjFJ@E6-=_G+^6w9ZW~*;W|*CmQ&3+5tnwceZ6MuPp%`xM(0=%V>r}4F^mFxHw#v%1rs_W++)5`FcR@7IE|0=bA zehzL`xSC^|>uWW*x?{ZtrH5m!zB;FxV=YeHHNlQW_*(FC?AMl)lX}K_9k6=Fc3rS> z)bn{U2(0esg=3JmE%Vv89@u;x7yZr4+&7?1?)t3Hsb=nCb9GPL2%PI;L$F+%d*a66 zTo;4E^7L_2aQ4Ja!1C;gn}OXEjb}aWX>W6|y7Ozh^0c=FSbuG{D^Gh{f{mlicIBC~ zt-!Wvzqg_EaL&}X=2Wxa;&LsiC$H_n8Heq_^5nH6IODJbSf2f1C-AGZXB}<1&UXgq zI!_;VK~v9q-xaJ@?hm`cQ_sH2^-G?6fRpF$V0rQ!0#2TLg5}9`FYuS-X&r6Jb0|3N zCC_1K>dA9BSgp)+1U&W3Q?8%mn{{Scj<@eR*soo%`$CF0^4Txv;*NSt;kue%-P(_admv17~_+WPrp#DtJBm}yu@>9kwaG)SE?=kSmhv@gu3vIH7GD=s_;K)o z@LUVW!}U@3p4Y$sbpkl|yyY|eiRjvLZ+{Zl`s()K6w35L+sT}2<}6MQr-J=myYSQC zQx`d%4%bILIh_GcPWjHjdz3TLwI!#sz}8oHOwzt(K4Z=Xo1f#Pzqy(Jxs=IYpL00X z%wKH2e#V>+&YpE1Sgy^_m(n*Mf~?2{Cl+XFqhWJIL`;Se)gy?xITG4k?WIw z&R^TVCayTS%|HzQEeekrIg{H=U~`y-GXKWJLew({x&JQDvISRPr@;pm+Dn(pC#T)L%3QQ%JeCKmg|7fMPOgm=;rgg=K>NmC32gh` zOMCC*edo$>b-&w?uL@T4T|@4VRs*Z&XVF&&8(-bN+1E9|j*s_N<}24P_BFw=n=`kb z)mmukxp!C_td>5l3$~BObFAgDtp~Qh+8lSe*fBSb_dl+~4ZyBN>+7f8oLplYg3T#w zNUnc!-l(>>?icHv2cxNb&22*I;b)Bc#++)dBe8Lu_f0t+cjLTY&+}%ro_WsCV!1vy z$7WuxU9s_uYu~kRS=hx{+m7j+oF21ry0-1->@{}H&cSKCxhRdlO;O)<(Z%U8H)rb4 zQ)BDTTVVgq7h}y&nd^3Y;^msL+_C27?tJV3H=c8)uRL?Q6WI9LtS5K>v0pob?VEM9 zWzFmY*5-WdM(N>vsPD?DW{zU_D6v}F+!Jh@;X~l|D|`1|aOZ9hPJPWyO}}!_Ydv$? zIL88<8M_5*>{u+sY5R*%X1s%9HyDux)5Fjy&Tx672X{ zM_a~k6j+<%w-2R<Deqh_pn2m;K%(TUC3|O1_*)Mr)W5H^+ zA$8PZge+O)SOHz9MX+MB?`g@(Hz42i6@Cjh!WdEE9*GJv& zbIoHCSlt}O$s9ITGqvpV708jqrhrjZ~D0YjsZLO)^`r&V&_kPuXWox z4!kg>KHBtg4XCHR6ToGAC&JZShiUI*aN2VX$c^Rv>!0>c1>2rJ+VpV^si(cu!DV}A z!qv{GeeB=a;I!u&k{iqQqkq~v7i@d_Xw%0vsGj!D2bb+#2v@tH_OZQ-!D-JmC^yz6 zl=`Q=OTo6Mk2ZZ=!|G}8a&Xz+m2kByY9HJC0XXfshULa`{pz3geh9WbeYEN0HK3mM zt_7Fv{RpmhUF~CgH-OWg*MQtuu7Ca0-jBhyr;j#$yoS`%-p$~$y`R9;o<^7!Q0rPU2yIlXoegN!zCH8}0HS5nz z`4Fe|m*cd)T%XkcHQ4db{_-$b|2;VMm+KS%N5PJt@0Ps&9|x;F#%T`T*FOPP&+kM( z30Bi5aef0fPU<`bR!g0y!D`m=J@?AQeFkhSZRYzd<#U|o`v|A`%Js7yKhu5-*6!z> zIG=a^Z1Vb?9_w=E^KMX$*QQ*L)6csNDD(OD2V%HShrdv8_wzp&T)$Tu{I!DX|7OAU zf2+aYF1Y@GFS!2iHTVYw*Z-q}>;G|se_C+;zbLr=Up4sG1=qhHH}d*rJUR;QzfCou z=2>5Vr2cyNyPmTa{sdOf=hyRKwX8wMTrK`Dg4Oc*^%7WZDNcRdA6^CzqI69-Cvvg# zp?~&=SHbQN`e@U~IaN=4uY=3>-hituS^L=DU%-Rv_MB6>v7Ar+)81dfwx^FaeOwdj zY406y+1}saYG>3ww)ZZ0P~DzuLT)VAgZ^pneX#B6qfH;zlzQ6x5L~wR54hR|wU6!n z6FjJH&ow1Cmg`CXwD$?v_Vm%Fk84sr?R^F=+xr}@c17)DdtZVF)$O?^<;HS7>Yw)h z1-3nXwCVFHrFz=?23)qsC0(hNpI?)q4XWF7P0Nktde%Sfb%JeAA8q=0O{k~6$-%Cn z?8yV+YOzlNeyy-i30I4KDzI~u`cuQzVxI==n8rRWTrHm`(}8W%xYpI~_2=iw^kA<^ zuRn8_>z~*&fE~ZYo)NBQ{h85c0$YE%y1rbW)Snsbc;tGY1+M=dwZB{+{oR9St?lm5 z;_T0Zv2Dibu`y@%=S^z75#^?w?$4W3X8)Ok80CI6Pr==mu-uK(H%zHY(wU%%k`Z`k023$FjB1=oM` z2H&#a`fpos{kL!M9Sg4it_9bB_k#Ox^$n?c*3X=9=h^j|H8vNTdOk1a2CHR_xdznY zKQCAyCSJ+D!@vAjO@PkUQ{ZBHL<`go12r@gJfWqaGe)wZsE zY;QYo+VdKh8_Vlk|FpLQ*!J|%rjL7sdfM9wT(-9hT+N?p)W`OA1E;-R!E$5mPN{#| z+XHNS`e@U~Jw`q4?FF8PGoN8Y!D_J&1A7g{J{+tT`v|aWKK7Adwb)02U8Aw@4OYwN z?>=DLG_G~E=kvE4JcKi!zjFN(dtdNsoQb_3SS_EwqrukC=dWC!)E@(OeP#a~3)Vk- zqgeXKv2b#O3Pd+Ym5%G^I}g>5@dkF7a#|FBJsx1`*b)BA_*DRcjD zC^7t83O}OYetsQYaQ%*L@Z$@v|49Yc|C9zlt>F5fS#bT&Zt!ypuKxuE*Z-mhzog*$ zUtVziuPpcvz*pBi>*p}I<5YhBejiOepTCEL)w0GML$%EDkzloa{vHKZD?fjafjc+O zv12S3JHGno`acfr^{T?K#JCV>!S2r@eE)wx^FaeOx2zY41F6+1~kZwes`#LU`J9 zjmVAV`p`e^T@1E8eYEN08dFbumx9aoE`zI;pTAeY)1GTgZYz4r0F!tQPx^z>Zn$*Mrq!zX5C?W4{rsmi_z3VB0jVb+vmv z`22Je*lQ&Fw_N|kz8P$P68jdgTK2P_fUTeXTdt4w-Lr13?e1sd>}NY+^KU_T?97?{ zY?m7ENVzMg``PZ4*}rZlhS#~=>;Gp3SHGv`x!&)98>hV9??hA2^?nyv&FkH~{rtWg zZZ6KJZOFyO*FW?BbFlNTk2ZbGQ9Wb$3$R+=Tm2HOb}wgq?gQ(SoPGsX%lCzj*Zpwo zYI8j08Bd=*_u}-}gEQl~XN?`tA)JoqP|A$oqr}K~`W@lpHP84x1h-x}e!oUj&;Iu? zSk3Fl_kC;O{|HX}(&*3$; zpCdT!=P1hb^EqOqpFV5*d!M&!o_@X#w_e%LH_+76&o{wp>E{;s{{^m|e*P6~ zF80kf7&hls->R?U^|f0V*;oB98Wp1#`a|rr~N#LGW~d$82Xg` zcn?k89DQf;KG?ec9FqJ4xSIV+Uq6IfPn*}M`TPT{Zm#0w>c8XTzv1I?FlXlEkQ$r! z_c+b360BCkKpQ_-_!L$X}Z~6wG zvEcg8QgHoeYw$S=uK(Nx*MHuEcY)`xdB$fT+z_HE0qh*>qfH;jP(AI<1TNc~8Ln2|kIxEEdyb*pSdO3m zX>WG0?dhXUALl?l?ac`;+nWonR^E@#15bO-f!tV*zy4`&KCtcSqfH;@P(AG}0JhJ$ zUs({Wc5kuH7pi?SM+?K%ykAM2Md0=&aTW!um2noUeG+GJxLW(ZXL9D0?*`%KU>nBJ z$Mv%W<&vDPsa#)j{gcB|VEdQfD_t6_X8jY0y$sm;xqp@GV|}l|Wox_Fr#RQ=N!U)~ z^f;L_*XJoUHukBUUZ1B^=6YO#80Gc3O2NH;Rxh}IYc}}W1=oL2!S!Fi;4?878`eD6 z+lp}WDzCSd(9|;rD}&W?y_vIG#&lJ%TCTU%z-s07wg%j>aDCY)x!C^bpY^yF*!8H7 zHhmlu^|ZGRxNL7-xLSF=tp`tgj)~k@j)(qfZv(LH>7z{_$5cJ-Z3H$4*UX9d4hE|^ zh8t6E!s)nX&B^si{Y}Bf^qI+@AJ`16X8p}6x8SsX)|^})>$|SDtnIEjan>A{Uv&Ye z$2pu?bLZChOv>|Wd^Y9zoUXqMDYO2zAzs#>>u0BetMAg_yB1vk;RV-!WP^{YdDh#u zaC0ix+jeN`S#R5e)w15qQ7z-S16VEVZAY+Lx!!h$I}XmLeUgjqkN(MLH?ZSU=CeDR zy8iZ8p1&Ee2e=D=V;W0)^4b$z<~0Pa=DM^!-!JY3w>{_DdMkj3qN%$+yssDrR@3gY z=Lm56;&{u=&9Sx(vEyvL` zen6SEvkx)LwKKNh>iakN0R?yM98z%o4{h+nYM#03hF^hSIam9lsb}r%2Ug44F-Nt` z(P*$**3KBPTDf+{!5s(3)jr9^_DBEZGal@?l=)0RQ`g`A$}=Yu!Cm+p(^%S**CcS6 z*MV@gtR44>gE(z3YsbChyS1LRb1+;jYv+65^u_U(o10^88)C=Vddc(qVDr>pKkd!6 zb0c<->o~J^epKThQeI!VwP*?*n+Dc-{2?IJY#*TQjd`{g*9qg-r$`X`?g!S=Jv=Oi?B{moyVF+LgG zg}*V4rM)@EcVPFpl`~`f(;DAQd0UNtLU}ul^%rg6n^4!S%nb!Edj5#`sjYeJ;oNG&J?h`RQP_%(-KrmN`EI ztd==H6RcLw`Pp#C*?Du!m|>tz~-sHe%h1g z55Q%fSHsnO_HbOx^M`QrbPlX%p4XtMC(moaYRU6Q;NpKkdzH{XXm-Kj+M~{)-y_jPl+Z-$VIJPOtS}QRZ5|lbBvZa<8pl6kPq64Srw2 z^?$11`ajd)&(=I^|1P*Wm23ZJXzIBZ?*^;oS~N$s%+WnywOoro2dkCW;=ORk!Ev=u zafTNzx|bGP96Yv;crZ1X-{4cg3G)ff~&Q!#R08*_Fuzo&ucOH zt^|J=U0c@ABVe^W^F9hz^O@J*@!1^w7-wSHhItt4@!FPHPr%jkv&m0_)qFP2-?jM- z+%~jj%{~pz7&^~#$HY0dZ(`@yx*4D6z+L2^zkb@AYxXhh9>3bN+iY^{l}^fYq`FZBs3C{YS7`*5IGO zYME2-zh8jcFUQFob-~8nn<5$34_#4w$+M8qi40ey-aAu63 zs_~PQPjfo1&r;^ve}gy~SIfUN_**qk-mk)qQ|A2|ntI0Zb+B6YbjRgQxVhLj+mwrK zPk*s(IhN`3U*TQU(O*C9%|8DDyT@~!>GN-EY@dHuW1mZY&uRbvNSXfsotS0+-)r#q z3+~?calw7=`Ly7+@mb9?UT?$Ap&YMw(9|$PNWb{$g`;56F0Z z0PmuX{`zT89v_06dHe%SJ?r@+u-X;G^!tm8i1kmndiIA;z{$(8keiF+ps#+HQ15fF z^~~!)aPo32IhQjw_)Im=m`npVPB}Nz zqN!(YrUR>$b29_nTWbSj@PVc>KU)uz-r}q z%>jRddX9@Z%EjiVzu4UTS<#Hw-0&{y=&zslRHS4fz`5>uO?O(Ts>=f z0q`4jUXF#_TpR~|^}7c9!eGZ+n`0tRo{NC}-opCU)83rdcd>iC&6#<9r^e3f-)ihL z_uo04*Y_whuiqhNIj>7L_)-OTURNl%&-5!5+%{IOdB$^5xH**bx)_>z=5=wfS~;&v z!0m_QVvcgL`ROkc>z$3&iaT@`!>HtSnYdvjht!tU`tXXf>T8auBa*7zREe{edl z|D?>ku1U;tUe|8$bqemhZdh=i-v$@lHa4z##&b2eIh6CdI+}Xsbq%muIj?KM?T6!H zj&ia2=`S|7t$E(ac&!WXqK^LhX-^)5z|B0?)6SWBT_3ELdA*BR8^F~wuN#4rmt!F} z7so+g{qCmTCSb=~n`0tRo|}U2!)AT!X>ZQ!r`SEp&z_Ir`Rw@w>|B0EnYrAOIOSY! z-Qe5QJY%vM+&JajY>uX$x!D4&R?f{xTo!zc_RKeqCd)-*0OC5M_n0{hy37{ojk2W&ejY z`0#>z{dO1p0r-9ew~f&?&v@+)H-~b(_CQn5c-PaY$|%{)e-sptCL8?1H(swEIb6y8f*Q0~dd3BCDYwSGq2RoOOQMA6 z3hrPGSI?z6W9LjYtvEbI3RPzn0_Rmg@gD*yXF45L^IxA7`kEZU=SIG|mtNA+s z`L1R>T-|Z8O`n0qwx`YU8%W&)!RD#Ww&jkW{W}Oe9i{cHr`>nJ>Ujrjj2Wo!F%4y| zjcIG_wJ}|d?f3Lx$7DuI$7C7GBZ*gz$wk95vnCILn?reR zd=E|CaTq|IL&4^jzu$HkTp#sM8Joi?J^Xt*+P=@JW?#gP%Qn=Pt2>_d<0#J6n5P-2 zdklDHN_Fk_Tg^P3`(wds`F`E8Jr1sJ4ku80n1lN9oNBfyHir{yuI~4Hj_WC4E=XyK??3`+I{N>u7px#+v$5)$kA=maGwsXM7(&oI#bA6r* zc3(A~^|U({&hdHRB%{e`aJY6@JgVitNG_Gq%E$boIiP!I3*gR&T%ylwrja|pH)z~?ky~h5`%p5iL z-Nu}p&iCAu*;B5fy<9(jU-aXGJC;8!xb56taQ*Krxc>JQT>twE?)Mgt6SA&5IC;m;~e;5B|bZxG!&G~-imfA1ZwR`qY z(A53g8-BNSE7*GKj*D}smbHHy*tMd~`IKvOUEcwA?Pzmu<=TvU7uZ*?oc>bV|%0am-4GxhETr=D%f_498`Io32bQN_ z4}jAz*Pz^=oAO$E5d12ob+o1ZhrnsyIIgK*qp2tFhrw#zf27_c@YFMJxqfBd<}Y{s z*$>xT*23f9f43H%K-Xq2E0D`?z+S`0GVk)I;7@X@yB7RD>}jy|w3&<7n_9-~S#ZY8 zYfqjr`z<(Q=5;97X58O{jb%))NqN@7AHaS;YCP*{&)EDCoUt*MWBMmF^^EEBV6}2B TyZ}!<+m!29j-g|j`>Ov3!H0$T diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl index 0ef9538..0dec2cd 100644 --- a/piet-gpu/shader/gen/draw_leaf.hlsl +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -158,10 +158,10 @@ static const DrawMonoid _443 = { 1u, 0u }; static const DrawMonoid _445 = { 1u, 1u }; static const DrawMonoid _447 = { 0u, 1u }; -RWByteAddressBuffer _201 : register(u0); -ByteAddressBuffer _225 : register(t2); -ByteAddressBuffer _1008 : register(t3); -ByteAddressBuffer _1042 : register(t1); +RWByteAddressBuffer _201 : register(u0, space0); +ByteAddressBuffer _225 : register(t2, space0); +ByteAddressBuffer _1008 : register(t3, space0); +ByteAddressBuffer _1042 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl index b28c956..216d923 100644 --- a/piet-gpu/shader/gen/draw_reduce.hlsl +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -49,10 +49,10 @@ static const DrawMonoid _90 = { 1u, 1u }; static const DrawMonoid _92 = { 0u, 1u }; static const DrawMonoid _94 = { 0u, 0u }; -ByteAddressBuffer _46 : register(t2); -RWByteAddressBuffer _203 : register(u3); -RWByteAddressBuffer _217 : register(u0); -ByteAddressBuffer _223 : register(t1); +ByteAddressBuffer _46 : register(t2, space0); +RWByteAddressBuffer _203 : register(u3, space0); +RWByteAddressBuffer _217 : register(u0, space0); +ByteAddressBuffer _223 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl index 7dc68b1..ec75d5c 100644 --- a/piet-gpu/shader/gen/draw_root.hlsl +++ b/piet-gpu/shader/gen/draw_root.hlsl @@ -8,7 +8,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const DrawMonoid _18 = { 0u, 0u }; -RWByteAddressBuffer _57 : register(u0); +RWByteAddressBuffer _57 : register(u0, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil new file mode 100644 index 0000000000000000000000000000000000000000..3b3c42e8c69d2a7f2bbfca14d8254af7ed99cfb1 GIT binary patch literal 10004 zcmeHrdsq|K+V3QpOag?EaB~7d0th0aP9k#gf(c+iK^v4JS}O?wq!uUw-qGd)31XC} zhf~IX4y0&c_QpqbWo7h`P~*7)ubhmsH6JrP4VQ z(VaO8d;vLOuUkSpOW++#$5UMeIn^n8Z(gFf`Xs^;Bn+f5_3V&#i6~vE>sLRkLHS}V z9T)@J$6sfIW}^wo&F-3ENK$yDud$p_NRsr(LV$zo_EJd`cxiO|vUl(hBpJ*Jf~#G4 zxmBoFNK+-ad{^11_x4QF>D(&Y(^w5=y3hr29i*EK3=6^HQzUdb!I)hmV`yY}U7Z;h~gj@40&)%^nrfjE2SrQDkDT`(bEd*d9$?%mb zG%`Sz6hyQW49dG@sEBVcb2K}Rhc!2j8MXR1`}LVQBYPf-I{-@JTBOh@FDez5rYXlO zuRk&F=+guRPx|0QjjCEg=rceE#QiHY$0f+23i@9R-|XHa}t}J#5ySr0{XDO z{sU}-khVoiEAPVwS+ot~NGTI=7)K6*M+O?e6xo=7kAs7KCh-$zegT(h=AtZn>;oEM zY^IL~7aZUtT;ba&B?cBM-o7R6K97)KY{J8&pG+nTAq5Ua&}|eo*-3Co{KS>6a1Jzc zan{?UU~G=&z2)6ahIz*Zr(j|O&S9tNbuMAaooJ%pA}MOXdyX?|46&|_JXa%O*hH;4 zXDAGCjWU!k+I^OdM7VhqiZ%`syapLM4A~%?L40b!-H(!{7n8k`X5n(TvN@ENsrwG6 zZY`r6Tz8oyYIK`(c9L@Sz)n? zi1OD^g2L!elArz{&d*m7vHqS3E2?Eux-_vbjn!rdc@;_tnJH?$=4){a~~xoC0)B^T(pB8h3OZ_MTs1c$5IW4i9<*|g$Cl6ygx zvV)0kAyfB#xFuw&{IDl4uX}jaK+oXg=L4NRgQw8!Q1I@Rxvl4TF_&Vl5LW<;%Tq`k z_}-MV6Z>wcx1^SxIDFW%Su{Z~_dv^~*)dmupeg2w_V`E#@iCLG;I{X3^Sw#)suN=? zExH5+ZSy8tS;IQUAmZ^Oi}_l?oXJR-*k%s6)yT~!sCJmzzn3zvJ8_+SJ+q(iC?-6<8)gnmm@`@J0gr>Y)}Vkk*~ENe$DFI` zIem$77btVDs^?Zx=JXDGZ;;Zq7SLh}w58yx(3a@6B~sdkaN34ZDqy=&q0Odi*Wuby zP=FegkuQgtFKx_UOw1PqC8xWQJJ87OwO=2}xIIDKc4a`147;ySjO$K}Yp`EiEwOcp zvDK70b(R@G=o{6Xdz3jw3!^=VR#8IP60Y5#&~ol1GDmIL0FU{-4f~bF95FHb^&US- zJYExOWj87Ckq`=MlLbHmCfhn5f<%COG~#~Xr@7k|QNqfxjv{|@)T(PgkCN^p?7@&r zMXvWs#%_H_%SdOX{}oZ!KDrp1VtF>Fc199Pu!H&-ZZh;11bE!a{k@~;J)!+w@qnZ@ zcC@?buxJ)!<+W~cMp7M<80{!}sQix^<32C;_H_@#^icXA<$z(e+N#)-I4{2RVdgLc zOvZl=beM3d2tg@4cc3UZQbfyAyvIqz<@Mn$QO~}*(E96~NvDcdu7-VAGI6{Q*MYwK zk9{~6%scz=PVnqswGYF+)&H>%U;XdghZBPF+%R67uz(zlH|^z9g>WnLuhp9!u?cSf zGQAn54(QF^|8;L3@*^2aQG<{~sXjt7lwgW$9Pke8#WHaJf9=I#^u+TboZ%`j^T*nG zVf1XV^gs1s{s)!VS z-$g6wg$5q|Ty*5V3#@xW5i7TdcYNws(X3k-;)Rmlc2$4|U6n9_uKG$QGd2#*Rd`W+ z^9w(cl~+7|@aV4X@et@{!p^?70etbvFU&3bm5e_#K0l+7xKOD@^j@h-f_<%NWZXjB zqZzq+tiULfP-Y1!%nypo{jum2N|cqjAo4eee4?Yjj6M)`P7rftibzrgx*=$V(|^2K zv`82Q+FFa7{5;shc;f3pCmif+>+Tsi^tijndJ0Y9fL?g{+~rHNFSncnJuXwrxP7vB z*OKWT&!^qoRb0G?2RG@^7yrNS{rBv9G9H3_3qE+g(6uZ#@Zl){@etLEc;^U%_NSh% zOONvwPsvD7dZ&$f)UCSXz33HI_mME&dr6CrOeos_v2g6~fptmlbuuLG8J{d6=jzx) zXX3O{LaZ2^{n008T$G{;dc2F>H|zQLk8pF%;qSlQ#U5&Iz5f^&^{ThM+{JF)7Wv>Y zZvOJqXMfwp9v=Set8Z~-m5z>)(Ei5%+Z+-}$i|Ee_myafXhq;L+Qj;3T^0 zj{tz!_T*gabT~QscNrm;KK>zMs~qLgRiiQRdpZPjK~z%3{x(k5$2RcslcPs?bgW^D z!Tbwtn~~;td^EvNdXnW@;dqQkG>bA_Pxd(du?^R0e z0~8Eya9hi8Z61?5TMtA`2V2U6luwz1P{dlh2*bDt`ydGnfC$EYQb2v2f;6WIyh0Sv z)@KbO*h?dn(myVx*e=C3E(Hv3aJy{*cL8gTNj*EVjHlj|1!U&(JYGOSrQ>U%+1Fwp zB=Klako#ccg!(uGY2Ij?fELxP%?2yOensrdv7;Fq#+PFspkQ!=Yqd>$0R;KmEb}7E zr0NP25L9UJ7=wbgjIU^1kbRJ3unS`T+7B#cBF#5|6~zu|%fX6he;Dj58W*&Ed`07e zz#s&pgJ3aMp>u~s@$Hs^1cVWT29dDy3^l9QlYSm8APyNYQvul@@flWT*k}@E+$tYZ zuc!BUv`g55gd~tRg z_7koXjaG?{uDi8zTKQZl)QGz#VN!CGw z>%2xU3zmd

eTaJ{9E28ux16L`-^^GWAW;B8ERpIhSFI)@C9*kKxfg)n;5##5~LL zoyIQ|qOS?$JlZ4dC}A?{GfbW}vUR)n1$*i?C2NCHiG>tczRtScC9-SLLMm zut~4@B7Loksv?bUn>k*cb{1I|7;iQPN)u0*@DPj!g019*rU11VNqm$d4Ja~6TZ0PR ziH1CBy);V6Q1oj|(I#nqW4h{RihSCT%V=L9ZmHw}?d>AHZk=@j9o zQifbLs>8X^)cq6BvGpC*W>_w{Kgs-+D+>Z-icB-^OMS%|U$;EII3#7K z!yrTQ7}XcnL50_gJrhBw@EFLSRs3Gpr!&WOq-`G-+k73$570KDB`r4_CqI?X1Cb=e zkwa9u)wg}kxQ@I3pw~*LJCq-;9iczZIjqjI(l)_BBam{pPJ*C28EdN>k`)sijCIvp z&@C-D`y+o%oIlR^0ac#GL7f_Zmvld5$f>bpV4mtP)OueF)BU`M|A?>>@Jh=vP5zAVtu9rbEQp_NIJ*&saK1BGs)> zU9XuGpBes&-+#Q;1>cs|Hy}4t0{vshvRGzk^ek`L7v#*$%$Jj5*>0JrGi|zSj^gc( z=stgZva-ez_M)kqJe#{655CwtxxFTf)!n5L#fPDx>uDz>L5#L$# ziXWu+DC#r{6}OF~J5&mhO1Nj>n^-P)ODeeo)x++|y~^FJCoYLch)Nd`eLB%=YG*!` zoBd5rE!S_}&r5SGU3y@JOJ}<(A^ssZr)kw|lOHJ_G<6n3@rrZ>)&=?mP!|jaP#3&G zT@bzx>%tZ?nI<3K?wAELe(-cGL^k8ko)@lzqQW#I`dD1vZh~3wyLH_Gsl(NYC(b8ek*mrc3UyfZyX@Xj<313& z0V)!&oah|iTt)u9bcWes!Qg$pcLUtsg6y7Wj9NL<*FP7CsVwdoDMSXX%Mcs z>#w1_yFe@YLVi>b;Q4xCZ+kEGB@?7H0p$IWn_nzsxyw+4L=tngVJ;;^~3j-c`1jK%+<~He3txgDEY zz?nwQH>wWY5#(UEiRZ2-b`#I(Cv*sB3}*{Q_zAfK^7iiTHL<~l1O5KH@#lG?nh@8m zocP2var~(xx(c0$GiFr@Mp}?On|yaqcNclTvUWjaNT6F526D_EN)5XodOOG?w;r+_ zq=ET}5q(crQD_7lN@k959AUM7Fy9ejC)W=-$&U5U^BaosrnOgO`p^EV5q*~La!y%e z9yG>wgX-$`1L}=i$cgb`_(+RcU%P`8bS#fKIb=?kK)df?K<+`y^}Z$Ndi!(NCI_|Q zNw}IWXUP~}9h?;J>pjwvps)2&SDfJu$dk~(kl<0AZSJb3gM>>;Vwj91k~`eC&TT_L z7%d?X%^nYi#$WZAq+WpqyVWe;PK($S@HtW;9d~;4FV~N@xRkE}POoVW!1+R$DxYuL z)K#c@%vnrV%w9_qkNZM>?8(BzNCoso-8V`$1qI=39ZasUhT+HVR3fbw>r~yJ&DIsF zB14XJ{*T;?u0!8PR0n=8`q)VR{j$GqwmIBwI~MO74!l%olZ!$E{qfN(p0l1XeO1SV z<&Nws2>Q(>rAzhIp4*PpoYoX?rc!p{QE)Faknn8T?BBd9emgNA{6Ie-4)M?vE^6YC zkYQn0{=j%rg1(wRiH1K({4tk7_+dniqpvOcpNHo|P zsrxlj9jKDZhbmA1391#U6P+53f$V?RVRze*_U>fl1h9KEqP?Z!_6Xf)*PTZn3p=E{ zMa$h*Q`eRIwC=fIlALAcz8`Rn;=__u|0JY>{ixl7lNQBQ&M%HaiwDHu_n$0CcLx^a zCqN78`-Z<^CyulvYJde9*SqRCs@f@0XyCH7^!Vs|dXNo4AR7{$7Obitb(-yD{cldP zqy7!}14;)h$hqF75$tt(^@{f*rv<;9^p?CDwdkKMSXFwRQ6JZ=pMDfr(C0yD6tG~z z)jI6Iu;3}|8nmFT9$2vdXVnTafj8p`B*q)TXnkBeE0%hLn-hMdGLOlhw8SFN;(n7= z_6!Twi7NS^p0Z|#{xA8-)>PET>GsT-cZ1tGbkvZ?l%wz55oq}}lXv{bd%e5&j%G=^ zyIe)_nT(MZmA;nEuFzH699ar{%jSbmT{H^1{1qL#Jl2fhxWXIeX9ZckLPH>G!?5-! zsLVj$B?*j_DB*ZQ7-pZ&eh`?wa!{WFaSQl>bH+ECqF^2NQBDbcU@Rum40 zj#-COogRJmY1&#xPEgM*agw2DBx{YLBLi|v?Da0JbZ@Qu02=7u;}Rd;;8k(TA3J@> zS^OM*@4oA)a}LrPajMcmK3H_B+Ce^Adx{ZaT^KLEu&+W_GI)A5a>2{Vb!7o@}^)*BzZSD;hk=r765)L)X1Q_rKx zRZFwXks=QG)Q-_djs?tjfo>-?wFX zmU8t6oQkS7`1-2!WPE*Py7ZsV z3k_U0uqeK>cYj6U!Kq_e#PVVOr)LM`qW8^R$G_ZLOWjC4=CJe_m}9qAntg4SgH&bz#YyhXw#`Zd1#i~3M9SB&0^Z=>@404`9<;3d zcDa9KS=Q7yrRqH9hx@ZX5?1o<(aLMXA9C;Y{AEZ%mS;)3yU@h=XdJ9Y|IRbf#|wWH z0gKyu+R3guSqd*O5go$?-CeAN_|BE0ib$|=R)q!nM~yWEutxa7%38Kdg)RkT8~K=w z@-zf`XjlV1GFt}RL!XW#?@|5L=90wW94T=_isiBDQakWoPwp>ELolD}hTY>|>Yehb zneUNy?g~S}_X=Oaze@JAt>Z>`2|F0NqgUFP$&7V z{|;^c)2xd8s|PB26*I;f*xAnbrQRxubnF&FZ=3$NWJI!?&GqI`@+_;s1pbD5 z54&Rr`$6!^5CB#eLU6+_2yTVopcn`q6$7y5i5)x5rTQq z5Zn#H>Te)eco=eC3&FBt2(~VUoUcN#DF%Z7u!D!}Y8-{&W(e-z4#5rYLe4C=2Y_=9 a1TT$**4Rxg0;c-{g3tRllYr;OkN*jNa=i2a literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl new file mode 100644 index 0000000..8b0699a --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -0,0 +1,689 @@ +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); + +RWByteAddressBuffer _202 : register(u0, space0); +ByteAddressBuffer _723 : register(t1, space0); +RWTexture2D image_atlas : register(u3, space0); +RWTexture2D gradients : register(u4, space0); +RWTexture2D image : register(u2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +uint spvPackUnorm4x8(float4 value) +{ + uint4 Packed = uint4(round(saturate(value) * 255.0)); + return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24); +} + +float4 spvUnpackUnorm4x8(uint value) +{ + uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24); + return float4(Packed) / 255.0; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _215 = { a.offset + offset }; + return _215; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _202.Load(offset * 4 + 8); + return v; +} + +CmdTag Cmd_tag(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + CmdTag _432 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _432; +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = asfloat(raw1); + return s; +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) +{ + CmdStrokeRef _449 = { ref.offset + 4u }; + Alloc param = a; + CmdStrokeRef param_1 = _449; + return CmdStroke_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TileSeg s; + s.origin = float2(asfloat(raw0), asfloat(raw1)); + s._vector = float2(asfloat(raw2), asfloat(raw3)); + s.y_edge = asfloat(raw4); + TileSegRef _572 = { raw5 }; + s.next = _572; + return s; +} + +uint2 chunk_offset(uint i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) +{ + CmdFillRef _439 = { ref.offset + 4u }; + Alloc param = a; + CmdFillRef param_1 = _439; + return CmdFill_read(param, param_1); +} + +CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdAlpha s; + s.alpha = asfloat(raw0); + return s; +} + +CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) +{ + CmdAlphaRef _459 = { ref.offset + 4u }; + Alloc param = a; + CmdAlphaRef param_1 = _459; + return CmdAlpha_read(param, param_1); +} + +CmdColor CmdColor_read(Alloc a, CmdColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +CmdColor Cmd_Color_read(Alloc a, CmdRef ref) +{ + CmdColorRef _469 = { ref.offset + 4u }; + Alloc param = a; + CmdColorRef param_1 = _469; + return CmdColor_read(param, param_1); +} + +float3 fromsRGB(float3 srgb) +{ + bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z); + float3 below = srgb / 12.9200000762939453125f.xxx; + float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx); + return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z); +} + +float4 unpacksRGB(uint srgba) +{ + float4 color = spvUnpackUnorm4x8(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + CmdLinGrad s; + s.index = raw0; + s.line_x = asfloat(raw1); + s.line_y = asfloat(raw2); + s.line_c = asfloat(raw3); + return s; +} + +CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) +{ + CmdLinGradRef _479 = { ref.offset + 4u }; + Alloc param = a; + CmdLinGradRef param_1 = _479; + return CmdLinGrad_read(param, param_1); +} + +CmdImage CmdImage_read(Alloc a, CmdImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +CmdImage Cmd_Image_read(Alloc a, CmdRef ref) +{ + CmdImageRef _489 = { ref.offset + 4u }; + Alloc param = a; + CmdImageRef param_1 = _489; + return CmdImage_read(param, param_1); +} + +void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) +{ + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas[uv]; + float3 param_1 = fg_rgba.xyz; + float3 _695 = fromsRGB(param_1); + fg_rgba.x = _695.x; + fg_rgba.y = _695.y; + fg_rgba.z = _695.z; + rgba[i] = fg_rgba; + } + spvReturnValue = rgba; +} + +float3 tosRGB(float3 rgb) +{ + bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z); + float3 below = 12.9200000762939453125f.xxx * rgb; + float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx; + return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z); +} + +uint packsRGB(inout float4 rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return spvPackUnorm4x8(rgba.wzyx); +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdJump s; + s.new_ref = raw0; + return s; +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) +{ + CmdJumpRef _499 = { ref.offset + 4u }; + Alloc param = a; + CmdJumpRef param_1 = _499; + return CmdJump_read(param, param_1); +} + +void comp_main() +{ + uint tile_ix = (gl_WorkGroupID.y * _723.Load(8)) + gl_WorkGroupID.x; + Alloc _738; + _738.offset = _723.Load(24); + Alloc param; + param.offset = _738.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _747 = { cmd_alloc.offset }; + CmdRef cmd_ref = _747; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = 0.0f.xxxx; + } + uint clip_depth = 0u; + bool mem_ok = _202.Load(4) == 0u; + float df[8]; + TileSegRef tile_seg_ref; + float area[8]; + uint blend_stack[128][8]; + float blend_alpha_stack[128][8]; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0f; + } + TileSegRef _842 = { stroke.tile_ref }; + tile_seg_ref = _842; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11); + float2 line_vec = seg._vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + 0.5f.xx) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f); + df[k_1] = min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + TileSegRef _964 = { fill.tile_ref }; + tile_seg_ref = _964; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1._vector; + float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx); + if (window.x != window.y) + { + float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx; + float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y)); + float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0f); + float c = max(b, 0.0f); + float d = max(xmin, 0.0f); + float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = min(abs(area[k_5]), 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0f; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba = gradients[int2(x, int(lin.index))]; + float3 param_29 = fg_rgba.xyz; + float3 _1298 = fromsRGB(param_29); + fg_rgba.x = _1298.x; + fg_rgba.y = _1298.y; + fg_rgba.z = _1298.z; + rgba[k_9] = fg_rgba; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_30, param_31); + uint2 param_32 = xy_uint; + CmdImage param_33 = fill_img; + float4 _1327[8]; + fillImage(_1327, param_32, param_33); + float4 img[8] = _1327; + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + float4 fg_k_1 = img[k_10] * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 12u; + break; + } + case 8u: + { + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + uint d_2 = min(clip_depth, 127u); + float4 param_34 = float4(rgba[k_11]); + uint _1390 = packsRGB(param_34); + blend_stack[d_2][k_11] = _1390; + blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f); + rgba[k_11] = 0.0f.xxxx; + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 9u: + { + clip_depth--; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + uint d_3 = min(clip_depth, 127u); + uint param_35 = blend_stack[d_3][k_12]; + float4 bg = unpacksRGB(param_35); + float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1; + } + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_36 = cmd_alloc; + CmdRef param_37 = cmd_ref; + CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref }; + cmd_ref = _1469; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_38 = i_1; + float3 param_39 = rgba[i_1].xyz; + image[int2(xy_uint + chunk_offset(param_38))] = float4(tosRGB(param_39), rgba[i_1].w); + } +} + +[numthreads(8, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl new file mode 100644 index 0000000..9318cc8 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.msl @@ -0,0 +1,728 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_202.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_202); + return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = as_type(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u }; + return CmdStroke_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_202); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_202); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_202); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_202); + TileSeg s; + s.origin = float2(as_type(raw0), as_type(raw1)); + s.vector = float2(as_type(raw2), as_type(raw3)); + s.y_edge = as_type(raw4); + s.next = TileSegRef{ raw5 }; + return s; +} + +static inline __attribute__((always_inline)) +uint2 chunk_offset(thread const uint& i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +static inline __attribute__((always_inline)) +CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u }; + return CmdFill_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + CmdAlpha s; + s.alpha = as_type(raw0); + return s; +} + +static inline __attribute__((always_inline)) +CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u }; + return CmdAlpha_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u }; + return CmdColor_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +float3 fromsRGB(thread const float3& srgb) +{ + bool3 cutoff = srgb >= float3(0.040449999272823333740234375); + float3 below = srgb / float3(12.9200000762939453125); + float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625)); + return select(below, above, cutoff); +} + +static inline __attribute__((always_inline)) +float4 unpacksRGB(thread const uint& srgba) +{ + float4 color = unpack_unorm4x8_to_float(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +static inline __attribute__((always_inline)) +CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_202); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_202); + CmdLinGrad s; + s.index = raw0; + s.line_x = as_type(raw1); + s.line_y = as_type(raw2); + s.line_c = as_type(raw3); + return s; +} + +static inline __attribute__((always_inline)) +CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u }; + return CmdLinGrad_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_202); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u }; + return CmdImage_read(param, param_1, v_202); +} + +static inline __attribute__((always_inline)) +spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, thread texture2d image_atlas) +{ + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas.read(uint2(uv)); + float3 param_1 = fg_rgba.xyz; + float3 _695 = fromsRGB(param_1); + fg_rgba.x = _695.x; + fg_rgba.y = _695.y; + fg_rgba.z = _695.z; + rgba[i] = fg_rgba; + } + return rgba; +} + +static inline __attribute__((always_inline)) +float3 tosRGB(thread const float3& rgb) +{ + bool3 cutoff = rgb >= float3(0.003130800090730190277099609375); + float3 below = float3(12.9200000762939453125) * rgb; + float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875); + return select(below, above, cutoff); +} + +static inline __attribute__((always_inline)) +uint packsRGB(thread float4& rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return pack_float_to_unorm4x8(rgba.wzyx); +} + +static inline __attribute__((always_inline)) +CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_202); + CmdJump s; + s.new_ref = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202) +{ + Alloc param = a; + CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u }; + return CmdJump_read(param, param_1, v_202); +} + +kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x; + Alloc param; + param.offset = _723.conf.ptcl_alloc.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = float4(0.0); + } + uint clip_depth = 0u; + bool mem_ok = v_202.mem_error == 0u; + spvUnsafeArray df; + TileSegRef tile_seg_ref; + spvUnsafeArray area; + spvUnsafeArray, 128> blend_stack; + spvUnsafeArray, 128> blend_alpha_stack; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4, v_202).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0; + } + tile_seg_ref = TileSegRef{ stroke.tile_ref }; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11, v_202); + float2 line_vec = seg.vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + float2(0.5)) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + tile_seg_ref = TileSegRef{ fill.tile_ref }; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1.vector; + float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0)); + if ((isunordered(window.x, window.y) || window.x != window.y)) + { + float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y); + float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y)); + float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07; + float xmax = fast::max(xs.x, xs.y); + float b = fast::min(xmax, 1.0); + float c = fast::max(b, 0.0); + float d = fast::max(xmin, 0.0); + float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = fast::min(abs(area[k_5]), 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24, v_202); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); + float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); + float3 param_29 = fg_rgba.xyz; + float3 _1298 = fromsRGB(param_29); + fg_rgba.x = _1298.x; + fg_rgba.y = _1298.y; + fg_rgba.z = _1298.z; + rgba[k_9] = fg_rgba; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202); + uint2 param_32 = xy_uint; + CmdImage param_33 = fill_img; + spvUnsafeArray img; + img = fillImage(param_32, param_33, image_atlas); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + float4 fg_k_1 = img[k_10] * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 12u; + break; + } + case 8u: + { + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + uint d_2 = min(clip_depth, 127u); + float4 param_34 = float4(rgba[k_11]); + uint _1390 = packsRGB(param_34); + blend_stack[d_2][k_11] = _1390; + blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0); + rgba[k_11] = float4(0.0); + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 9u: + { + clip_depth--; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + uint d_3 = min(clip_depth, 127u); + uint param_35 = blend_stack[d_3][k_12]; + float4 bg = unpacksRGB(param_35); + float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1; + } + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_36 = cmd_alloc; + CmdRef param_37 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref }; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_38 = i_1; + float3 param_39 = rgba[i_1].xyz; + image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38)))); + } +} + diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv similarity index 100% rename from piet-gpu/shader/kernel4.spv rename to piet-gpu/shader/gen/kernel4.spv diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil new file mode 100644 index 0000000000000000000000000000000000000000..9fd593ca43dc26f919b022903612dc81fad0890b GIT binary patch literal 7064 zcmeHLeOMD$x}T8AWI`r!AfO2#nFvNlYRFCTAuS`bRi9bAXhvDF#*j3tr)ab zpjCr51eyy7&Vja~pom1lIQ$f?SR@UPDf-kujMXbvl68Pjh9~?3{SiEjzvB_0PuW7X z2+;VT!Jtu3n9m*`j@dfV9V20WSf669C{>gmLm)^4F`(Z7fc&zeVhRWcGCSZaRo+`7 z41(N%$2JpVYx01MX-e9o2*5=Mo<6sUX2GU2P(dR6N`;ZXFhbeHR(9g)F*zuINo0D3 ztj+Hf@29$T{=-i zTB(i@Vx54BfF2#_@yr?}QH%oXAgGampe#Vsr`cGB&>6it)?pRu#G10|#JKNG( zjN5&voCQS=qFy)VC-D-SmXD-YLYIsfI(?@cfm_$U3bhU%?jN2pBuZWYw`fJ`V9Xl+ zVJ+Wkav!i>KH_O`KIUp|1WAMv;R7dnX2v>M({HfxC@8*1ULp%;~7n8WY3?Bt6j;BZw+?+F9a7AHb@(#TwYn zzw|mDc>-cyIu5fVWvv$arDL`eaWtY$KJy^W{obqI`)=a zdfU(T9$2zt3w{5lcABD=P&nHe9gEVpSjs7BROjCT-84`MTP8Hy99IqU~=SW1?} z(BdQ3)BB4dQh^&eUaQG-lH>cGmY-_Rf3~dUQ&@p{SRKJ)+`7)+!|@~L8#ynkO4dV< zq>+2~vM+yMu_Ke(VrbSKwx+)Ik(+&e)ik%w2^Pm&N?%9UYxe}#hUfH{LX=n zn;-OC8Wf|de82sxS@w;<*g{F)&B1$s=g*&ZWjDo9=ij>&djH0)jsQ@+U^$I_v*FdR zm!7sST-_4?*-cpM$x>?xL}zr;evVY?kt!$hf(NPYK{kdEG@(-wo0awZtY2lGNzZ;S zMWNn_UA=I+cw^dES85i^Ge!O_ZEW3HaI(=cF9kxY275XJmj+sH1uotAHC!l2&zsVI zrv06H?QLh;Q+%b3*wvkBlTJ$lSH9YvW~U%FE;fD)FjBT0-Fvxy`;sk34;+X#tOh_I z-1_q7y-NdwzlJwK)P0tK`^-R%37!23l`*vWHdT(rZ&wbdHJ0WgJ&CS3&r`{4J+z?Ie&?Z)rjk)LRb9+>C&(ZVF*XLR2x!s|dts3Tz zDrR<7!)EX{Ht3xVdJS{yOy<@R27rCp*zi2su!-NW+10SkWs-ay;(i_EKJs#hNP0z} zHDRDNp?mCoQ=4!}l+bCKdP$31cdF(FRC8Tp?|1z<&8i$5J-69E8!Y+xfq zBBnu4-#)WptFb}QtKyCXkpY5xH;8yhfuvxkv}w$ZvmXCg%oFYK1@c5dN8&Y`G}el-+4`h z^%mMbDx6GE_&?MF1sjFH$)oh2nlrXsJmiB4shm$eg^Cr0G2>lmMnT!T_0JQ3+O{fg z^}OL9-lOW)O;DJqI`%hE;Qqdjbpn34j(rVe|EzT^2kh-Xtz!rOx7IO{2#Pw|+{H z`gy+Q-_WuUcvGRt~IP z{fX85rm^U^9UoNwX|Sna)OpLB93we zhc72Q!!O^z>ZAB_!KckoipqL-+RwB8wsJ=H%B<|^&M(2-=Jwu;Unu@?yuJ6zQdTC| z`v2$r|3f)nOF)nbB!3=UgR?>2P6PL!*$u^1$bB(|Df^a6pH}wAwXRyjX#h=Fcs~o}MGW0)7SH=>?FPu3# zW5cQIx3dfIrX8>Ratx5<+oOyG+7TqjRvBj|^X(9ngs>p9M$@Pi6#q+*xs9xFLIjFj zpXpLOV!GYTBm@Cs`H2o@WJZ*ZoOzVwNs;>W@Sws-X@8P(Pa$RQAX{)~t;f`idu_A` zBf($u+aR+j<*~0y31QhYX@c%_SgFoWlrs$OBm@PpPG85&JX(rf49gkY+|!*AD+%_z zn`CzczdIVG$I_!*3{qVZWqJx&Ii}y=k-F-PH}WWpMqVa8J~ci|&N$(YJs4r<6A)@v zTxWZ{j5f-Sh3+s9;hqgK&QZLVy2HFI?%M?7n?)_0@1#LBmkS`hzX&|nFjNzG&N{R@@Z8R!rGe)x zL(;%u^GQLWp<8}iXHE@tS)O(}SX@J~KBzNC3wg~WE+WtcD`b<=P#ZC2AfrM<-4$!u zIhDqg7FgHmh2%IEeMsM+8Sge7!W)BTZ2`uGV4pX{GN9EJok2>;ex#>CK0Qr-X}idM zRquE5G~{Iu?vp5_YSXd8*Qa#K8Jn4nGjV!sU~2N~a;y|ny*|UjSU@weooApuj{PcJ z$c`8-1NKDB0Y<(+2$QY>!%mtE#bxwSejj(Z&Bz(hMZ^cX_*qWJoNAx|kMKp_F1S!8 z&L5U)O%4{@Q0yMiDW?m0$zW@~t}{(W2SZB;zi%L9li}euxzoWoN(gxc3R!{OVvrDw zQI|Y&SlXF9pi`uP^I7h6NRJc34U2&S8j>oF4wlnUY#Y#ZLFnAimu0~Z+H`vyQ(rL@qd^@_A$&G&^hO^r29ixC9=6@xj3zQbi{85~0{BueD9#~8~5J*V~hwIlY%unWzcP$Ht>2a@y z)T$9{()-tl_j%SFEy^?F*doIzk-;CO3C4B4+=0))v`XCEk0e3$m* z%!h4V*#kN=A!L83l1*-A8VWEBxD@eVPb_PqJG$pbRFX5GTO>bHZeHW2w{mH{2P$^g zZyd5MIJv2ZlQ`Nihx=Y{5+SXYW1SEdk4FeFuZI>@N7ukb-gWPal9xwdm4fDJuIXh+ zD*llGy7T`c}Yu)ZeF+nhMBFNq4w zmlR3Ml~C+G+e&BBM1X&Cz}C0})}>Q<XUmMvKd|OIt{HR!Hzzbd|B0u|XtUvgaosKLs4~8I`QyoXsH7b4FdkxM3+=0Ni2& zaH~H7AAd_E-0&Pw_%D5YZB>MjU8s_Y#R`K29nhJ;m34!mtHEF=d81|eMBwsy1G>T% z5V#LP;NBAnbv-KCbvIDJgE|W&tcqaqNjmZycy`iM6ycsLHQlejK>|!4GG^Kd;UAkZvWwtSA;!AEY7K*rx)YE zm{VZUa#Q$9XC0kwVpt8`B-rTbK$rR1CzABP0?96eguJag6Ww!HLzA+r6*52PjF+p? zG)cVcaw<^UTUO0F!NE+ll64rI`b!_mHzBFkzPa>w^hm06K@Pp0qVsd;zt>yik?oX2 zqmd_jc8-*Jdb`fel8Hfnao0N-CWFK|Wwb1q7Mh5fgkYzEcf*)CMc*L`>R)7 zKCS7nx2rkB=15tOABmz@4r(g;-lTj(~hw@9^zhW${M0qb7zKm7I75 zcipylo5b*Ri=&hX*nHHNiu~c;T>Tx{*DGXS@WxSm`3v6UWe;Y66cBroTS~~D*-7(r zOayWk>x;4TB_OfT&MgRT$v&O5V8}1~He(6N$u{o(SBrw-|1kXoEw(Jj^J@6aSgnt! zGh5dGQuugStoEnC@%W?eIU_r=KU1SVefH+Uj#>(6 z4@Dmq2s~p{As{}MUkJsoh1mO8Ghy?%yO0%Czl`|v%wb4)c&+#JFnQauOi!!uN1eez zuBTr}@)vMHdIjb3I@D!rbRMgoO@x0r?2!hn1j*w06afCNr zbW{+T#;v<<6II8B?O)6uVokWO zR+qIE5S~mvbHvW)Q(t(>MY%CQdC!5N6H=#1I<7QpWz`VU=sKAs>6bz zUOwXtd6R^z$jd$Z6l!JMMTsd}Ej{n&+f?I}JjAH2HM8-vKCWDC;;wEjj8iZoN_z_C zRd^_~@RZ|GXj_ST(M!$U>2}{6Nn70u4HBI7T^SJ#K4WGDGZK2E*$=Ly8%=!JA5}x+ zb*z*2c6L>mZC7jQ)3A>?4f5jbz{1kv>>fr1wg^v)F`UJ~|Cv6$WyyJGh_mQCj75FU zQXM2MECre-?=~kkU6q*lSS_LCYcy{0=W?cbCk;Vf80ORxWq{?K9cB@Kvr+4#d(_& zrPVACoP=z_<3MWhtjJB z?}aBF=gm4^5?o@QJ=!$;LGLX8N%KDGi(>?njl3MwULly2^2hL`-|^CZS3)c^ z=QQrkF)g3<1gw2UT6+n(Y$LhsPUf=K(wFgNnwNvi2>wQN1Ogub0WbT7soz48G3uhl zsOB{{hNL35f^fR0&d44q%X=3_ho%m(U_Xbp)aaAc3(CM*2wZhR9%|3~0R~kbEW4jHYpB13$G{D($I>dZg5Bu%Q@W?RW9dTaf28=a%>X#8I!pI?~eBPB(!+& ILl8Xw25S(5yZ`_I literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl new file mode 100644 index 0000000..6025bde --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.hlsl @@ -0,0 +1,664 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u); + +static const PathSegTag _721 = { 0u, 0u }; + +RWByteAddressBuffer _136 : register(u0, space0); +ByteAddressBuffer _710 : register(t1, space0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _136.Load(offset * 4 + 8); + return v; +} + +PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _367; +} + +PathCubic PathCubic_read(Alloc a, PathCubicRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23); + PathCubic s; + s.p0 = float2(asfloat(raw0), asfloat(raw1)); + s.p1 = float2(asfloat(raw2), asfloat(raw3)); + s.p2 = float2(asfloat(raw4), asfloat(raw5)); + s.p3 = float2(asfloat(raw6), asfloat(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(asfloat(raw10), asfloat(raw11)); + return s; +} + +PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) +{ + PathCubicRef _373 = { ref.offset + 4u }; + Alloc param = a; + PathCubicRef param_1 = _373; + return PathCubic_read(param, param_1); +} + +float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t) +{ + float mt = 1.0f - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t); +} + +float approx_parabola_integral(float x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x)))); +} + +SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0f; + if (scale < 1000000000.0f) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + SubdivResult _695 = { val, a0, a2 }; + return _695; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _427 = { raw2 }; + s.tiles = _427; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +float approx_parabola_inv_integral(float x) +{ + return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x))); +} + +float2 eval_quad(float2 p0, float2 p1, float2 p2, float t) +{ + float mt = 1.0f - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t); +} + +MallocResult malloc(uint size) +{ + uint _142; + _136.InterlockedAdd(0, size, _142); + uint offset = _142; + uint _149; + _136.GetDimensions(_149); + _149 = (_149 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_149) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171; + _136.InterlockedMax(4, 1u, _171); + return r; + } + return r; +} + +TileRef Tile_index(TileRef ref, uint index) +{ + TileRef _385 = { ref.offset + (index * 8u) }; + return _385; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _136.Store(offset * 4 + 8, val); +} + +void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.origin.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.origin.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s._vector.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s._vector.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.y_edge); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) }; + PathSegRef ref = _718; + PathSegTag tag = _721; + if (element_ix < _710.Load(4)) + { + Alloc _731; + _731.offset = _710.Load(28); + Alloc param; + param.offset = _731.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1); + } + bool mem_ok = _136.Load(4) == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc _748; + _748.offset = _710.Load(28); + Alloc param_2; + param_2.offset = _748.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0f; + float2 qp0 = cubic.p0; + float _step = 1.0f / float(n_quads); + SubdivResult keep_params[16]; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5f * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125f; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + PathRef _904 = { _710.Load(16) + (path_ix * 12u) }; + Alloc _907; + _907.offset = _710.Load(16); + Alloc param_19; + param_19.offset = _907.offset; + PathRef param_20 = _904; + Path path = Path_read(param_19, param_20); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0f; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5f * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0f / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = lerp(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10f) + { + _1147 = 1000000000.0f; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f; + int x0 = int(floor(xmin * 0.0625f)); + int x1 = int(floor(xmax * 0.0625f) + 1.0f); + int y0 = int(floor(ymin * 0.0625f)); + int y1 = int(floor(ymax * 0.0625f) + 1.0f); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625f)); + int last_xray = int(floor(p1.x * 0.0625f)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374; + _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625f)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef _1454 = { path.tiles.offset }; + TileRef param_46 = _1454; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477; + _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477); + old = _1477; + } + tile_seg.origin = p0; + tile_seg._vector = p1 - p0; + float y_edge = 0.0f; + if (!is_stroke) + { + y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg._vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg._vector = p1 - p; + } + if (tile_seg._vector.x == 0.0f) + { + tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0f; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + TileSegRef _1559 = { tile_offset }; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = _1559; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + +[numthreads(32, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl new file mode 100644 index 0000000..d263f31 --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.msl @@ -0,0 +1,708 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_136.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize); + return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize); + PathCubic s; + s.p0 = float2(as_type(raw0), as_type(raw1)); + s.p1 = float2(as_type(raw2), as_type(raw3)); + s.p2 = float2(as_type(raw4), as_type(raw5)); + s.p3 = float2(as_type(raw6), as_type(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(as_type(raw10), as_type(raw11)); + return s; +} + +static inline __attribute__((always_inline)) +PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u }; + return PathCubic_read(param, param_1, v_136, v_136BufferSize); +} + +static inline __attribute__((always_inline)) +float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t); +} + +static inline __attribute__((always_inline)) +float approx_parabola_integral(thread const float& x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x)))); +} + +static inline __attribute__((always_inline)) +SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0; + if (scale < 1000000000.0) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + return SubdivResult{ val, a0, a2 }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +float approx_parabola_inv_integral(thread const float& x) +{ + return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x))); +} + +static inline __attribute__((always_inline)) +float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t); +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed); + uint offset = _142; + MallocResult r; + r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +TileRef Tile_index(thread const TileRef& ref, thread const uint& index) +{ + return TileRef{ ref.offset + (index * 8u) }; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_136.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.origin.x); + write_mem(param, param_1, param_2, v_136, v_136BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.origin.y); + write_mem(param_3, param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.vector.x); + write_mem(param_6, param_7, param_8, v_136, v_136BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.vector.y); + write_mem(param_9, param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.y_edge); + write_mem(param_12, param_13, param_14, v_136, v_136BufferSize); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17, v_136, v_136BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + constant uint& v_136BufferSize = spvBufferSizeConstants[0]; + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) }; + PathSegTag tag = PathSegTag{ 0u, 0u }; + if (element_ix < _710.conf.n_pathseg) + { + Alloc param; + param.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize); + } + bool mem_ok = v_136.mem_error == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc param_2; + param_2.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0; + float2 qp0 = cubic.p0; + float _step = 1.0 / float(n_quads); + spvUnsafeArray keep_params; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5 * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + Alloc param_19; + param_19.offset = _710.conf.tile_alloc.offset; + PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_19, param_20, v_136, v_136BufferSize); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5 * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0 / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = mix(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x; + float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x; + float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y; + float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10) + { + _1147 = 1000000000.0; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625; + int x0 = int(floor(xmin * 0.0625)); + int x1 = int(floor(xmax * 0.0625) + 1.0); + int y0 = int(floor(ymin * 0.0625)); + int y1 = int(floor(ymax * 0.0625) + 1.0); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625)); + int last_xray = int(floor(p1.x * 0.0625)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = fast::min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef param_46 = TileRef{ path.tiles.offset }; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed); + old = _1477; + } + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; + float y_edge = 0.0; + if (!is_stroke) + { + y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (fast::min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg.vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg.vector = p1 - p; + } + if (tile_seg.vector.x == 0.0) + { + tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = TileSegRef{ tile_offset }; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv similarity index 100% rename from piet-gpu/shader/path_coarse.spv rename to piet-gpu/shader/gen/path_coarse.spv diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl index c7f7df0..a9cee25 100644 --- a/piet-gpu/shader/gen/pathseg.hlsl +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -77,10 +77,10 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; static const Monoid _567 = { 0.0f.xxxx, 0u }; -RWByteAddressBuffer _111 : register(u0); -ByteAddressBuffer _574 : register(t2); -ByteAddressBuffer _639 : register(t1); -ByteAddressBuffer _709 : register(t3); +RWByteAddressBuffer _111 : register(u0, space0); +ByteAddressBuffer _574 : register(t2, space0); +ByteAddressBuffer _639 : register(t1, space0); +ByteAddressBuffer _709 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl index dd7c611..291243e 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.hlsl +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -36,10 +36,10 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); -ByteAddressBuffer _139 : register(t1); -ByteAddressBuffer _150 : register(t2); -RWByteAddressBuffer _238 : register(u3); -RWByteAddressBuffer _258 : register(u0); +ByteAddressBuffer _139 : register(t1, space0); +ByteAddressBuffer _150 : register(t2, space0); +RWByteAddressBuffer _238 : register(u3, space0); +RWByteAddressBuffer _258 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl index 388f99d..f1ec389 100644 --- a/piet-gpu/shader/gen/pathtag_root.hlsl +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -11,7 +11,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; -RWByteAddressBuffer _78 : register(u0); +RWByteAddressBuffer _78 : register(u0, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil new file mode 100644 index 0000000000000000000000000000000000000000..d69db16005e41c4d04ccba4f03161ff131490b28 GIT binary patch literal 5048 zcmeHLeN+=y7Jr!xGZPZRgan$D4=3`midu&V2m)?C0E0jq4%Ei2ZIhsg6)=1p3es&S zA0*LYjT&1N+Y>}vx9hrCwSu*6BB@3EXt365OD(pDc8^=Bb-QY9yKfL|chA|s_Ut*` zJ$>iAdH4R_y?5??bCX{txq7wcrMQfb?G3XJo_I@GoYlSaaSi~WM+^WC*K9b8;9LS{ z6`WUDodBQ`&eHq>8ZP%nLFs~&Nu6CE`WIbSx`=LqIDOCKpM@Xc2>Ce#6Jc{Ugdb;`lO&PujC!X_=}_4~sH@5O5X-$ki7Vv4Kdqr-Zm#Ou?rGI2w5EZoQaXf;;eNUj2MNsmcY~=xu1%SFyg0y}md*In4aq8d+L>kR`lHAwGhXyk&eoQz>0)i2uiji3C!+Ht5+HUPcZ*{ao?qf&S|ti zEY!|Oxz%6Y#X3%V7t;dek`iJ(3~(K`Fnv3wZ?_-&tb0w+A)lJCqHC?yr31RAt*Ln) z`qqrYMsa7DCRckpKQctr3B0w>0||@+#F9Fnv%%m@uXHw7a-kgdXPvM*C|nJ%Abd5A z*Lm@J3a@qxYbS-KNukz&vj#!z%b%GwD)H3@ydj8 zI1>@Zk$K?>Zh3rOE@%XFW_bHHJbPq1!=^Ys4E^gJ#vfz)WnXVzmLOFPAES2dtAgYp)fPy z{e@6b#I;Lhq9S)2FKNM1RD)E2g$C3s9oSBWy1hWYu#*_@(gTaVzEFF&2qg3E{W1`- z9DrunYmLzVer}OF9}(xKNz3{5S=_xdHgy!X)N{Ag_HtC6{H%_a4auhD(5=b0xAt0^ zQwr<(<%Q=-RE!R9p+FEbQt1S$GFH`CL2k5zD^DU!084ng!ti7g!R(jIJ*`sA0n*Wv zs6{Ye_(Y~cjPCxVn+(q-x#B!gx7?jho*$pK#n z1wKGI-gLpIbQR@V_f5KFGp3u;NoV$UAD(?^U+){U_huh@y_bjRzUFP(#6SMs_N!4A?B}1{-#ZZurvxtMJc`X!Ib^1mwS*dZ{c@!ly4YKUodyB z3ZzD?sA;GdqlS3(BaXkn`)ySIQTrajwUy6*RlKJy@3F1XpAMm%;dH%`!pfD{G6S}p z!B%*&m0qkWh}Dc?W-nIjJLZomGnVgLSzi!8qrR&n0)4oDY{}Juk|F;_=N{^v?*FJ22DAdwlVR^`Q!{vT6JFn5D;Ndge=9#J1-&Nguq`X7$=uJ)K5WajbF)s2Ng54!OC>(H(pd+ujMMCPnhkj4G`#U< z2n_oa<9vc})=Hgq?ato28#KWk|{1=!h|v6*%w=YbgL!a7{E|md|3o-}&nKORL&X zKC|B9TYNxPvG23;70r(?|G?t&@<*)?0~z{E{f!r2u6XesWBPYF7_$7s3f6e%rRJB4 zam2Pq&(Tjtxpwohr=XuKwO#N#b=s1*zq#ofp))N%PV z5RJwktQMRx-#YaLURFYud${Y-eu^$nm3wt0*A9_cS{9j|i_#)_P{22a ze!#j!2hPX^gId=I+C@~@i`TM$Xn+PTHiBp=YyaC=5l8Sjm2Mc(t%sm2n*|?AQ`&F6 zTm1>}11(EWK&vX%j(=f8*=*lqyFIE;y+RK}l)2`)SgzGV;$yk@G-QXLb1x0?N$x!c z_x>-qciaCy_r3z3<1d?g*{-`gekt6`{gKc9 zdy6o|4aNS~5p|+OvR^)yL@YB@OAmz$(gjPTjNhRW1c2bR?!D&^^zPWOxp%ObXH8z( z^g!dXy80EXYD$sqJlV_7JtpmVY|D_xw@c%=tZ+u50k#_r>1+%_rab z@YuS|CyRMI=0V?OH4Symx>YP8mm*=m^4YI^_Fv&MBL#UKlDrtzvp=Z;`yPl&OiGnR zn<&Yj9$7jalT<1TJWIkn>hqEZZ-skQwU}p>@=HvjQ*Hk7M^&d}sWQ)_5s|6zt;^FU zVkqMCxG+tu1-@^(F%XaOfYsoB%aXRsa=*v#QXvc_U1~N*uJmCQw8;P{0>k5!RX>T< zv-L-QCf2wwtWb&V z6Ie*eMNFrPzB1wVEQyx{$K6z~?n93#Ln!nP(0egTT0@*+1QjxD>a?TbUhBMI=orWs zkO1{Fnvk3zQYtMbn5p-$M_cD#v6iveyxWs@^&vW7VDu(|5L?}f(6&~Z=8=_;+-U=<;JR3w-iO#ml7W6k-GwytS54hT2{ZlMkOd@9r7+ZaM2H=Mm?yBU&$~Y>ORz#A}gu zz26&go791}P+@yw1}xLhtQB$Dn#@Jk*t5oSjLyoTN2VC)Au`C7o={ME^pKK&8NfE+ zi6yndxu$Nmqe>TU<38ZBYu#u88js= znD!h!;a7B+o6+7^(2W-E4$Sl`s+1$S3ebnyA0)VRp|4FU1%%ZyM|_J3_lSMom%vf8 zCKzF$j|VgS6Wym!MT~Su&?&G#_o1g?^Q97VhR6U4lg)l)vWJVkfy{oi7-r)$_hq6> z>Eoc-Z`RB^!ILTwY z*py1Pv#!l*R3{(@4g7;<{=sQ$l7omN8nMIASMrSZ#2%W0Z~g3lW{)}^Ap(ZuZoWlY YC3{*<+Gg1bP*eA^o;6DN&(u%jAI3Osk^lez literal 0 HcmV?d00001 diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl new file mode 100644 index 0000000..010e714 --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -0,0 +1,335 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _92 : register(u0, space0); +ByteAddressBuffer _305 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared uint sh_tile_count[256]; +groupshared MallocResult sh_tile_alloc; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _92.Load(offset * 4 + 8); + return v; +} + +AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + AnnotatedTag _236 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _236; +} + +AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + AnnoEndClip s; + s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + return s; +} + +AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) +{ + AnnoEndClipRef _243 = { ref.offset + 4u }; + Alloc param = a; + AnnoEndClipRef param_1 = _243; + return AnnoEndClip_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _98; + _92.InterlockedAdd(0, size, _98); + uint offset = _98; + uint _105; + _92.GetDimensions(_105); + _105 = (_105 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_105) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _127; + _92.InterlockedMax(4, 1u, _127); + return r; + } + return r; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _169 = { a.offset + offset }; + return _169; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _92.Store(offset * 4 + 8, val); +} + +void Path_write(Alloc a, PathRef ref, Path s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef _312 = { _305.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _312; + AnnotatedRef _321 = { _305.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _321; + uint tag = 0u; + if (element_ix < _305.Load(0)) + { + Alloc _332; + _332.offset = _305.Load(32); + Alloc param; + param.offset = _332.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc _350; + _350.offset = _305.Load(32); + Alloc param_2; + param_2.offset = _350.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3); + x0 = int(floor(clip.bbox.x * 0.0625f)); + y0 = int(floor(clip.bbox.y * 0.0625f)); + x1 = int(ceil(clip.bbox.z * 0.0625f)); + y1 = int(ceil(clip.bbox.w * 0.0625f)); + break; + } + } + x0 = clamp(x0, 0, int(_305.Load(8))); + y0 = clamp(y0, 0, int(_305.Load(12))); + x1 = clamp(x1, 0, int(_305.Load(8))); + y1 = clamp(y1, 0, int(_305.Load(12))); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + if (tag == 5u) + { + tile_count = 0u; + } + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= uint(1 << int(i))) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_4 = total_tile_count * 8u; + MallocResult _477 = malloc(param_4); + sh_tile_alloc = _477; + } + GroupMemoryBarrierWithGroupSync(); + MallocResult alloc_start = sh_tile_alloc; + bool _488; + if (!alloc_start.failed) + { + _488 = _92.Load(4) != 0u; + } + else + { + _488 = alloc_start.failed; + } + if (_488) + { + return; + } + if (element_ix < _305.Load(0)) + { + uint _501; + if (th_ix > 0u) + { + _501 = sh_tile_count[th_ix - 1u]; + } + else + { + _501 = 0u; + } + uint tile_subix = _501; + Alloc param_5 = alloc_start.alloc; + uint param_6 = 8u * tile_subix; + uint param_7 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_5, param_6, param_7); + TileRef _523 = { tiles_alloc.offset }; + path.tiles = _523; + Alloc _528; + _528.offset = _305.Load(16); + Alloc param_8; + param_8.offset = _528.offset; + PathRef param_9 = path_ref; + Path param_10 = path; + Path_write(param_8, param_9, param_10); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_11 = alloc_start.alloc; + uint param_12 = start_ix + i_1; + uint param_13 = 0u; + write_mem(param_11, param_12, param_13); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl new file mode 100644 index 0000000..3906536 --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -0,0 +1,336 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct AnnoEndClipRef +{ + uint offset; +}; + +struct AnnoEndClip +{ + float4 bbox; +}; + +struct AnnotatedRef +{ + uint offset; +}; + +struct AnnotatedTag +{ + uint tag; + uint flags; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + Alloc_1 drawmonoid_alloc; + uint n_trans; + uint n_path; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_92.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_92, v_92BufferSize); + return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_92, constant uint& v_92BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_92, v_92BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_92, v_92BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_92, v_92BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize); + AnnoEndClip s; + s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + return s; +} + +static inline __attribute__((always_inline)) +AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = a; + AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u }; + return AnnoEndClip_read(param, param_1, v_92, v_92BufferSize); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_92, constant uint& v_92BufferSize) +{ + uint _98 = atomic_fetch_add_explicit((device atomic_uint*)&v_92.mem_offset, size, memory_order_relaxed); + uint offset = _98; + MallocResult r; + r.failed = (offset + size) > uint(int((v_92BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _127 = atomic_fetch_max_explicit((device atomic_uint*)&v_92.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_92, constant uint& v_92BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_92.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_92, constant uint& v_92BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2, v_92, v_92BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5, v_92, v_92BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8, v_92, v_92BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup uint sh_tile_count[256]; + threadgroup MallocResult sh_tile_alloc; + constant uint& v_92BufferSize = spvBufferSizeConstants[0]; + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) }; + AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) }; + uint tag = 0u; + if (element_ix < _305.conf.n_elements) + { + Alloc param; + param.offset = _305.conf.anno_alloc.offset; + AnnotatedRef param_1 = ref; + tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + switch (tag) + { + case 1u: + case 2u: + case 3u: + case 4u: + case 5u: + { + Alloc param_2; + param_2.offset = _305.conf.anno_alloc.offset; + AnnotatedRef param_3 = ref; + AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize); + x0 = int(floor(clip.bbox.x * 0.0625)); + y0 = int(floor(clip.bbox.y * 0.0625)); + x1 = int(ceil(clip.bbox.z * 0.0625)); + y1 = int(ceil(clip.bbox.w * 0.0625)); + break; + } + } + x0 = clamp(x0, 0, int(_305.conf.width_in_tiles)); + y0 = clamp(y0, 0, int(_305.conf.height_in_tiles)); + x1 = clamp(x1, 0, int(_305.conf.width_in_tiles)); + y1 = clamp(y1, 0, int(_305.conf.height_in_tiles)); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + if (tag == 5u) + { + tile_count = 0u; + } + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= uint(1 << int(i))) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_4 = total_tile_count * 8u; + MallocResult _477 = malloc(param_4, v_92, v_92BufferSize); + sh_tile_alloc = _477; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + MallocResult alloc_start = sh_tile_alloc; + bool _488; + if (!alloc_start.failed) + { + _488 = v_92.mem_error != 0u; + } + else + { + _488 = alloc_start.failed; + } + if (_488) + { + return; + } + if (element_ix < _305.conf.n_elements) + { + uint _501; + if (th_ix > 0u) + { + _501 = sh_tile_count[th_ix - 1u]; + } + else + { + _501 = 0u; + } + uint tile_subix = _501; + Alloc param_5 = alloc_start.alloc; + uint param_6 = 8u * tile_subix; + uint param_7 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_5, param_6, param_7); + path.tiles = TileRef{ tiles_alloc.offset }; + Alloc param_8; + param_8.offset = _305.conf.tile_alloc.offset; + PathRef param_9 = path_ref; + Path param_10 = path; + Path_write(param_8, param_9, param_10, v_92, v_92BufferSize); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_11 = alloc_start.alloc; + uint param_12 = start_ix + i_1; + uint param_13 = 0u; + write_mem(param_11, param_12, param_13, v_92, v_92BufferSize); + } +} + diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv similarity index 79% rename from piet-gpu/shader/tile_alloc.spv rename to piet-gpu/shader/gen/tile_alloc.spv index b443b031d0471f4fe289b6cae2795e3975dca240..d4a6e31710c5991852f7811fe336203f50b6c276 100644 GIT binary patch delta 2552 zcmZ9N+iz4=6vp>VJEg51p%kd43N3B5(juY)AqEo?P1Ofu@yRGDCxoPp)f!rfFXpK6 zA0Q__nCQETF*XyQ^ucIU#0!EK>J9IA@q(6{BL05UvvKGoyX*VDwf5R;ud{m2%%QW=9Q9${|2!Kiw^l~Wqs798%E5(ql!o7@ z^0VygqT(5FJ-C&1ZO-kDn*J8@(7G_h8h{UDJJ8%e7#hMVVmDMCSbAF_v2});#&+7e z$hF7j@!1-_48IMH@p79PxwaweZtqO>*wE;?0d7MhzX@(bBiGu{T5dCIxs8n6{5CUk zt@c6(1}-$Bsx zta#cXu-Z;4Gg9B7?6IEC^gWPgJWXAVcs~HEIYlRkJdE{sntSQ{kfnB#I>oWJkHOaV zNPfRhz-lkDJkjw{+l1d~{0zJ#gcVhX0gk1j4TuLE&W3wC)6Y@$8s*fLqIRW-(9(~7I{}e)6 zO(cnlPJ&Go|6NXj)wZ)aj_qlM^E(_C!rgF^^IA0=_`%!4BP4&aZB4#+DFuc(Cs_xwrxA zLE;5i6K-Nc+|vjiz#4Cye&c*s>hnO~Oif_5t>9Q_Gg_+SR-VZajtjtwZt}feX6<29 z{GXQ_;{@G|z7V`FPkL9Lcny3Jy1rrZ_$F%wtNDusEr#=&POBAJv4VkCcRhhiz=I^y z=lff(??!yfz!s*@ce&h-U_v{8Zo&EuaG!pkbsSo)MNDIt7rC~B>VbYN)a-(-T>(ClB^_<)N|5a3{N7z)wLSSxKPzhSq*Y)?-K>6M!W$OpmZjvs?H zU}Mznr6awz$ShZ98sbLB^GX8Nvb>=mO}W?)rb)zCVH0BG1H2mE1vlO}{odLRSg1A@ db`AJkVf}1ckD$1>rmbqVZSAW(y84wx{{eL6KQ;gW delta 2543 zcmZ9NO>C4^6o&6iXG&W;LMf%zQ7CP}N{dn}R$?$|qCs33iwhUp)CnQ6v06hbaiJMC zx^PKm?aGZ2V{9faOk5aE6hV=SKUGxxS2W^JM+hMwa^Z9I2TQw3_a{6b zj=~fF*wl1s*EBKtFkgARxwY*)9;Z*G^PQcmywbe7+>ou!kqb)~@fC~s%K!0v=zvd8 zPM0Rec1%9>z`~lV(2@$q#vi6Ms;CQHXya+!SO1rza%Z6>Z#9F}(ZWlm{v6kDyT5kIycV%0c z^2&ksLZds94_0T}pKU7JgMAdc6Z;nS5#06Qx54U9@J{6W!EZ9uhtX!R2N*9h^leG! zy_4#ZzYDK29!v8a1gjZ$W2mc7oChn%y81S~hqQ(fryc@3H7+*`b~)pi8svv6+q#Qu z-v_IKstGOv7^&PG}-CZny0CJ7zDW?JNM_@IN(J3P5us&zmj=ql>YNtas!Lha@ zXx8>*x~s|Cpb>V90R+qyD{GU*6=gd#~O_JiJ@i<;&UOBV70Wp%fl~_UdAwo&jOEE zMmyTezoHyxyp)#p8`wy7m-++i<5K#5XQ;W9IKI6MJIW& zU(DFUr1+ng8{-MO6@3|aAWeE-ns@`e1zq0=dA#VBgVlUuL9KA^>2gw$6)PBMbvF^% z2JR=BJ}+>&zFY9MgDp&-m$}@IV8SK*+=lfd;5_}l>v(9j7BPb@X4$p}$xl{n%Vppq zBfh`O!D_FO)CK)osM!TuyBd77QtD_ecY4P7J<=PqoaN+9L!9XG93WsL!@l{|l#8#ynnXN^EfE`E;I;5hxbep6ch`2n gLbb85P2dZK^)l%?1jW5IZB?sn>$>Vw>)%-RFN+O5sQ>@~ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl index 6fa9267..7744e0f 100644 --- a/piet-gpu/shader/gen/transform_leaf.hlsl +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -51,10 +51,10 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; -RWByteAddressBuffer _71 : register(u0); -ByteAddressBuffer _96 : register(t2); -ByteAddressBuffer _278 : register(t1); -ByteAddressBuffer _377 : register(t3); +RWByteAddressBuffer _71 : register(u0, space0); +ByteAddressBuffer _96 : register(t2, space0); +ByteAddressBuffer _278 : register(t1, space0); +ByteAddressBuffer _377 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl index 60addf3..5ada811 100644 --- a/piet-gpu/shader/gen/transform_reduce.hlsl +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -38,10 +38,10 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); -ByteAddressBuffer _49 : register(t2); -ByteAddressBuffer _161 : register(t1); -RWByteAddressBuffer _251 : register(u3); -RWByteAddressBuffer _267 : register(u0); +ByteAddressBuffer _49 : register(t2, space0); +ByteAddressBuffer _161 : register(t1, space0); +RWByteAddressBuffer _251 : register(u3, space0); +RWByteAddressBuffer _267 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl index 42bbd38..35961b1 100644 --- a/piet-gpu/shader/gen/transform_root.hlsl +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -8,7 +8,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; -RWByteAddressBuffer _89 : register(u0); +RWByteAddressBuffer _89 : register(u0, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; diff --git a/piet-gpu/shader/kernel4_idx.spv b/piet-gpu/shader/kernel4_idx.spv deleted file mode 100644 index 953eae12634cefc40bc72f7f66b2ecb73c362c80..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34676 zcmb812Y_8w-R%#|P3Y34LkIza^cH#`w1D)EAPkdak_;p>VKNC-AfbbR(m|S_Ac%?$ zL+s#LpFP@^)I`ga?hQaj``PEpsMDp7Oobo z#tx~P$CB0jC{<;B=Q35(cEG*|ckXiNft?e&r;qRNnKsbZ-`UeUq5Jrr-bvLW)zGH@ z*gX%}bA#yv6E@gv^NmdOk*cGa&s>XF3sk0WP^NbE^lCVB{;Il#vIOz!OHvl3h>hvs zKg`vGkcR!+v)P5-zTO$VJrn!-r*ISA;Q@PAKkJ==5L=vH&mZO8GgP2Cz_M>P^`vqSiAT`)nr_Dq@59Etlx&12p; z8r25y+OIhVxkvjL)E&dPuJOlA=u5s`2N?|R? zzn<%DTX}b%+BK=UZS1R>N7>iy;k4iM)25%ANAu!E9nHC(7rR8Y6Fjm1weRUk(f2XL zr|&z0)AwCkdB;qdHn}VFtmZMV{UOy@IPEw6wAoKJk1m6$-4A*3I5jby z`242}1nsuE>0bQ=G3n!B;8q_Oua2s7c`p9f!OzWTKRT)-i=3TB&h9#A>%#dz&uUKZ zVd>|DBInU{&gK^RpQmj0lgsPeoIOR(zB*^~)c#-QoLc1UEpl?ZZLUr8l>1-i;r31Prv8t`}N6n+$pQpfUzvh0Fd-V19 zO!6toJ*ws&oykLt*Vbr08=cVEJz)|XB3cJpZ};&|6Ol_)v#IO+|9NQhIy>mxJB|44 zms#M}oDHeY0T1*|-+%0mBWF(B)aRUz>TKEQgrV-RODNrx|(|YwQ%2(R>!HpDm-$ zSL@yV{cZughcx}&1Nb*{*w1_69^#)-_pmAL$K&AsuH%}|Hhy9opWVjKXyc!5;}^E^%iH*sZTyBdeq$TIt&QJ4h<8+Xz%zf} z0+;i5*C5|Ub$1)Tr{zC){qJkD-`~a`Z251(Hg2A0kF?nzZR0;~`EMHkpSRf`YvaFc z`EM5gC)@15ZsWgc`EMTo=iBVR8^k-Rm*B1a(x~18kDoEXg|{`9`M5ZZ>z>l*J<@B= zP1Vf`Q5V+x$;}&%5U|s@d=XKZCMqyr=nIW73q)L;CuU8Qb4C zW7_UJxAL5eUcN_=dz8-u=b`P?*E_MN`Mf%`l6&-ac2DV^+TA-qpS&liGu`OZx&|gs z@5UWX!=9hoFxmRo*p3ABsM0 zVEmLA(*6SKtzN}6x6cb=>+0?8iy^rdLLcbw>YZL2{7ilR+_~6s23v|8Ee*&%=v6}L zs4l4I$e2sP-kZ}`*Y7H{a-Uq^#&2ljH`X~?&pM6jX87?Zbk6AM9iW8;oA0bx-#z=r zcTL&7cP7<9PhYRA-0IJrP5&0Y8{FE1jp_mLynQpIdK5l>O3yT2s!eOXqi9q=LTldt zk7>pK3hrkW$MFQX9LG~_{Eu*+VtAQiY@_;9oBfSJyrX&(PBZv+RPTZ(G|yXBl}*yz z2W%@I8t2aQg6Qrvb1w`o`?6RY|418OvW+h_h!3fjg*V^T@sOK&Tpq2}zeY6-JYia& zmzrk#BjNMf@2EC_`_SuH_9)^G0C(}KCvg+uKKF%Bg3rsty=)NgsHVU(N3+2h|LKGL z8r4~C{OmSAu5bvliglDXmgQuR*>EmW|tXB^5ZB*C7rw?@XXS_GUyL%@zeQyCD z$IA^by3WO2@c2CdKEA%DJuaTwdJpS*KGWn~euQbpy#OCq&yD8(1K5cb zufoM6^R($)+^BX2&wF3dsCI#u^F4MD=Q*Z`Jpx=_?~feh+o+Bj#QAIspU8#K@l0s5 zO@?QFr#1aLrTFq$M(*cxALyG}qdIp`e4{#l5NEyMt>=F}s|>O=s?QGM9o41qw0#Yj zdrjSTM|J01TSs*tctYK$N5JEzu&TVr=EG5QOpl|@o4W<7r{?-JXSnukt;g%wT5Wbz zZ-XZ^^LAA47*o&DyWn~GI;s!gnZK2|=+Cn?s#OMY-h;w(UE5IX{j#3BO$PZksx1pX zmc8OQ+FBpt7sd;a*4P)8b;70TF)Nrsz`06d%^nKri9L6=jtf;ZiJ|60g0uL#$&qMK9sKARB zc(DR6QQ##Dyi|dgDe&?IUa7!7XQka$3cPB8efEmaumZ1M;Nb;cv%qT=c`Rp2cOyk&v6DzMLpX=}Rzk1p``1s+r29Sgiufp;$OE(PAT zz+(%%TY)Fe#jWSg;jQj9_uA3e+-t49*wK70T^jD5(zbk!eLnS2pI+dB0?#P0&#&=6 zslcbs#f|1O>>@a=Nb%gsv#RHghnmlz9_?EDIi{bk`gx8e)^Vx%{a7^5OSL5mjmt+< zTc*%F*Y$G_61zaHtz2lfr{Azb^I21EL{sD1UXR1M-HmfTJh{V-@o=8iI-0oAeEuBX zv}kK_*Ufp5`yHoqxl)sNG@k>3%{)GL1#e?Q4)gom5`0*Jk1KG8lf|L`(ghw_;B^Z8 zu>yOqOzflrdvA<>Y=H*~JfpxT6!@e9pIqQi75J0_pITt=r)l?+0(-xVetm&&EbvVQ zzNNso7WlRTf4RW77x*g${z-wo@1_slbAz8P@N)%zzQEpZ8mkyN5no7kHZj zZ(HE)=i)}SE7r)S-%ttYw-xcetZoUnv)y!wC`8K0Y zK40#bPfeTO-)b{<3u?Ks`f1;mI*lH zirV>V&HvEmcSUN^^|SA%Qu{rU?VH2cGpLRAeDi!bliJ+stHaN#vAX^jQ0w2IXuptJ z%^2e^uCbc-OQ`L)y75=k+`ekRl3FeC*VI_eeAm|8Ia(KPuInhCqbtL&r?xFM$9My^ znt6;hW_{|LDIV)EQe$tYB;Q>cI99=y&k(DU>mD@Qv#8sj?@`-#ZQIoR-damr_tjkg z`)h2DwD$~nK8p4is2#ieC-HN@vA=I*oP+ZfGfDeYU; z_R%%pz2;+U{>hrV|I*(nXdiw!kI#U+{;@v{cTU283U@z+KMNlMxBVBv_V<*UzgBbi zs{W4SEi~iZFYkg~uk6o%*Z%7IJ1-xASKeT+ZPsI@&VYXiS4)ZC{B&~v@9aG$ejRYN zl=v-xw%x*KZy&z};c6-ITNrKTflrQ(-y(3el=yuF?W<$Ewu|55aJ7{9t$_B*SJ#~D zw<26EC4Ot5U47LGJ0#x-xLV2z+zBVaog1cM(1FaXwf6dCh<>h1j|L7k_qF`!TGKwY z#_>NM?mBv|xlX6U-Q!uOGvKzZZa(>$wPwDvYHWoSPB<nOJMFSv84o?P#NM;_C=ciLKr3H38b?8BX4ZW-%wMtRPx0avrc zXH?VYwC$7QDEOlI#{M+?-Yw4BJoft;`>ELNb~!f-TH=tdyl=b;NClKF1Xhy-{+U{cNg4q;emqd_h`X=*8EYywfjy#`Ru>% z^~3Ft@Abo9fd8@J`v0lm`oB?d<9x55c=wr~F~d*B?ui=iHQD$2;nu#}FS+mb!yS+B z_QUOu@AkvZ@4NkQ=g)Wh;nu#_57$1d;Of5HkNs7+@Akv3eZL>>`24IK{sDI1@rT<# z-|>e#U%ulHxAq->xbrc);Of5XkKNjL{o&4s@A|`S&v*Ud&WG>%!>xVSAMW^l-yd%6 z`~Gn4zV9#jjcwd_{-xb_{w05);IES3cmJ`=efMAT7uvY*|4X~y0hHYD0m7|Usn7eI zuYMm8yU!du7u@;sJAv3;AHNd_xAr@MaG!CGD7f)Q7Toxw3huM#_=1~fLL2wHf#lcU zcOc~{sR8}IvyaIe9>vk15LyMdCwU2x-lXHnYyo*-Pi?=Hff55Fr2cRu{CAl%yT3QF$x z1L0?4_Z@>GKV^dfNLY*!I-Z-nYQ!(zY-2tKJ@eK1_u+K8q{~+#r)b~<6wBJ|Tt+OWI z2b<##`sHV#`@w2HGrda=`8^bKnOmIL2f@ZVKc2Tg0Q;<>Zr|iSuiCc#Fy1=xkARKO zJpB-?Hiujpn_4+G`=RYoYR7Hc_f!8EelNAL$^R2H_1o&P{S>TboH=}MQZxV0sO_J% zbN37C$0#1!AFu7!=69}s2{woGB>p+YIP-}0^PG7S?73i@Pf*LXc@F#<9-MK=fNKKTibIKHTx}gycvgj z^8OBNUUU3`+T-86F;tf;y}K-e=jxBt>f$%4ofF6VSFr6` zyZ&!czfJMb{`cB$o%#3&*c>@`-T|xS+>!r{VlH!w6Z>zlvF_hjY2$tHyA*Z%CV!7& z+xEkF>%@NmHa_RhhhVjwJKEIBvDpvXbKd7;ah!|t`aD0Hy4Ppl>ve$D>`(I_mH8Uf z_Q%?}8cMwY#Y6j$+HRfp7ST>A+glW_Zhk)#$i;ph@Uy97b#E*V_B_+(X9IceM@z`@ zG2S@s`uUkbn>ifI(qQDK&Cd^V_sJ>9%YbK78>7v5KeMRYf1i;*3RcVY#cT3%aCQCt z3?di%`6K&k1+a5v?Od%yy)wnaxmu;RTW4)o1)C%LYBjK0_LY1^in+`!Hr76^4!)DQ znnl0mBj9R&md{w$1Ur_jg??+n)laYE)&?7^?wE#AtEG?YfbF9;KTpZGBK8V$eGGgx zwR-Mf>w?wO&-K9OOh4BLtLc~a)Z)J(*f|d02yFbNv^j#dHU{gX9-mFXWz42EPt0a; zebmpX^K1?_m$pUOedgW*>^05rN3=i0{Q4fme)`!_yKQ=1+Y;=&gl|=I=ic*tYq;mR zx<2Ojds+4Lds}e&t$jPV?OaF>*J?CaAN9m+4|YDXemj8mQ(u&xnP&{xaqLLZz9?4$ z^&M;7Yo>kCFa6yKY=0e>CO7a{EF7tc>?s}>x&tYJ7_l|iE2b)Vh=fEeyIR~`o z97z9;Ks$h9znwd|G0u(iC`#fcfD>msa{X+_XBhkGvy1)o*=;hmJbO$6%eDFJb~HH89z9^W_Y}WBIR<N?VTD!OUsHagp?ANii-8%Ev4>m{k)^xC1_LjVtVlH!w6FUGl z);Vdgs58L!&AHdd_t-Px>V5DL@Z-R0S?}Y)uD7`yk6a)7Yu|0tI&Gc+w#}@~iC{I` z%zUUBpZT!wS+|qG_OV>=li})lUilPQ&3yioo9`5A`(y3=o<=>3;-UT2+HRfp&d^RN z+dC7kZvNA$<>Irb7os>;uP^6-^Sp94Sf1yVbHP5Z7;l_*bDT%5%^Y6yJ`J{gZRbngap?AMjl@;oP84fZ)fJ*RCU zbzsMybu#99xO)0@1K7CS8$S=$N8Oy)Qme)PCa_xC_yX8Aa(>vx7vbvZ=gnZ_)XjM# zwVFA+?|%tw&b&MDoVpdR9-rI5#+5!_hO6&e&&Tax?C;m%>hbwT?Nj#mn`r8H(O=JnZ-I?d_jBW&)E?e< zwSAkS<~|Z<4ZZ`m{j_yAxPAV972-vd^&WUapkHkUTvo4FSEg0r4w-|s`$mVDm_ zt66+LbPn$an^T+fb{Dl;`uQN(ewOq05L`Vz4})Fj(&qgRCv^yL@e^u;#Jp-<-WF|hsB z&v|=*S}kq-60BCv$*5Df{sxntGmPp90$tpJk2L9^0=An|`AxdFJ^ISo_ly z`}a7tTH1aV+}^+E(A4AeeC<>A@3&~`>BH~9_HQJ`c-AT#xwOUaZ{Xn+ZL{k3-U8c4ZOQ*O*s&(p z-@*E+$LAkl=Rf=%a2fMYxIXIn?D#LRxwJXQf1p+~$56)pF8IQlI~V^3>y!QS9{3hY zc@De}SC7vJwNJUnK15T`cxbLNPCauoADZoHORo9B2cX%njI#qxJ@eN9s~MlTA>hP0 zPPu+QPuxi^ujSSm_fTw(JJ;U@;IrX#=uSD-1<|$T^Xfui_n>p9E$uH1zMS}S4i-UE zkI$lD+qjR|jBPPA^~}>pz{aU(yo-a?M-iKA+Y)g5;pczto(tMDw@aegSN+odQefxE zv8R21hFU#sEdy3FK5@%}6X)E@^>c17CZBU_ow@xeHs?0a)yu)v=FnH;wW%3@Q62wT zYTNVk_VU=Ar}Fdm3TW!a`%iBCEU@ic8}H}s6|w1)=P%c4B{cPX-d-83=30B-m!Cq+ zX>~qp+wyx|eQax0ux(lU+&YYUb&7}f;kDg5bFl{49Cy$+pFKu^)$-gbUyWiebBhzZ zCfL~UwQ8PqSsU)UsOvL={3F5Y*{|z>-6vE{Tdt)at99>5`nY$JcU`!7ZQnVN>+cvH zul=-6zt#iW&#d?QV6{2q&K#Nnk-N8pw8>3C1 zovGEuAE(aR?E!XvtX;dksQ0FL*spzRyLHC8FW4MeyZyjwSv&cj6mywdoY?)r#%Apf z0Nb}*TSn0CfpGQs93*EwjkW)Bee9oowGHdEaWL36vKEJc)#i{p<5x32eopsi4kv*>L#?iTJhhs5GIjP#57_y(cE21$ zJ%!@oIH%Tj>&#s**c{m}ePFfh7x~c?bD3Lgto=F`Y+sVAA8fAdjp<gmUcVDtL3%jw5SaP>Tk zoDBB5q@KQg3T!TI$$bjgT(j!_p9yf>T<&fZSWGtktN^GvW>c}|=KPn_+@^()T_zb7`Q=fru~a!#BJ zmTU8z_%t}@#Q9*k=fw8dE&vZu8>7v(&Y@N}{t9fL0sHe7+AadiwHbdgSbuGwrIwr9 zpHIF7d=s@X+Vr`QT3vi8ba-}c@_KT;hsb4`iwxk5v-nldK1_@>W=vu zYPID20@$44U#xlhbu--ks;-ayx&^GRkNx-(*u3RAbSqpv=g@6n&mr~n?aN?uX-n?g z!RDG(pF>{(>!+SReHCn*qSTX3n*B#`HC?W6XZP1FSZOzGok(8J~UNn%SP` z(AU8^7n1WEXzI!NO|aTgj4^TFf+x;)oeH}!WZ9>(5N z+pW`|?}5#cy?8HJEqhV^9g4ZkEl%wB!T!vS&&2NQ2jD&{uSL;zKgB-YN3AV!KLo3V zKMJ-F;XkT**6qh|*G*j?_tHi+!92>4IIY97&kR%`Yt+Rtl^baS;|)EY~`yRFvG z-vfW2I`^@>%!!TKBWAKKZPh&+flPdx~P8?1%kTGcNu8 z9oYV5O~>%_kZHa`3QC9v8Y`eeK|HRH37-E)b58SGw5{2aL2 zoH|~cTF#;0gL4ig$17;+$?+;!t$b#G4W2mrCD*S!_x!o0 zGuP|XIal8RJ4e=@tAC+>lj31pf35A-Y3pxbbL3oo3#^uNRsLs+xy&t2?BBuVx%v-y z&Q)#qQ|#m0)Y=mFPq13}zrgk(d;Q;F_qu-C;`bg{n>pX5mgm{?eX!4->e}C-R!i)M z;8q*0=P{0+j+?Oz8Kt?^4{q0YOCko zxH#B*qk7u)_qNTYExG;uZF9|H!feam;nq(*=kwBF&u9B(f93kQ7u}ccC+qZo8Ep1H z=hCupwK?QBUYnZn+3W7hoSz@Xmh&t*mqSxe&gH>s<@vb+JaM)o*RMQ3U#rhgpD$O& zmgh@<=3H)jK3}d1HkWz)`E$9?fc`GyYGD7Ck}=xybDG1z+C2P@+B~bn)y-o*dB!;$ zoc?Bd$r>>(9;S8Z#2CueH~hk5R8n z@i2D1+HRe`tq(Rwt}z>c)pCuIuR}4Hxy8m#qWz7){;rn%3fkBh?DKbWZwfcJ{q*Pg z<@w%tGw{{KsJs5gZw@wxwv)*@8f_Ftdb$?hLlSIj?pBtIes$tWC}M z>>1bheeMujQvCfo&n17x!DA817ObJqI{uzu@M1M?{=OI3e)xNj`JKBjf{jVPZUGyg z^WaNheR3Yi_0iw4DefV|@L&G*8~QaqNT*dKq##J@d}zWH}x?9WH3jraGt6Yq1} ziWHCKD2ey?C=>7Bn=pPQ>Wt@WV8@f+S^9b#zq{t?{~d61y8n%pXD+@0whe9Ok$Ww2 zY~Q3fK4Y|fiTP0f7Dd}T#N0{k(fkgc`r8yW`z&_MUKix*Y4`hJ+x32aKlKBYdnxx( zw8ig1ur~YWxa6@t1Xi;Rapu)E8%FV1nUZ;3rN++Pss&yRZ2wlL&YpM#tiO9A?fnp} z9{woUJlWqr0_&shHQIjsn4)eU#p&ak;B_b-Yf;k2wQIZv^+<~S`WSWA`e$JKAO3j3 zmjpjiaQ&WY<4+e{|K|#>|8EQa0{F$6XI_2|w_oMF`~t52Am{5)>c=Q*uBGc{pVcyN zzXYpg-hKsEbD!wrygf;AE{u0fa7z{_=R`g2{RUjN_bgm3-}%|z z^OUsboXE}PeCVI{eh0QaeYEN0oT{h2m%#QRbM`XWd34V7m+KS%--FHJy=DaUD-<>3 zU!{JHVtlTfa(#?3`zN?;?_Y4Wa*zERp7xv*xw)JV z{nOt2VB6D2n?BB|dfMam*z7~*Y(BX2m^~)fC;sz;&EXzfpSlCCW_*Ks2*vp9F}Xg* zyN`y}cK4V#du$Z8ttlQ`G)wCqxMhtur{0R<9@~aGdukzalzYs-fgbLj@o%7qTl+WA z!~MP5Wow@CFATSxa?dP+rk*{sC|E6f#y+TJjy?ib%br;rtXA%sCEdwQ|kYfRCXa*T6B!#g0e+tl64i*GwO6`Zy=*X>V>7z{_=TtrIZ2-0pnX?VS&SUn7T%Y)F z1UARR^}To#u$q5cC+GgAVD*Ra@mjDMSWTbg*&J-1#Eb%~C1wk-nlby5XEb@X1e;5n z{q=A7Y)!Ghxvt3dvmN*CHnrV7EzX`Ei){~z$8MDD>D_C*EA_`I?&&?Lv!_RsBYVzz z$AY^@cPY5`u?629{PCJ+zPDGyuiT?Mps8n%jsdG>kJ<;d%;ipCwd~QI!D{6m-4*WG z+ynMeF1COAXV2^gcF*XeO&`aop7!YI}NUo3ZW3lgD+g%fJ z)?^=S2T(lrrDRR^tMT5{`%_$#1F5qn`;o)B4nL^iuF0VV*YB```*)s>sCm|8f4J?G zYjS`ZC2MjZSS@Q}AJj5O2ZPnJCWnC4$~F0fKD6T;+ef+B{^_4JIUMYo=%Y;^$Ecq6 zjs&Y^44q)dl{Jv-qrdZfRBd++#94zwupLhEIFyn#_(Y8lrap|~8hnyEb3UFNSp(;M zQo+@GYM%a0fSaeB<8Czd%<)99nsaQwz4lFp+ZX4+HsoUS>z_G08tk0uqfH`LvlxIGDwmFL8aRepvd1Q^9&rXW- z*+uPqI)BHLBlEc){KT4P{-(o?E9Y+jO+9;VCRl9-#m|d=PjVbsJ@a`2*uFSFwka3e zp1#hfV>k)C3AH}j9H&~w>F0!QipMxg#yP&mj&lOVaZaSpIA@b1<6Hy%jGAYhr@)OX z$9XE6dd7JgSS{lmj{hv}l#KIquzhiCwka3ep1zLLF`Nlrhgu(Pj#Dk;9EKW&GV6}{M7yQqMt7n`SfbEN8 zvrW0!_Vjg}j^QG(_dk8KIZm~Va}>5dipLa6#yPddj}K z7sHJ!$9V~wdd7JvSS{n+4FAjE>KW%1VEf|OY*Q|_J$)UgW4H>uHMKt49H&~w=`+Fr z#p75?#@SzE$2pziIA>62oHvjoKW(f z!S==Nj`Kw7jPo{fWSo1W z-(K^K^Jchl@$Ks+8n1^ z#(4_1*%XgcDH-Q!H9nbo7R7O%PVG1y-yP)0IIX``aIeRA6ZTy~s>wjOt^}oN3 zKUi@6A1S#0kGAn27hM0J7hM0x+W0REuK$w-*Z}2!5&N*#}>T z&%&?V2j4(b&%N`TU^VaOuB~&f<{lV|_HD3Q_RXDOwY#uopM0ljZ^gSNa&x&J`nyle z^^j8$23Rd^ z{uykHdfNO8*f!_XZOZja&Nso%b#nd{tY-YH#QY6xe9mLJK8b%D?A-axc@qAA2df$H z_4glOh!DPGUdrS^LMZ(`n~c-@lg zGvEA!`Ouo%rg`7T-#qux$9KV`Yib{Y?W6hB;@<%4@9)(ZKLp-EQPSK$y>c60t>)?5@^JHHu2+Jqtw?cf=2MIR zDq#Kn-8K8ZDqPL^*5CHj;y(ymmI^v>dkqs*JMq2)A#=SBSWQ1;oolts^#%0xGZc>tDei0M^CF6K{aI?~`eN$!h%4uM zqng{c_r3Mu#`&{z=G_49n0BCO+lRkgrM@9po23Iz)%2rkj@fqfF|Ym`gVpl02AhD@ zEFQKYbiVA9_WZ2DreODydafaxfz6kD(Fp9D!`1UM1f#&ldPLiz)@(1@mbIp9ey(6E zi1GSsH@EYyo_Tk^E~9u{LUG=m<4Y;d`{mTRer!ve^Pc(G4z6G3V>DdN;!)0rvDz~q z+k?H1sb@ZR0GqFzk1=TK8Rw2*V?CnnRBPpY>|ATQWZKtZg048XVr20z}3pQec|de$mRXfe(eW0pEmm~Prt9neiOyx28#W@ zhWhh0_B)3gDURU_)b17c$bsa@Il2{XeX`){M;6@uHnHH|tNIImJoto~yI$t@dDx$6 zIS6dq*8UvSA=HObJhXqJwp(X@4+ERy4#vMAeusnAoJ)VkLVhsCT;>)V>s%cHcI?U3 z2{)Jha!m64+|yBD*IeD_dE>jl=FsM|#b~r~V0F&}=W0CoM)Wxp+tFtNn!0)9-C*O? zZQJ=(Gl%ng4ERRsn438OY#k$7Yb4{25_8O_4d^5qVe64fwIJla=4Sa`CJV!h)(;t0| zS3eQ#`Xt{;VDmW#?z@x0?k)X{m+Rx+aUZ$<*2(uNu=!jE^Pd7%bFGZmrdIAv_mT0Q POQ(Wu&+|;|dG>z*L#Hyh diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 6340683..3761e9e 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -73,7 +73,7 @@ void main() { for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) { barrier(); if (th_ix >= (1 << i)) { - total_tile_count += sh_tile_count[th_ix - (1 << i)]; + total_tile_count += sh_tile_count[th_ix - (1u << i)]; } barrier(); sh_tile_count[th_ix] = total_tile_count; diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 25627f6..e1bde6a 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -15,7 +15,7 @@ use piet::{ImageFormat, RenderContext}; use piet_gpu_hal::{ BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline, - QueryPool, Session, ShaderCode, + QueryPool, Session, ShaderCode, include_shader, }; use pico_svg::PicoSvg; @@ -161,23 +161,23 @@ impl Renderer { }) .collect(); - let tile_alloc_code = ShaderCode::Spv(include_bytes!("../shader/tile_alloc.spv")); + let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc"); let tile_pipeline = session .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::Buffer])?; let tile_ds = session .create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?; - let path_alloc_code = ShaderCode::Spv(include_bytes!("../shader/path_coarse.spv")); + let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse"); let path_pipeline = session .create_compute_pipeline(path_alloc_code, &[BindType::Buffer, BindType::Buffer])?; let path_ds = session .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 { - ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv")) + include_shader!(session, "../shader/gen/backdrop_lg") } else { println!("using small workgroup backdrop kernel"); - ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")) + include_shader!(session, "../shader/gen/backdrop") }; let backdrop_pipeline = session .create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::Buffer])?; @@ -185,13 +185,13 @@ impl Renderer { .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?; // TODO: constants - let bin_code = ShaderCode::Spv(include_bytes!("../shader/binning.spv")); + let bin_code = include_shader!(session, "../shader/gen/binning"); let bin_pipeline = session.create_compute_pipeline(bin_code, &[BindType::Buffer, BindType::Buffer])?; let bin_ds = session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?; - let coarse_code = ShaderCode::Spv(include_bytes!("../shader/coarse.spv")); + let coarse_code = include_shader!(session, "../shader/gen/coarse"); let coarse_pipeline = session.create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::Buffer])?; let coarse_ds = session @@ -210,7 +210,7 @@ impl Renderer { .collect(); let gradients = Self::make_gradient_image(&session); - let k4_code = ShaderCode::Spv(include_bytes!("../shader/kernel4.spv")); + let k4_code = include_shader!(session, "../shader/gen/kernel4"); let k4_pipeline = session.create_compute_pipeline( k4_code, &[