#pragma clang diagnostic ignored "-Wmissing-prototypes" #include #include using namespace metal; struct Bic { uint a; uint b; }; struct ClipEl { uint parent_ix; float4 bbox; }; struct Alloc { uint offset; }; struct Config { uint n_elements; uint n_pathseg; uint width_in_tiles; uint height_in_tiles; Alloc tile_alloc; Alloc bin_alloc; Alloc ptcl_alloc; Alloc pathseg_alloc; Alloc anno_alloc; Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; Alloc clip_alloc; Alloc clip_bic_alloc; Alloc clip_stack_alloc; Alloc clip_bbox_alloc; uint n_trans; uint n_path; uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; uint pathseg_offset; }; struct ConfigBuf { Config conf; }; struct Memory { uint mem_offset; uint mem_error; uint memory[1]; }; constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) { uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); return Bic{ v_96.memory[base], v_96.memory[base + 1u] }; } static inline __attribute__((always_inline)) Bic bic_combine(thread const Bic& x, thread const Bic& y) { uint m = min(x.b, y.a); return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; } static inline __attribute__((always_inline)) ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) { uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); uint parent_ix = v_96.memory[base]; float x0 = as_type(v_96.memory[base + 1u]); float y0 = as_type(v_96.memory[base + 2u]); float x1 = as_type(v_96.memory[base + 3u]); float y1 = as_type(v_96.memory[base + 4u]); float4 bbox = float4(x0, y0, x1, y1); return ClipEl{ parent_ix, bbox }; } static inline __attribute__((always_inline)) float4 bbox_intersect(thread const float4& a, thread const float4& b) { return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); } static inline __attribute__((always_inline)) uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) { if (ix < v_80.conf.n_clip) { return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix]; } else { return 2147483648u; } } static inline __attribute__((always_inline)) float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96) { uint base = (v_80.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix); float bbox_l = float(v_96.memory[base]) - 32768.0; float bbox_t = float(v_96.memory[base + 1u]) - 32768.0; float bbox_r = float(v_96.memory[base + 2u]) - 32768.0; float bbox_b = float(v_96.memory[base + 3u]) - 32768.0; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } static inline __attribute__((always_inline)) uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510]) { uint ix = gl_LocalInvocationID.x; uint j = 0u; while (j < 8u) { uint base = 512u - (2u << (8u - j)); if (((ix >> j) & 1u) != 0u) { Bic param = sh_bic[(base + (ix >> j)) - 1u]; Bic param_1 = bic; Bic test = bic_combine(param, param_1); if (test.b > 0u) { break; } bic = test; ix -= (1u << j); } j++; } if (ix > 0u) { while (j > 0u) { j--; uint base_1 = 512u - (2u << (8u - j)); Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; Bic param_3 = bic; Bic test_1 = bic_combine(param_2, param_3); if (test_1.b == 0u) { bic = test_1; ix -= (1u << j); } } } if (ix > 0u) { return ix - 1u; } else { return 4294967295u - bic.a; } } static inline __attribute__((always_inline)) void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96) { uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix); v_96.memory[base] = as_type(bbox.x); v_96.memory[base + 1u] = as_type(bbox.y); v_96.memory[base + 2u] = as_type(bbox.z); v_96.memory[base + 3u] = as_type(bbox.w); } kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) { threadgroup Bic sh_bic[510]; threadgroup uint sh_stack[256]; threadgroup float4 sh_stack_bbox[256]; threadgroup uint sh_link[256]; threadgroup float4 sh_bbox[256]; uint th = gl_LocalInvocationID.x; Bic bic = Bic{ 0u, 0u }; if (th < gl_WorkGroupID.x) { uint param = th; bic = load_bic(param, v_80, v_96); } sh_bic[th] = bic; for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); if ((th + (1u << i)) < 256u) { Bic other = sh_bic[th + (1u << i)]; Bic param_1 = bic; Bic param_2 = other; bic = bic_combine(param_1, param_2); } threadgroup_barrier(mem_flags::mem_threadgroup); sh_bic[th] = bic; } threadgroup_barrier(mem_flags::mem_threadgroup); uint stack_size = sh_bic[0].b; uint sp = 255u - th; uint ix = 0u; for (uint i_1 = 0u; i_1 < 8u; i_1++) { uint probe = ix + (128u >> i_1); if (sp < sh_bic[probe].b) { ix = probe; } } uint b = sh_bic[ix].b; float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); if (sp < b) { uint param_3 = (((ix * 256u) + b) - sp) - 1u; ClipEl el = load_clip_el(param_3, v_80, v_96); sh_stack[th] = el.parent_ix; bbox = el.bbox; } for (uint i_2 = 0u; i_2 < 8u; i_2++) { sh_stack_bbox[th] = bbox; threadgroup_barrier(mem_flags::mem_threadgroup); if (th >= (1u << i_2)) { float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; float4 param_5 = bbox; bbox = bbox_intersect(param_4, param_5); } threadgroup_barrier(mem_flags::mem_threadgroup); } sh_stack_bbox[th] = bbox; uint param_6 = gl_GlobalInvocationID.x; uint inp = load_path_ix(param_6, v_80, v_96); bool is_push = int(inp) >= 0; bic = Bic{ 1u - uint(is_push), uint(is_push) }; sh_bic[th] = bic; if (is_push) { uint param_7 = inp; bbox = load_path_bbox(param_7, v_80, v_96); } else { bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); } uint inbase = 0u; for (uint i_3 = 0u; i_3 < 7u; i_3++) { uint outbase = 512u - (1u << (8u - i_3)); threadgroup_barrier(mem_flags::mem_threadgroup); if (th < (1u << (7u - i_3))) { Bic param_8 = sh_bic[inbase + (th * 2u)]; Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; sh_bic[outbase + th] = bic_combine(param_8, param_9); } inbase = outbase; } threadgroup_barrier(mem_flags::mem_threadgroup); bic = Bic{ 0u, 0u }; Bic param_10 = bic; uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic); bic = param_10; uint link = _618; sh_link[th] = link; threadgroup_barrier(mem_flags::mem_threadgroup); uint grandparent; if (int(link) >= 0) { grandparent = sh_link[link]; } else { grandparent = link - 1u; } uint parent; if (int(link) >= 0) { parent = (gl_WorkGroupID.x * 256u) + link; } else { if (int(link + stack_size) >= 0) { parent = sh_stack[256u + link]; } else { parent = 4294967295u; } } for (uint i_4 = 0u; i_4 < 8u; i_4++) { if (i_4 != 0u) { sh_link[th] = link; } sh_bbox[th] = bbox; threadgroup_barrier(mem_flags::mem_threadgroup); if (int(link) >= 0) { float4 param_11 = sh_bbox[link]; float4 param_12 = bbox; bbox = bbox_intersect(param_11, param_12); link = sh_link[link]; } threadgroup_barrier(mem_flags::mem_threadgroup); } if (int(link + stack_size) >= 0) { float4 param_13 = sh_stack_bbox[256u + link]; float4 param_14 = bbox; bbox = bbox_intersect(param_13, param_14); } sh_bbox[th] = bbox; threadgroup_barrier(mem_flags::mem_threadgroup); uint path_ix = inp; bool _717 = !is_push; bool _725; if (_717) { _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip; } else { _725 = _717; } if (_725) { uint param_15 = parent; path_ix = load_path_ix(param_15, v_80, v_96); uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * (~inp)); v_96.memory[drawmonoid_out_base] = path_ix; if (int(grandparent) >= 0) { bbox = sh_bbox[grandparent]; } else { if (int(grandparent + stack_size) >= 0) { bbox = sh_stack_bbox[256u + grandparent]; } else { bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); } } } uint param_16 = gl_GlobalInvocationID.x; float4 param_17 = bbox; store_clip_bbox(param_16, param_17, v_80, v_96); }