Make shaders cross-platform

Translate all piet-gpu shaders into DXIL and MSL; move generated files
into the shader/gen directory.
This commit is contained in:
Raph Levien 2021-12-03 15:49:58 -08:00
parent 44327fe49f
commit c503ff28b0
46 changed files with 8140 additions and 529 deletions

View file

@ -87,8 +87,8 @@ void main() {
// Prefix sum of sh_row_count
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
barrier();
if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) {
row_count += sh_row_count[th_ix - (1 << i)];
if (gl_LocalInvocationID.y == 0 && th_ix >= (1u << i)) {
row_count += sh_row_count[th_ix - (1u << i)];
}
barrier();
if (gl_LocalInvocationID.y == 0) {
@ -102,7 +102,7 @@ void main() {
// Binary search to find element
uint el_ix = 0;
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
uint probe = el_ix + ((BACKDROP_WG / 2) >> i);
uint probe = el_ix + (uint(BACKDROP_WG / 2) >> i);
if (row >= sh_row_count[probe - 1]) {
el_ix = probe;
}

View file

@ -14,7 +14,7 @@ rule glsl
command = $glslang_validator $flags -V -o $out $in
rule hlsl
command = $spirv_cross --hlsl $in --output $out
command = $spirv_cross --hlsl --shader-model 60 $in --output $out
rule dxil
command = $dxc -T cs_6_0 $in -Fo $out
@ -22,23 +22,41 @@ rule dxil
rule msl
command = $spirv_cross --msl $in --output $out $msl_flags
build gen/binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h
build gen/binning.hlsl: hlsl gen/binning.spv
build gen/binning.dxil: dxil gen/binning.hlsl
build gen/binning.msl: msl gen/binning.spv
build elements.spv: glsl elements.comp | scene.h state.h annotated.h
build gen/tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
build gen/tile_alloc.hlsl: hlsl gen/tile_alloc.spv
build gen/tile_alloc.dxil: dxil gen/tile_alloc.hlsl
build gen/tile_alloc.msl: msl gen/tile_alloc.spv
build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h
build gen/path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
build gen/path_coarse.hlsl: hlsl gen/path_coarse.spv
build gen/path_coarse.dxil: dxil gen/path_coarse.hlsl
build gen/path_coarse.msl: msl gen/path_coarse.spv
build tile_alloc.spv: glsl tile_alloc.comp | annotated.h tile.h setup.h
build gen/backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
build gen/backdrop.hlsl: hlsl gen/backdrop.spv
build gen/backdrop.dxil: dxil gen/backdrop.hlsl
build gen/backdrop.msl: msl gen/backdrop.spv
build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setup.h
build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
build gen/backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
flags = -DBACKDROP_DIST_FACTOR=4
build gen/backdrop_lg.hlsl: hlsl gen/backdrop_lg.spv
build gen/backdrop_lg.dxil: dxil gen/backdrop_lg.hlsl
build gen/backdrop_lg.msl: msl gen/backdrop_lg.spv
build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
build gen/coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
build gen/coarse.hlsl: hlsl gen/coarse.spv
build gen/coarse.dxil: dxil gen/coarse.hlsl
build gen/coarse.msl: msl gen/coarse.spv
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
build gen/kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
build gen/kernel4.hlsl: hlsl gen/kernel4.spv
build gen/kernel4.dxil: dxil gen/kernel4.hlsl
build gen/kernel4.msl: msl gen/kernel4.spv
# New element pipeline follows

View file

@ -172,8 +172,8 @@ void main() {
}
barrier();
if (th_ix < N_PART_READ) {
if (th_ix >= (1 << i)) {
count += sh_part_count[th_ix - (1 << i)];
if (th_ix >= (1u << i)) {
count += sh_part_count[th_ix - (1u << i)];
}
}
barrier();
@ -190,7 +190,7 @@ void main() {
if (ix >= wr_ix && ix < ready_ix && mem_ok) {
uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + ((N_PART_READ / 2) >> i);
uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
if (ix >= sh_part_count[probe - 1]) {
part_ix = probe;
}
@ -257,8 +257,8 @@ void main() {
sh_tile_count[th_ix] = tile_count;
for (uint i = 0; i < LG_N_TILE; i++) {
barrier();
if (th_ix >= (1 << i)) {
tile_count += sh_tile_count[th_ix - (1 << i)];
if (th_ix >= (1u << i)) {
tile_count += sh_tile_count[th_ix - (1u << i)];
}
barrier();
sh_tile_count[th_ix] = tile_count;
@ -269,7 +269,7 @@ void main() {
// Binary search to find element
uint el_ix = 0;
for (uint i = 0; i < LG_N_TILE; i++) {
uint probe = el_ix + ((N_TILE / 2) >> i);
uint probe = el_ix + (uint(N_TILE / 2) >> i);
if (ix >= sh_tile_count[probe - 1]) {
el_ix = probe;
}
@ -292,7 +292,7 @@ void main() {
}
if (include_tile) {
uint el_slice = el_ix / 32;
uint el_mask = 1 << (el_ix & 31);
uint el_mask = 1u << (el_ix & 31);
atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
}
}
@ -372,7 +372,7 @@ void main() {
if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1;
} else if (tile.tile.offset == 0 && clip_depth < 32) {
clip_one_mask |= (1 << clip_depth);
clip_one_mask |= (1u << clip_depth);
} else {
AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
@ -382,14 +382,14 @@ void main() {
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
if (clip_depth < 32) {
clip_one_mask &= ~(1 << clip_depth);
clip_one_mask &= ~(1u << clip_depth);
}
}
clip_depth++;
break;
case Annotated_EndClip:
clip_depth--;
if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}

View file

@ -1,467 +0,0 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The element processing stage, first in the pipeline.
//
// This stage is primarily about applying transforms and computing bounding
// boxes. It is organized as a scan over the input elements, producing
// annotated output elements.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define N_ROWS 4
#define WG_SIZE 32
#define LG_WG_SIZE 5
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(set = 0, binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
// It would be better to use the Vulkan memory model than
// "volatile" but shooting for compatibility here rather
// than doing things right.
layout(set = 0, binding = 3) volatile buffer StateBuf {
uint part_counter;
uint[] state;
};
#include "scene.h"
#include "state.h"
#include "annotated.h"
#include "pathseg.h"
#include "tile.h"
#define StateBuf_stride (4 + 2 * State_size)
StateRef state_aggregate_ref(uint partition_ix) {
return StateRef(4 + partition_ix * StateBuf_stride);
}
StateRef state_prefix_ref(uint partition_ix) {
return StateRef(4 + partition_ix * StateBuf_stride + State_size);
}
uint state_flag_index(uint partition_ix) {
return partition_ix * (StateBuf_stride / 4);
}
// These correspond to X, A, P respectively in the prefix sum paper.
#define FLAG_NOT_READY 0
#define FLAG_AGGREGATE_READY 1
#define FLAG_PREFIX_READY 2
#define FLAG_SET_LINEWIDTH 1
#define FLAG_SET_BBOX 2
#define FLAG_RESET_BBOX 4
#define FLAG_SET_FILL_MODE 8
// Fill modes take up the next bit. Non-zero fill is 0, stroke is 1.
#define LG_FILL_MODE 4
#define FILL_MODE_BITS 1
#define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE)
// This is almost like a monoid (the interaction between transformation and
// bounding boxes is approximate)
State combine_state(State a, State b) {
State c;
c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
c.bbox = a.bbox;
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
{
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
}
// It would be more concise to cast to matrix types; ah well.
c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags;
c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags;
fill_mode &= FILL_MODE_MASK;
c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode;
c.path_count = a.path_count + b.path_count;
c.pathseg_count = a.pathseg_count + b.pathseg_count;
c.trans_count = a.trans_count + b.trans_count;
return c;
}
State map_element(ElementRef ref) {
// TODO: it would *probably* be more efficient to make the memory read patterns less
// divergent, though it would be more wasted memory.
uint tag = Element_tag(ref).tag;
State c;
c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
c.mat = vec4(1.0, 0.0, 0.0, 1.0);
c.translate = vec2(0.0, 0.0);
c.linewidth = 1.0; // TODO should be 0.0
c.flags = 0;
c.path_count = 0;
c.pathseg_count = 0;
c.trans_count = 0;
switch (tag) {
case Element_Line:
LineSeg line = Element_Line_read(ref);
c.bbox.xy = min(line.p0, line.p1);
c.bbox.zw = max(line.p0, line.p1);
c.pathseg_count = 1;
break;
case Element_Quad:
QuadSeg quad = Element_Quad_read(ref);
c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
c.pathseg_count = 1;
break;
case Element_Cubic:
CubicSeg cubic = Element_Cubic_read(ref);
c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
c.pathseg_count = 1;
break;
case Element_FillColor:
case Element_FillLinGradient:
case Element_FillImage:
case Element_BeginClip:
c.flags = FLAG_RESET_BBOX;
c.path_count = 1;
break;
case Element_EndClip:
c.path_count = 1;
break;
case Element_SetLineWidth:
SetLineWidth lw = Element_SetLineWidth_read(ref);
c.linewidth = lw.width;
c.flags = FLAG_SET_LINEWIDTH;
break;
case Element_Transform:
Transform t = Element_Transform_read(ref);
c.mat = t.mat;
c.translate = t.translate;
c.trans_count = 1;
break;
case Element_SetFillMode:
SetFillMode fm = Element_SetFillMode_read(ref);
c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE);
break;
}
return c;
}
// Get the bounding box of a circle transformed by the matrix into an ellipse.
vec2 get_linewidth(State st) {
// See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
}
shared State sh_state[WG_SIZE];
shared uint sh_part_ix;
shared State sh_prefix;
shared uint sh_flag;
void main() {
State th_state[N_ROWS];
// Determine partition to process by atomic counter (described in Section
// 4.4 of prefix sum paper).
if (gl_LocalInvocationID.x == 0) {
sh_part_ix = atomicAdd(part_counter, 1);
}
barrier();
uint part_ix = sh_part_ix;
uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
ElementRef ref = ElementRef(ix * Element_size);
th_state[0] = map_element(ref);
for (uint i = 1; i < N_ROWS; i++) {
// discussion question: would it be faster to load using more coherent patterns
// into thread memory? This is kinda strided.
th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
}
State agg = th_state[N_ROWS - 1];
sh_state[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1 << i)) {
State other = sh_state[gl_LocalInvocationID.x - (1 << i)];
agg = combine_state(other, agg);
}
barrier();
sh_state[gl_LocalInvocationID.x] = agg;
}
State exclusive;
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
exclusive.translate = vec2(0.0, 0.0);
exclusive.linewidth = 1.0; //TODO should be 0.0
exclusive.flags = 0;
exclusive.path_count = 0;
exclusive.pathseg_count = 0;
exclusive.trans_count = 0;
// Publish aggregate for this partition
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
State_write(state_aggregate_ref(part_ix), agg);
if (part_ix == 0) {
State_write(state_prefix_ref(part_ix), agg);
}
}
// Write flag with release semantics; this is done portably with a barrier.
memoryBarrierBuffer();
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
uint flag = FLAG_AGGREGATE_READY;
if (part_ix == 0) {
flag = FLAG_PREFIX_READY;
}
state[state_flag_index(part_ix)] = flag;
}
if (part_ix != 0) {
// step 4 of paper: decoupled lookback
uint look_back_ix = part_ix - 1;
State their_agg;
uint their_ix = 0;
while (true) {
// Read flag with acquire semantics.
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
sh_flag = state[state_flag_index(look_back_ix)];
}
// The flag load is done only in the last thread. However, because the
// translation of memoryBarrierBuffer to Metal requires uniform control
// flow, we broadcast it to all threads.
memoryBarrierBuffer();
barrier();
uint flag = sh_flag;
barrier();
if (flag == FLAG_PREFIX_READY) {
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
State their_prefix = State_read(state_prefix_ref(look_back_ix));
exclusive = combine_state(their_prefix, exclusive);
}
break;
} else if (flag == FLAG_AGGREGATE_READY) {
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
their_agg = State_read(state_aggregate_ref(look_back_ix));
exclusive = combine_state(their_agg, exclusive);
}
look_back_ix--;
their_ix = 0;
continue;
}
// else spin
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
// Unfortunately there's no guarantee of forward progress of other
// workgroups, so compute a bit of the aggregate before trying again.
// In the worst case, spinning stops when the aggregate is complete.
ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
State s = map_element(ref);
if (their_ix == 0) {
their_agg = s;
} else {
their_agg = combine_state(their_agg, s);
}
their_ix++;
if (their_ix == PARTITION_SIZE) {
exclusive = combine_state(their_agg, exclusive);
if (look_back_ix == 0) {
sh_flag = FLAG_PREFIX_READY;
} else {
look_back_ix--;
their_ix = 0;
}
}
}
barrier();
flag = sh_flag;
barrier();
if (flag == FLAG_PREFIX_READY) {
break;
}
}
// step 5 of paper: compute inclusive prefix
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
State inclusive_prefix = combine_state(exclusive, agg);
sh_prefix = exclusive;
State_write(state_prefix_ref(part_ix), inclusive_prefix);
}
memoryBarrierBuffer();
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
state[state_flag_index(part_ix)] = FLAG_PREFIX_READY;
}
}
barrier();
if (part_ix != 0) {
exclusive = sh_prefix;
}
State row = exclusive;
if (gl_LocalInvocationID.x > 0) {
State other = sh_state[gl_LocalInvocationID.x - 1];
row = combine_state(row, other);
}
for (uint i = 0; i < N_ROWS; i++) {
State st = combine_state(row, th_state[i]);
// Here we read again from the original scene. There may be
// gains to be had from stashing in shared memory or possibly
// registers (though register pressure is an issue).
ElementRef this_ref = Element_index(ref, i);
ElementTag tag = Element_tag(this_ref);
uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE);
bool is_stroke = fill_mode == MODE_STROKE;
switch (tag.tag) {
case Element_Line:
LineSeg line = Element_Line_read(this_ref);
PathCubic path_cubic;
path_cubic.p0 = line.p0;
path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0);
path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0);
path_cubic.p3 = line.p1;
path_cubic.path_ix = st.path_count;
path_cubic.trans_ix = st.trans_count;
if (is_stroke) {
path_cubic.stroke = get_linewidth(st);
} else {
path_cubic.stroke = vec2(0.0);
}
PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
break;
case Element_Quad:
QuadSeg quad = Element_Quad_read(this_ref);
path_cubic.p0 = quad.p0;
path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0);
path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0);
path_cubic.p3 = quad.p2;
path_cubic.path_ix = st.path_count;
path_cubic.trans_ix = st.trans_count;
if (is_stroke) {
path_cubic.stroke = get_linewidth(st);
} else {
path_cubic.stroke = vec2(0.0);
}
path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
break;
case Element_Cubic:
CubicSeg cubic = Element_Cubic_read(this_ref);
path_cubic.p0 = cubic.p0;
path_cubic.p1 = cubic.p1;
path_cubic.p2 = cubic.p2;
path_cubic.p3 = cubic.p3;
path_cubic.path_ix = st.path_count;
path_cubic.trans_ix = st.trans_count;
if (is_stroke) {
path_cubic.stroke = get_linewidth(st);
} else {
path_cubic.stroke = vec2(0.0);
}
path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
break;
case Element_FillColor:
FillColor fill = Element_FillColor_read(this_ref);
AnnoColor anno_fill;
anno_fill.rgba_color = fill.rgba_color;
if (is_stroke) {
vec2 lw = get_linewidth(st);
anno_fill.bbox = st.bbox + vec4(-lw, lw);
anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
} else {
anno_fill.bbox = st.bbox;
anno_fill.linewidth = 0.0;
}
AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
break;
case Element_FillLinGradient:
FillLinGradient lin = Element_FillLinGradient_read(this_ref);
AnnoLinGradient anno_lin;
anno_lin.index = lin.index;
vec2 p0 = st.mat.xy * lin.p0.x + st.mat.zw * lin.p0.y + st.translate;
vec2 p1 = st.mat.xy * lin.p1.x + st.mat.zw * lin.p1.y + st.translate;
vec2 dxy = p1 - p0;
float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
float line_x = dxy.x * scale;
float line_y = dxy.y * scale;
anno_lin.line_x = line_x;
anno_lin.line_y = line_y;
anno_lin.line_c = -(p0.x * line_x + p0.y * line_y);
// TODO: consider consolidating bbox calculation
if (is_stroke) {
vec2 lw = get_linewidth(st);
anno_lin.bbox = st.bbox + vec4(-lw, lw);
anno_lin.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
} else {
anno_lin.bbox = st.bbox;
anno_lin.linewidth = 0.0;
}
out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
Annotated_LinGradient_write(conf.anno_alloc, out_ref, fill_mode, anno_lin);
break;
case Element_FillImage:
FillImage fill_img = Element_FillImage_read(this_ref);
AnnoImage anno_img;
anno_img.index = fill_img.index;
anno_img.offset = fill_img.offset;
if (is_stroke) {
vec2 lw = get_linewidth(st);
anno_img.bbox = st.bbox + vec4(-lw, lw);
anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
} else {
anno_img.bbox = st.bbox;
anno_img.linewidth = 0.0;
}
out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
break;
case Element_BeginClip:
Clip begin_clip = Element_BeginClip_read(this_ref);
AnnoBeginClip anno_begin_clip;
// This is the absolute bbox, it's been transformed during encoding.
anno_begin_clip.bbox = begin_clip.bbox;
if (is_stroke) {
vec2 lw = get_linewidth(st);
anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
} else {
anno_begin_clip.linewidth = 0.0;
}
out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
break;
case Element_EndClip:
Clip end_clip = Element_EndClip_read(this_ref);
// This bbox is expected to be the same as the begin one.
AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox);
out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
break;
case Element_Transform:
TransformSeg transform = TransformSeg(st.mat, st.translate);
TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size);
TransformSeg_write(conf.trans_alloc, trans_ref, transform);
break;
}
}
}

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,283 @@
struct Alloc
{
uint offset;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
RWByteAddressBuffer _79 : register(u0, space0);
ByteAddressBuffer _186 : register(t1, space0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
static uint gl_LocalInvocationIndex;
struct SPIRV_Cross_Input
{
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
uint gl_LocalInvocationIndex : SV_GroupIndex;
};
groupshared uint sh_row_width[256];
groupshared Alloc sh_row_alloc[256];
groupshared uint sh_row_count[256];
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
uint read_mem(Alloc alloc, uint offset)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = _79.Load(offset * 4 + 8);
return v;
}
AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1);
AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _121;
}
uint fill_mode_from_flags(uint flags)
{
return flags & 1u;
}
Path Path_read(Alloc a, PathRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
TileRef _165 = { raw2 };
s.tiles = _165;
return s;
}
Alloc new_alloc(uint offset, uint size, bool mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
void write_mem(Alloc alloc, uint offset, uint val)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
_79.Store(offset * 4 + 8, val);
}
void comp_main()
{
uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
AnnotatedRef ref = _194;
uint row_count = 0u;
bool mem_ok = _79.Load(4) == 0u;
if (gl_LocalInvocationID.y == 0u)
{
if (element_ix < _186.Load(0))
{
Alloc _217;
_217.offset = _186.Load(32);
Alloc param;
param.offset = _217.offset;
AnnotatedRef param_1 = ref;
AnnotatedTag tag = Annotated_tag(param, param_1);
switch (tag.tag)
{
case 3u:
case 2u:
case 4u:
case 1u:
{
uint param_2 = tag.flags;
if (fill_mode_from_flags(param_2) != 0u)
{
break;
}
PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
PathRef path_ref = _243;
Alloc _247;
_247.offset = _186.Load(16);
Alloc param_3;
param_3.offset = _247.offset;
PathRef param_4 = path_ref;
Path path = Path_read(param_3, param_4);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
bool _272 = row_count == 1u;
bool _278;
if (_272)
{
_278 = path.bbox.y > 0u;
}
else
{
_278 = _272;
}
if (_278)
{
row_count = 0u;
}
uint param_5 = path.tiles.offset;
uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_7 = mem_ok;
Alloc path_alloc = new_alloc(param_5, param_6, param_7);
sh_row_alloc[th_ix] = path_alloc;
break;
}
}
}
sh_row_count[th_ix] = row_count;
}
for (uint i = 0u; i < 8u; i++)
{
GroupMemoryBarrierWithGroupSync();
bool _325 = gl_LocalInvocationID.y == 0u;
bool _332;
if (_325)
{
_332 = th_ix >= (1u << i);
}
else
{
_332 = _325;
}
if (_332)
{
row_count += sh_row_count[th_ix - (1u << i)];
}
GroupMemoryBarrierWithGroupSync();
if (gl_LocalInvocationID.y == 0u)
{
sh_row_count[th_ix] = row_count;
}
}
GroupMemoryBarrierWithGroupSync();
uint total_rows = sh_row_count[255];
uint _411;
for (uint row = th_ix; row < total_rows; row += 256u)
{
uint el_ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = el_ix + (128u >> i_1);
if (row >= sh_row_count[probe - 1u])
{
el_ix = probe;
}
}
uint width = sh_row_width[el_ix];
if ((width > 0u) && mem_ok)
{
Alloc tiles_alloc = sh_row_alloc[el_ix];
if (el_ix > 0u)
{
_411 = sh_row_count[el_ix - 1u];
}
else
{
_411 = 0u;
}
uint seq_ix = row - _411;
uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
Alloc param_8 = tiles_alloc;
uint param_9 = tile_el_ix;
uint sum = read_mem(param_8, param_9);
for (uint x = 1u; x < width; x++)
{
tile_el_ix += 2u;
Alloc param_10 = tiles_alloc;
uint param_11 = tile_el_ix;
sum += read_mem(param_10, param_11);
Alloc param_12 = tiles_alloc;
uint param_13 = tile_el_ix;
uint param_14 = sum;
write_mem(param_12, param_13, param_14);
}
}
}
}
[numthreads(256, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
comp_main();
}

View file

@ -0,0 +1,284 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct Alloc
{
uint offset;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_79.memory[offset];
return v;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_79);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
uint fill_mode_from_flags(thread const uint& flags)
{
return flags & 1u;
}
static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_79);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_79);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_79);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
s.tiles = TileRef{ raw2 };
return s;
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_79.memory[offset] = val;
}
kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint sh_row_width[256];
threadgroup Alloc sh_row_alloc[256];
threadgroup uint sh_row_count[256];
uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
uint row_count = 0u;
bool mem_ok = v_79.mem_error == 0u;
if (gl_LocalInvocationID.y == 0u)
{
if (element_ix < _186.conf.n_elements)
{
Alloc param;
param.offset = _186.conf.anno_alloc.offset;
AnnotatedRef param_1 = ref;
AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
switch (tag.tag)
{
case 3u:
case 2u:
case 4u:
case 1u:
{
uint param_2 = tag.flags;
if (fill_mode_from_flags(param_2) != 0u)
{
break;
}
PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
Alloc param_3;
param_3.offset = _186.conf.tile_alloc.offset;
PathRef param_4 = path_ref;
Path path = Path_read(param_3, param_4, v_79);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
bool _272 = row_count == 1u;
bool _278;
if (_272)
{
_278 = path.bbox.y > 0u;
}
else
{
_278 = _272;
}
if (_278)
{
row_count = 0u;
}
uint param_5 = path.tiles.offset;
uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_7 = mem_ok;
Alloc path_alloc = new_alloc(param_5, param_6, param_7);
sh_row_alloc[th_ix] = path_alloc;
break;
}
}
}
sh_row_count[th_ix] = row_count;
}
for (uint i = 0u; i < 8u; i++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
bool _325 = gl_LocalInvocationID.y == 0u;
bool _332;
if (_325)
{
_332 = th_ix >= (1u << i);
}
else
{
_332 = _325;
}
if (_332)
{
row_count += sh_row_count[th_ix - (1u << i)];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (gl_LocalInvocationID.y == 0u)
{
sh_row_count[th_ix] = row_count;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint total_rows = sh_row_count[255];
uint _411;
for (uint row = th_ix; row < total_rows; row += 256u)
{
uint el_ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = el_ix + (128u >> i_1);
if (row >= sh_row_count[probe - 1u])
{
el_ix = probe;
}
}
uint width = sh_row_width[el_ix];
if ((width > 0u) && mem_ok)
{
Alloc tiles_alloc = sh_row_alloc[el_ix];
if (el_ix > 0u)
{
_411 = sh_row_count[el_ix - 1u];
}
else
{
_411 = 0u;
}
uint seq_ix = row - _411;
uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
Alloc param_8 = tiles_alloc;
uint param_9 = tile_el_ix;
uint sum = read_mem(param_8, param_9, v_79);
for (uint x = 1u; x < width; x++)
{
tile_el_ix += 2u;
Alloc param_10 = tiles_alloc;
uint param_11 = tile_el_ix;
sum += read_mem(param_10, param_11, v_79);
Alloc param_12 = tiles_alloc;
uint param_13 = tile_el_ix;
uint param_14 = sum;
write_mem(param_12, param_13, param_14, v_79);
}
}
}
}

Binary file not shown.

View file

@ -0,0 +1,283 @@
struct Alloc
{
uint offset;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u);
RWByteAddressBuffer _79 : register(u0, space0);
ByteAddressBuffer _186 : register(t1, space0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
static uint gl_LocalInvocationIndex;
struct SPIRV_Cross_Input
{
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
uint gl_LocalInvocationIndex : SV_GroupIndex;
};
groupshared uint sh_row_width[256];
groupshared Alloc sh_row_alloc[256];
groupshared uint sh_row_count[256];
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
uint read_mem(Alloc alloc, uint offset)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = _79.Load(offset * 4 + 8);
return v;
}
AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1);
AnnotatedTag _121 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _121;
}
uint fill_mode_from_flags(uint flags)
{
return flags & 1u;
}
Path Path_read(Alloc a, PathRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
TileRef _165 = { raw2 };
s.tiles = _165;
return s;
}
Alloc new_alloc(uint offset, uint size, bool mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
void write_mem(Alloc alloc, uint offset, uint val)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
_79.Store(offset * 4 + 8, val);
}
void comp_main()
{
uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef _194 = { _186.Load(32) + (element_ix * 40u) };
AnnotatedRef ref = _194;
uint row_count = 0u;
bool mem_ok = _79.Load(4) == 0u;
if (gl_LocalInvocationID.y == 0u)
{
if (element_ix < _186.Load(0))
{
Alloc _217;
_217.offset = _186.Load(32);
Alloc param;
param.offset = _217.offset;
AnnotatedRef param_1 = ref;
AnnotatedTag tag = Annotated_tag(param, param_1);
switch (tag.tag)
{
case 3u:
case 2u:
case 4u:
case 1u:
{
uint param_2 = tag.flags;
if (fill_mode_from_flags(param_2) != 0u)
{
break;
}
PathRef _243 = { _186.Load(16) + (element_ix * 12u) };
PathRef path_ref = _243;
Alloc _247;
_247.offset = _186.Load(16);
Alloc param_3;
param_3.offset = _247.offset;
PathRef param_4 = path_ref;
Path path = Path_read(param_3, param_4);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
bool _272 = row_count == 1u;
bool _278;
if (_272)
{
_278 = path.bbox.y > 0u;
}
else
{
_278 = _272;
}
if (_278)
{
row_count = 0u;
}
uint param_5 = path.tiles.offset;
uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_7 = mem_ok;
Alloc path_alloc = new_alloc(param_5, param_6, param_7);
sh_row_alloc[th_ix] = path_alloc;
break;
}
}
}
sh_row_count[th_ix] = row_count;
}
for (uint i = 0u; i < 8u; i++)
{
GroupMemoryBarrierWithGroupSync();
bool _325 = gl_LocalInvocationID.y == 0u;
bool _332;
if (_325)
{
_332 = th_ix >= (1u << i);
}
else
{
_332 = _325;
}
if (_332)
{
row_count += sh_row_count[th_ix - (1u << i)];
}
GroupMemoryBarrierWithGroupSync();
if (gl_LocalInvocationID.y == 0u)
{
sh_row_count[th_ix] = row_count;
}
}
GroupMemoryBarrierWithGroupSync();
uint total_rows = sh_row_count[255];
uint _411;
for (uint row = th_ix; row < total_rows; row += 1024u)
{
uint el_ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = el_ix + (128u >> i_1);
if (row >= sh_row_count[probe - 1u])
{
el_ix = probe;
}
}
uint width = sh_row_width[el_ix];
if ((width > 0u) && mem_ok)
{
Alloc tiles_alloc = sh_row_alloc[el_ix];
if (el_ix > 0u)
{
_411 = sh_row_count[el_ix - 1u];
}
else
{
_411 = 0u;
}
uint seq_ix = row - _411;
uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
Alloc param_8 = tiles_alloc;
uint param_9 = tile_el_ix;
uint sum = read_mem(param_8, param_9);
for (uint x = 1u; x < width; x++)
{
tile_el_ix += 2u;
Alloc param_10 = tiles_alloc;
uint param_11 = tile_el_ix;
sum += read_mem(param_10, param_11);
Alloc param_12 = tiles_alloc;
uint param_13 = tile_el_ix;
uint param_14 = sum;
write_mem(param_12, param_13, param_14);
}
}
}
}
[numthreads(256, 4, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex;
comp_main();
}

View file

@ -0,0 +1,284 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct Alloc
{
uint offset;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_79.memory[offset];
return v;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_79);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
uint fill_mode_from_flags(thread const uint& flags)
{
return flags & 1u;
}
static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_79);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_79);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_79);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
s.tiles = TileRef{ raw2 };
return s;
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_79.memory[offset] = val;
}
kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint sh_row_width[256];
threadgroup Alloc sh_row_alloc[256];
threadgroup uint sh_row_count[256];
uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
uint row_count = 0u;
bool mem_ok = v_79.mem_error == 0u;
if (gl_LocalInvocationID.y == 0u)
{
if (element_ix < _186.conf.n_elements)
{
Alloc param;
param.offset = _186.conf.anno_alloc.offset;
AnnotatedRef param_1 = ref;
AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
switch (tag.tag)
{
case 3u:
case 2u:
case 4u:
case 1u:
{
uint param_2 = tag.flags;
if (fill_mode_from_flags(param_2) != 0u)
{
break;
}
PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
Alloc param_3;
param_3.offset = _186.conf.tile_alloc.offset;
PathRef param_4 = path_ref;
Path path = Path_read(param_3, param_4, v_79);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
bool _272 = row_count == 1u;
bool _278;
if (_272)
{
_278 = path.bbox.y > 0u;
}
else
{
_278 = _272;
}
if (_278)
{
row_count = 0u;
}
uint param_5 = path.tiles.offset;
uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_7 = mem_ok;
Alloc path_alloc = new_alloc(param_5, param_6, param_7);
sh_row_alloc[th_ix] = path_alloc;
break;
}
}
}
sh_row_count[th_ix] = row_count;
}
for (uint i = 0u; i < 8u; i++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
bool _325 = gl_LocalInvocationID.y == 0u;
bool _332;
if (_325)
{
_332 = th_ix >= (1u << i);
}
else
{
_332 = _325;
}
if (_332)
{
row_count += sh_row_count[th_ix - (1u << i)];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (gl_LocalInvocationID.y == 0u)
{
sh_row_count[th_ix] = row_count;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint total_rows = sh_row_count[255];
uint _411;
for (uint row = th_ix; row < total_rows; row += 1024u)
{
uint el_ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = el_ix + (128u >> i_1);
if (row >= sh_row_count[probe - 1u])
{
el_ix = probe;
}
}
uint width = sh_row_width[el_ix];
if ((width > 0u) && mem_ok)
{
Alloc tiles_alloc = sh_row_alloc[el_ix];
if (el_ix > 0u)
{
_411 = sh_row_count[el_ix - 1u];
}
else
{
_411 = 0u;
}
uint seq_ix = row - _411;
uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
Alloc param_8 = tiles_alloc;
uint param_9 = tile_el_ix;
uint sum = read_mem(param_8, param_9, v_79);
for (uint x = 1u; x < width; x++)
{
tile_el_ix += 2u;
Alloc param_10 = tiles_alloc;
uint param_11 = tile_el_ix;
sum += read_mem(param_10, param_11, v_79);
Alloc param_12 = tiles_alloc;
uint param_13 = tile_el_ix;
uint param_14 = sum;
write_mem(param_12, param_13, param_14, v_79);
}
}
}
}

View file

@ -27,8 +27,8 @@ struct Config
static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
ByteAddressBuffer _21 : register(t1);
RWByteAddressBuffer _45 : register(u0);
ByteAddressBuffer _21 : register(t1, space0);
RWByteAddressBuffer _45 : register(u0, space0);
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input

Binary file not shown.

View file

@ -0,0 +1,352 @@
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct AnnoEndClipRef
{
uint offset;
};
struct AnnoEndClip
{
float4 bbox;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct BinInstanceRef
{
uint offset;
};
struct BinInstance
{
uint element_ix;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
RWByteAddressBuffer _84 : register(u0, space0);
ByteAddressBuffer _253 : register(t1, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_WorkGroupID : SV_GroupID;
uint3 gl_LocalInvocationID : SV_GroupThreadID;
};
groupshared uint bitmaps[8][256];
groupshared bool sh_alloc_failed;
groupshared uint count[8][256];
groupshared Alloc sh_chunk_alloc[256];
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
uint read_mem(Alloc alloc, uint offset)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = _84.Load(offset * 4 + 8);
return v;
}
AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1);
AnnotatedTag _221 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _221;
}
AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7);
AnnoEndClip s;
s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
return s;
}
AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
{
AnnoEndClipRef _228 = { ref.offset + 4u };
Alloc param = a;
AnnoEndClipRef param_1 = _228;
return AnnoEndClip_read(param, param_1);
}
Alloc new_alloc(uint offset, uint size, bool mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
MallocResult malloc(uint size)
{
uint _90;
_84.InterlockedAdd(0, size, _90);
uint offset = _90;
uint _97;
_84.GetDimensions(_97);
_97 = (_97 - 8) / 4;
MallocResult r;
r.failed = (offset + size) > uint(int(_97) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _119;
_84.InterlockedMax(4, 1u, _119);
return r;
}
return r;
}
void write_mem(Alloc alloc, uint offset, uint val)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
_84.Store(offset * 4 + 8, val);
}
void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.element_ix;
write_mem(param, param_1, param_2);
}
void comp_main()
{
uint my_n_elements = _253.Load(0);
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0u; i < 8u; i++)
{
bitmaps[i][gl_LocalInvocationID.x] = 0u;
}
if (gl_LocalInvocationID.x == 0u)
{
sh_alloc_failed = false;
}
GroupMemoryBarrierWithGroupSync();
uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
AnnotatedRef _308 = { _253.Load(32) + (element_ix * 40u) };
AnnotatedRef ref = _308;
uint tag = 0u;
if (element_ix < my_n_elements)
{
Alloc _318;
_318.offset = _253.Load(32);
Alloc param;
param.offset = _318.offset;
AnnotatedRef param_1 = ref;
tag = Annotated_tag(param, param_1).tag;
}
int x0 = 0;
int y0 = 0;
int x1 = 0;
int y1 = 0;
switch (tag)
{
case 1u:
case 2u:
case 3u:
case 4u:
case 5u:
{
Alloc _336;
_336.offset = _253.Load(32);
Alloc param_2;
param_2.offset = _336.offset;
AnnotatedRef param_3 = ref;
AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
x0 = int(floor(clip.bbox.x * 0.00390625f));
y0 = int(floor(clip.bbox.y * 0.00390625f));
x1 = int(ceil(clip.bbox.z * 0.00390625f));
y1 = int(ceil(clip.bbox.w * 0.00390625f));
break;
}
}
uint width_in_bins = ((_253.Load(8) + 16u) - 1u) / 16u;
uint height_in_bins = ((_253.Load(12) + 16u) - 1u) / 16u;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1)
{
y1 = y0;
}
int x = x0;
int y = y0;
uint my_slice = gl_LocalInvocationID.x / 32u;
uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u));
while (y < y1)
{
uint _438;
InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _438);
x++;
if (x == x1)
{
x = x0;
y++;
}
}
GroupMemoryBarrierWithGroupSync();
uint element_count = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x])));
count[i_1][gl_LocalInvocationID.x] = element_count;
}
uint param_4 = 0u;
uint param_5 = 0u;
bool param_6 = true;
Alloc chunk_alloc = new_alloc(param_4, param_5, param_6);
if (element_count != 0u)
{
uint param_7 = element_count * 4u;
MallocResult _488 = malloc(param_7);
MallocResult chunk = _488;
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed)
{
sh_alloc_failed = true;
}
}
uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
Alloc _517;
_517.offset = _253.Load(20);
Alloc param_8;
param_8.offset = _517.offset;
uint param_9 = out_ix;
uint param_10 = element_count;
write_mem(param_8, param_9, param_10);
Alloc _529;
_529.offset = _253.Load(20);
Alloc param_11;
param_11.offset = _529.offset;
uint param_12 = out_ix + 1u;
uint param_13 = chunk_alloc.offset;
write_mem(param_11, param_12, param_13);
GroupMemoryBarrierWithGroupSync();
bool _544;
if (!sh_alloc_failed)
{
_544 = _84.Load(4) != 0u;
}
else
{
_544 = sh_alloc_failed;
}
if (_544)
{
return;
}
x = x0;
y = y0;
while (y < y1)
{
uint bin_ix = (uint(y) * width_in_bins) + uint(x);
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0u)
{
uint idx = uint(int(countbits(out_mask & (my_mask - 1u))));
if (my_slice > 0u)
{
idx += count[my_slice - 1u][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + (idx * 4u);
BinInstanceRef _606 = { out_offset };
BinInstance _608 = { element_ix };
Alloc param_14 = out_alloc;
BinInstanceRef param_15 = _606;
BinInstance param_16 = _608;
BinInstance_write(param_14, param_15, param_16);
}
x++;
if (x == x1)
{
x = x0;
y++;
}
}
}
[numthreads(256, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
comp_main();
}

View file

@ -0,0 +1,350 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct AnnoEndClipRef
{
uint offset;
};
struct AnnoEndClip
{
float4 bbox;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct BinInstanceRef
{
uint offset;
};
struct BinInstance
{
uint element_ix;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_84, constant uint& v_84BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_84.memory[offset];
return v;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_84, v_84BufferSize);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_84, v_84BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_84, v_84BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_84, v_84BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_84, v_84BufferSize);
AnnoEndClip s;
s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
return s;
}
static inline __attribute__((always_inline))
AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize)
{
Alloc param = a;
AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
return AnnoEndClip_read(param, param_1, v_84, v_84BufferSize);
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_84, constant uint& v_84BufferSize)
{
uint _90 = atomic_fetch_add_explicit((device atomic_uint*)&v_84.mem_offset, size, memory_order_relaxed);
uint offset = _90;
MallocResult r;
r.failed = (offset + size) > uint(int((v_84BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _119 = atomic_fetch_max_explicit((device atomic_uint*)&v_84.mem_error, 1u, memory_order_relaxed);
return r;
}
return r;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_84, constant uint& v_84BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_84.memory[offset] = val;
}
static inline __attribute__((always_inline))
void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_84, constant uint& v_84BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.element_ix;
write_mem(param, param_1, param_2, v_84, v_84BufferSize);
}
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_84 [[buffer(0)]], const device ConfigBuf& _253 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint bitmaps[8][256];
threadgroup short sh_alloc_failed;
threadgroup uint count[8][256];
threadgroup Alloc sh_chunk_alloc[256];
constant uint& v_84BufferSize = spvBufferSizeConstants[0];
uint my_n_elements = _253.conf.n_elements;
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0u; i < 8u; i++)
{
bitmaps[i][gl_LocalInvocationID.x] = 0u;
}
if (gl_LocalInvocationID.x == 0u)
{
sh_alloc_failed = short(false);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef{ _253.conf.anno_alloc.offset + (element_ix * 40u) };
uint tag = 0u;
if (element_ix < my_n_elements)
{
Alloc param;
param.offset = _253.conf.anno_alloc.offset;
AnnotatedRef param_1 = ref;
tag = Annotated_tag(param, param_1, v_84, v_84BufferSize).tag;
}
int x0 = 0;
int y0 = 0;
int x1 = 0;
int y1 = 0;
switch (tag)
{
case 1u:
case 2u:
case 3u:
case 4u:
case 5u:
{
Alloc param_2;
param_2.offset = _253.conf.anno_alloc.offset;
AnnotatedRef param_3 = ref;
AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_84, v_84BufferSize);
x0 = int(floor(clip.bbox.x * 0.00390625));
y0 = int(floor(clip.bbox.y * 0.00390625));
x1 = int(ceil(clip.bbox.z * 0.00390625));
y1 = int(ceil(clip.bbox.w * 0.00390625));
break;
}
}
uint width_in_bins = ((_253.conf.width_in_tiles + 16u) - 1u) / 16u;
uint height_in_bins = ((_253.conf.height_in_tiles + 16u) - 1u) / 16u;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1)
{
y1 = y0;
}
int x = x0;
int y = y0;
uint my_slice = gl_LocalInvocationID.x / 32u;
uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u));
while (y < y1)
{
uint _438 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
x++;
if (x == x1)
{
x = x0;
y++;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint element_count = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
count[i_1][gl_LocalInvocationID.x] = element_count;
}
uint param_4 = 0u;
uint param_5 = 0u;
bool param_6 = true;
Alloc chunk_alloc = new_alloc(param_4, param_5, param_6);
if (element_count != 0u)
{
uint param_7 = element_count * 4u;
MallocResult _488 = malloc(param_7, v_84, v_84BufferSize);
MallocResult chunk = _488;
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed)
{
sh_alloc_failed = short(true);
}
}
uint out_ix = (_253.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
Alloc param_8;
param_8.offset = _253.conf.bin_alloc.offset;
uint param_9 = out_ix;
uint param_10 = element_count;
write_mem(param_8, param_9, param_10, v_84, v_84BufferSize);
Alloc param_11;
param_11.offset = _253.conf.bin_alloc.offset;
uint param_12 = out_ix + 1u;
uint param_13 = chunk_alloc.offset;
write_mem(param_11, param_12, param_13, v_84, v_84BufferSize);
threadgroup_barrier(mem_flags::mem_threadgroup);
bool _544;
if (!bool(sh_alloc_failed))
{
_544 = v_84.mem_error != 0u;
}
else
{
_544 = bool(sh_alloc_failed);
}
if (_544)
{
return;
}
x = x0;
y = y0;
while (y < y1)
{
uint bin_ix = (uint(y) * width_in_bins) + uint(x);
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0u)
{
uint idx = uint(int(popcount(out_mask & (my_mask - 1u))));
if (my_slice > 0u)
{
idx += count[my_slice - 1u][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + (idx * 4u);
Alloc param_14 = out_alloc;
BinInstanceRef param_15 = BinInstanceRef{ out_offset };
BinInstance param_16 = BinInstance{ element_ix };
BinInstance_write(param_14, param_15, param_16, v_84, v_84BufferSize);
}
x++;
if (x == x1)
{
x = x0;
y++;
}
}
}

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -158,10 +158,10 @@ static const DrawMonoid _443 = { 1u, 0u };
static const DrawMonoid _445 = { 1u, 1u };
static const DrawMonoid _447 = { 0u, 1u };
RWByteAddressBuffer _201 : register(u0);
ByteAddressBuffer _225 : register(t2);
ByteAddressBuffer _1008 : register(t3);
ByteAddressBuffer _1042 : register(t1);
RWByteAddressBuffer _201 : register(u0, space0);
ByteAddressBuffer _225 : register(t2, space0);
ByteAddressBuffer _1008 : register(t3, space0);
ByteAddressBuffer _1042 : register(t1, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;

View file

@ -49,10 +49,10 @@ static const DrawMonoid _90 = { 1u, 1u };
static const DrawMonoid _92 = { 0u, 1u };
static const DrawMonoid _94 = { 0u, 0u };
ByteAddressBuffer _46 : register(t2);
RWByteAddressBuffer _203 : register(u3);
RWByteAddressBuffer _217 : register(u0);
ByteAddressBuffer _223 : register(t1);
ByteAddressBuffer _46 : register(t2, space0);
RWByteAddressBuffer _203 : register(u3, space0);
RWByteAddressBuffer _217 : register(u0, space0);
ByteAddressBuffer _223 : register(t1, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;

View file

@ -8,7 +8,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const DrawMonoid _18 = { 0u, 0u };
RWByteAddressBuffer _57 : register(u0);
RWByteAddressBuffer _57 : register(u0, space0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;

Binary file not shown.

View file

@ -0,0 +1,689 @@
struct Alloc
{
uint offset;
};
struct CmdStrokeRef
{
uint offset;
};
struct CmdStroke
{
uint tile_ref;
float half_width;
};
struct CmdFillRef
{
uint offset;
};
struct CmdFill
{
uint tile_ref;
int backdrop;
};
struct CmdColorRef
{
uint offset;
};
struct CmdColor
{
uint rgba_color;
};
struct CmdLinGradRef
{
uint offset;
};
struct CmdLinGrad
{
uint index;
float line_x;
float line_y;
float line_c;
};
struct CmdImageRef
{
uint offset;
};
struct CmdImage
{
uint index;
int2 offset;
};
struct CmdAlphaRef
{
uint offset;
};
struct CmdAlpha
{
float alpha;
};
struct CmdJumpRef
{
uint offset;
};
struct CmdJump
{
uint new_ref;
};
struct CmdRef
{
uint offset;
};
struct CmdTag
{
uint tag;
uint flags;
};
struct TileSegRef
{
uint offset;
};
struct TileSeg
{
float2 origin;
float2 _vector;
float y_edge;
TileSegRef next;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u);
RWByteAddressBuffer _202 : register(u0, space0);
ByteAddressBuffer _723 : register(t1, space0);
RWTexture2D<unorm float4> image_atlas : register(u3, space0);
RWTexture2D<unorm float4> gradients : register(u4, space0);
RWTexture2D<unorm float4> image : register(u2, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_WorkGroupID : SV_GroupID;
uint3 gl_LocalInvocationID : SV_GroupThreadID;
};
uint spvPackUnorm4x8(float4 value)
{
uint4 Packed = uint4(round(saturate(value) * 255.0));
return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24);
}
float4 spvUnpackUnorm4x8(uint value)
{
uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24);
return float4(Packed) / 255.0;
}
Alloc slice_mem(Alloc a, uint offset, uint size)
{
Alloc _215 = { a.offset + offset };
return _215;
}
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
uint read_mem(Alloc alloc, uint offset)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = _202.Load(offset * 4 + 8);
return v;
}
CmdTag Cmd_tag(Alloc a, CmdRef ref)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1);
CmdTag _432 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _432;
}
CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
CmdStroke s;
s.tile_ref = raw0;
s.half_width = asfloat(raw1);
return s;
}
CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref)
{
CmdStrokeRef _449 = { ref.offset + 4u };
Alloc param = a;
CmdStrokeRef param_1 = _449;
return CmdStroke_read(param, param_1);
}
Alloc new_alloc(uint offset, uint size, bool mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
TileSeg TileSeg_read(Alloc a, TileSegRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11);
TileSeg s;
s.origin = float2(asfloat(raw0), asfloat(raw1));
s._vector = float2(asfloat(raw2), asfloat(raw3));
s.y_edge = asfloat(raw4);
TileSegRef _572 = { raw5 };
s.next = _572;
return s;
}
uint2 chunk_offset(uint i)
{
return uint2((i % 2u) * 8u, (i / 2u) * 4u);
}
CmdFill CmdFill_read(Alloc a, CmdFillRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
CmdFill s;
s.tile_ref = raw0;
s.backdrop = int(raw1);
return s;
}
CmdFill Cmd_Fill_read(Alloc a, CmdRef ref)
{
CmdFillRef _439 = { ref.offset + 4u };
Alloc param = a;
CmdFillRef param_1 = _439;
return CmdFill_read(param, param_1);
}
CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
CmdAlpha s;
s.alpha = asfloat(raw0);
return s;
}
CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref)
{
CmdAlphaRef _459 = { ref.offset + 4u };
Alloc param = a;
CmdAlphaRef param_1 = _459;
return CmdAlpha_read(param, param_1);
}
CmdColor CmdColor_read(Alloc a, CmdColorRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
CmdColor s;
s.rgba_color = raw0;
return s;
}
CmdColor Cmd_Color_read(Alloc a, CmdRef ref)
{
CmdColorRef _469 = { ref.offset + 4u };
Alloc param = a;
CmdColorRef param_1 = _469;
return CmdColor_read(param, param_1);
}
float3 fromsRGB(float3 srgb)
{
bool3 cutoff = bool3(srgb.x >= 0.040449999272823333740234375f.xxx.x, srgb.y >= 0.040449999272823333740234375f.xxx.y, srgb.z >= 0.040449999272823333740234375f.xxx.z);
float3 below = srgb / 12.9200000762939453125f.xxx;
float3 above = pow((srgb + 0.054999999701976776123046875f.xxx) / 1.05499994754791259765625f.xxx, 2.400000095367431640625f.xxx);
return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
}
float4 unpacksRGB(uint srgba)
{
float4 color = spvUnpackUnorm4x8(srgba).wzyx;
float3 param = color.xyz;
return float4(fromsRGB(param), color.w);
}
CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7);
CmdLinGrad s;
s.index = raw0;
s.line_x = asfloat(raw1);
s.line_y = asfloat(raw2);
s.line_c = asfloat(raw3);
return s;
}
CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref)
{
CmdLinGradRef _479 = { ref.offset + 4u };
Alloc param = a;
CmdLinGradRef param_1 = _479;
return CmdLinGrad_read(param, param_1);
}
CmdImage CmdImage_read(Alloc a, CmdImageRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
CmdImage s;
s.index = raw0;
s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
return s;
}
CmdImage Cmd_Image_read(Alloc a, CmdRef ref)
{
CmdImageRef _489 = { ref.offset + 4u };
Alloc param = a;
CmdImageRef param_1 = _489;
return CmdImage_read(param, param_1);
}
void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img)
{
float4 rgba[8];
for (uint i = 0u; i < 8u; i++)
{
uint param = i;
int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
float4 fg_rgba = image_atlas[uv];
float3 param_1 = fg_rgba.xyz;
float3 _695 = fromsRGB(param_1);
fg_rgba.x = _695.x;
fg_rgba.y = _695.y;
fg_rgba.z = _695.z;
rgba[i] = fg_rgba;
}
spvReturnValue = rgba;
}
float3 tosRGB(float3 rgb)
{
bool3 cutoff = bool3(rgb.x >= 0.003130800090730190277099609375f.xxx.x, rgb.y >= 0.003130800090730190277099609375f.xxx.y, rgb.z >= 0.003130800090730190277099609375f.xxx.z);
float3 below = 12.9200000762939453125f.xxx * rgb;
float3 above = (1.05499994754791259765625f.xxx * pow(rgb, 0.416660010814666748046875f.xxx)) - 0.054999999701976776123046875f.xxx;
return float3(cutoff.x ? above.x : below.x, cutoff.y ? above.y : below.y, cutoff.z ? above.z : below.z);
}
uint packsRGB(inout float4 rgba)
{
float3 param = rgba.xyz;
rgba = float4(tosRGB(param), rgba.w);
return spvPackUnorm4x8(rgba.wzyx);
}
CmdJump CmdJump_read(Alloc a, CmdJumpRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
CmdJump s;
s.new_ref = raw0;
return s;
}
CmdJump Cmd_Jump_read(Alloc a, CmdRef ref)
{
CmdJumpRef _499 = { ref.offset + 4u };
Alloc param = a;
CmdJumpRef param_1 = _499;
return CmdJump_read(param, param_1);
}
void comp_main()
{
uint tile_ix = (gl_WorkGroupID.y * _723.Load(8)) + gl_WorkGroupID.x;
Alloc _738;
_738.offset = _723.Load(24);
Alloc param;
param.offset = _738.offset;
uint param_1 = tile_ix * 1024u;
uint param_2 = 1024u;
Alloc cmd_alloc = slice_mem(param, param_1, param_2);
CmdRef _747 = { cmd_alloc.offset };
CmdRef cmd_ref = _747;
uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
float2 xy = float2(xy_uint);
float4 rgba[8];
for (uint i = 0u; i < 8u; i++)
{
rgba[i] = 0.0f.xxxx;
}
uint clip_depth = 0u;
bool mem_ok = _202.Load(4) == 0u;
float df[8];
TileSegRef tile_seg_ref;
float area[8];
uint blend_stack[128][8];
float blend_alpha_stack[128][8];
while (mem_ok)
{
Alloc param_3 = cmd_alloc;
CmdRef param_4 = cmd_ref;
uint tag = Cmd_tag(param_3, param_4).tag;
if (tag == 0u)
{
break;
}
switch (tag)
{
case 2u:
{
Alloc param_5 = cmd_alloc;
CmdRef param_6 = cmd_ref;
CmdStroke stroke = Cmd_Stroke_read(param_5, param_6);
for (uint k = 0u; k < 8u; k++)
{
df[k] = 1000000000.0f;
}
TileSegRef _842 = { stroke.tile_ref };
tile_seg_ref = _842;
do
{
uint param_7 = tile_seg_ref.offset;
uint param_8 = 24u;
bool param_9 = mem_ok;
Alloc param_10 = new_alloc(param_7, param_8, param_9);
TileSegRef param_11 = tile_seg_ref;
TileSeg seg = TileSeg_read(param_10, param_11);
float2 line_vec = seg._vector;
for (uint k_1 = 0u; k_1 < 8u; k_1++)
{
float2 dpos = (xy + 0.5f.xx) - seg.origin;
uint param_12 = k_1;
dpos += float2(chunk_offset(param_12));
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f);
df[k_1] = min(df[k_1], length((line_vec * t) - dpos));
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0u);
for (uint k_2 = 0u; k_2 < 8u; k_2++)
{
area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f);
}
cmd_ref.offset += 12u;
break;
}
case 1u:
{
Alloc param_13 = cmd_alloc;
CmdRef param_14 = cmd_ref;
CmdFill fill = Cmd_Fill_read(param_13, param_14);
for (uint k_3 = 0u; k_3 < 8u; k_3++)
{
area[k_3] = float(fill.backdrop);
}
TileSegRef _964 = { fill.tile_ref };
tile_seg_ref = _964;
do
{
uint param_15 = tile_seg_ref.offset;
uint param_16 = 24u;
bool param_17 = mem_ok;
Alloc param_18 = new_alloc(param_15, param_16, param_17);
TileSegRef param_19 = tile_seg_ref;
TileSeg seg_1 = TileSeg_read(param_18, param_19);
for (uint k_4 = 0u; k_4 < 8u; k_4++)
{
uint param_20 = k_4;
float2 my_xy = xy + float2(chunk_offset(param_20));
float2 start = seg_1.origin - my_xy;
float2 end = start + seg_1._vector;
float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx);
if (window.x != window.y)
{
float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx;
float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y));
float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f;
float xmax = max(xs.x, xs.y);
float b = min(xmax, 1.0f);
float c = max(b, 0.0f);
float d = max(xmin, 0.0f);
float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
area[k_4] += (a * (window.x - window.y));
}
area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f));
}
tile_seg_ref = seg_1.next;
} while (tile_seg_ref.offset != 0u);
for (uint k_5 = 0u; k_5 < 8u; k_5++)
{
area[k_5] = min(abs(area[k_5]), 1.0f);
}
cmd_ref.offset += 12u;
break;
}
case 3u:
{
for (uint k_6 = 0u; k_6 < 8u; k_6++)
{
area[k_6] = 1.0f;
}
cmd_ref.offset += 4u;
break;
}
case 4u:
{
Alloc param_21 = cmd_alloc;
CmdRef param_22 = cmd_ref;
CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22);
for (uint k_7 = 0u; k_7 < 8u; k_7++)
{
area[k_7] = alpha.alpha;
}
cmd_ref.offset += 8u;
break;
}
case 5u:
{
Alloc param_23 = cmd_alloc;
CmdRef param_24 = cmd_ref;
CmdColor color = Cmd_Color_read(param_23, param_24);
uint param_25 = color.rgba_color;
float4 fg = unpacksRGB(param_25);
for (uint k_8 = 0u; k_8 < 8u; k_8++)
{
float4 fg_k = fg * area[k_8];
rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k;
}
cmd_ref.offset += 8u;
break;
}
case 6u:
{
Alloc param_26 = cmd_alloc;
CmdRef param_27 = cmd_ref;
CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27);
float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
for (uint k_9 = 0u; k_9 < 8u; k_9++)
{
uint param_28 = k_9;
float2 chunk_xy = float2(chunk_offset(param_28));
float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f));
float4 fg_rgba = gradients[int2(x, int(lin.index))];
float3 param_29 = fg_rgba.xyz;
float3 _1298 = fromsRGB(param_29);
fg_rgba.x = _1298.x;
fg_rgba.y = _1298.y;
fg_rgba.z = _1298.z;
rgba[k_9] = fg_rgba;
}
cmd_ref.offset += 20u;
break;
}
case 7u:
{
Alloc param_30 = cmd_alloc;
CmdRef param_31 = cmd_ref;
CmdImage fill_img = Cmd_Image_read(param_30, param_31);
uint2 param_32 = xy_uint;
CmdImage param_33 = fill_img;
float4 _1327[8];
fillImage(_1327, param_32, param_33);
float4 img[8] = _1327;
for (uint k_10 = 0u; k_10 < 8u; k_10++)
{
float4 fg_k_1 = img[k_10] * area[k_10];
rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_1.w)) + fg_k_1;
}
cmd_ref.offset += 12u;
break;
}
case 8u:
{
for (uint k_11 = 0u; k_11 < 8u; k_11++)
{
uint d_2 = min(clip_depth, 127u);
float4 param_34 = float4(rgba[k_11]);
uint _1390 = packsRGB(param_34);
blend_stack[d_2][k_11] = _1390;
blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f);
rgba[k_11] = 0.0f.xxxx;
}
clip_depth++;
cmd_ref.offset += 4u;
break;
}
case 9u:
{
clip_depth--;
for (uint k_12 = 0u; k_12 < 8u; k_12++)
{
uint d_3 = min(clip_depth, 127u);
uint param_35 = blend_stack[d_3][k_12];
float4 bg = unpacksRGB(param_35);
float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1;
}
cmd_ref.offset += 4u;
break;
}
case 10u:
{
Alloc param_36 = cmd_alloc;
CmdRef param_37 = cmd_ref;
CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref };
cmd_ref = _1469;
cmd_alloc.offset = cmd_ref.offset;
break;
}
}
}
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint param_38 = i_1;
float3 param_39 = rgba[i_1].xyz;
image[int2(xy_uint + chunk_offset(param_38))] = float4(tosRGB(param_39), rgba[i_1].w);
}
}
[numthreads(8, 4, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
comp_main();
}

View file

@ -0,0 +1,728 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
template<typename T, size_t Num>
struct spvUnsafeArray
{
T elements[Num ? Num : 1];
thread T& operator [] (size_t pos) thread
{
return elements[pos];
}
constexpr const thread T& operator [] (size_t pos) const thread
{
return elements[pos];
}
device T& operator [] (size_t pos) device
{
return elements[pos];
}
constexpr const device T& operator [] (size_t pos) const device
{
return elements[pos];
}
constexpr const constant T& operator [] (size_t pos) const constant
{
return elements[pos];
}
threadgroup T& operator [] (size_t pos) threadgroup
{
return elements[pos];
}
constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
{
return elements[pos];
}
};
struct Alloc
{
uint offset;
};
struct CmdStrokeRef
{
uint offset;
};
struct CmdStroke
{
uint tile_ref;
float half_width;
};
struct CmdFillRef
{
uint offset;
};
struct CmdFill
{
uint tile_ref;
int backdrop;
};
struct CmdColorRef
{
uint offset;
};
struct CmdColor
{
uint rgba_color;
};
struct CmdLinGradRef
{
uint offset;
};
struct CmdLinGrad
{
uint index;
float line_x;
float line_y;
float line_c;
};
struct CmdImageRef
{
uint offset;
};
struct CmdImage
{
uint index;
int2 offset;
};
struct CmdAlphaRef
{
uint offset;
};
struct CmdAlpha
{
float alpha;
};
struct CmdJumpRef
{
uint offset;
};
struct CmdJump
{
uint new_ref;
};
struct CmdRef
{
uint offset;
};
struct CmdTag
{
uint tag;
uint flags;
};
struct TileSegRef
{
uint offset;
};
struct TileSeg
{
float2 origin;
float2 vector;
float y_edge;
TileSegRef next;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u);
static inline __attribute__((always_inline))
Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
{
return Alloc{ a.offset + offset };
}
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_202)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_202.memory[offset];
return v;
}
static inline __attribute__((always_inline))
CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_202);
return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_202);
CmdStroke s;
s.tile_ref = raw0;
s.half_width = as_type<float>(raw1);
return s;
}
static inline __attribute__((always_inline))
CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u };
return CmdStroke_read(param, param_1, v_202);
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_202);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_202);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_202);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9, v_202);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11, v_202);
TileSeg s;
s.origin = float2(as_type<float>(raw0), as_type<float>(raw1));
s.vector = float2(as_type<float>(raw2), as_type<float>(raw3));
s.y_edge = as_type<float>(raw4);
s.next = TileSegRef{ raw5 };
return s;
}
static inline __attribute__((always_inline))
uint2 chunk_offset(thread const uint& i)
{
return uint2((i % 2u) * 8u, (i / 2u) * 4u);
}
static inline __attribute__((always_inline))
CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_202);
CmdFill s;
s.tile_ref = raw0;
s.backdrop = int(raw1);
return s;
}
static inline __attribute__((always_inline))
CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u };
return CmdFill_read(param, param_1, v_202);
}
static inline __attribute__((always_inline))
CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
CmdAlpha s;
s.alpha = as_type<float>(raw0);
return s;
}
static inline __attribute__((always_inline))
CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u };
return CmdAlpha_read(param, param_1, v_202);
}
static inline __attribute__((always_inline))
CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
CmdColor s;
s.rgba_color = raw0;
return s;
}
static inline __attribute__((always_inline))
CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u };
return CmdColor_read(param, param_1, v_202);
}
static inline __attribute__((always_inline))
float3 fromsRGB(thread const float3& srgb)
{
bool3 cutoff = srgb >= float3(0.040449999272823333740234375);
float3 below = srgb / float3(12.9200000762939453125);
float3 above = pow((srgb + float3(0.054999999701976776123046875)) / float3(1.05499994754791259765625), float3(2.400000095367431640625));
return select(below, above, cutoff);
}
static inline __attribute__((always_inline))
float4 unpacksRGB(thread const uint& srgba)
{
float4 color = unpack_unorm4x8_to_float(srgba).wzyx;
float3 param = color.xyz;
return float4(fromsRGB(param), color.w);
}
static inline __attribute__((always_inline))
CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_202);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_202);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_202);
CmdLinGrad s;
s.index = raw0;
s.line_x = as_type<float>(raw1);
s.line_y = as_type<float>(raw2);
s.line_c = as_type<float>(raw3);
return s;
}
static inline __attribute__((always_inline))
CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u };
return CmdLinGrad_read(param, param_1, v_202);
}
static inline __attribute__((always_inline))
CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_202);
CmdImage s;
s.index = raw0;
s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
return s;
}
static inline __attribute__((always_inline))
CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u };
return CmdImage_read(param, param_1, v_202);
}
static inline __attribute__((always_inline))
spvUnsafeArray<float4, 8> fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, thread texture2d<float> image_atlas)
{
spvUnsafeArray<float4, 8> rgba;
for (uint i = 0u; i < 8u; i++)
{
uint param = i;
int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset;
float4 fg_rgba = image_atlas.read(uint2(uv));
float3 param_1 = fg_rgba.xyz;
float3 _695 = fromsRGB(param_1);
fg_rgba.x = _695.x;
fg_rgba.y = _695.y;
fg_rgba.z = _695.z;
rgba[i] = fg_rgba;
}
return rgba;
}
static inline __attribute__((always_inline))
float3 tosRGB(thread const float3& rgb)
{
bool3 cutoff = rgb >= float3(0.003130800090730190277099609375);
float3 below = float3(12.9200000762939453125) * rgb;
float3 above = (float3(1.05499994754791259765625) * pow(rgb, float3(0.416660010814666748046875))) - float3(0.054999999701976776123046875);
return select(below, above, cutoff);
}
static inline __attribute__((always_inline))
uint packsRGB(thread float4& rgba)
{
float3 param = rgba.xyz;
rgba = float4(tosRGB(param), rgba.w);
return pack_float_to_unorm4x8(rgba.wzyx);
}
static inline __attribute__((always_inline))
CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_202)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_202);
CmdJump s;
s.new_ref = raw0;
return s;
}
static inline __attribute__((always_inline))
CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_202)
{
Alloc param = a;
CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u };
return CmdJump_read(param, param_1, v_202);
}
kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _723 [[buffer(1)]], texture2d<float, access::write> image [[texture(2)]], texture2d<float> image_atlas [[texture(3)]], texture2d<float> gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
uint tile_ix = (gl_WorkGroupID.y * _723.conf.width_in_tiles) + gl_WorkGroupID.x;
Alloc param;
param.offset = _723.conf.ptcl_alloc.offset;
uint param_1 = tile_ix * 1024u;
uint param_2 = 1024u;
Alloc cmd_alloc = slice_mem(param, param_1, param_2);
CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y));
float2 xy = float2(xy_uint);
spvUnsafeArray<float4, 8> rgba;
for (uint i = 0u; i < 8u; i++)
{
rgba[i] = float4(0.0);
}
uint clip_depth = 0u;
bool mem_ok = v_202.mem_error == 0u;
spvUnsafeArray<float, 8> df;
TileSegRef tile_seg_ref;
spvUnsafeArray<float, 8> area;
spvUnsafeArray<spvUnsafeArray<uint, 8>, 128> blend_stack;
spvUnsafeArray<spvUnsafeArray<float, 8>, 128> blend_alpha_stack;
while (mem_ok)
{
Alloc param_3 = cmd_alloc;
CmdRef param_4 = cmd_ref;
uint tag = Cmd_tag(param_3, param_4, v_202).tag;
if (tag == 0u)
{
break;
}
switch (tag)
{
case 2u:
{
Alloc param_5 = cmd_alloc;
CmdRef param_6 = cmd_ref;
CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_202);
for (uint k = 0u; k < 8u; k++)
{
df[k] = 1000000000.0;
}
tile_seg_ref = TileSegRef{ stroke.tile_ref };
do
{
uint param_7 = tile_seg_ref.offset;
uint param_8 = 24u;
bool param_9 = mem_ok;
Alloc param_10 = new_alloc(param_7, param_8, param_9);
TileSegRef param_11 = tile_seg_ref;
TileSeg seg = TileSeg_read(param_10, param_11, v_202);
float2 line_vec = seg.vector;
for (uint k_1 = 0u; k_1 < 8u; k_1++)
{
float2 dpos = (xy + float2(0.5)) - seg.origin;
uint param_12 = k_1;
dpos += float2(chunk_offset(param_12));
float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos));
}
tile_seg_ref = seg.next;
} while (tile_seg_ref.offset != 0u);
for (uint k_2 = 0u; k_2 < 8u; k_2++)
{
area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0);
}
cmd_ref.offset += 12u;
break;
}
case 1u:
{
Alloc param_13 = cmd_alloc;
CmdRef param_14 = cmd_ref;
CmdFill fill = Cmd_Fill_read(param_13, param_14, v_202);
for (uint k_3 = 0u; k_3 < 8u; k_3++)
{
area[k_3] = float(fill.backdrop);
}
tile_seg_ref = TileSegRef{ fill.tile_ref };
do
{
uint param_15 = tile_seg_ref.offset;
uint param_16 = 24u;
bool param_17 = mem_ok;
Alloc param_18 = new_alloc(param_15, param_16, param_17);
TileSegRef param_19 = tile_seg_ref;
TileSeg seg_1 = TileSeg_read(param_18, param_19, v_202);
for (uint k_4 = 0u; k_4 < 8u; k_4++)
{
uint param_20 = k_4;
float2 my_xy = xy + float2(chunk_offset(param_20));
float2 start = seg_1.origin - my_xy;
float2 end = start + seg_1.vector;
float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0));
if ((isunordered(window.x, window.y) || window.x != window.y))
{
float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y);
float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y));
float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07;
float xmax = fast::max(xs.x, xs.y);
float b = fast::min(xmax, 1.0);
float c = fast::max(b, 0.0);
float d = fast::max(xmin, 0.0);
float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin);
area[k_4] += (a * (window.x - window.y));
}
area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0));
}
tile_seg_ref = seg_1.next;
} while (tile_seg_ref.offset != 0u);
for (uint k_5 = 0u; k_5 < 8u; k_5++)
{
area[k_5] = fast::min(abs(area[k_5]), 1.0);
}
cmd_ref.offset += 12u;
break;
}
case 3u:
{
for (uint k_6 = 0u; k_6 < 8u; k_6++)
{
area[k_6] = 1.0;
}
cmd_ref.offset += 4u;
break;
}
case 4u:
{
Alloc param_21 = cmd_alloc;
CmdRef param_22 = cmd_ref;
CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_202);
for (uint k_7 = 0u; k_7 < 8u; k_7++)
{
area[k_7] = alpha.alpha;
}
cmd_ref.offset += 8u;
break;
}
case 5u:
{
Alloc param_23 = cmd_alloc;
CmdRef param_24 = cmd_ref;
CmdColor color = Cmd_Color_read(param_23, param_24, v_202);
uint param_25 = color.rgba_color;
float4 fg = unpacksRGB(param_25);
for (uint k_8 = 0u; k_8 < 8u; k_8++)
{
float4 fg_k = fg * area[k_8];
rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k;
}
cmd_ref.offset += 8u;
break;
}
case 6u:
{
Alloc param_26 = cmd_alloc;
CmdRef param_27 = cmd_ref;
CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_202);
float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c;
for (uint k_9 = 0u; k_9 < 8u; k_9++)
{
uint param_28 = k_9;
float2 chunk_xy = float2(chunk_offset(param_28));
float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y);
int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0));
float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index))));
float3 param_29 = fg_rgba.xyz;
float3 _1298 = fromsRGB(param_29);
fg_rgba.x = _1298.x;
fg_rgba.y = _1298.y;
fg_rgba.z = _1298.z;
rgba[k_9] = fg_rgba;
}
cmd_ref.offset += 20u;
break;
}
case 7u:
{
Alloc param_30 = cmd_alloc;
CmdRef param_31 = cmd_ref;
CmdImage fill_img = Cmd_Image_read(param_30, param_31, v_202);
uint2 param_32 = xy_uint;
CmdImage param_33 = fill_img;
spvUnsafeArray<float4, 8> img;
img = fillImage(param_32, param_33, image_atlas);
for (uint k_10 = 0u; k_10 < 8u; k_10++)
{
float4 fg_k_1 = img[k_10] * area[k_10];
rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_1.w)) + fg_k_1;
}
cmd_ref.offset += 12u;
break;
}
case 8u:
{
for (uint k_11 = 0u; k_11 < 8u; k_11++)
{
uint d_2 = min(clip_depth, 127u);
float4 param_34 = float4(rgba[k_11]);
uint _1390 = packsRGB(param_34);
blend_stack[d_2][k_11] = _1390;
blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0);
rgba[k_11] = float4(0.0);
}
clip_depth++;
cmd_ref.offset += 4u;
break;
}
case 9u:
{
clip_depth--;
for (uint k_12 = 0u; k_12 < 8u; k_12++)
{
uint d_3 = min(clip_depth, 127u);
uint param_35 = blend_stack[d_3][k_12];
float4 bg = unpacksRGB(param_35);
float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12];
rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1;
}
cmd_ref.offset += 4u;
break;
}
case 10u:
{
Alloc param_36 = cmd_alloc;
CmdRef param_37 = cmd_ref;
cmd_ref = CmdRef{ Cmd_Jump_read(param_36, param_37, v_202).new_ref };
cmd_alloc.offset = cmd_ref.offset;
break;
}
}
}
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint param_38 = i_1;
float3 param_39 = rgba[i_1].xyz;
image.write(float4(tosRGB(param_39), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_38))));
}
}

Binary file not shown.

View file

@ -0,0 +1,664 @@
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct PathCubicRef
{
uint offset;
};
struct PathCubic
{
float2 p0;
float2 p1;
float2 p2;
float2 p3;
uint path_ix;
uint trans_ix;
float2 stroke;
};
struct PathSegRef
{
uint offset;
};
struct PathSegTag
{
uint tag;
uint flags;
};
struct TileRef
{
uint offset;
};
struct PathRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct TileSegRef
{
uint offset;
};
struct TileSeg
{
float2 origin;
float2 _vector;
float y_edge;
TileSegRef next;
};
struct SubdivResult
{
float val;
float a0;
float a2;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u);
static const PathSegTag _721 = { 0u, 0u };
RWByteAddressBuffer _136 : register(u0, space0);
ByteAddressBuffer _710 : register(t1, space0);
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
uint read_mem(Alloc alloc, uint offset)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = _136.Load(offset * 4 + 8);
return v;
}
PathSegTag PathSeg_tag(Alloc a, PathSegRef ref)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1);
PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _367;
}
PathCubic PathCubic_read(Alloc a, PathCubicRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11);
Alloc param_12 = a;
uint param_13 = ix + 6u;
uint raw6 = read_mem(param_12, param_13);
Alloc param_14 = a;
uint param_15 = ix + 7u;
uint raw7 = read_mem(param_14, param_15);
Alloc param_16 = a;
uint param_17 = ix + 8u;
uint raw8 = read_mem(param_16, param_17);
Alloc param_18 = a;
uint param_19 = ix + 9u;
uint raw9 = read_mem(param_18, param_19);
Alloc param_20 = a;
uint param_21 = ix + 10u;
uint raw10 = read_mem(param_20, param_21);
Alloc param_22 = a;
uint param_23 = ix + 11u;
uint raw11 = read_mem(param_22, param_23);
PathCubic s;
s.p0 = float2(asfloat(raw0), asfloat(raw1));
s.p1 = float2(asfloat(raw2), asfloat(raw3));
s.p2 = float2(asfloat(raw4), asfloat(raw5));
s.p3 = float2(asfloat(raw6), asfloat(raw7));
s.path_ix = raw8;
s.trans_ix = raw9;
s.stroke = float2(asfloat(raw10), asfloat(raw11));
return s;
}
PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref)
{
PathCubicRef _373 = { ref.offset + 4u };
Alloc param = a;
PathCubicRef param_1 = _373;
return PathCubic_read(param, param_1);
}
float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t)
{
float mt = 1.0f - t;
return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t);
}
float approx_parabola_integral(float x)
{
return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x))));
}
SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol)
{
float2 d01 = p1 - p0;
float2 d12 = p2 - p1;
float2 dd = d01 - d12;
float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
float scale = abs(_cross / (length(dd) * (x2 - x0)));
float param = x0;
float a0 = approx_parabola_integral(param);
float param_1 = x2;
float a2 = approx_parabola_integral(param_1);
float val = 0.0f;
if (scale < 1000000000.0f)
{
float da = abs(a2 - a0);
float sqrt_scale = sqrt(scale);
if (sign(x0) == sign(x2))
{
val = da * sqrt_scale;
}
else
{
float xmin = sqrt_tol / sqrt_scale;
float param_2 = xmin;
val = (sqrt_tol * da) / approx_parabola_integral(param_2);
}
}
SubdivResult _695 = { val, a0, a2 };
return _695;
}
uint fill_mode_from_flags(uint flags)
{
return flags & 1u;
}
Path Path_read(Alloc a, PathRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
TileRef _427 = { raw2 };
s.tiles = _427;
return s;
}
Alloc new_alloc(uint offset, uint size, bool mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
float approx_parabola_inv_integral(float x)
{
return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x)));
}
float2 eval_quad(float2 p0, float2 p1, float2 p2, float t)
{
float mt = 1.0f - t;
return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t);
}
MallocResult malloc(uint size)
{
uint _142;
_136.InterlockedAdd(0, size, _142);
uint offset = _142;
uint _149;
_136.GetDimensions(_149);
_149 = (_149 - 8) / 4;
MallocResult r;
r.failed = (offset + size) > uint(int(_149) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _171;
_136.InterlockedMax(4, 1u, _171);
return r;
}
return r;
}
TileRef Tile_index(TileRef ref, uint index)
{
TileRef _385 = { ref.offset + (index * 8u) };
return _385;
}
void write_mem(Alloc alloc, uint offset, uint val)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
_136.Store(offset * 4 + 8, val);
}
void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = asuint(s.origin.x);
write_mem(param, param_1, param_2);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = asuint(s.origin.y);
write_mem(param_3, param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = asuint(s._vector.x);
write_mem(param_6, param_7, param_8);
Alloc param_9 = a;
uint param_10 = ix + 3u;
uint param_11 = asuint(s._vector.y);
write_mem(param_9, param_10, param_11);
Alloc param_12 = a;
uint param_13 = ix + 4u;
uint param_14 = asuint(s.y_edge);
write_mem(param_12, param_13, param_14);
Alloc param_15 = a;
uint param_16 = ix + 5u;
uint param_17 = s.next.offset;
write_mem(param_15, param_16, param_17);
}
void comp_main()
{
uint element_ix = gl_GlobalInvocationID.x;
PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) };
PathSegRef ref = _718;
PathSegTag tag = _721;
if (element_ix < _710.Load(4))
{
Alloc _731;
_731.offset = _710.Load(28);
Alloc param;
param.offset = _731.offset;
PathSegRef param_1 = ref;
tag = PathSeg_tag(param, param_1);
}
bool mem_ok = _136.Load(4) == 0u;
switch (tag.tag)
{
case 1u:
{
Alloc _748;
_748.offset = _710.Load(28);
Alloc param_2;
param_2.offset = _748.offset;
PathSegRef param_3 = ref;
PathCubic cubic = PathSeg_Cubic_read(param_2, param_3);
float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3;
float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u);
n_quads = min(n_quads, 16u);
float val = 0.0f;
float2 qp0 = cubic.p0;
float _step = 1.0f / float(n_quads);
SubdivResult keep_params[16];
for (uint i = 0u; i < n_quads; i++)
{
float t = float(i + 1u) * _step;
float2 param_4 = cubic.p0;
float2 param_5 = cubic.p1;
float2 param_6 = cubic.p2;
float2 param_7 = cubic.p3;
float param_8 = t;
float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
float2 param_9 = cubic.p0;
float2 param_10 = cubic.p1;
float2 param_11 = cubic.p2;
float2 param_12 = cubic.p3;
float param_13 = t - (0.5f * _step);
float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f);
float2 param_14 = qp0;
float2 param_15 = qp1;
float2 param_16 = qp2;
float param_17 = 0.4743416607379913330078125f;
SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
keep_params[i] = params;
val += params.val;
qp0 = qp2;
}
uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u);
uint param_18 = tag.flags;
bool is_stroke = fill_mode_from_flags(param_18) == 1u;
uint path_ix = cubic.path_ix;
PathRef _904 = { _710.Load(16) + (path_ix * 12u) };
Alloc _907;
_907.offset = _710.Load(16);
Alloc param_19;
param_19.offset = _907.offset;
PathRef param_20 = _904;
Path path = Path_read(param_19, param_20);
uint param_21 = path.tiles.offset;
uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_23 = mem_ok;
Alloc path_alloc = new_alloc(param_21, param_22, param_23);
int4 bbox = int4(path.bbox);
float2 p0 = cubic.p0;
qp0 = cubic.p0;
float v_step = val / float(n);
int n_out = 1;
float val_sum = 0.0f;
float2 p1;
float _1147;
TileSeg tile_seg;
for (uint i_1 = 0u; i_1 < n_quads; i_1++)
{
float t_1 = float(i_1 + 1u) * _step;
float2 param_24 = cubic.p0;
float2 param_25 = cubic.p1;
float2 param_26 = cubic.p2;
float2 param_27 = cubic.p3;
float param_28 = t_1;
float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
float2 param_29 = cubic.p0;
float2 param_30 = cubic.p1;
float2 param_31 = cubic.p2;
float2 param_32 = cubic.p3;
float param_33 = t_1 - (0.5f * _step);
float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f);
SubdivResult params_1 = keep_params[i_1];
float param_34 = params_1.a0;
float u0 = approx_parabola_inv_integral(param_34);
float param_35 = params_1.a2;
float u2 = approx_parabola_inv_integral(param_35);
float uscale = 1.0f / (u2 - u0);
float target = float(n_out) * v_step;
for (;;)
{
bool _1040 = uint(n_out) == n;
bool _1050;
if (!_1040)
{
_1050 = target < (val_sum + params_1.val);
}
else
{
_1050 = _1040;
}
if (_1050)
{
if (uint(n_out) == n)
{
p1 = cubic.p3;
}
else
{
float u = (target - val_sum) / params_1.val;
float a = lerp(params_1.a0, params_1.a2, u);
float param_36 = a;
float au = approx_parabola_inv_integral(param_36);
float t_2 = (au - u0) * uscale;
float2 param_37 = qp0;
float2 param_38 = qp1_1;
float2 param_39 = qp2_1;
float param_40 = t_2;
p1 = eval_quad(param_37, param_38, param_39, param_40);
}
float xmin = min(p0.x, p1.x) - cubic.stroke.x;
float xmax = max(p0.x, p1.x) + cubic.stroke.x;
float ymin = min(p0.y, p1.y) - cubic.stroke.y;
float ymax = max(p0.y, p1.y) + cubic.stroke.y;
float dx = p1.x - p0.x;
float dy = p1.y - p0.y;
if (abs(dy) < 9.999999717180685365747194737196e-10f)
{
_1147 = 1000000000.0f;
}
else
{
_1147 = dx / dy;
}
float invslope = _1147;
float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f;
float b = invslope;
float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f;
int x0 = int(floor(xmin * 0.0625f));
int x1 = int(floor(xmax * 0.0625f) + 1.0f);
int y0 = int(floor(ymin * 0.0625f));
int y1 = int(floor(ymax * 0.0625f) + 1.0f);
x0 = clamp(x0, bbox.x, bbox.z);
y0 = clamp(y0, bbox.y, bbox.w);
x1 = clamp(x1, bbox.x, bbox.z);
y1 = clamp(y1, bbox.y, bbox.w);
float xc = a_1 + (b * float(y0));
int stride = bbox.z - bbox.x;
int base = ((y0 - bbox.y) * stride) - bbox.x;
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
uint param_41 = n_tile_alloc * 24u;
MallocResult _1263 = malloc(param_41);
MallocResult tile_alloc = _1263;
if (tile_alloc.failed || (!mem_ok))
{
return;
}
uint tile_offset = tile_alloc.alloc.offset;
int xray = int(floor(p0.x * 0.0625f));
int last_xray = int(floor(p1.x * 0.0625f));
if (p0.y > p1.y)
{
int tmp = xray;
xray = last_xray;
last_xray = tmp;
}
for (int y = y0; y < y1; y++)
{
float tile_y0 = float(y * 16);
int xbackdrop = max((xray + 1), bbox.x);
bool _1319 = !is_stroke;
bool _1329;
if (_1319)
{
_1329 = min(p0.y, p1.y) < tile_y0;
}
else
{
_1329 = _1319;
}
bool _1336;
if (_1329)
{
_1336 = xbackdrop < bbox.z;
}
else
{
_1336 = _1329;
}
if (_1336)
{
int backdrop = (p1.y < p0.y) ? 1 : (-1);
TileRef param_42 = path.tiles;
uint param_43 = uint(base + xbackdrop);
TileRef tile_ref = Tile_index(param_42, param_43);
uint tile_el = tile_ref.offset >> uint(2);
Alloc param_44 = path_alloc;
uint param_45 = tile_el + 1u;
if (touch_mem(param_44, param_45))
{
uint _1374;
_136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374);
}
}
int next_xray = last_xray;
if (y < (y1 - 1))
{
float tile_y1 = float((y + 1) * 16);
float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy);
next_xray = int(floor(x_edge * 0.0625f));
}
int min_xray = min(xray, next_xray);
int max_xray = max(xray, next_xray);
int xx0 = min(int(floor(xc - c)), min_xray);
int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
xx0 = clamp(xx0, x0, x1);
xx1 = clamp(xx1, x0, x1);
for (int x = xx0; x < xx1; x++)
{
float tile_x0 = float(x * 16);
TileRef _1454 = { path.tiles.offset };
TileRef param_46 = _1454;
uint param_47 = uint(base + x);
TileRef tile_ref_1 = Tile_index(param_46, param_47);
uint tile_el_1 = tile_ref_1.offset >> uint(2);
uint old = 0u;
Alloc param_48 = path_alloc;
uint param_49 = tile_el_1;
if (touch_mem(param_48, param_49))
{
uint _1477;
_136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477);
old = _1477;
}
tile_seg.origin = p0;
tile_seg._vector = p1 - p0;
float y_edge = 0.0f;
if (!is_stroke)
{
y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx);
if (min(p0.x, p1.x) < tile_x0)
{
float2 p = float2(tile_x0, y_edge);
if (p0.x > p1.x)
{
tile_seg._vector = p - p0;
}
else
{
tile_seg.origin = p;
tile_seg._vector = p1 - p;
}
if (tile_seg._vector.x == 0.0f)
{
tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f;
}
}
if ((x <= min_xray) || (max_xray < x))
{
y_edge = 1000000000.0f;
}
}
tile_seg.y_edge = y_edge;
tile_seg.next.offset = old;
TileSegRef _1559 = { tile_offset };
Alloc param_50 = tile_alloc.alloc;
TileSegRef param_51 = _1559;
TileSeg param_52 = tile_seg;
TileSeg_write(param_50, param_51, param_52);
tile_offset += 24u;
}
xc += b;
base += stride;
xray = next_xray;
}
n_out++;
target += v_step;
p0 = p1;
continue;
}
else
{
break;
}
}
val_sum += params_1.val;
qp0 = qp2_1;
}
break;
}
}
}
[numthreads(32, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}

View file

@ -0,0 +1,708 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
template<typename T, size_t Num>
struct spvUnsafeArray
{
T elements[Num ? Num : 1];
thread T& operator [] (size_t pos) thread
{
return elements[pos];
}
constexpr const thread T& operator [] (size_t pos) const thread
{
return elements[pos];
}
device T& operator [] (size_t pos) device
{
return elements[pos];
}
constexpr const device T& operator [] (size_t pos) const device
{
return elements[pos];
}
constexpr const constant T& operator [] (size_t pos) const constant
{
return elements[pos];
}
threadgroup T& operator [] (size_t pos) threadgroup
{
return elements[pos];
}
constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
{
return elements[pos];
}
};
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct PathCubicRef
{
uint offset;
};
struct PathCubic
{
float2 p0;
float2 p1;
float2 p2;
float2 p3;
uint path_ix;
uint trans_ix;
float2 stroke;
};
struct PathSegRef
{
uint offset;
};
struct PathSegTag
{
uint tag;
uint flags;
};
struct TileRef
{
uint offset;
};
struct PathRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct TileSegRef
{
uint offset;
};
struct TileSeg
{
float2 origin;
float2 vector;
float y_edge;
TileSegRef next;
};
struct SubdivResult
{
float val;
float a0;
float a2;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_136.memory[offset];
return v;
}
static inline __attribute__((always_inline))
PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize);
return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize);
Alloc param_12 = a;
uint param_13 = ix + 6u;
uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize);
Alloc param_14 = a;
uint param_15 = ix + 7u;
uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize);
Alloc param_16 = a;
uint param_17 = ix + 8u;
uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize);
Alloc param_18 = a;
uint param_19 = ix + 9u;
uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize);
Alloc param_20 = a;
uint param_21 = ix + 10u;
uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize);
Alloc param_22 = a;
uint param_23 = ix + 11u;
uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize);
PathCubic s;
s.p0 = float2(as_type<float>(raw0), as_type<float>(raw1));
s.p1 = float2(as_type<float>(raw2), as_type<float>(raw3));
s.p2 = float2(as_type<float>(raw4), as_type<float>(raw5));
s.p3 = float2(as_type<float>(raw6), as_type<float>(raw7));
s.path_ix = raw8;
s.trans_ix = raw9;
s.stroke = float2(as_type<float>(raw10), as_type<float>(raw11));
return s;
}
static inline __attribute__((always_inline))
PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
{
Alloc param = a;
PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u };
return PathCubic_read(param, param_1, v_136, v_136BufferSize);
}
static inline __attribute__((always_inline))
float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t)
{
float mt = 1.0 - t;
return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t);
}
static inline __attribute__((always_inline))
float approx_parabola_integral(thread const float& x)
{
return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x))));
}
static inline __attribute__((always_inline))
SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol)
{
float2 d01 = p1 - p0;
float2 d12 = p2 - p1;
float2 dd = d01 - d12;
float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
float scale = abs(_cross / (length(dd) * (x2 - x0)));
float param = x0;
float a0 = approx_parabola_integral(param);
float param_1 = x2;
float a2 = approx_parabola_integral(param_1);
float val = 0.0;
if (scale < 1000000000.0)
{
float da = abs(a2 - a0);
float sqrt_scale = sqrt(scale);
if (sign(x0) == sign(x2))
{
val = da * sqrt_scale;
}
else
{
float xmin = sqrt_tol / sqrt_scale;
float param_2 = xmin;
val = (sqrt_tol * da) / approx_parabola_integral(param_2);
}
}
return SubdivResult{ val, a0, a2 };
}
static inline __attribute__((always_inline))
uint fill_mode_from_flags(thread const uint& flags)
{
return flags & 1u;
}
static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
s.tiles = TileRef{ raw2 };
return s;
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
float approx_parabola_inv_integral(thread const float& x)
{
return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x)));
}
static inline __attribute__((always_inline))
float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t)
{
float mt = 1.0 - t;
return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t);
}
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize)
{
uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed);
uint offset = _142;
MallocResult r;
r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed);
return r;
}
return r;
}
static inline __attribute__((always_inline))
TileRef Tile_index(thread const TileRef& ref, thread const uint& index)
{
return TileRef{ ref.offset + (index * 8u) };
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_136.memory[offset] = val;
}
static inline __attribute__((always_inline))
void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = as_type<uint>(s.origin.x);
write_mem(param, param_1, param_2, v_136, v_136BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = as_type<uint>(s.origin.y);
write_mem(param_3, param_4, param_5, v_136, v_136BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = as_type<uint>(s.vector.x);
write_mem(param_6, param_7, param_8, v_136, v_136BufferSize);
Alloc param_9 = a;
uint param_10 = ix + 3u;
uint param_11 = as_type<uint>(s.vector.y);
write_mem(param_9, param_10, param_11, v_136, v_136BufferSize);
Alloc param_12 = a;
uint param_13 = ix + 4u;
uint param_14 = as_type<uint>(s.y_edge);
write_mem(param_12, param_13, param_14, v_136, v_136BufferSize);
Alloc param_15 = a;
uint param_16 = ix + 5u;
uint param_17 = s.next.offset;
write_mem(param_15, param_16, param_17, v_136, v_136BufferSize);
}
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
{
constant uint& v_136BufferSize = spvBufferSizeConstants[0];
uint element_ix = gl_GlobalInvocationID.x;
PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) };
PathSegTag tag = PathSegTag{ 0u, 0u };
if (element_ix < _710.conf.n_pathseg)
{
Alloc param;
param.offset = _710.conf.pathseg_alloc.offset;
PathSegRef param_1 = ref;
tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize);
}
bool mem_ok = v_136.mem_error == 0u;
switch (tag.tag)
{
case 1u:
{
Alloc param_2;
param_2.offset = _710.conf.pathseg_alloc.offset;
PathSegRef param_3 = ref;
PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize);
float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3;
float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u);
n_quads = min(n_quads, 16u);
float val = 0.0;
float2 qp0 = cubic.p0;
float _step = 1.0 / float(n_quads);
spvUnsafeArray<SubdivResult, 16> keep_params;
for (uint i = 0u; i < n_quads; i++)
{
float t = float(i + 1u) * _step;
float2 param_4 = cubic.p0;
float2 param_5 = cubic.p1;
float2 param_6 = cubic.p2;
float2 param_7 = cubic.p3;
float param_8 = t;
float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
float2 param_9 = cubic.p0;
float2 param_10 = cubic.p1;
float2 param_11 = cubic.p2;
float2 param_12 = cubic.p3;
float param_13 = t - (0.5 * _step);
float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5);
float2 param_14 = qp0;
float2 param_15 = qp1;
float2 param_16 = qp2;
float param_17 = 0.4743416607379913330078125;
SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
keep_params[i] = params;
val += params.val;
qp0 = qp2;
}
uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u);
uint param_18 = tag.flags;
bool is_stroke = fill_mode_from_flags(param_18) == 1u;
uint path_ix = cubic.path_ix;
Alloc param_19;
param_19.offset = _710.conf.tile_alloc.offset;
PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) };
Path path = Path_read(param_19, param_20, v_136, v_136BufferSize);
uint param_21 = path.tiles.offset;
uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_23 = mem_ok;
Alloc path_alloc = new_alloc(param_21, param_22, param_23);
int4 bbox = int4(path.bbox);
float2 p0 = cubic.p0;
qp0 = cubic.p0;
float v_step = val / float(n);
int n_out = 1;
float val_sum = 0.0;
float2 p1;
float _1147;
TileSeg tile_seg;
for (uint i_1 = 0u; i_1 < n_quads; i_1++)
{
float t_1 = float(i_1 + 1u) * _step;
float2 param_24 = cubic.p0;
float2 param_25 = cubic.p1;
float2 param_26 = cubic.p2;
float2 param_27 = cubic.p3;
float param_28 = t_1;
float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
float2 param_29 = cubic.p0;
float2 param_30 = cubic.p1;
float2 param_31 = cubic.p2;
float2 param_32 = cubic.p3;
float param_33 = t_1 - (0.5 * _step);
float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5);
SubdivResult params_1 = keep_params[i_1];
float param_34 = params_1.a0;
float u0 = approx_parabola_inv_integral(param_34);
float param_35 = params_1.a2;
float u2 = approx_parabola_inv_integral(param_35);
float uscale = 1.0 / (u2 - u0);
float target = float(n_out) * v_step;
for (;;)
{
bool _1040 = uint(n_out) == n;
bool _1050;
if (!_1040)
{
_1050 = target < (val_sum + params_1.val);
}
else
{
_1050 = _1040;
}
if (_1050)
{
if (uint(n_out) == n)
{
p1 = cubic.p3;
}
else
{
float u = (target - val_sum) / params_1.val;
float a = mix(params_1.a0, params_1.a2, u);
float param_36 = a;
float au = approx_parabola_inv_integral(param_36);
float t_2 = (au - u0) * uscale;
float2 param_37 = qp0;
float2 param_38 = qp1_1;
float2 param_39 = qp2_1;
float param_40 = t_2;
p1 = eval_quad(param_37, param_38, param_39, param_40);
}
float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x;
float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x;
float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y;
float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y;
float dx = p1.x - p0.x;
float dy = p1.y - p0.y;
if (abs(dy) < 9.999999717180685365747194737196e-10)
{
_1147 = 1000000000.0;
}
else
{
_1147 = dx / dy;
}
float invslope = _1147;
float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625;
float b = invslope;
float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625;
int x0 = int(floor(xmin * 0.0625));
int x1 = int(floor(xmax * 0.0625) + 1.0);
int y0 = int(floor(ymin * 0.0625));
int y1 = int(floor(ymax * 0.0625) + 1.0);
x0 = clamp(x0, bbox.x, bbox.z);
y0 = clamp(y0, bbox.y, bbox.w);
x1 = clamp(x1, bbox.x, bbox.z);
y1 = clamp(y1, bbox.y, bbox.w);
float xc = a_1 + (b * float(y0));
int stride = bbox.z - bbox.x;
int base = ((y0 - bbox.y) * stride) - bbox.x;
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
uint param_41 = n_tile_alloc * 24u;
MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize);
MallocResult tile_alloc = _1263;
if (tile_alloc.failed || (!mem_ok))
{
return;
}
uint tile_offset = tile_alloc.alloc.offset;
int xray = int(floor(p0.x * 0.0625));
int last_xray = int(floor(p1.x * 0.0625));
if (p0.y > p1.y)
{
int tmp = xray;
xray = last_xray;
last_xray = tmp;
}
for (int y = y0; y < y1; y++)
{
float tile_y0 = float(y * 16);
int xbackdrop = max((xray + 1), bbox.x);
bool _1319 = !is_stroke;
bool _1329;
if (_1319)
{
_1329 = fast::min(p0.y, p1.y) < tile_y0;
}
else
{
_1329 = _1319;
}
bool _1336;
if (_1329)
{
_1336 = xbackdrop < bbox.z;
}
else
{
_1336 = _1329;
}
if (_1336)
{
int backdrop = (p1.y < p0.y) ? 1 : (-1);
TileRef param_42 = path.tiles;
uint param_43 = uint(base + xbackdrop);
TileRef tile_ref = Tile_index(param_42, param_43);
uint tile_el = tile_ref.offset >> uint(2);
Alloc param_44 = path_alloc;
uint param_45 = tile_el + 1u;
if (touch_mem(param_44, param_45))
{
uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed);
}
}
int next_xray = last_xray;
if (y < (y1 - 1))
{
float tile_y1 = float((y + 1) * 16);
float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
next_xray = int(floor(x_edge * 0.0625));
}
int min_xray = min(xray, next_xray);
int max_xray = max(xray, next_xray);
int xx0 = min(int(floor(xc - c)), min_xray);
int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
xx0 = clamp(xx0, x0, x1);
xx1 = clamp(xx1, x0, x1);
for (int x = xx0; x < xx1; x++)
{
float tile_x0 = float(x * 16);
TileRef param_46 = TileRef{ path.tiles.offset };
uint param_47 = uint(base + x);
TileRef tile_ref_1 = Tile_index(param_46, param_47);
uint tile_el_1 = tile_ref_1.offset >> uint(2);
uint old = 0u;
Alloc param_48 = path_alloc;
uint param_49 = tile_el_1;
if (touch_mem(param_48, param_49))
{
uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed);
old = _1477;
}
tile_seg.origin = p0;
tile_seg.vector = p1 - p0;
float y_edge = 0.0;
if (!is_stroke)
{
y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
if (fast::min(p0.x, p1.x) < tile_x0)
{
float2 p = float2(tile_x0, y_edge);
if (p0.x > p1.x)
{
tile_seg.vector = p - p0;
}
else
{
tile_seg.origin = p;
tile_seg.vector = p1 - p;
}
if (tile_seg.vector.x == 0.0)
{
tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10;
}
}
if ((x <= min_xray) || (max_xray < x))
{
y_edge = 1000000000.0;
}
}
tile_seg.y_edge = y_edge;
tile_seg.next.offset = old;
Alloc param_50 = tile_alloc.alloc;
TileSegRef param_51 = TileSegRef{ tile_offset };
TileSeg param_52 = tile_seg;
TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize);
tile_offset += 24u;
}
xc += b;
base += stride;
xray = next_xray;
}
n_out++;
target += v_step;
p0 = p1;
continue;
}
else
{
break;
}
}
val_sum += params_1.val;
qp0 = qp2_1;
}
break;
}
}
}

View file

@ -77,10 +77,10 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u };
static const Monoid _567 = { 0.0f.xxxx, 0u };
RWByteAddressBuffer _111 : register(u0);
ByteAddressBuffer _574 : register(t2);
ByteAddressBuffer _639 : register(t1);
ByteAddressBuffer _709 : register(t3);
RWByteAddressBuffer _111 : register(u0, space0);
ByteAddressBuffer _574 : register(t2, space0);
ByteAddressBuffer _639 : register(t1, space0);
ByteAddressBuffer _709 : register(t3, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;

View file

@ -36,10 +36,10 @@ struct Config
static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u);
ByteAddressBuffer _139 : register(t1);
ByteAddressBuffer _150 : register(t2);
RWByteAddressBuffer _238 : register(u3);
RWByteAddressBuffer _258 : register(u0);
ByteAddressBuffer _139 : register(t1, space0);
ByteAddressBuffer _150 : register(t2, space0);
RWByteAddressBuffer _238 : register(u3, space0);
RWByteAddressBuffer _258 : register(u0, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;

View file

@ -11,7 +11,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u };
RWByteAddressBuffer _78 : register(u0);
RWByteAddressBuffer _78 : register(u0, space0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;

Binary file not shown.

View file

@ -0,0 +1,335 @@
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct AnnoEndClipRef
{
uint offset;
};
struct AnnoEndClip
{
float4 bbox;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
RWByteAddressBuffer _92 : register(u0, space0);
ByteAddressBuffer _305 : register(t1, space0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
groupshared uint sh_tile_count[256];
groupshared MallocResult sh_tile_alloc;
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
uint read_mem(Alloc alloc, uint offset)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = _92.Load(offset * 4 + 8);
return v;
}
AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1);
AnnotatedTag _236 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _236;
}
AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7);
AnnoEndClip s;
s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
return s;
}
AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref)
{
AnnoEndClipRef _243 = { ref.offset + 4u };
Alloc param = a;
AnnoEndClipRef param_1 = _243;
return AnnoEndClip_read(param, param_1);
}
Alloc new_alloc(uint offset, uint size, bool mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
MallocResult malloc(uint size)
{
uint _98;
_92.InterlockedAdd(0, size, _98);
uint offset = _98;
uint _105;
_92.GetDimensions(_105);
_105 = (_105 - 8) / 4;
MallocResult r;
r.failed = (offset + size) > uint(int(_105) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _127;
_92.InterlockedMax(4, 1u, _127);
return r;
}
return r;
}
Alloc slice_mem(Alloc a, uint offset, uint size)
{
Alloc _169 = { a.offset + offset };
return _169;
}
void write_mem(Alloc alloc, uint offset, uint val)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
_92.Store(offset * 4 + 8, val);
}
void Path_write(Alloc a, PathRef ref, Path s)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
write_mem(param, param_1, param_2);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
write_mem(param_3, param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = s.tiles.offset;
write_mem(param_6, param_7, param_8);
}
void comp_main()
{
uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
PathRef _312 = { _305.Load(16) + (element_ix * 12u) };
PathRef path_ref = _312;
AnnotatedRef _321 = { _305.Load(32) + (element_ix * 40u) };
AnnotatedRef ref = _321;
uint tag = 0u;
if (element_ix < _305.Load(0))
{
Alloc _332;
_332.offset = _305.Load(32);
Alloc param;
param.offset = _332.offset;
AnnotatedRef param_1 = ref;
tag = Annotated_tag(param, param_1).tag;
}
int x0 = 0;
int y0 = 0;
int x1 = 0;
int y1 = 0;
switch (tag)
{
case 1u:
case 2u:
case 3u:
case 4u:
case 5u:
{
Alloc _350;
_350.offset = _305.Load(32);
Alloc param_2;
param_2.offset = _350.offset;
AnnotatedRef param_3 = ref;
AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3);
x0 = int(floor(clip.bbox.x * 0.0625f));
y0 = int(floor(clip.bbox.y * 0.0625f));
x1 = int(ceil(clip.bbox.z * 0.0625f));
y1 = int(ceil(clip.bbox.w * 0.0625f));
break;
}
}
x0 = clamp(x0, 0, int(_305.Load(8)));
y0 = clamp(y0, 0, int(_305.Load(12)));
x1 = clamp(x1, 0, int(_305.Load(8)));
y1 = clamp(y1, 0, int(_305.Load(12)));
Path path;
path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
uint tile_count = uint((x1 - x0) * (y1 - y0));
if (tag == 5u)
{
tile_count = 0u;
}
sh_tile_count[th_ix] = tile_count;
uint total_tile_count = tile_count;
for (uint i = 0u; i < 8u; i++)
{
GroupMemoryBarrierWithGroupSync();
if (th_ix >= uint(1 << int(i)))
{
total_tile_count += sh_tile_count[th_ix - (1u << i)];
}
GroupMemoryBarrierWithGroupSync();
sh_tile_count[th_ix] = total_tile_count;
}
if (th_ix == 255u)
{
uint param_4 = total_tile_count * 8u;
MallocResult _477 = malloc(param_4);
sh_tile_alloc = _477;
}
GroupMemoryBarrierWithGroupSync();
MallocResult alloc_start = sh_tile_alloc;
bool _488;
if (!alloc_start.failed)
{
_488 = _92.Load(4) != 0u;
}
else
{
_488 = alloc_start.failed;
}
if (_488)
{
return;
}
if (element_ix < _305.Load(0))
{
uint _501;
if (th_ix > 0u)
{
_501 = sh_tile_count[th_ix - 1u];
}
else
{
_501 = 0u;
}
uint tile_subix = _501;
Alloc param_5 = alloc_start.alloc;
uint param_6 = 8u * tile_subix;
uint param_7 = 8u * tile_count;
Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
TileRef _523 = { tiles_alloc.offset };
path.tiles = _523;
Alloc _528;
_528.offset = _305.Load(16);
Alloc param_8;
param_8.offset = _528.offset;
PathRef param_9 = path_ref;
Path param_10 = path;
Path_write(param_8, param_9, param_10);
}
uint total_count = sh_tile_count[255] * 2u;
uint start_ix = alloc_start.alloc.offset >> uint(2);
for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
{
Alloc param_11 = alloc_start.alloc;
uint param_12 = start_ix + i_1;
uint param_13 = 0u;
write_mem(param_11, param_12, param_13);
}
}
[numthreads(256, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}

View file

@ -0,0 +1,336 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct AnnoEndClipRef
{
uint offset;
};
struct AnnoEndClip
{
float4 bbox;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
uint n_trans;
uint n_path;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_92, constant uint& v_92BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_92.memory[offset];
return v;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_92, v_92BufferSize);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_92, v_92BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_92, v_92BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_92, v_92BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_92, v_92BufferSize);
AnnoEndClip s;
s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
return s;
}
static inline __attribute__((always_inline))
AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_92, constant uint& v_92BufferSize)
{
Alloc param = a;
AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u };
return AnnoEndClip_read(param, param_1, v_92, v_92BufferSize);
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_92, constant uint& v_92BufferSize)
{
uint _98 = atomic_fetch_add_explicit((device atomic_uint*)&v_92.mem_offset, size, memory_order_relaxed);
uint offset = _98;
MallocResult r;
r.failed = (offset + size) > uint(int((v_92BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _127 = atomic_fetch_max_explicit((device atomic_uint*)&v_92.mem_error, 1u, memory_order_relaxed);
return r;
}
return r;
}
static inline __attribute__((always_inline))
Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
{
return Alloc{ a.offset + offset };
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_92, constant uint& v_92BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_92.memory[offset] = val;
}
static inline __attribute__((always_inline))
void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_92, constant uint& v_92BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
write_mem(param, param_1, param_2, v_92, v_92BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
write_mem(param_3, param_4, param_5, v_92, v_92BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = s.tiles.offset;
write_mem(param_6, param_7, param_8, v_92, v_92BufferSize);
}
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_92 [[buffer(0)]], const device ConfigBuf& _305 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
{
threadgroup uint sh_tile_count[256];
threadgroup MallocResult sh_tile_alloc;
constant uint& v_92BufferSize = spvBufferSizeConstants[0];
uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
PathRef path_ref = PathRef{ _305.conf.tile_alloc.offset + (element_ix * 12u) };
AnnotatedRef ref = AnnotatedRef{ _305.conf.anno_alloc.offset + (element_ix * 40u) };
uint tag = 0u;
if (element_ix < _305.conf.n_elements)
{
Alloc param;
param.offset = _305.conf.anno_alloc.offset;
AnnotatedRef param_1 = ref;
tag = Annotated_tag(param, param_1, v_92, v_92BufferSize).tag;
}
int x0 = 0;
int y0 = 0;
int x1 = 0;
int y1 = 0;
switch (tag)
{
case 1u:
case 2u:
case 3u:
case 4u:
case 5u:
{
Alloc param_2;
param_2.offset = _305.conf.anno_alloc.offset;
AnnotatedRef param_3 = ref;
AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_92, v_92BufferSize);
x0 = int(floor(clip.bbox.x * 0.0625));
y0 = int(floor(clip.bbox.y * 0.0625));
x1 = int(ceil(clip.bbox.z * 0.0625));
y1 = int(ceil(clip.bbox.w * 0.0625));
break;
}
}
x0 = clamp(x0, 0, int(_305.conf.width_in_tiles));
y0 = clamp(y0, 0, int(_305.conf.height_in_tiles));
x1 = clamp(x1, 0, int(_305.conf.width_in_tiles));
y1 = clamp(y1, 0, int(_305.conf.height_in_tiles));
Path path;
path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
uint tile_count = uint((x1 - x0) * (y1 - y0));
if (tag == 5u)
{
tile_count = 0u;
}
sh_tile_count[th_ix] = tile_count;
uint total_tile_count = tile_count;
for (uint i = 0u; i < 8u; i++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if (th_ix >= uint(1 << int(i)))
{
total_tile_count += sh_tile_count[th_ix - (1u << i)];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_tile_count[th_ix] = total_tile_count;
}
if (th_ix == 255u)
{
uint param_4 = total_tile_count * 8u;
MallocResult _477 = malloc(param_4, v_92, v_92BufferSize);
sh_tile_alloc = _477;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
MallocResult alloc_start = sh_tile_alloc;
bool _488;
if (!alloc_start.failed)
{
_488 = v_92.mem_error != 0u;
}
else
{
_488 = alloc_start.failed;
}
if (_488)
{
return;
}
if (element_ix < _305.conf.n_elements)
{
uint _501;
if (th_ix > 0u)
{
_501 = sh_tile_count[th_ix - 1u];
}
else
{
_501 = 0u;
}
uint tile_subix = _501;
Alloc param_5 = alloc_start.alloc;
uint param_6 = 8u * tile_subix;
uint param_7 = 8u * tile_count;
Alloc tiles_alloc = slice_mem(param_5, param_6, param_7);
path.tiles = TileRef{ tiles_alloc.offset };
Alloc param_8;
param_8.offset = _305.conf.tile_alloc.offset;
PathRef param_9 = path_ref;
Path param_10 = path;
Path_write(param_8, param_9, param_10, v_92, v_92BufferSize);
}
uint total_count = sh_tile_count[255] * 2u;
uint start_ix = alloc_start.alloc.offset >> uint(2);
for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
{
Alloc param_11 = alloc_start.alloc;
uint param_12 = start_ix + i_1;
uint param_13 = 0u;
write_mem(param_11, param_12, param_13, v_92, v_92BufferSize);
}
}

View file

@ -51,10 +51,10 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
RWByteAddressBuffer _71 : register(u0);
ByteAddressBuffer _96 : register(t2);
ByteAddressBuffer _278 : register(t1);
ByteAddressBuffer _377 : register(t3);
RWByteAddressBuffer _71 : register(u0, space0);
ByteAddressBuffer _96 : register(t2, space0);
ByteAddressBuffer _278 : register(t1, space0);
ByteAddressBuffer _377 : register(t3, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;

View file

@ -38,10 +38,10 @@ struct Config
static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
ByteAddressBuffer _49 : register(t2);
ByteAddressBuffer _161 : register(t1);
RWByteAddressBuffer _251 : register(u3);
RWByteAddressBuffer _267 : register(u0);
ByteAddressBuffer _49 : register(t2, space0);
ByteAddressBuffer _161 : register(t1, space0);
RWByteAddressBuffer _251 : register(u3, space0);
RWByteAddressBuffer _267 : register(u0, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;

View file

@ -8,7 +8,7 @@ static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
RWByteAddressBuffer _89 : register(u0);
RWByteAddressBuffer _89 : register(u0, space0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;

Binary file not shown.

View file

@ -73,7 +73,7 @@ void main() {
for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
barrier();
if (th_ix >= (1 << i)) {
total_tile_count += sh_tile_count[th_ix - (1 << i)];
total_tile_count += sh_tile_count[th_ix - (1u << i)];
}
barrier();
sh_tile_count[th_ix] = total_tile_count;

View file

@ -15,7 +15,7 @@ use piet::{ImageFormat, RenderContext};
use piet_gpu_hal::{
BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Error, Image, ImageLayout, Pipeline,
QueryPool, Session, ShaderCode,
QueryPool, Session, ShaderCode, include_shader,
};
use pico_svg::PicoSvg;
@ -161,23 +161,23 @@ impl Renderer {
})
.collect();
let tile_alloc_code = ShaderCode::Spv(include_bytes!("../shader/tile_alloc.spv"));
let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc");
let tile_pipeline = session
.create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::Buffer])?;
let tile_ds = session
.create_simple_descriptor_set(&tile_pipeline, &[&memory_buf_dev, &config_buf])?;
let path_alloc_code = ShaderCode::Spv(include_bytes!("../shader/path_coarse.spv"));
let path_alloc_code = include_shader!(session, "../shader/gen/path_coarse");
let path_pipeline = session
.create_compute_pipeline(path_alloc_code, &[BindType::Buffer, BindType::Buffer])?;
let path_ds = session
.create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv"))
include_shader!(session, "../shader/gen/backdrop_lg")
} else {
println!("using small workgroup backdrop kernel");
ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
include_shader!(session, "../shader/gen/backdrop")
};
let backdrop_pipeline = session
.create_compute_pipeline(backdrop_code, &[BindType::Buffer, BindType::Buffer])?;
@ -185,13 +185,13 @@ impl Renderer {
.create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;
// TODO: constants
let bin_code = ShaderCode::Spv(include_bytes!("../shader/binning.spv"));
let bin_code = include_shader!(session, "../shader/gen/binning");
let bin_pipeline =
session.create_compute_pipeline(bin_code, &[BindType::Buffer, BindType::Buffer])?;
let bin_ds =
session.create_simple_descriptor_set(&bin_pipeline, &[&memory_buf_dev, &config_buf])?;
let coarse_code = ShaderCode::Spv(include_bytes!("../shader/coarse.spv"));
let coarse_code = include_shader!(session, "../shader/gen/coarse");
let coarse_pipeline =
session.create_compute_pipeline(coarse_code, &[BindType::Buffer, BindType::Buffer])?;
let coarse_ds = session
@ -210,7 +210,7 @@ impl Renderer {
.collect();
let gradients = Self::make_gradient_image(&session);
let k4_code = ShaderCode::Spv(include_bytes!("../shader/kernel4.spv"));
let k4_code = include_shader!(session, "../shader/gen/kernel4");
let k4_pipeline = session.create_compute_pipeline(
k4_code,
&[