vello/piet-gpu/shader/gen/coarse.msl
Raph Levien 3b67a4e7c1 New clip implementation
This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack.

This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely.

Fixes #88. Also relevant to #119
2022-02-17 17:13:28 -08:00

1307 lines
45 KiB
Plaintext
Generated

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
// Implementation of the GLSL findLSB() function
template<typename T>
inline T spvFindLSB(T x)
{
return select(ctz(x), T(-1), x == T(0));
}
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct AnnoImageRef
{
uint offset;
};
struct AnnoImage
{
float4 bbox;
float linewidth;
uint index;
int2 offset;
};
struct AnnoColorRef
{
uint offset;
};
struct AnnoColor
{
float4 bbox;
float linewidth;
uint rgba_color;
};
struct AnnoLinGradientRef
{
uint offset;
};
struct AnnoLinGradient
{
float4 bbox;
float linewidth;
uint index;
float line_x;
float line_y;
float line_c;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct BinInstanceRef
{
uint offset;
};
struct BinInstance
{
uint element_ix;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct TileSegRef
{
uint offset;
};
struct Tile
{
TileSegRef tile;
int backdrop;
};
struct CmdStrokeRef
{
uint offset;
};
struct CmdStroke
{
uint tile_ref;
float half_width;
};
struct CmdFillRef
{
uint offset;
};
struct CmdFill
{
uint tile_ref;
int backdrop;
};
struct CmdColorRef
{
uint offset;
};
struct CmdColor
{
uint rgba_color;
};
struct CmdLinGradRef
{
uint offset;
};
struct CmdLinGrad
{
uint index;
float line_x;
float line_y;
float line_c;
};
struct CmdImageRef
{
uint offset;
};
struct CmdImage
{
uint index;
int2 offset;
};
struct CmdJumpRef
{
uint offset;
};
struct CmdJump
{
uint new_ref;
};
struct CmdRef
{
uint offset;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
Alloc_1 clip_alloc;
Alloc_1 clip_bic_alloc;
Alloc_1 clip_stack_alloc;
Alloc_1 clip_bbox_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
{
return Alloc{ a.offset + offset };
}
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_283.memory[offset];
return v;
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index)
{
return BinInstanceRef{ ref.offset + (index * 4u) };
}
static inline __attribute__((always_inline))
BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
BinInstance s;
s.element_ix = raw0;
return s;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_283, v_283BufferSize);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
s.tiles = TileRef{ raw2 };
return s;
}
static inline __attribute__((always_inline))
void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
{
}
static inline __attribute__((always_inline))
Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_283, constant uint& v_283BufferSize)
{
uint param = 0u;
uint param_1 = uint(int((v_283BufferSize - 8) / 4) * 4);
bool param_2 = mem_ok;
return new_alloc(param, param_1, param_2);
}
static inline __attribute__((always_inline))
Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
Tile s;
s.tile = TileSegRef{ raw0 };
s.backdrop = int(raw1);
return s;
}
static inline __attribute__((always_inline))
AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
AnnoColor s;
s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
s.linewidth = as_type<float>(raw4);
s.rgba_color = raw5;
return s;
}
static inline __attribute__((always_inline))
AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u };
return AnnoColor_read(param, param_1, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_283, constant uint& v_283BufferSize)
{
uint _289 = atomic_fetch_add_explicit((device atomic_uint*)&v_283.mem_offset, size, memory_order_relaxed);
uint offset = _289;
MallocResult r;
r.failed = (offset + size) > uint(int((v_283BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _318 = atomic_fetch_max_explicit((device atomic_uint*)&v_283.mem_error, 1u, memory_order_relaxed);
return r;
}
return r;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_283.memory[offset] = val;
}
static inline __attribute__((always_inline))
void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.new_ref;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 10u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
CmdJump param_5 = s;
CmdJump_write(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_283, constant uint& v_283BufferSize)
{
if (cmd_ref.offset < cmd_limit)
{
return true;
}
uint param = 1024u;
MallocResult _1076 = malloc(param, v_283, v_283BufferSize);
MallocResult new_cmd = _1076;
if (new_cmd.failed)
{
return false;
}
CmdJump jump = CmdJump{ new_cmd.alloc.offset };
Alloc param_1 = cmd_alloc;
CmdRef param_2 = cmd_ref;
CmdJump param_3 = jump;
Cmd_Jump_write(param_1, param_2, param_3, v_283, v_283BufferSize);
cmd_alloc = new_cmd.alloc;
cmd_ref = CmdRef{ cmd_alloc.offset };
cmd_limit = (cmd_alloc.offset + 1024u) - 60u;
return true;
}
static inline __attribute__((always_inline))
uint fill_mode_from_flags(thread const uint& flags)
{
return flags & 1u;
}
static inline __attribute__((always_inline))
void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.tile_ref;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = uint(s.backdrop);
write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 1u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
CmdFill param_5 = s;
CmdFill_write(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 3u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.tile_ref;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = as_type<uint>(s.half_width);
write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 2u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
CmdStroke param_5 = s;
CmdStroke_write(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_283, constant uint& v_283BufferSize)
{
uint param = flags;
if (fill_mode_from_flags(param) == 0u)
{
if (tile.tile.offset != 0u)
{
CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
Alloc param_1 = alloc;
CmdRef param_2 = cmd_ref;
CmdFill param_3 = cmd_fill;
Cmd_Fill_write(param_1, param_2, param_3, v_283, v_283BufferSize);
cmd_ref.offset += 12u;
}
else
{
Alloc param_4 = alloc;
CmdRef param_5 = cmd_ref;
Cmd_Solid_write(param_4, param_5, v_283, v_283BufferSize);
cmd_ref.offset += 4u;
}
}
else
{
CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
Alloc param_6 = alloc;
CmdRef param_7 = cmd_ref;
CmdStroke param_8 = cmd_stroke;
Cmd_Stroke_write(param_6, param_7, param_8, v_283, v_283BufferSize);
cmd_ref.offset += 12u;
}
}
static inline __attribute__((always_inline))
void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.rgba_color;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 5u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
CmdColor param_5 = s;
CmdColor_write(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
Alloc param_12 = a;
uint param_13 = ix + 6u;
uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize);
Alloc param_14 = a;
uint param_15 = ix + 7u;
uint raw7 = read_mem(param_14, param_15, v_283, v_283BufferSize);
Alloc param_16 = a;
uint param_17 = ix + 8u;
uint raw8 = read_mem(param_16, param_17, v_283, v_283BufferSize);
AnnoLinGradient s;
s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
s.linewidth = as_type<float>(raw4);
s.index = raw5;
s.line_x = as_type<float>(raw6);
s.line_y = as_type<float>(raw7);
s.line_c = as_type<float>(raw8);
return s;
}
static inline __attribute__((always_inline))
AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u };
return AnnoLinGradient_read(param, param_1, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.index;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = as_type<uint>(s.line_x);
write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = as_type<uint>(s.line_y);
write_mem(param_6, param_7, param_8, v_283, v_283BufferSize);
Alloc param_9 = a;
uint param_10 = ix + 3u;
uint param_11 = as_type<uint>(s.line_c);
write_mem(param_9, param_10, param_11, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 6u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
CmdLinGrad param_5 = s;
CmdLinGrad_write(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize);
Alloc param_12 = a;
uint param_13 = ix + 6u;
uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize);
AnnoImage s;
s.bbox = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
s.linewidth = as_type<float>(raw4);
s.index = raw5;
s.offset = int2(int(raw6 << uint(16)) >> 16, int(raw6) >> 16);
return s;
}
static inline __attribute__((always_inline))
AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u };
return AnnoImage_read(param, param_1, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.index;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
write_mem(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 7u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
Alloc param_3 = a;
CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
CmdImage param_5 = s;
CmdImage_write(param_3, param_4, param_5, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 8u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 9u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
}
static inline __attribute__((always_inline))
void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint param_2 = 0u;
write_mem(param, param_1, param_2, v_283, v_283BufferSize);
}
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_283 [[buffer(0)]], const device ConfigBuf& _1169 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint sh_bitmaps[8][256];
threadgroup Alloc sh_part_elements[256];
threadgroup uint sh_part_count[256];
threadgroup uint sh_elements[256];
threadgroup uint sh_tile_stride[256];
threadgroup uint sh_tile_width[256];
threadgroup uint sh_tile_x0[256];
threadgroup uint sh_tile_y0[256];
threadgroup uint sh_tile_base[256];
threadgroup uint sh_tile_count[256];
constant uint& v_283BufferSize = spvBufferSizeConstants[0];
uint width_in_bins = ((_1169.conf.width_in_tiles + 16u) - 1u) / 16u;
uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
uint partition_ix = 0u;
uint n_partitions = ((_1169.conf.n_elements + 256u) - 1u) / 256u;
uint th_ix = gl_LocalInvocationID.x;
uint bin_tile_x = 16u * gl_WorkGroupID.x;
uint bin_tile_y = 16u * gl_WorkGroupID.y;
uint tile_x = gl_LocalInvocationID.x % 16u;
uint tile_y = gl_LocalInvocationID.x / 16u;
uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.conf.width_in_tiles) + bin_tile_x) + tile_x;
Alloc param;
param.offset = _1169.conf.ptcl_alloc.offset;
uint param_1 = this_tile_ix * 1024u;
uint param_2 = 1024u;
Alloc cmd_alloc = slice_mem(param, param_1, param_2);
CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
uint cmd_limit = (cmd_ref.offset + 1024u) - 60u;
uint clip_depth = 0u;
uint clip_zero_depth = 0u;
uint rd_ix = 0u;
uint wr_ix = 0u;
uint part_start_ix = 0u;
uint ready_ix = 0u;
bool mem_ok = v_283.mem_error == 0u;
Alloc param_3;
Alloc param_5;
uint _1448;
uint element_ix;
AnnotatedRef ref;
Alloc param_14;
Alloc param_16;
uint tile_count;
Alloc param_23;
uint _1770;
Alloc param_29;
Tile tile_1;
AnnoColor fill;
Alloc param_35;
Alloc param_52;
CmdLinGrad cmd_lin;
Alloc param_69;
while (true)
{
for (uint i = 0u; i < 8u; i++)
{
sh_bitmaps[i][th_ix] = 0u;
}
bool _1500;
for (;;)
{
if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
{
part_start_ix = ready_ix;
uint count = 0u;
bool _1298 = th_ix < 256u;
bool _1306;
if (_1298)
{
_1306 = (partition_ix + th_ix) < n_partitions;
}
else
{
_1306 = _1298;
}
if (_1306)
{
uint in_ix = (_1169.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
param_3.offset = _1169.conf.bin_alloc.offset;
uint param_4 = in_ix;
count = read_mem(param_3, param_4, v_283, v_283BufferSize);
param_5.offset = _1169.conf.bin_alloc.offset;
uint param_6 = in_ix + 1u;
uint offset = read_mem(param_5, param_6, v_283, v_283BufferSize);
uint param_7 = offset;
uint param_8 = count * 4u;
bool param_9 = mem_ok;
sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9);
}
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
if (th_ix < 256u)
{
sh_part_count[th_ix] = count;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (th_ix < 256u)
{
if (th_ix >= (1u << i_1))
{
count += sh_part_count[th_ix - (1u << i_1)];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
if (th_ix < 256u)
{
sh_part_count[th_ix] = part_start_ix + count;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
ready_ix = sh_part_count[255];
partition_ix += 256u;
}
uint ix = rd_ix + th_ix;
if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok)
{
uint part_ix = 0u;
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
uint probe = part_ix + (128u >> i_2);
if (ix >= sh_part_count[probe - 1u])
{
part_ix = probe;
}
}
if (part_ix > 0u)
{
_1448 = sh_part_count[part_ix - 1u];
}
else
{
_1448 = part_start_ix;
}
ix -= _1448;
Alloc bin_alloc = sh_part_elements[part_ix];
BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
BinInstanceRef param_10 = inst_ref;
uint param_11 = ix;
Alloc param_12 = bin_alloc;
BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
BinInstance inst = BinInstance_read(param_12, param_13, v_283, v_283BufferSize);
sh_elements[th_ix] = inst.element_ix;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
wr_ix = min((rd_ix + 256u), ready_ix);
bool _1490 = (wr_ix - rd_ix) < 256u;
if (_1490)
{
_1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
}
else
{
_1500 = _1490;
}
if (_1500)
{
continue;
}
else
{
break;
}
}
uint tag = 0u;
if ((th_ix + rd_ix) < wr_ix)
{
element_ix = sh_elements[th_ix];
ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix * 40u) };
param_14.offset = _1169.conf.anno_alloc.offset;
AnnotatedRef param_15 = ref;
tag = Annotated_tag(param_14, param_15, v_283, v_283BufferSize).tag;
}
switch (tag)
{
case 1u:
case 3u:
case 2u:
case 4u:
case 5u:
{
uint drawmonoid_base = (_1169.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
uint path_ix = v_283.memory[drawmonoid_base];
param_16.offset = _1169.conf.tile_alloc.offset;
PathRef param_17 = PathRef{ _1169.conf.tile_alloc.offset + (path_ix * 12u) };
Path path = Path_read(param_16, param_17, v_283, v_283BufferSize);
uint stride = path.bbox.z - path.bbox.x;
sh_tile_stride[th_ix] = stride;
int dx = int(path.bbox.x) - int(bin_tile_x);
int dy = int(path.bbox.y) - int(bin_tile_y);
int x0 = clamp(dx, 0, 16);
int y0 = clamp(dy, 0, 16);
int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16);
int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16);
sh_tile_width[th_ix] = uint(x1 - x0);
sh_tile_x0[th_ix] = uint(x0);
sh_tile_y0[th_ix] = uint(y0);
tile_count = uint(x1 - x0) * uint(y1 - y0);
uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
sh_tile_base[th_ix] = base;
uint param_18 = path.tiles.offset;
uint param_19 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_20 = mem_ok;
Alloc path_alloc = new_alloc(param_18, param_19, param_20);
uint param_21 = th_ix;
Alloc param_22 = path_alloc;
write_tile_alloc(param_21, param_22);
break;
}
default:
{
tile_count = 0u;
break;
}
}
sh_tile_count[th_ix] = tile_count;
for (uint i_3 = 0u; i_3 < 8u; i_3++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if (th_ix >= (1u << i_3))
{
tile_count += sh_tile_count[th_ix - (1u << i_3)];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_tile_count[th_ix] = tile_count;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint total_tile_count = sh_tile_count[255];
for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u)
{
uint el_ix = 0u;
for (uint i_4 = 0u; i_4 < 8u; i_4++)
{
uint probe_1 = el_ix + (128u >> i_4);
if (ix_1 >= sh_tile_count[probe_1 - 1u])
{
el_ix = probe_1;
}
}
AnnotatedRef ref_1 = AnnotatedRef{ _1169.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) };
param_23.offset = _1169.conf.anno_alloc.offset;
AnnotatedRef param_24 = ref_1;
uint tag_1 = Annotated_tag(param_23, param_24, v_283, v_283BufferSize).tag;
if (el_ix > 0u)
{
_1770 = sh_tile_count[el_ix - 1u];
}
else
{
_1770 = 0u;
}
uint seq_ix = ix_1 - _1770;
uint width = sh_tile_width[el_ix];
uint x = sh_tile_x0[el_ix] + (seq_ix % width);
uint y = sh_tile_y0[el_ix] + (seq_ix / width);
bool include_tile = false;
if (mem_ok)
{
uint param_25 = el_ix;
bool param_26 = mem_ok;
Alloc param_27 = read_tile_alloc(param_25, param_26, v_283, v_283BufferSize);
TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
Tile tile = Tile_read(param_27, param_28, v_283, v_283BufferSize);
bool is_clip = (tag_1 == 4u) || (tag_1 == 5u);
bool _1834 = tile.tile.offset != 0u;
bool _1843;
if (!_1834)
{
_1843 = (tile.backdrop == 0) == is_clip;
}
else
{
_1843 = _1834;
}
include_tile = _1843;
}
if (include_tile)
{
uint el_slice = el_ix / 32u;
uint el_mask = 1u << (el_ix & 31u);
uint _1863 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint slice_ix = 0u;
uint bitmap = sh_bitmaps[0][th_ix];
while (mem_ok)
{
if (bitmap == 0u)
{
slice_ix++;
if (slice_ix == 8u)
{
break;
}
bitmap = sh_bitmaps[slice_ix][th_ix];
if (bitmap == 0u)
{
continue;
}
}
uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
uint element_ix_1 = sh_elements[element_ref_ix];
bitmap &= (bitmap - 1u);
ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix_1 * 40u) };
param_29.offset = _1169.conf.anno_alloc.offset;
AnnotatedRef param_30 = ref;
AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_283, v_283BufferSize);
if (clip_zero_depth == 0u)
{
switch (tag_2.tag)
{
case 1u:
{
uint param_31 = element_ref_ix;
bool param_32 = mem_ok;
Alloc param_33 = read_tile_alloc(param_31, param_32, v_283, v_283BufferSize);
TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
tile_1 = Tile_read(param_33, param_34, v_283, v_283BufferSize);
param_35.offset = _1169.conf.anno_alloc.offset;
AnnotatedRef param_36 = ref;
fill = Annotated_Color_read(param_35, param_36, v_283, v_283BufferSize);
Alloc param_37 = cmd_alloc;
CmdRef param_38 = cmd_ref;
uint param_39 = cmd_limit;
bool _1977 = alloc_cmd(param_37, param_38, param_39, v_283, v_283BufferSize);
cmd_alloc = param_37;
cmd_ref = param_38;
cmd_limit = param_39;
if (!_1977)
{
break;
}
Alloc param_40 = cmd_alloc;
CmdRef param_41 = cmd_ref;
uint param_42 = tag_2.flags;
Tile param_43 = tile_1;
float param_44 = fill.linewidth;
write_fill(param_40, param_41, param_42, param_43, param_44, v_283, v_283BufferSize);
cmd_ref = param_41;
Alloc param_45 = cmd_alloc;
CmdRef param_46 = cmd_ref;
CmdColor param_47 = CmdColor{ fill.rgba_color };
Cmd_Color_write(param_45, param_46, param_47, v_283, v_283BufferSize);
cmd_ref.offset += 8u;
break;
}
case 2u:
{
uint param_48 = element_ref_ix;
bool param_49 = mem_ok;
Alloc param_50 = read_tile_alloc(param_48, param_49, v_283, v_283BufferSize);
TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
tile_1 = Tile_read(param_50, param_51, v_283, v_283BufferSize);
param_52.offset = _1169.conf.anno_alloc.offset;
AnnotatedRef param_53 = ref;
AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_283, v_283BufferSize);
Alloc param_54 = cmd_alloc;
CmdRef param_55 = cmd_ref;
uint param_56 = cmd_limit;
bool _2049 = alloc_cmd(param_54, param_55, param_56, v_283, v_283BufferSize);
cmd_alloc = param_54;
cmd_ref = param_55;
cmd_limit = param_56;
if (!_2049)
{
break;
}
Alloc param_57 = cmd_alloc;
CmdRef param_58 = cmd_ref;
uint param_59 = tag_2.flags;
Tile param_60 = tile_1;
float param_61 = fill.linewidth;
write_fill(param_57, param_58, param_59, param_60, param_61, v_283, v_283BufferSize);
cmd_ref = param_58;
cmd_lin.index = lin.index;
cmd_lin.line_x = lin.line_x;
cmd_lin.line_y = lin.line_y;
cmd_lin.line_c = lin.line_c;
Alloc param_62 = cmd_alloc;
CmdRef param_63 = cmd_ref;
CmdLinGrad param_64 = cmd_lin;
Cmd_LinGrad_write(param_62, param_63, param_64, v_283, v_283BufferSize);
cmd_ref.offset += 20u;
break;
}
case 3u:
{
uint param_65 = element_ref_ix;
bool param_66 = mem_ok;
Alloc param_67 = read_tile_alloc(param_65, param_66, v_283, v_283BufferSize);
TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
tile_1 = Tile_read(param_67, param_68, v_283, v_283BufferSize);
param_69.offset = _1169.conf.anno_alloc.offset;
AnnotatedRef param_70 = ref;
AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_283, v_283BufferSize);
Alloc param_71 = cmd_alloc;
CmdRef param_72 = cmd_ref;
uint param_73 = cmd_limit;
bool _2133 = alloc_cmd(param_71, param_72, param_73, v_283, v_283BufferSize);
cmd_alloc = param_71;
cmd_ref = param_72;
cmd_limit = param_73;
if (!_2133)
{
break;
}
Alloc param_74 = cmd_alloc;
CmdRef param_75 = cmd_ref;
uint param_76 = tag_2.flags;
Tile param_77 = tile_1;
float param_78 = fill_img.linewidth;
write_fill(param_74, param_75, param_76, param_77, param_78, v_283, v_283BufferSize);
cmd_ref = param_75;
Alloc param_79 = cmd_alloc;
CmdRef param_80 = cmd_ref;
CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset };
Cmd_Image_write(param_79, param_80, param_81, v_283, v_283BufferSize);
cmd_ref.offset += 12u;
break;
}
case 4u:
{
uint param_82 = element_ref_ix;
bool param_83 = mem_ok;
Alloc param_84 = read_tile_alloc(param_82, param_83, v_283, v_283BufferSize);
TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
tile_1 = Tile_read(param_84, param_85, v_283, v_283BufferSize);
bool _2194 = tile_1.tile.offset == 0u;
bool _2200;
if (_2194)
{
_2200 = tile_1.backdrop == 0;
}
else
{
_2200 = _2194;
}
if (_2200)
{
clip_zero_depth = clip_depth + 1u;
}
else
{
Alloc param_86 = cmd_alloc;
CmdRef param_87 = cmd_ref;
uint param_88 = cmd_limit;
bool _2212 = alloc_cmd(param_86, param_87, param_88, v_283, v_283BufferSize);
cmd_alloc = param_86;
cmd_ref = param_87;
cmd_limit = param_88;
if (!_2212)
{
break;
}
Alloc param_89 = cmd_alloc;
CmdRef param_90 = cmd_ref;
Cmd_BeginClip_write(param_89, param_90, v_283, v_283BufferSize);
cmd_ref.offset += 4u;
}
clip_depth++;
break;
}
case 5u:
{
uint param_91 = element_ref_ix;
bool param_92 = mem_ok;
Alloc param_93 = read_tile_alloc(param_91, param_92, v_283, v_283BufferSize);
TileRef param_94 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
tile_1 = Tile_read(param_93, param_94, v_283, v_283BufferSize);
clip_depth--;
Alloc param_95 = cmd_alloc;
CmdRef param_96 = cmd_ref;
uint param_97 = cmd_limit;
bool _2261 = alloc_cmd(param_95, param_96, param_97, v_283, v_283BufferSize);
cmd_alloc = param_95;
cmd_ref = param_96;
cmd_limit = param_97;
if (!_2261)
{
break;
}
Alloc param_98 = cmd_alloc;
CmdRef param_99 = cmd_ref;
uint param_100 = 0u;
Tile param_101 = tile_1;
float param_102 = 0.0;
write_fill(param_98, param_99, param_100, param_101, param_102, v_283, v_283BufferSize);
cmd_ref = param_99;
Alloc param_103 = cmd_alloc;
CmdRef param_104 = cmd_ref;
Cmd_EndClip_write(param_103, param_104, v_283, v_283BufferSize);
cmd_ref.offset += 4u;
break;
}
}
}
else
{
switch (tag_2.tag)
{
case 4u:
{
clip_depth++;
break;
}
case 5u:
{
if (clip_depth == clip_zero_depth)
{
clip_zero_depth = 0u;
}
clip_depth--;
break;
}
}
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
rd_ix += 256u;
if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions))
{
break;
}
}
bool _2326 = (bin_tile_x + tile_x) < _1169.conf.width_in_tiles;
bool _2335;
if (_2326)
{
_2335 = (bin_tile_y + tile_y) < _1169.conf.height_in_tiles;
}
else
{
_2335 = _2326;
}
if (_2335)
{
Alloc param_105 = cmd_alloc;
CmdRef param_106 = cmd_ref;
Cmd_End_write(param_105, param_106, v_283, v_283BufferSize);
}
}