vello/piet-gpu/shader/gen/coarse.msl

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wunused-variable"

#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>

using namespace metal;

// Implementation of the GLSL findLSB() function
template<typename T>
inline T spvFindLSB(T x)
{
    return select(ctz(x), T(-1), x == T(0));
}

struct Alloc
{
    uint offset;
};

struct MallocResult
{
    Alloc alloc;
    bool failed;
};

struct BinInstanceRef
{
    uint offset;
};

struct BinInstance
{
    uint element_ix;
};

struct PathRef
{
    uint offset;
};

struct TileRef
{
    uint offset;
};

struct Path
{
    uint4 bbox;
    TileRef tiles;
};

struct TileSegRef
{
    uint offset;
};

struct Tile
{
    TileSegRef tile;
    int backdrop;
};

struct CmdStrokeRef
{
    uint offset;
};

struct CmdStroke
{
    uint tile_ref;
    float half_width;
};

struct CmdFillRef
{
    uint offset;
};

struct CmdFill
{
    uint tile_ref;
    int backdrop;
};

struct CmdColorRef
{
    uint offset;
};

struct CmdColor
{
    uint rgba_color;
};

struct CmdLinGradRef
{
    uint offset;
};

struct CmdLinGrad
{
    uint index;
    float line_x;
    float line_y;
    float line_c;
};

struct CmdRadGradRef
{
    uint offset;
};

struct CmdRadGrad
{
    uint index;
    float4 mat;
    float2 xlat;
    float2 c1;
    float ra;
    float roff;
};

struct CmdImageRef
{
    uint offset;
};

struct CmdImage
{
    uint index;
    int2 offset;
};

struct CmdEndClipRef
{
    uint offset;
};

struct CmdEndClip
{
    uint blend;
};

struct CmdJumpRef
{
    uint offset;
};

struct CmdJump
{
    uint new_ref;
};

struct CmdRef
{
    uint offset;
};

struct Memory
{
    uint mem_offset;
    uint mem_error;
    uint memory[1];
};

struct Alloc_1
{
    uint offset;
};

struct Config
{
    uint n_elements;
    uint n_pathseg;
    uint width_in_tiles;
    uint height_in_tiles;
    Alloc_1 tile_alloc;
    Alloc_1 bin_alloc;
    Alloc_1 ptcl_alloc;
    Alloc_1 pathseg_alloc;
    Alloc_1 anno_alloc;
    Alloc_1 trans_alloc;
    Alloc_1 path_bbox_alloc;
    Alloc_1 drawmonoid_alloc;
    Alloc_1 clip_alloc;
    Alloc_1 clip_bic_alloc;
    Alloc_1 clip_stack_alloc;
    Alloc_1 clip_bbox_alloc;
    Alloc_1 draw_bbox_alloc;
    Alloc_1 drawinfo_alloc;
    uint n_trans;
    uint n_path;
    uint n_clip;
    uint trans_offset;
    uint linewidth_offset;
    uint pathtag_offset;
    uint pathseg_offset;
    uint drawtag_offset;
    uint drawdata_offset;
};

struct ConfigBuf
{
    Config conf;
};

struct SceneBuf
{
    uint scene[1];
};

constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);

static inline __attribute__((always_inline))
Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
{
    return Alloc{ a.offset + offset };
}

static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
    return true;
}

static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = alloc;
    uint param_1 = offset;
    if (!touch_mem(param, param_1))
    {
        return 0u;
    }
    uint v = v_260.memory[offset];
    return v;
}

static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
    Alloc a;
    a.offset = offset;
    return a;
}

static inline __attribute__((always_inline))
BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index)
{
    return BinInstanceRef{ ref.offset + (index * 4u) };
}

static inline __attribute__((always_inline))
BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
    BinInstance s;
    s.element_ix = raw0;
    return s;
}

static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
    Alloc param_4 = a;
    uint param_5 = ix + 2u;
    uint raw2 = read_mem(param_4, param_5, v_260, v_260BufferSize);
    Path s;
    s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
    s.tiles = TileRef{ raw2 };
    return s;
}

static inline __attribute__((always_inline))
void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a)
{
}

static inline __attribute__((always_inline))
Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint param = 0u;
    uint param_1 = uint(int((v_260BufferSize - 8) / 4) * 4);
    bool param_2 = mem_ok;
    return new_alloc(param, param_1, param_2);
}

static inline __attribute__((always_inline))
Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize);
    Alloc param_2 = a;
    uint param_3 = ix + 1u;
    uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize);
    Tile s;
    s.tile = TileSegRef{ raw0 };
    s.backdrop = int(raw1);
    return s;
}

static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint _266 = atomic_fetch_add_explicit((device atomic_uint*)&v_260.mem_offset, size, memory_order_relaxed);
    uint offset = _266;
    MallocResult r;
    r.failed = (offset + size) > uint(int((v_260BufferSize - 8) / 4) * 4);
    uint param = offset;
    uint param_1 = size;
    bool param_2 = !r.failed;
    r.alloc = new_alloc(param, param_1, param_2);
    if (r.failed)
    {
        uint _295 = atomic_fetch_max_explicit((device atomic_uint*)&v_260.mem_error, 1u, memory_order_relaxed);
        return r;
    }
    return r;
}

static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = alloc;
    uint param_1 = offset;
    if (!touch_mem(param, param_1))
    {
        return;
    }
    v_260.memory[offset] = val;
}

static inline __attribute__((always_inline))
void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.new_ref;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 11u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u };
    CmdJump param_5 = s;
    CmdJump_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_260, constant uint& v_260BufferSize)
{
    if (cmd_ref.offset < cmd_limit)
    {
        return true;
    }
    uint param = 1024u;
    MallocResult _913 = malloc(param, v_260, v_260BufferSize);
    MallocResult new_cmd = _913;
    if (new_cmd.failed)
    {
        return false;
    }
    CmdJump jump = CmdJump{ new_cmd.alloc.offset };
    Alloc param_1 = cmd_alloc;
    CmdRef param_2 = cmd_ref;
    CmdJump param_3 = jump;
    Cmd_Jump_write(param_1, param_2, param_3, v_260, v_260BufferSize);
    cmd_alloc = new_cmd.alloc;
    cmd_ref = CmdRef{ cmd_alloc.offset };
    cmd_limit = (cmd_alloc.offset + 1024u) - 144u;
    return true;
}

static inline __attribute__((always_inline))
void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.tile_ref;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = uint(s.backdrop);
    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 1u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u };
    CmdFill param_5 = s;
    CmdFill_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 3u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.tile_ref;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.half_width);
    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 2u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u };
    CmdStroke param_5 = s;
    CmdStroke_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_260, constant uint& v_260BufferSize)
{
    if (linewidth < 0.0)
    {
        if (tile.tile.offset != 0u)
        {
            CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop };
            Alloc param = alloc;
            CmdRef param_1 = cmd_ref;
            CmdFill param_2 = cmd_fill;
            Cmd_Fill_write(param, param_1, param_2, v_260, v_260BufferSize);
            cmd_ref.offset += 12u;
        }
        else
        {
            Alloc param_3 = alloc;
            CmdRef param_4 = cmd_ref;
            Cmd_Solid_write(param_3, param_4, v_260, v_260BufferSize);
            cmd_ref.offset += 4u;
        }
    }
    else
    {
        CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth };
        Alloc param_5 = alloc;
        CmdRef param_6 = cmd_ref;
        CmdStroke param_7 = cmd_stroke;
        Cmd_Stroke_write(param_5, param_6, param_7, v_260, v_260BufferSize);
        cmd_ref.offset += 12u;
    }
}

static inline __attribute__((always_inline))
void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.rgba_color;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 5u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u };
    CmdColor param_5 = s;
    CmdColor_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.index;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.line_x);
    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.line_y);
    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.line_c);
    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 6u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u };
    CmdLinGrad param_5 = s;
    CmdLinGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.index;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = as_type<uint>(s.mat.x);
    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
    Alloc param_6 = a;
    uint param_7 = ix + 2u;
    uint param_8 = as_type<uint>(s.mat.y);
    write_mem(param_6, param_7, param_8, v_260, v_260BufferSize);
    Alloc param_9 = a;
    uint param_10 = ix + 3u;
    uint param_11 = as_type<uint>(s.mat.z);
    write_mem(param_9, param_10, param_11, v_260, v_260BufferSize);
    Alloc param_12 = a;
    uint param_13 = ix + 4u;
    uint param_14 = as_type<uint>(s.mat.w);
    write_mem(param_12, param_13, param_14, v_260, v_260BufferSize);
    Alloc param_15 = a;
    uint param_16 = ix + 5u;
    uint param_17 = as_type<uint>(s.xlat.x);
    write_mem(param_15, param_16, param_17, v_260, v_260BufferSize);
    Alloc param_18 = a;
    uint param_19 = ix + 6u;
    uint param_20 = as_type<uint>(s.xlat.y);
    write_mem(param_18, param_19, param_20, v_260, v_260BufferSize);
    Alloc param_21 = a;
    uint param_22 = ix + 7u;
    uint param_23 = as_type<uint>(s.c1.x);
    write_mem(param_21, param_22, param_23, v_260, v_260BufferSize);
    Alloc param_24 = a;
    uint param_25 = ix + 8u;
    uint param_26 = as_type<uint>(s.c1.y);
    write_mem(param_24, param_25, param_26, v_260, v_260BufferSize);
    Alloc param_27 = a;
    uint param_28 = ix + 9u;
    uint param_29 = as_type<uint>(s.ra);
    write_mem(param_27, param_28, param_29, v_260, v_260BufferSize);
    Alloc param_30 = a;
    uint param_31 = ix + 10u;
    uint param_32 = as_type<uint>(s.roff);
    write_mem(param_30, param_31, param_32, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 7u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u };
    CmdRadGrad param_5 = s;
    CmdRadGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.index;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    uint param_4 = ix + 1u;
    uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16));
    write_mem(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 8u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u };
    CmdImage param_5 = s;
    CmdImage_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 9u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    uint ix = ref.offset >> uint(2);
    Alloc param = a;
    uint param_1 = ix + 0u;
    uint param_2 = s.blend;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 10u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
    Alloc param_3 = a;
    CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u };
    CmdEndClip param_5 = s;
    CmdEndClip_write(param_3, param_4, param_5, v_260, v_260BufferSize);
}

static inline __attribute__((always_inline))
void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize)
{
    Alloc param = a;
    uint param_1 = ref.offset >> uint(2);
    uint param_2 = 0u;
    write_mem(param, param_1, param_2, v_260, v_260BufferSize);
}

kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1378 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
    threadgroup uint sh_bitmaps[8][256];
    threadgroup Alloc sh_part_elements[256];
    threadgroup uint sh_part_count[256];
    threadgroup uint sh_elements[256];
    threadgroup uint sh_tile_stride[256];
    threadgroup uint sh_tile_width[256];
    threadgroup uint sh_tile_x0[256];
    threadgroup uint sh_tile_y0[256];
    threadgroup uint sh_tile_base[256];
    threadgroup uint sh_tile_count[256];
    constant uint& v_260BufferSize = spvBufferSizeConstants[0];
    uint width_in_bins = ((_1005.conf.width_in_tiles + 16u) - 1u) / 16u;
    uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x;
    uint partition_ix = 0u;
    uint n_partitions = ((_1005.conf.n_elements + 256u) - 1u) / 256u;
    uint th_ix = gl_LocalInvocationID.x;
    uint bin_tile_x = 16u * gl_WorkGroupID.x;
    uint bin_tile_y = 16u * gl_WorkGroupID.y;
    uint tile_x = gl_LocalInvocationID.x % 16u;
    uint tile_y = gl_LocalInvocationID.x / 16u;
    uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.conf.width_in_tiles) + bin_tile_x) + tile_x;
    Alloc param;
    param.offset = _1005.conf.ptcl_alloc.offset;
    uint param_1 = this_tile_ix * 1024u;
    uint param_2 = 1024u;
    Alloc cmd_alloc = slice_mem(param, param_1, param_2);
    CmdRef cmd_ref = CmdRef{ cmd_alloc.offset };
    uint cmd_limit = (cmd_ref.offset + 1024u) - 144u;
    uint clip_depth = 0u;
    uint clip_zero_depth = 0u;
    uint rd_ix = 0u;
    uint wr_ix = 0u;
    uint part_start_ix = 0u;
    uint ready_ix = 0u;
    cmd_ref.offset += 4u;
    uint render_blend_depth = 0u;
    uint max_blend_depth = 0u;
    uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2);
    uint drawtag_start = _1005.conf.drawtag_offset >> uint(2);
    uint drawdata_start = _1005.conf.drawdata_offset >> uint(2);
    uint drawinfo_start = _1005.conf.drawinfo_alloc.offset >> uint(2);
    bool mem_ok = v_260.mem_error == 0u;
    Alloc param_3;
    Alloc param_5;
    uint _1310;
    uint element_ix;
    Alloc param_14;
    uint tile_count;
    uint _1611;
    float linewidth;
    CmdLinGrad cmd_lin;
    CmdRadGrad cmd_rad;
    while (true)
    {
        for (uint i = 0u; i < 8u; i++)
        {
            sh_bitmaps[i][th_ix] = 0u;
        }
        bool _1362;
        for (;;)
        {
            if ((ready_ix == wr_ix) && (partition_ix < n_partitions))
            {
                part_start_ix = ready_ix;
                uint count = 0u;
                bool _1160 = th_ix < 256u;
                bool _1168;
                if (_1160)
                {
                    _1168 = (partition_ix + th_ix) < n_partitions;
                }
                else
                {
                    _1168 = _1160;
                }
                if (_1168)
                {
                    uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u);
                    param_3.offset = _1005.conf.bin_alloc.offset;
                    uint param_4 = in_ix;
                    count = read_mem(param_3, param_4, v_260, v_260BufferSize);
                    param_5.offset = _1005.conf.bin_alloc.offset;
                    uint param_6 = in_ix + 1u;
                    uint offset = read_mem(param_5, param_6, v_260, v_260BufferSize);
                    uint param_7 = offset;
                    uint param_8 = count * 4u;
                    bool param_9 = mem_ok;
                    sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9);
                }
                for (uint i_1 = 0u; i_1 < 8u; i_1++)
                {
                    if (th_ix < 256u)
                    {
                        sh_part_count[th_ix] = count;
                    }
                    threadgroup_barrier(mem_flags::mem_threadgroup);
                    if (th_ix < 256u)
                    {
                        if (th_ix >= (1u << i_1))
                        {
                            count += sh_part_count[th_ix - (1u << i_1)];
                        }
                    }
                    threadgroup_barrier(mem_flags::mem_threadgroup);
                }
                if (th_ix < 256u)
                {
                    sh_part_count[th_ix] = part_start_ix + count;
                }
                threadgroup_barrier(mem_flags::mem_threadgroup);
                ready_ix = sh_part_count[255];
                partition_ix += 256u;
            }
            uint ix = rd_ix + th_ix;
            if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok)
            {
                uint part_ix = 0u;
                for (uint i_2 = 0u; i_2 < 8u; i_2++)
                {
                    uint probe = part_ix + (128u >> i_2);
                    if (ix >= sh_part_count[probe - 1u])
                    {
                        part_ix = probe;
                    }
                }
                if (part_ix > 0u)
                {
                    _1310 = sh_part_count[part_ix - 1u];
                }
                else
                {
                    _1310 = part_start_ix;
                }
                ix -= _1310;
                Alloc bin_alloc = sh_part_elements[part_ix];
                BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset };
                BinInstanceRef param_10 = inst_ref;
                uint param_11 = ix;
                Alloc param_12 = bin_alloc;
                BinInstanceRef param_13 = BinInstance_index(param_10, param_11);
                BinInstance inst = BinInstance_read(param_12, param_13, v_260, v_260BufferSize);
                sh_elements[th_ix] = inst.element_ix;
            }
            threadgroup_barrier(mem_flags::mem_threadgroup);
            wr_ix = min((rd_ix + 256u), ready_ix);
            bool _1352 = (wr_ix - rd_ix) < 256u;
            if (_1352)
            {
                _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions);
            }
            else
            {
                _1362 = _1352;
            }
            if (_1362)
            {
                continue;
            }
            else
            {
                break;
            }
        }
        uint tag = 0u;
        if ((th_ix + rd_ix) < wr_ix)
        {
            element_ix = sh_elements[th_ix];
            tag = _1378.scene[drawtag_start + element_ix];
        }
        switch (tag)
        {
            case 68u:
            case 72u:
            case 276u:
            case 732u:
            case 5u:
            case 37u:
            {
                uint drawmonoid_base = drawmonoid_start + (4u * element_ix);
                uint path_ix = v_260.memory[drawmonoid_base];
                param_14.offset = _1005.conf.tile_alloc.offset;
                PathRef param_15 = PathRef{ _1005.conf.tile_alloc.offset + (path_ix * 12u) };
                Path path = Path_read(param_14, param_15, v_260, v_260BufferSize);
                uint stride = path.bbox.z - path.bbox.x;
                sh_tile_stride[th_ix] = stride;
                int dx = int(path.bbox.x) - int(bin_tile_x);
                int dy = int(path.bbox.y) - int(bin_tile_y);
                int x0 = clamp(dx, 0, 16);
                int y0 = clamp(dy, 0, 16);
                int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16);
                int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16);
                sh_tile_width[th_ix] = uint(x1 - x0);
                sh_tile_x0[th_ix] = uint(x0);
                sh_tile_y0[th_ix] = uint(y0);
                tile_count = uint(x1 - x0) * uint(y1 - y0);
                uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u);
                sh_tile_base[th_ix] = base;
                uint param_16 = path.tiles.offset;
                uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
                bool param_18 = mem_ok;
                Alloc path_alloc = new_alloc(param_16, param_17, param_18);
                uint param_19 = th_ix;
                Alloc param_20 = path_alloc;
                write_tile_alloc(param_19, param_20);
                break;
            }
            default:
            {
                tile_count = 0u;
                break;
            }
        }
        sh_tile_count[th_ix] = tile_count;
        for (uint i_3 = 0u; i_3 < 8u; i_3++)
        {
            threadgroup_barrier(mem_flags::mem_threadgroup);
            if (th_ix >= (1u << i_3))
            {
                tile_count += sh_tile_count[th_ix - (1u << i_3)];
            }
            threadgroup_barrier(mem_flags::mem_threadgroup);
            sh_tile_count[th_ix] = tile_count;
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
        uint total_tile_count = sh_tile_count[255];
        for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u)
        {
            uint el_ix = 0u;
            for (uint i_4 = 0u; i_4 < 8u; i_4++)
            {
                uint probe_1 = el_ix + (128u >> i_4);
                if (ix_1 >= sh_tile_count[probe_1 - 1u])
                {
                    el_ix = probe_1;
                }
            }
            uint element_ix_1 = sh_elements[el_ix];
            uint tag_1 = _1378.scene[drawtag_start + element_ix_1];
            if (el_ix > 0u)
            {
                _1611 = sh_tile_count[el_ix - 1u];
            }
            else
            {
                _1611 = 0u;
            }
            uint seq_ix = ix_1 - _1611;
            uint width = sh_tile_width[el_ix];
            uint x = sh_tile_x0[el_ix] + (seq_ix % width);
            uint y = sh_tile_y0[el_ix] + (seq_ix / width);
            bool include_tile = false;
            if (mem_ok)
            {
                uint param_21 = el_ix;
                bool param_22 = mem_ok;
                Alloc param_23 = read_tile_alloc(param_21, param_22, v_260, v_260BufferSize);
                TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) };
                Tile tile = Tile_read(param_23, param_24, v_260, v_260BufferSize);
                bool is_clip = (tag_1 & 1u) != 0u;
                bool is_blend = false;
                if (is_clip)
                {
                    uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1);
                    uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u];
                    uint dd = drawdata_start + (scene_offset >> uint(2));
                    uint blend = _1378.scene[dd];
                    is_blend = blend != 3u;
                }
                bool _1698 = tile.tile.offset != 0u;
                bool _1707;
                if (!_1698)
                {
                    _1707 = (tile.backdrop == 0) == is_clip;
                }
                else
                {
                    _1707 = _1698;
                }
                include_tile = _1707 || is_blend;
            }
            if (include_tile)
            {
                uint el_slice = el_ix / 32u;
                uint el_mask = 1u << (el_ix & 31u);
                uint _1729 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed);
            }
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
        uint slice_ix = 0u;
        uint bitmap = sh_bitmaps[0][th_ix];
        while (mem_ok)
        {
            if (bitmap == 0u)
            {
                slice_ix++;
                if (slice_ix == 8u)
                {
                    break;
                }
                bitmap = sh_bitmaps[slice_ix][th_ix];
                if (bitmap == 0u)
                {
                    continue;
                }
            }
            uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap)));
            uint element_ix_2 = sh_elements[element_ref_ix];
            bitmap &= (bitmap - 1u);
            uint drawtag = _1378.scene[drawtag_start + element_ix_2];
            if (clip_zero_depth == 0u)
            {
                uint param_25 = element_ref_ix;
                bool param_26 = mem_ok;
                Alloc param_27 = read_tile_alloc(param_25, param_26, v_260, v_260BufferSize);
                TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) };
                Tile tile_1 = Tile_read(param_27, param_28, v_260, v_260BufferSize);
                uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2);
                uint scene_offset_1 = v_260.memory[drawmonoid_base_2 + 2u];
                uint info_offset = v_260.memory[drawmonoid_base_2 + 3u];
                uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2));
                uint di = drawinfo_start + (info_offset >> uint(2));
                switch (drawtag)
                {
                    case 68u:
                    {
                        linewidth = as_type<float>(v_260.memory[di]);
                        Alloc param_29 = cmd_alloc;
                        CmdRef param_30 = cmd_ref;
                        uint param_31 = cmd_limit;
                        bool _1854 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize);
                        cmd_alloc = param_29;
                        cmd_ref = param_30;
                        cmd_limit = param_31;
                        if (!_1854)
                        {
                            break;
                        }
                        Alloc param_32 = cmd_alloc;
                        CmdRef param_33 = cmd_ref;
                        Tile param_34 = tile_1;
                        float param_35 = linewidth;
                        write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize);
                        cmd_ref = param_33;
                        uint rgba = _1378.scene[dd_1];
                        Alloc param_36 = cmd_alloc;
                        CmdRef param_37 = cmd_ref;
                        CmdColor param_38 = CmdColor{ rgba };
                        Cmd_Color_write(param_36, param_37, param_38, v_260, v_260BufferSize);
                        cmd_ref.offset += 8u;
                        break;
                    }
                    case 276u:
                    {
                        Alloc param_39 = cmd_alloc;
                        CmdRef param_40 = cmd_ref;
                        uint param_41 = cmd_limit;
                        bool _1895 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize);
                        cmd_alloc = param_39;
                        cmd_ref = param_40;
                        cmd_limit = param_41;
                        if (!_1895)
                        {
                            break;
                        }
                        linewidth = as_type<float>(v_260.memory[di]);
                        Alloc param_42 = cmd_alloc;
                        CmdRef param_43 = cmd_ref;
                        Tile param_44 = tile_1;
                        float param_45 = linewidth;
                        write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize);
                        cmd_ref = param_43;
                        cmd_lin.index = _1378.scene[dd_1];
                        cmd_lin.line_x = as_type<float>(v_260.memory[di + 1u]);
                        cmd_lin.line_y = as_type<float>(v_260.memory[di + 2u]);
                        cmd_lin.line_c = as_type<float>(v_260.memory[di + 3u]);
                        Alloc param_46 = cmd_alloc;
                        CmdRef param_47 = cmd_ref;
                        CmdLinGrad param_48 = cmd_lin;
                        Cmd_LinGrad_write(param_46, param_47, param_48, v_260, v_260BufferSize);
                        cmd_ref.offset += 20u;
                        break;
                    }
                    case 732u:
                    {
                        Alloc param_49 = cmd_alloc;
                        CmdRef param_50 = cmd_ref;
                        uint param_51 = cmd_limit;
                        bool _1959 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize);
                        cmd_alloc = param_49;
                        cmd_ref = param_50;
                        cmd_limit = param_51;
                        if (!_1959)
                        {
                            break;
                        }
                        linewidth = as_type<float>(v_260.memory[di]);
                        Alloc param_52 = cmd_alloc;
                        CmdRef param_53 = cmd_ref;
                        Tile param_54 = tile_1;
                        float param_55 = linewidth;
                        write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize);
                        cmd_ref = param_53;
                        cmd_rad.index = _1378.scene[dd_1];
                        cmd_rad.mat = as_type<float4>(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u]));
                        cmd_rad.xlat = as_type<float2>(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u]));
                        cmd_rad.c1 = as_type<float2>(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u]));
                        cmd_rad.ra = as_type<float>(v_260.memory[di + 9u]);
                        cmd_rad.roff = as_type<float>(v_260.memory[di + 10u]);
                        Alloc param_56 = cmd_alloc;
                        CmdRef param_57 = cmd_ref;
                        CmdRadGrad param_58 = cmd_rad;
                        Cmd_RadGrad_write(param_56, param_57, param_58, v_260, v_260BufferSize);
                        cmd_ref.offset += 48u;
                        break;
                    }
                    case 72u:
                    {
                        linewidth = as_type<float>(v_260.memory[di]);
                        Alloc param_59 = cmd_alloc;
                        CmdRef param_60 = cmd_ref;
                        uint param_61 = cmd_limit;
                        bool _2065 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize);
                        cmd_alloc = param_59;
                        cmd_ref = param_60;
                        cmd_limit = param_61;
                        if (!_2065)
                        {
                            break;
                        }
                        Alloc param_62 = cmd_alloc;
                        CmdRef param_63 = cmd_ref;
                        Tile param_64 = tile_1;
                        float param_65 = linewidth;
                        write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize);
                        cmd_ref = param_63;
                        uint index = _1378.scene[dd_1];
                        uint raw1 = _1378.scene[dd_1 + 1u];
                        int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16);
                        Alloc param_66 = cmd_alloc;
                        CmdRef param_67 = cmd_ref;
                        CmdImage param_68 = CmdImage{ index, offset_1 };
                        Cmd_Image_write(param_66, param_67, param_68, v_260, v_260BufferSize);
                        cmd_ref.offset += 12u;
                        break;
                    }
                    case 5u:
                    {
                        bool _2118 = tile_1.tile.offset == 0u;
                        bool _2124;
                        if (_2118)
                        {
                            _2124 = tile_1.backdrop == 0;
                        }
                        else
                        {
                            _2124 = _2118;
                        }
                        if (_2124)
                        {
                            clip_zero_depth = clip_depth + 1u;
                        }
                        else
                        {
                            Alloc param_69 = cmd_alloc;
                            CmdRef param_70 = cmd_ref;
                            uint param_71 = cmd_limit;
                            bool _2136 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize);
                            cmd_alloc = param_69;
                            cmd_ref = param_70;
                            cmd_limit = param_71;
                            if (!_2136)
                            {
                                break;
                            }
                            Alloc param_72 = cmd_alloc;
                            CmdRef param_73 = cmd_ref;
                            Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize);
                            cmd_ref.offset += 4u;
                            render_blend_depth++;
                            max_blend_depth = max(max_blend_depth, render_blend_depth);
                        }
                        clip_depth++;
                        break;
                    }
                    case 37u:
                    {
                        clip_depth--;
                        Alloc param_74 = cmd_alloc;
                        CmdRef param_75 = cmd_ref;
                        uint param_76 = cmd_limit;
                        bool _2169 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize);
                        cmd_alloc = param_74;
                        cmd_ref = param_75;
                        cmd_limit = param_76;
                        if (!_2169)
                        {
                            break;
                        }
                        Alloc param_77 = cmd_alloc;
                        CmdRef param_78 = cmd_ref;
                        Tile param_79 = tile_1;
                        float param_80 = -1.0;
                        write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize);
                        cmd_ref = param_78;
                        uint blend_1 = _1378.scene[dd_1];
                        Alloc param_81 = cmd_alloc;
                        CmdRef param_82 = cmd_ref;
                        CmdEndClip param_83 = CmdEndClip{ blend_1 };
                        Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize);
                        cmd_ref.offset += 8u;
                        render_blend_depth--;
                        break;
                    }
                }
            }
            else
            {
                switch (drawtag)
                {
                    case 5u:
                    {
                        clip_depth++;
                        break;
                    }
                    case 37u:
                    {
                        if (clip_depth == clip_zero_depth)
                        {
                            clip_zero_depth = 0u;
                        }
                        clip_depth--;
                        break;
                    }
                }
            }
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
        rd_ix += 256u;
        if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions))
        {
            break;
        }
    }
    bool _2241 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles;
    bool _2250;
    if (_2241)
    {
        _2250 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles;
    }
    else
    {
        _2250 = _2241;
    }
    if (_2250)
    {
        Alloc param_84 = cmd_alloc;
        CmdRef param_85 = cmd_ref;
        Cmd_End_write(param_84, param_85, v_260, v_260BufferSize);
        if (max_blend_depth > 4u)
        {
        }
    }
}