mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-25 18:56:35 +11:00
7134be2329
We always do BeginClip/EndClip if it's a solid tile and the blend mode is not default. Also fix missing entry in pipeline layout (affects Vulkan but not Metal).
273 lines
7.5 KiB
Text
Generated
273 lines
7.5 KiB
Text
Generated
#pragma clang diagnostic ignored "-Wmissing-prototypes"
|
|
#pragma clang diagnostic ignored "-Wunused-variable"
|
|
|
|
#include <metal_stdlib>
|
|
#include <simd/simd.h>
|
|
#include <metal_atomic>
|
|
|
|
using namespace metal;
|
|
|
|
struct Alloc
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct MallocResult
|
|
{
|
|
Alloc alloc;
|
|
bool failed;
|
|
};
|
|
|
|
struct PathRef
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct TileRef
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct Path
|
|
{
|
|
uint4 bbox;
|
|
TileRef tiles;
|
|
};
|
|
|
|
struct Memory
|
|
{
|
|
uint mem_offset;
|
|
uint mem_error;
|
|
uint memory[1];
|
|
};
|
|
|
|
struct Alloc_1
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct Config
|
|
{
|
|
uint n_elements;
|
|
uint n_pathseg;
|
|
uint width_in_tiles;
|
|
uint height_in_tiles;
|
|
Alloc_1 tile_alloc;
|
|
Alloc_1 bin_alloc;
|
|
Alloc_1 ptcl_alloc;
|
|
Alloc_1 pathseg_alloc;
|
|
Alloc_1 anno_alloc;
|
|
Alloc_1 trans_alloc;
|
|
Alloc_1 path_bbox_alloc;
|
|
Alloc_1 drawmonoid_alloc;
|
|
Alloc_1 clip_alloc;
|
|
Alloc_1 clip_bic_alloc;
|
|
Alloc_1 clip_stack_alloc;
|
|
Alloc_1 clip_bbox_alloc;
|
|
Alloc_1 draw_bbox_alloc;
|
|
Alloc_1 drawinfo_alloc;
|
|
uint n_trans;
|
|
uint n_path;
|
|
uint n_clip;
|
|
uint trans_offset;
|
|
uint linewidth_offset;
|
|
uint pathtag_offset;
|
|
uint pathseg_offset;
|
|
uint drawtag_offset;
|
|
uint drawdata_offset;
|
|
};
|
|
|
|
struct ConfigBuf
|
|
{
|
|
Config conf;
|
|
};
|
|
|
|
struct SceneBuf
|
|
{
|
|
uint scene[1];
|
|
};
|
|
|
|
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
|
|
|
|
static inline __attribute__((always_inline))
|
|
float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181)
|
|
{
|
|
uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
|
|
float x0 = as_type<float>(v_70.memory[base]);
|
|
float y0 = as_type<float>(v_70.memory[base + 1u]);
|
|
float x1 = as_type<float>(v_70.memory[base + 2u]);
|
|
float y1 = as_type<float>(v_70.memory[base + 3u]);
|
|
float4 bbox = float4(x0, y0, x1, y1);
|
|
return bbox;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
|
|
{
|
|
Alloc a;
|
|
a.offset = offset;
|
|
return a;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize)
|
|
{
|
|
uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed);
|
|
uint offset = _76;
|
|
MallocResult r;
|
|
r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4);
|
|
uint param = offset;
|
|
uint param_1 = size;
|
|
bool param_2 = !r.failed;
|
|
r.alloc = new_alloc(param, param_1, param_2);
|
|
if (r.failed)
|
|
{
|
|
uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed);
|
|
return r;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size)
|
|
{
|
|
return Alloc{ a.offset + offset };
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize)
|
|
{
|
|
Alloc param = alloc;
|
|
uint param_1 = offset;
|
|
if (!touch_mem(param, param_1))
|
|
{
|
|
return;
|
|
}
|
|
v_70.memory[offset] = val;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize)
|
|
{
|
|
uint ix = ref.offset >> uint(2);
|
|
Alloc param = a;
|
|
uint param_1 = ix + 0u;
|
|
uint param_2 = s.bbox.x | (s.bbox.y << uint(16));
|
|
write_mem(param, param_1, param_2, v_70, v_70BufferSize);
|
|
Alloc param_3 = a;
|
|
uint param_4 = ix + 1u;
|
|
uint param_5 = s.bbox.z | (s.bbox.w << uint(16));
|
|
write_mem(param_3, param_4, param_5, v_70, v_70BufferSize);
|
|
Alloc param_6 = a;
|
|
uint param_7 = ix + 2u;
|
|
uint param_8 = s.tiles.offset;
|
|
write_mem(param_6, param_7, param_8, v_70, v_70BufferSize);
|
|
}
|
|
|
|
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
|
|
{
|
|
threadgroup uint sh_tile_count[256];
|
|
threadgroup MallocResult sh_tile_alloc;
|
|
constant uint& v_70BufferSize = spvBufferSizeConstants[0];
|
|
uint th_ix = gl_LocalInvocationID.x;
|
|
uint element_ix = gl_GlobalInvocationID.x;
|
|
PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) };
|
|
uint drawtag_base = v_181.conf.drawtag_offset >> uint(2);
|
|
uint drawtag = 0u;
|
|
if (element_ix < v_181.conf.n_elements)
|
|
{
|
|
drawtag = _257.scene[drawtag_base + element_ix];
|
|
}
|
|
int x0 = 0;
|
|
int y0 = 0;
|
|
int x1 = 0;
|
|
int y1 = 0;
|
|
if ((drawtag != 0u) && (drawtag != 37u))
|
|
{
|
|
uint param = element_ix;
|
|
float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181);
|
|
x0 = int(floor(bbox.x * 0.0625));
|
|
y0 = int(floor(bbox.y * 0.0625));
|
|
x1 = int(ceil(bbox.z * 0.0625));
|
|
y1 = int(ceil(bbox.w * 0.0625));
|
|
}
|
|
x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles));
|
|
y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles));
|
|
x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles));
|
|
y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles));
|
|
Path path;
|
|
path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1));
|
|
uint tile_count = uint((x1 - x0) * (y1 - y0));
|
|
sh_tile_count[th_ix] = tile_count;
|
|
uint total_tile_count = tile_count;
|
|
for (uint i = 0u; i < 8u; i++)
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
if (th_ix >= (1u << i))
|
|
{
|
|
total_tile_count += sh_tile_count[th_ix - (1u << i)];
|
|
}
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
sh_tile_count[th_ix] = total_tile_count;
|
|
}
|
|
if (th_ix == 255u)
|
|
{
|
|
uint param_1 = total_tile_count * 8u;
|
|
MallocResult _392 = malloc(param_1, v_70, v_70BufferSize);
|
|
sh_tile_alloc = _392;
|
|
}
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
MallocResult alloc_start = sh_tile_alloc;
|
|
bool _403;
|
|
if (!alloc_start.failed)
|
|
{
|
|
_403 = v_70.mem_error != 0u;
|
|
}
|
|
else
|
|
{
|
|
_403 = alloc_start.failed;
|
|
}
|
|
if (_403)
|
|
{
|
|
return;
|
|
}
|
|
if (element_ix < v_181.conf.n_elements)
|
|
{
|
|
uint _416;
|
|
if (th_ix > 0u)
|
|
{
|
|
_416 = sh_tile_count[th_ix - 1u];
|
|
}
|
|
else
|
|
{
|
|
_416 = 0u;
|
|
}
|
|
uint tile_subix = _416;
|
|
Alloc param_2 = alloc_start.alloc;
|
|
uint param_3 = 8u * tile_subix;
|
|
uint param_4 = 8u * tile_count;
|
|
Alloc tiles_alloc = slice_mem(param_2, param_3, param_4);
|
|
path.tiles = TileRef{ tiles_alloc.offset };
|
|
Alloc param_5;
|
|
param_5.offset = v_181.conf.tile_alloc.offset;
|
|
PathRef param_6 = path_ref;
|
|
Path param_7 = path;
|
|
Path_write(param_5, param_6, param_7, v_70, v_70BufferSize);
|
|
}
|
|
uint total_count = sh_tile_count[255] * 2u;
|
|
uint start_ix = alloc_start.alloc.offset >> uint(2);
|
|
for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u)
|
|
{
|
|
Alloc param_8 = alloc_start.alloc;
|
|
uint param_9 = start_ix + i_1;
|
|
uint param_10 = 0u;
|
|
write_mem(param_8, param_9, param_10, v_70, v_70BufferSize);
|
|
}
|
|
}
|
|
|