vello/piet-gpu/shader/gen/backdrop.msl
Raph Levien 3b67a4e7c1 New clip implementation
This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack.

This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely.

Fixes #88. Also relevant to #119
2022-02-17 17:13:28 -08:00

290 lines
7.7 KiB
Plaintext
Generated

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct Alloc
{
uint offset;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct PathRef
{
uint offset;
};
struct TileRef
{
uint offset;
};
struct Path
{
uint4 bbox;
TileRef tiles;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
Alloc_1 clip_alloc;
Alloc_1 clip_bic_alloc;
Alloc_1 clip_stack_alloc;
Alloc_1 clip_bbox_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_79)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_79.memory[offset];
return v;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_79)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_79);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
uint fill_mode_from_flags(thread const uint& flags)
{
return flags & 1u;
}
static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_79)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_79);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_79);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_79);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
s.tiles = TileRef{ raw2 };
return s;
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_79)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_79.memory[offset] = val;
}
kernel void main0(device Memory& v_79 [[buffer(0)]], const device ConfigBuf& _186 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint sh_row_width[256];
threadgroup Alloc sh_row_alloc[256];
threadgroup uint sh_row_count[256];
uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef{ _186.conf.anno_alloc.offset + (element_ix * 40u) };
uint row_count = 0u;
bool mem_ok = v_79.mem_error == 0u;
if (gl_LocalInvocationID.y == 0u)
{
if (element_ix < _186.conf.n_elements)
{
Alloc param;
param.offset = _186.conf.anno_alloc.offset;
AnnotatedRef param_1 = ref;
AnnotatedTag tag = Annotated_tag(param, param_1, v_79);
switch (tag.tag)
{
case 3u:
case 2u:
case 4u:
case 1u:
{
uint param_2 = tag.flags;
if (fill_mode_from_flags(param_2) != 0u)
{
break;
}
PathRef path_ref = PathRef{ _186.conf.tile_alloc.offset + (element_ix * 12u) };
Alloc param_3;
param_3.offset = _186.conf.tile_alloc.offset;
PathRef param_4 = path_ref;
Path path = Path_read(param_3, param_4, v_79);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
bool _272 = row_count == 1u;
bool _278;
if (_272)
{
_278 = path.bbox.y > 0u;
}
else
{
_278 = _272;
}
if (_278)
{
row_count = 0u;
}
uint param_5 = path.tiles.offset;
uint param_6 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_7 = mem_ok;
Alloc path_alloc = new_alloc(param_5, param_6, param_7);
sh_row_alloc[th_ix] = path_alloc;
break;
}
}
}
sh_row_count[th_ix] = row_count;
}
for (uint i = 0u; i < 8u; i++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
bool _325 = gl_LocalInvocationID.y == 0u;
bool _332;
if (_325)
{
_332 = th_ix >= (1u << i);
}
else
{
_332 = _325;
}
if (_332)
{
row_count += sh_row_count[th_ix - (1u << i)];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (gl_LocalInvocationID.y == 0u)
{
sh_row_count[th_ix] = row_count;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint total_rows = sh_row_count[255];
uint _411;
for (uint row = th_ix; row < total_rows; row += 256u)
{
uint el_ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = el_ix + (128u >> i_1);
if (row >= sh_row_count[probe - 1u])
{
el_ix = probe;
}
}
uint width = sh_row_width[el_ix];
if ((width > 0u) && mem_ok)
{
Alloc tiles_alloc = sh_row_alloc[el_ix];
if (el_ix > 0u)
{
_411 = sh_row_count[el_ix - 1u];
}
else
{
_411 = 0u;
}
uint seq_ix = row - _411;
uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
Alloc param_8 = tiles_alloc;
uint param_9 = tile_el_ix;
uint sum = read_mem(param_8, param_9, v_79);
for (uint x = 1u; x < width; x++)
{
tile_el_ix += 2u;
Alloc param_10 = tiles_alloc;
uint param_11 = tile_el_ix;
sum += read_mem(param_10, param_11, v_79);
Alloc param_12 = tiles_alloc;
uint param_13 = tile_el_ix;
uint param_14 = sum;
write_mem(param_12, param_13, param_14, v_79);
}
}
}
}