vello/piet-gpu/shader/gen/clip_leaf.msl
Raph Levien 3b67a4e7c1 New clip implementation
This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack.

This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely.

Fixes #88. Also relevant to #119
2022-02-17 17:13:28 -08:00

367 lines
9.9 KiB
Plaintext
Generated

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct Bic
{
uint a;
uint b;
};
struct ClipEl
{
uint parent_ix;
float4 bbox;
};
struct Alloc
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
Alloc clip_alloc;
Alloc clip_bic_alloc;
Alloc clip_stack_alloc;
Alloc clip_bbox_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
{
uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix);
return Bic{ v_96.memory[base], v_96.memory[base + 1u] };
}
static inline __attribute__((always_inline))
Bic bic_combine(thread const Bic& x, thread const Bic& y)
{
uint m = min(x.b, y.a);
return Bic{ (x.a + y.a) - m, (x.b + y.b) - m };
}
static inline __attribute__((always_inline))
ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
{
uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix);
uint parent_ix = v_96.memory[base];
float x0 = as_type<float>(v_96.memory[base + 1u]);
float y0 = as_type<float>(v_96.memory[base + 2u]);
float x1 = as_type<float>(v_96.memory[base + 3u]);
float y1 = as_type<float>(v_96.memory[base + 4u]);
float4 bbox = float4(x0, y0, x1, y1);
return ClipEl{ parent_ix, bbox };
}
static inline __attribute__((always_inline))
float4 bbox_intersect(thread const float4& a, thread const float4& b)
{
return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
}
static inline __attribute__((always_inline))
uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96)
{
if (ix < v_80.conf.n_clip)
{
return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix];
}
else
{
return 2147483648u;
}
}
static inline __attribute__((always_inline))
float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96)
{
uint base = (v_80.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
float bbox_l = float(v_96.memory[base]) - 32768.0;
float bbox_t = float(v_96.memory[base + 1u]) - 32768.0;
float bbox_r = float(v_96.memory[base + 2u]) - 32768.0;
float bbox_b = float(v_96.memory[base + 3u]) - 32768.0;
float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
static inline __attribute__((always_inline))
uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510])
{
uint ix = gl_LocalInvocationID.x;
uint j = 0u;
while (j < 8u)
{
uint base = 512u - (2u << (8u - j));
if (((ix >> j) & 1u) != 0u)
{
Bic param = sh_bic[(base + (ix >> j)) - 1u];
Bic param_1 = bic;
Bic test = bic_combine(param, param_1);
if (test.b > 0u)
{
break;
}
bic = test;
ix -= (1u << j);
}
j++;
}
if (ix > 0u)
{
while (j > 0u)
{
j--;
uint base_1 = 512u - (2u << (8u - j));
Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
Bic param_3 = bic;
Bic test_1 = bic_combine(param_2, param_3);
if (test_1.b == 0u)
{
bic = test_1;
ix -= (1u << j);
}
}
}
if (ix > 0u)
{
return ix - 1u;
}
else
{
return 4294967295u - bic.a;
}
}
static inline __attribute__((always_inline))
void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96)
{
uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix);
v_96.memory[base] = as_type<uint>(bbox.x);
v_96.memory[base + 1u] = as_type<uint>(bbox.y);
v_96.memory[base + 2u] = as_type<uint>(bbox.z);
v_96.memory[base + 3u] = as_type<uint>(bbox.w);
}
kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
{
threadgroup Bic sh_bic[510];
threadgroup uint sh_stack[256];
threadgroup float4 sh_stack_bbox[256];
threadgroup uint sh_link[256];
threadgroup float4 sh_bbox[256];
uint th = gl_LocalInvocationID.x;
Bic bic = Bic{ 0u, 0u };
if (th < gl_WorkGroupID.x)
{
uint param = th;
bic = load_bic(param, v_80, v_96);
}
sh_bic[th] = bic;
for (uint i = 0u; i < 8u; i++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if ((th + (1u << i)) < 256u)
{
Bic other = sh_bic[th + (1u << i)];
Bic param_1 = bic;
Bic param_2 = other;
bic = bic_combine(param_1, param_2);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_bic[th] = bic;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint stack_size = sh_bic[0].b;
uint sp = 255u - th;
uint ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = ix + (128u >> i_1);
if (sp < sh_bic[probe].b)
{
ix = probe;
}
}
uint b = sh_bic[ix].b;
float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
if (sp < b)
{
uint param_3 = (((ix * 256u) + b) - sp) - 1u;
ClipEl el = load_clip_el(param_3, v_80, v_96);
sh_stack[th] = el.parent_ix;
bbox = el.bbox;
}
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
sh_stack_bbox[th] = bbox;
threadgroup_barrier(mem_flags::mem_threadgroup);
if (th >= (1u << i_2))
{
float4 param_4 = sh_stack_bbox[th - (1u << i_2)];
float4 param_5 = bbox;
bbox = bbox_intersect(param_4, param_5);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
sh_stack_bbox[th] = bbox;
uint param_6 = gl_GlobalInvocationID.x;
uint inp = load_path_ix(param_6, v_80, v_96);
bool is_push = int(inp) >= 0;
bic = Bic{ 1u - uint(is_push), uint(is_push) };
sh_bic[th] = bic;
if (is_push)
{
uint param_7 = inp;
bbox = load_path_bbox(param_7, v_80, v_96);
}
else
{
bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
}
uint inbase = 0u;
for (uint i_3 = 0u; i_3 < 7u; i_3++)
{
uint outbase = 512u - (1u << (8u - i_3));
threadgroup_barrier(mem_flags::mem_threadgroup);
if (th < (1u << (7u - i_3)))
{
Bic param_8 = sh_bic[inbase + (th * 2u)];
Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
sh_bic[outbase + th] = bic_combine(param_8, param_9);
}
inbase = outbase;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
bic = Bic{ 0u, 0u };
Bic param_10 = bic;
uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic);
bic = param_10;
uint link = _618;
sh_link[th] = link;
threadgroup_barrier(mem_flags::mem_threadgroup);
uint grandparent;
if (int(link) >= 0)
{
grandparent = sh_link[link];
}
else
{
grandparent = link - 1u;
}
uint parent;
if (int(link) >= 0)
{
parent = (gl_WorkGroupID.x * 256u) + link;
}
else
{
if (int(link + stack_size) >= 0)
{
parent = sh_stack[256u + link];
}
else
{
parent = 4294967295u;
}
}
for (uint i_4 = 0u; i_4 < 8u; i_4++)
{
if (i_4 != 0u)
{
sh_link[th] = link;
}
sh_bbox[th] = bbox;
threadgroup_barrier(mem_flags::mem_threadgroup);
if (int(link) >= 0)
{
float4 param_11 = sh_bbox[link];
float4 param_12 = bbox;
bbox = bbox_intersect(param_11, param_12);
link = sh_link[link];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
if (int(link + stack_size) >= 0)
{
float4 param_13 = sh_stack_bbox[256u + link];
float4 param_14 = bbox;
bbox = bbox_intersect(param_13, param_14);
}
sh_bbox[th] = bbox;
threadgroup_barrier(mem_flags::mem_threadgroup);
uint path_ix = inp;
bool _717 = !is_push;
bool _725;
if (_717)
{
_725 = gl_GlobalInvocationID.x < v_80.conf.n_clip;
}
else
{
_725 = _717;
}
if (_725)
{
uint param_15 = parent;
path_ix = load_path_ix(param_15, v_80, v_96);
uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * (~inp));
v_96.memory[drawmonoid_out_base] = path_ix;
if (int(grandparent) >= 0)
{
bbox = sh_bbox[grandparent];
}
else
{
if (int(grandparent + stack_size) >= 0)
{
bbox = sh_stack_bbox[256u + grandparent];
}
else
{
bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
}
}
}
uint param_16 = gl_GlobalInvocationID.x;
float4 param_17 = bbox;
store_clip_bbox(param_16, param_17, v_80, v_96);
}