vello/piet-gpu/shader/gen/binning.msl
Raph Levien 3b67a4e7c1 New clip implementation
This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack.

This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely.

Fixes #88. Also relevant to #119
2022-02-17 17:13:28 -08:00

391 lines
12 KiB
Plaintext
Generated

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct AnnotatedRef
{
uint offset;
};
struct AnnotatedTag
{
uint tag;
uint flags;
};
struct BinInstanceRef
{
uint offset;
};
struct BinInstance
{
uint element_ix;
};
struct DrawMonoid
{
uint path_ix;
uint clip_ix;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 bbox_alloc;
Alloc_1 drawmonoid_alloc;
Alloc_1 clip_alloc;
Alloc_1 clip_bic_alloc;
Alloc_1 clip_stack_alloc;
Alloc_1 clip_bbox_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_94, constant uint& v_94BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return 0u;
}
uint v = v_94.memory[offset];
return v;
}
static inline __attribute__((always_inline))
AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_94, constant uint& v_94BufferSize)
{
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_94, v_94BufferSize);
return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
}
static inline __attribute__((always_inline))
DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
{
uint base = (v_202.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix);
uint path_ix = v_94.memory[base];
uint clip_ix = v_94.memory[base + 1u];
return DrawMonoid{ path_ix, clip_ix };
}
static inline __attribute__((always_inline))
float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
{
uint base = (v_202.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
float x0 = as_type<float>(v_94.memory[base]);
float y0 = as_type<float>(v_94.memory[base + 1u]);
float x1 = as_type<float>(v_94.memory[base + 2u]);
float y1 = as_type<float>(v_94.memory[base + 3u]);
float4 bbox = float4(x0, y0, x1, y1);
return bbox;
}
static inline __attribute__((always_inline))
float4 load_path_bbox(thread const uint& path_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202)
{
uint base = (v_202.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix);
float bbox_l = float(v_94.memory[base]) - 32768.0;
float bbox_t = float(v_94.memory[base + 1u]) - 32768.0;
float bbox_r = float(v_94.memory[base + 2u]) - 32768.0;
float bbox_b = float(v_94.memory[base + 3u]) - 32768.0;
float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
static inline __attribute__((always_inline))
float4 bbox_intersect(thread const float4& a, thread const float4& b)
{
return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
}
static inline __attribute__((always_inline))
void store_path_bbox(thread const AnnotatedRef& ref, thread const float4& bbox, device Memory& v_94, constant uint& v_94BufferSize)
{
uint ix = ref.offset >> uint(2);
v_94.memory[ix + 1u] = as_type<uint>(bbox.x);
v_94.memory[ix + 2u] = as_type<uint>(bbox.y);
v_94.memory[ix + 3u] = as_type<uint>(bbox.z);
v_94.memory[ix + 4u] = as_type<uint>(bbox.w);
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_94, constant uint& v_94BufferSize)
{
uint _100 = atomic_fetch_add_explicit((device atomic_uint*)&v_94.mem_offset, size, memory_order_relaxed);
uint offset = _100;
MallocResult r;
r.failed = (offset + size) > uint(int((v_94BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _129 = atomic_fetch_max_explicit((device atomic_uint*)&v_94.mem_error, 1u, memory_order_relaxed);
return r;
}
return r;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_94, constant uint& v_94BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_94.memory[offset] = val;
}
static inline __attribute__((always_inline))
void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_94, constant uint& v_94BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.element_ix;
write_mem(param, param_1, param_2, v_94, v_94BufferSize);
}
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint bitmaps[8][256];
threadgroup short sh_alloc_failed;
threadgroup uint count[8][256];
threadgroup Alloc sh_chunk_alloc[256];
constant uint& v_94BufferSize = spvBufferSizeConstants[0];
uint my_n_elements = v_202.conf.n_elements;
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0u; i < 8u; i++)
{
bitmaps[i][gl_LocalInvocationID.x] = 0u;
}
if (gl_LocalInvocationID.x == 0u)
{
sh_alloc_failed = short(false);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef{ v_202.conf.anno_alloc.offset + (element_ix * 40u) };
uint tag = 0u;
if (element_ix < my_n_elements)
{
Alloc param;
param.offset = v_202.conf.anno_alloc.offset;
AnnotatedRef param_1 = ref;
tag = Annotated_tag(param, param_1, v_94, v_94BufferSize).tag;
}
int x0 = 0;
int y0 = 0;
int x1 = 0;
int y1 = 0;
switch (tag)
{
case 1u:
case 2u:
case 3u:
case 4u:
case 5u:
{
uint param_2 = element_ix;
DrawMonoid draw_monoid = load_draw_monoid(param_2, v_94, v_94BufferSize, v_202);
uint path_ix = draw_monoid.path_ix;
float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
uint clip_ix = draw_monoid.clip_ix;
if (clip_ix > 0u)
{
uint param_3 = clip_ix - 1u;
clip_bbox = load_clip_bbox(param_3, v_94, v_94BufferSize, v_202);
}
uint param_4 = path_ix;
float4 path_bbox = load_path_bbox(param_4, v_94, v_94BufferSize, v_202);
float4 param_5 = path_bbox;
float4 param_6 = clip_bbox;
float4 bbox = bbox_intersect(param_5, param_6);
float4 _473 = bbox;
float4 _475 = bbox;
float2 _477 = fast::max(_473.xy, _475.zw);
bbox.z = _477.x;
bbox.w = _477.y;
AnnotatedRef param_7 = ref;
float4 param_8 = bbox;
store_path_bbox(param_7, param_8, v_94, v_94BufferSize);
x0 = int(floor(bbox.x * 0.00390625));
y0 = int(floor(bbox.y * 0.00390625));
x1 = int(ceil(bbox.z * 0.00390625));
y1 = int(ceil(bbox.w * 0.00390625));
break;
}
}
uint width_in_bins = ((v_202.conf.width_in_tiles + 16u) - 1u) / 16u;
uint height_in_bins = ((v_202.conf.height_in_tiles + 16u) - 1u) / 16u;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1)
{
y1 = y0;
}
int x = x0;
int y = y0;
uint my_slice = gl_LocalInvocationID.x / 32u;
uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
while (y < y1)
{
uint _581 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
x++;
if (x == x1)
{
x = x0;
y++;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint element_count = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
count[i_1][gl_LocalInvocationID.x] = element_count;
}
uint param_9 = 0u;
uint param_10 = 0u;
bool param_11 = true;
Alloc chunk_alloc = new_alloc(param_9, param_10, param_11);
if (element_count != 0u)
{
uint param_12 = element_count * 4u;
MallocResult _631 = malloc(param_12, v_94, v_94BufferSize);
MallocResult chunk = _631;
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed)
{
sh_alloc_failed = short(true);
}
}
uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
Alloc param_13;
param_13.offset = v_202.conf.bin_alloc.offset;
uint param_14 = out_ix;
uint param_15 = element_count;
write_mem(param_13, param_14, param_15, v_94, v_94BufferSize);
Alloc param_16;
param_16.offset = v_202.conf.bin_alloc.offset;
uint param_17 = out_ix + 1u;
uint param_18 = chunk_alloc.offset;
write_mem(param_16, param_17, param_18, v_94, v_94BufferSize);
threadgroup_barrier(mem_flags::mem_threadgroup);
bool _687;
if (!bool(sh_alloc_failed))
{
_687 = v_94.mem_error != 0u;
}
else
{
_687 = bool(sh_alloc_failed);
}
if (_687)
{
return;
}
x = x0;
y = y0;
while (y < y1)
{
uint bin_ix = (uint(y) * width_in_bins) + uint(x);
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0u)
{
uint idx = uint(int(popcount(out_mask & (my_mask - 1u))));
if (my_slice > 0u)
{
idx += count[my_slice - 1u][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + (idx * 4u);
Alloc param_19 = out_alloc;
BinInstanceRef param_20 = BinInstanceRef{ out_offset };
BinInstance param_21 = BinInstance{ element_ix };
BinInstance_write(param_19, param_20, param_21, v_94, v_94BufferSize);
}
x++;
if (x == x1)
{
x = x0;
y++;
}
}
}