vello/piet-gpu/shader/gen/binning.msl
Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

348 lines
11 KiB
Plaintext
Generated

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
struct Alloc
{
uint offset;
};
struct MallocResult
{
Alloc alloc;
bool failed;
};
struct BinInstanceRef
{
uint offset;
};
struct BinInstance
{
uint element_ix;
};
struct DrawMonoid
{
uint path_ix;
uint clip_ix;
uint scene_offset;
uint info_offset;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 path_bbox_alloc;
Alloc_1 drawmonoid_alloc;
Alloc_1 clip_alloc;
Alloc_1 clip_bic_alloc;
Alloc_1 clip_stack_alloc;
Alloc_1 clip_bbox_alloc;
Alloc_1 draw_bbox_alloc;
Alloc_1 drawinfo_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
uint drawtag_offset;
uint drawdata_offset;
};
struct ConfigBuf
{
Config conf;
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
{
uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix);
uint path_ix = v_81.memory[base];
uint clip_ix = v_81.memory[base + 1u];
uint scene_offset = v_81.memory[base + 2u];
uint info_offset = v_81.memory[base + 3u];
return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset };
}
static inline __attribute__((always_inline))
float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
{
uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix);
float x0 = as_type<float>(v_81.memory[base]);
float y0 = as_type<float>(v_81.memory[base + 1u]);
float x1 = as_type<float>(v_81.memory[base + 2u]);
float y1 = as_type<float>(v_81.memory[base + 3u]);
float4 bbox = float4(x0, y0, x1, y1);
return bbox;
}
static inline __attribute__((always_inline))
float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
{
uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix);
float bbox_l = float(v_81.memory[base]) - 32768.0;
float bbox_t = float(v_81.memory[base + 1u]) - 32768.0;
float bbox_r = float(v_81.memory[base + 2u]) - 32768.0;
float bbox_b = float(v_81.memory[base + 3u]) - 32768.0;
float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
static inline __attribute__((always_inline))
float4 bbox_intersect(thread const float4& a, thread const float4& b)
{
return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw));
}
static inline __attribute__((always_inline))
void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156)
{
uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix);
v_81.memory[base] = as_type<uint>(bbox.x);
v_81.memory[base + 1u] = as_type<uint>(bbox.y);
v_81.memory[base + 2u] = as_type<uint>(bbox.z);
v_81.memory[base + 3u] = as_type<uint>(bbox.w);
}
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
{
Alloc a;
a.offset = offset;
return a;
}
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize)
{
uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed);
uint offset = _87;
MallocResult r;
r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
{
uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed);
return r;
}
return r;
}
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_81.memory[offset] = val;
}
static inline __attribute__((always_inline))
void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = s.element_ix;
write_mem(param, param_1, param_2, v_81, v_81BufferSize);
}
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup uint bitmaps[8][256];
threadgroup short sh_alloc_failed;
threadgroup uint count[8][256];
threadgroup Alloc sh_chunk_alloc[256];
constant uint& v_81BufferSize = spvBufferSizeConstants[0];
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0u; i < 8u; i++)
{
bitmaps[i][gl_LocalInvocationID.x] = 0u;
}
if (gl_LocalInvocationID.x == 0u)
{
sh_alloc_failed = short(false);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x;
int x0 = 0;
int y0 = 0;
int x1 = 0;
int y1 = 0;
if (element_ix < v_156.conf.n_elements)
{
uint param = element_ix;
DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156);
uint path_ix = draw_monoid.path_ix;
float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0);
uint clip_ix = draw_monoid.clip_ix;
if (clip_ix > 0u)
{
uint param_1 = clip_ix - 1u;
clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156);
}
uint param_2 = path_ix;
float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156);
float4 param_3 = path_bbox;
float4 param_4 = clip_bbox;
float4 bbox = bbox_intersect(param_3, param_4);
float4 _417 = bbox;
float4 _419 = bbox;
float2 _421 = fast::max(_417.xy, _419.zw);
bbox.z = _421.x;
bbox.w = _421.y;
uint param_5 = element_ix;
float4 param_6 = bbox;
store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156);
x0 = int(floor(bbox.x * 0.00390625));
y0 = int(floor(bbox.y * 0.00390625));
x1 = int(ceil(bbox.z * 0.00390625));
y1 = int(ceil(bbox.w * 0.00390625));
}
uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u;
uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1)
{
y1 = y0;
}
int x = x0;
int y = y0;
uint my_slice = gl_LocalInvocationID.x / 32u;
uint my_mask = 1u << (gl_LocalInvocationID.x & 31u);
while (y < y1)
{
uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed);
x++;
if (x == x1)
{
x = x0;
y++;
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
uint element_count = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x])));
count[i_1][gl_LocalInvocationID.x] = element_count;
}
uint param_7 = 0u;
uint param_8 = 0u;
bool param_9 = true;
Alloc chunk_alloc = new_alloc(param_7, param_8, param_9);
if (element_count != 0u)
{
uint param_10 = element_count * 4u;
MallocResult _573 = malloc(param_10, v_81, v_81BufferSize);
MallocResult chunk = _573;
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed)
{
sh_alloc_failed = short(true);
}
}
uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u);
Alloc param_11;
param_11.offset = v_156.conf.bin_alloc.offset;
uint param_12 = out_ix;
uint param_13 = element_count;
write_mem(param_11, param_12, param_13, v_81, v_81BufferSize);
Alloc param_14;
param_14.offset = v_156.conf.bin_alloc.offset;
uint param_15 = out_ix + 1u;
uint param_16 = chunk_alloc.offset;
write_mem(param_14, param_15, param_16, v_81, v_81BufferSize);
threadgroup_barrier(mem_flags::mem_threadgroup);
bool _630;
if (!bool(sh_alloc_failed))
{
_630 = v_81.mem_error != 0u;
}
else
{
_630 = bool(sh_alloc_failed);
}
if (_630)
{
return;
}
x = x0;
y = y0;
while (y < y1)
{
uint bin_ix = (uint(y) * width_in_bins) + uint(x);
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0u)
{
uint idx = uint(int(popcount(out_mask & (my_mask - 1u))));
if (my_slice > 0u)
{
idx += count[my_slice - 1u][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + (idx * 4u);
Alloc param_17 = out_alloc;
BinInstanceRef param_18 = BinInstanceRef{ out_offset };
BinInstance param_19 = BinInstance{ element_ix };
BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize);
}
x++;
if (x == x1)
{
x = x0;
y++;
}
}
}