Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

718 lines
26 KiB

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#pragma clang diagnostic ignored "-Wunused-variable"
#include <metal_stdlib>
#include <simd/simd.h>
#include <metal_atomic>
using namespace metal;
template<typename T, size_t Num>
struct spvUnsafeArray
T elements[Num ? Num : 1];
thread T& operator [] (size_t pos) thread
return elements[pos];
constexpr const thread T& operator [] (size_t pos) const thread
return elements[pos];
device T& operator [] (size_t pos) device
return elements[pos];
constexpr const device T& operator [] (size_t pos) const device
return elements[pos];
constexpr const constant T& operator [] (size_t pos) const constant
return elements[pos];
threadgroup T& operator [] (size_t pos) threadgroup
return elements[pos];
constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
return elements[pos];
struct Alloc
uint offset;
struct MallocResult
Alloc alloc;
bool failed;
struct PathCubicRef
uint offset;
struct PathCubic
float2 p0;
float2 p1;
float2 p2;
float2 p3;
uint path_ix;
uint trans_ix;
float2 stroke;
struct PathSegRef
uint offset;
struct PathSegTag
uint tag;
uint flags;
struct TileRef
uint offset;
struct PathRef
uint offset;
struct Path
uint4 bbox;
TileRef tiles;
struct TileSegRef
uint offset;
struct TileSeg
float2 origin;
float2 vector;
float y_edge;
TileSegRef next;
struct SubdivResult
float val;
float a0;
float a2;
struct Memory
uint mem_offset;
uint mem_error;
uint memory[1];
struct Alloc_1
uint offset;
struct Config
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
Alloc_1 path_bbox_alloc;
Alloc_1 drawmonoid_alloc;
Alloc_1 clip_alloc;
Alloc_1 clip_bic_alloc;
Alloc_1 clip_stack_alloc;
Alloc_1 clip_bbox_alloc;
Alloc_1 draw_bbox_alloc;
Alloc_1 drawinfo_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
uint drawtag_offset;
uint drawdata_offset;
struct ConfigBuf
Config conf;
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u);
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
return true;
static inline __attribute__((always_inline))
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize)
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
return 0u;
uint v = v_136.memory[offset];
return v;
static inline __attribute__((always_inline))
PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
Alloc param = a;
uint param_1 = ref.offset >> uint(2);
uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize);
return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) };
static inline __attribute__((always_inline))
PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 3u;
uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize);
Alloc param_8 = a;
uint param_9 = ix + 4u;
uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize);
Alloc param_10 = a;
uint param_11 = ix + 5u;
uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize);
Alloc param_12 = a;
uint param_13 = ix + 6u;
uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize);
Alloc param_14 = a;
uint param_15 = ix + 7u;
uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize);
Alloc param_16 = a;
uint param_17 = ix + 8u;
uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize);
Alloc param_18 = a;
uint param_19 = ix + 9u;
uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize);
Alloc param_20 = a;
uint param_21 = ix + 10u;
uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize);
Alloc param_22 = a;
uint param_23 = ix + 11u;
uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize);
PathCubic s;
s.p0 = float2(as_type<float>(raw0), as_type<float>(raw1));
s.p1 = float2(as_type<float>(raw2), as_type<float>(raw3));
s.p2 = float2(as_type<float>(raw4), as_type<float>(raw5));
s.p3 = float2(as_type<float>(raw6), as_type<float>(raw7));
s.path_ix = raw8;
s.trans_ix = raw9;
s.stroke = float2(as_type<float>(raw10), as_type<float>(raw11));
return s;
static inline __attribute__((always_inline))
PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
Alloc param = a;
PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u };
return PathCubic_read(param, param_1, v_136, v_136BufferSize);
static inline __attribute__((always_inline))
float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t)
float mt = 1.0 - t;
return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t);
static inline __attribute__((always_inline))
float approx_parabola_integral(thread const float& x)
return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x))));
static inline __attribute__((always_inline))
SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol)
float2 d01 = p1 - p0;
float2 d12 = p2 - p1;
float2 dd = d01 - d12;
float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x);
float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross;
float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross;
float scale = abs(_cross / (length(dd) * (x2 - x0)));
float param = x0;
float a0 = approx_parabola_integral(param);
float param_1 = x2;
float a2 = approx_parabola_integral(param_1);
float val = 0.0;
if (scale < 1000000000.0)
float da = abs(a2 - a0);
float sqrt_scale = sqrt(scale);
if (sign(x0) == sign(x2))
val = da * sqrt_scale;
float xmin = sqrt_tol / sqrt_scale;
float param_2 = xmin;
val = (sqrt_tol * da) / approx_parabola_integral(param_2);
return SubdivResult{ val, a0, a2 };
static inline __attribute__((always_inline))
uint fill_mode_from_flags(thread const uint& flags)
return flags & 1u;
static inline __attribute__((always_inline))
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize)
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize);
Alloc param_2 = a;
uint param_3 = ix + 1u;
uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize);
Alloc param_4 = a;
uint param_5 = ix + 2u;
uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize);
Path s;
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
s.tiles = TileRef{ raw2 };
return s;
static inline __attribute__((always_inline))
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
Alloc a;
a.offset = offset;
return a;
static inline __attribute__((always_inline))
float approx_parabola_inv_integral(thread const float& x)
return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x)));
static inline __attribute__((always_inline))
float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t)
float mt = 1.0 - t;
return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t);
static inline __attribute__((always_inline))
MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize)
uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed);
uint offset = _142;
MallocResult r;
r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4);
uint param = offset;
uint param_1 = size;
bool param_2 = !r.failed;
r.alloc = new_alloc(param, param_1, param_2);
if (r.failed)
uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed);
return r;
return r;
static inline __attribute__((always_inline))
TileRef Tile_index(thread const TileRef& ref, thread const uint& index)
return TileRef{ ref.offset + (index * 8u) };
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize)
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
v_136.memory[offset] = val;
static inline __attribute__((always_inline))
void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize)
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = as_type<uint>(s.origin.x);
write_mem(param, param_1, param_2, v_136, v_136BufferSize);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = as_type<uint>(s.origin.y);
write_mem(param_3, param_4, param_5, v_136, v_136BufferSize);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = as_type<uint>(s.vector.x);
write_mem(param_6, param_7, param_8, v_136, v_136BufferSize);
Alloc param_9 = a;
uint param_10 = ix + 3u;
uint param_11 = as_type<uint>(s.vector.y);
write_mem(param_9, param_10, param_11, v_136, v_136BufferSize);
Alloc param_12 = a;
uint param_13 = ix + 4u;
uint param_14 = as_type<uint>(s.y_edge);
write_mem(param_12, param_13, param_14, v_136, v_136BufferSize);
Alloc param_15 = a;
uint param_16 = ix + 5u;
uint param_17 =;
write_mem(param_15, param_16, param_17, v_136, v_136BufferSize);
kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
constant uint& v_136BufferSize = spvBufferSizeConstants[0];
uint element_ix = gl_GlobalInvocationID.x;
PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) };
PathSegTag tag = PathSegTag{ 0u, 0u };
if (element_ix < _710.conf.n_pathseg)
Alloc param;
param.offset = _710.conf.pathseg_alloc.offset;
PathSegRef param_1 = ref;
tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize);
bool mem_ok = v_136.mem_error == 0u;
switch (tag.tag)
case 1u:
Alloc param_2;
param_2.offset = _710.conf.pathseg_alloc.offset;
PathSegRef param_3 = ref;
PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize);
float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3;
float err = (err_v.x * err_v.x) + (err_v.y * err_v.y);
uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u);
n_quads = min(n_quads, 16u);
float val = 0.0;
float2 qp0 = cubic.p0;
float _step = 1.0 / float(n_quads);
spvUnsafeArray<SubdivResult, 16> keep_params;
for (uint i = 0u; i < n_quads; i++)
float t = float(i + 1u) * _step;
float2 param_4 = cubic.p0;
float2 param_5 = cubic.p1;
float2 param_6 = cubic.p2;
float2 param_7 = cubic.p3;
float param_8 = t;
float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8);
float2 param_9 = cubic.p0;
float2 param_10 = cubic.p1;
float2 param_11 = cubic.p2;
float2 param_12 = cubic.p3;
float param_13 = t - (0.5 * _step);
float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13);
qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5);
float2 param_14 = qp0;
float2 param_15 = qp1;
float2 param_16 = qp2;
float param_17 = 0.4743416607379913330078125;
SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17);
keep_params[i] = params;
val += params.val;
qp0 = qp2;
uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u);
uint param_18 = tag.flags;
bool is_stroke = fill_mode_from_flags(param_18) == 1u;
uint path_ix = cubic.path_ix;
Alloc param_19;
param_19.offset = _710.conf.tile_alloc.offset;
PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) };
Path path = Path_read(param_19, param_20, v_136, v_136BufferSize);
uint param_21 = path.tiles.offset;
uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
bool param_23 = mem_ok;
Alloc path_alloc = new_alloc(param_21, param_22, param_23);
int4 bbox = int4(path.bbox);
float2 p0 = cubic.p0;
qp0 = cubic.p0;
float v_step = val / float(n);
int n_out = 1;
float val_sum = 0.0;
float2 p1;
float _1147;
TileSeg tile_seg;
for (uint i_1 = 0u; i_1 < n_quads; i_1++)
float t_1 = float(i_1 + 1u) * _step;
float2 param_24 = cubic.p0;
float2 param_25 = cubic.p1;
float2 param_26 = cubic.p2;
float2 param_27 = cubic.p3;
float param_28 = t_1;
float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28);
float2 param_29 = cubic.p0;
float2 param_30 = cubic.p1;
float2 param_31 = cubic.p2;
float2 param_32 = cubic.p3;
float param_33 = t_1 - (0.5 * _step);
float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33);
qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5);
SubdivResult params_1 = keep_params[i_1];
float param_34 = params_1.a0;
float u0 = approx_parabola_inv_integral(param_34);
float param_35 = params_1.a2;
float u2 = approx_parabola_inv_integral(param_35);
float uscale = 1.0 / (u2 - u0);
float target = float(n_out) * v_step;
for (;;)
bool _1040 = uint(n_out) == n;
bool _1050;
if (!_1040)
_1050 = target < (val_sum + params_1.val);
_1050 = _1040;
if (_1050)
if (uint(n_out) == n)
p1 = cubic.p3;
float u = (target - val_sum) / params_1.val;
float a = mix(params_1.a0, params_1.a2, u);
float param_36 = a;
float au = approx_parabola_inv_integral(param_36);
float t_2 = (au - u0) * uscale;
float2 param_37 = qp0;
float2 param_38 = qp1_1;
float2 param_39 = qp2_1;
float param_40 = t_2;
p1 = eval_quad(param_37, param_38, param_39, param_40);
float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x;
float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x;
float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y;
float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y;
float dx = p1.x - p0.x;
float dy = p1.y - p0.y;
if (abs(dy) < 9.999999717180685365747194737196e-10)
_1147 = 1000000000.0;
_1147 = dx / dy;
float invslope = _1147;
float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625;
float b = invslope;
float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625;
int x0 = int(floor(xmin * 0.0625));
int x1 = int(floor(xmax * 0.0625) + 1.0);
int y0 = int(floor(ymin * 0.0625));
int y1 = int(floor(ymax * 0.0625) + 1.0);
x0 = clamp(x0, bbox.x, bbox.z);
y0 = clamp(y0, bbox.y, bbox.w);
x1 = clamp(x1, bbox.x, bbox.z);
y1 = clamp(y1, bbox.y, bbox.w);
float xc = a_1 + (b * float(y0));
int stride = bbox.z - bbox.x;
int base = ((y0 - bbox.y) * stride) - bbox.x;
uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
uint param_41 = n_tile_alloc * 24u;
MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize);
MallocResult tile_alloc = _1263;
if (tile_alloc.failed || (!mem_ok))
uint tile_offset = tile_alloc.alloc.offset;
int xray = int(floor(p0.x * 0.0625));
int last_xray = int(floor(p1.x * 0.0625));
if (p0.y > p1.y)
int tmp = xray;
xray = last_xray;
last_xray = tmp;
for (int y = y0; y < y1; y++)
float tile_y0 = float(y * 16);
int xbackdrop = max((xray + 1), bbox.x);
bool _1319 = !is_stroke;
bool _1329;
if (_1319)
_1329 = fast::min(p0.y, p1.y) < tile_y0;
_1329 = _1319;
bool _1336;
if (_1329)
_1336 = xbackdrop < bbox.z;
_1336 = _1329;
if (_1336)
int backdrop = (p1.y < p0.y) ? 1 : (-1);
TileRef param_42 = path.tiles;
uint param_43 = uint(base + xbackdrop);
TileRef tile_ref = Tile_index(param_42, param_43);
uint tile_el = tile_ref.offset >> uint(2);
Alloc param_44 = path_alloc;
uint param_45 = tile_el + 1u;
if (touch_mem(param_44, param_45))
uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed);
int next_xray = last_xray;
if (y < (y1 - 1))
float tile_y1 = float((y + 1) * 16);
float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
next_xray = int(floor(x_edge * 0.0625));
int min_xray = min(xray, next_xray);
int max_xray = max(xray, next_xray);
int xx0 = min(int(floor(xc - c)), min_xray);
int xx1 = max(int(ceil(xc + c)), (max_xray + 1));
xx0 = clamp(xx0, x0, x1);
xx1 = clamp(xx1, x0, x1);
for (int x = xx0; x < xx1; x++)
float tile_x0 = float(x * 16);
TileRef param_46 = TileRef{ path.tiles.offset };
uint param_47 = uint(base + x);
TileRef tile_ref_1 = Tile_index(param_46, param_47);
uint tile_el_1 = tile_ref_1.offset >> uint(2);
uint old = 0u;
Alloc param_48 = path_alloc;
uint param_49 = tile_el_1;
if (touch_mem(param_48, param_49))
uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed);
old = _1477;
tile_seg.origin = p0;
tile_seg.vector = p1 - p0;
float y_edge = 0.0;
if (!is_stroke)
y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
if (fast::min(p0.x, p1.x) < tile_x0)
float2 p = float2(tile_x0, y_edge);
if (p0.x > p1.x)
tile_seg.vector = p - p0;
tile_seg.origin = p;
tile_seg.vector = p1 - p;
if (tile_seg.vector.x == 0.0)
tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10;
if ((x <= min_xray) || (max_xray < x))
y_edge = 1000000000.0;
tile_seg.y_edge = y_edge; = old;
Alloc param_50 = tile_alloc.alloc;
TileSegRef param_51 = TileSegRef{ tile_offset };
TileSeg param_52 = tile_seg;
TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize);
tile_offset += 24u;
xc += b;
base += stride;
xray = next_xray;
target += v_step;
p0 = p1;
val_sum += params_1.val;
qp0 = qp2_1;