vello/piet-gpu/shader/gen/draw_leaf.msl
Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

287 lines
8.9 KiB
Plaintext
Generated

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
template<typename T, size_t Num>
struct spvUnsafeArray
{
T elements[Num ? Num : 1];
thread T& operator [] (size_t pos) thread
{
return elements[pos];
}
constexpr const thread T& operator [] (size_t pos) const thread
{
return elements[pos];
}
device T& operator [] (size_t pos) device
{
return elements[pos];
}
constexpr const device T& operator [] (size_t pos) const device
{
return elements[pos];
}
constexpr const constant T& operator [] (size_t pos) const constant
{
return elements[pos];
}
threadgroup T& operator [] (size_t pos) threadgroup
{
return elements[pos];
}
constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
{
return elements[pos];
}
};
struct DrawMonoid
{
uint path_ix;
uint clip_ix;
uint scene_offset;
uint info_offset;
};
struct Alloc
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc path_bbox_alloc;
Alloc drawmonoid_alloc;
Alloc clip_alloc;
Alloc clip_bic_alloc;
Alloc clip_stack_alloc;
Alloc clip_bbox_alloc;
Alloc draw_bbox_alloc;
Alloc drawinfo_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
uint drawtag_offset;
uint drawdata_offset;
};
struct ConfigBuf
{
Config conf;
};
struct SceneBuf
{
uint scene[1];
};
struct DrawMonoid_1
{
uint path_ix;
uint clip_ix;
uint scene_offset;
uint info_offset;
};
struct ParentBuf
{
DrawMonoid_1 parent[1];
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
static inline __attribute__((always_inline))
DrawMonoid map_tag(thread const uint& tag_word)
{
uint has_path = uint(tag_word != 0u);
return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 28u };
}
static inline __attribute__((always_inline))
DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b)
{
DrawMonoid c;
c.path_ix = a.path_ix + b.path_ix;
c.clip_ix = a.clip_ix + b.clip_ix;
c.scene_offset = a.scene_offset + b.scene_offset;
c.info_offset = a.info_offset + b.info_offset;
return c;
}
static inline __attribute__((always_inline))
DrawMonoid draw_monoid_identity()
{
return DrawMonoid{ 0u, 0u, 0u, 0u };
}
kernel void main0(device Memory& _284 [[buffer(0)]], const device ConfigBuf& _92 [[buffer(1)]], const device SceneBuf& _102 [[buffer(2)]], const device ParentBuf& _202 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
{
threadgroup DrawMonoid sh_scratch[256];
uint ix = gl_GlobalInvocationID.x * 8u;
uint drawtag_base = _92.conf.drawtag_offset >> uint(2);
uint tag_word = _102.scene[drawtag_base + ix];
uint param = tag_word;
DrawMonoid agg = map_tag(param);
spvUnsafeArray<DrawMonoid, 8> local;
local[0] = agg;
for (uint i = 1u; i < 8u; i++)
{
tag_word = _102.scene[(drawtag_base + ix) + i];
uint param_1 = tag_word;
DrawMonoid param_2 = agg;
DrawMonoid param_3 = map_tag(param_1);
agg = combine_draw_monoid(param_2, param_3);
local[i] = agg;
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if (gl_LocalInvocationID.x >= (1u << i_1))
{
DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
DrawMonoid param_4 = other;
DrawMonoid param_5 = agg;
agg = combine_draw_monoid(param_4, param_5);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_scratch[gl_LocalInvocationID.x] = agg;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
DrawMonoid row = draw_monoid_identity();
if (gl_WorkGroupID.x > 0u)
{
uint _205 = gl_WorkGroupID.x - 1u;
row.path_ix = _202.parent[_205].path_ix;
row.clip_ix = _202.parent[_205].clip_ix;
row.scene_offset = _202.parent[_205].scene_offset;
row.info_offset = _202.parent[_205].info_offset;
}
if (gl_LocalInvocationID.x > 0u)
{
DrawMonoid param_6 = row;
DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u];
row = combine_draw_monoid(param_6, param_7);
}
uint drawdata_base = _92.conf.drawdata_offset >> uint(2);
uint drawinfo_base = _92.conf.drawinfo_alloc.offset >> uint(2);
uint out_ix = gl_GlobalInvocationID.x * 8u;
uint out_base = (_92.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u);
uint clip_out_base = _92.conf.clip_alloc.offset >> uint(2);
float4 mat;
float2 translate;
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
DrawMonoid m = row;
if (i_2 > 0u)
{
DrawMonoid param_8 = m;
DrawMonoid param_9 = local[i_2 - 1u];
m = combine_draw_monoid(param_8, param_9);
}
_284.memory[out_base + (i_2 * 4u)] = m.path_ix;
_284.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix;
_284.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset;
_284.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset;
uint dd = drawdata_base + (m.scene_offset >> uint(2));
uint di = drawinfo_base + (m.info_offset >> uint(2));
tag_word = _102.scene[(drawtag_base + ix) + i_2];
if ((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 72u)) || (tag_word == 5u))
{
uint bbox_offset = (_92.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix);
float bbox_l = float(_284.memory[bbox_offset]) - 32768.0;
float bbox_t = float(_284.memory[bbox_offset + 1u]) - 32768.0;
float bbox_r = float(_284.memory[bbox_offset + 2u]) - 32768.0;
float bbox_b = float(_284.memory[bbox_offset + 3u]) - 32768.0;
float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
float linewidth = as_type<float>(_284.memory[bbox_offset + 4u]);
uint fill_mode = uint(linewidth >= 0.0);
if ((linewidth >= 0.0) || (tag_word == 276u))
{
uint trans_ix = _284.memory[bbox_offset + 5u];
uint t = (_92.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix);
mat = as_type<float4>(uint4(_284.memory[t], _284.memory[t + 1u], _284.memory[t + 2u], _284.memory[t + 3u]));
if (tag_word == 276u)
{
translate = as_type<float2>(uint2(_284.memory[t + 4u], _284.memory[t + 5u]));
}
}
if (linewidth >= 0.0)
{
linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z)));
}
switch (tag_word)
{
case 68u:
case 72u:
{
_284.memory[di] = as_type<uint>(linewidth);
break;
}
case 276u:
{
_284.memory[di] = as_type<uint>(linewidth);
uint index = _102.scene[dd];
float2 p0 = as_type<float2>(uint2(_102.scene[dd + 1u], _102.scene[dd + 2u]));
float2 p1 = as_type<float2>(uint2(_102.scene[dd + 3u], _102.scene[dd + 4u]));
p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate;
p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate;
float2 dxy = p1 - p0;
float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y));
float line_x = dxy.x * scale;
float line_y = dxy.y * scale;
float line_c = -((p0.x * line_x) + (p0.y * line_y));
_284.memory[di + 1u] = as_type<uint>(line_x);
_284.memory[di + 2u] = as_type<uint>(line_y);
_284.memory[di + 3u] = as_type<uint>(line_c);
break;
}
case 5u:
{
break;
}
}
}
if ((tag_word == 5u) || (tag_word == 37u))
{
uint path_ix = ~(out_ix + i_2);
if (tag_word == 5u)
{
path_ix = m.path_ix;
}
_284.memory[clip_out_base + m.clip_ix] = path_ix;
}
}
}