mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-25 18:56:35 +11:00
acb3933d94
This patch switches to a variable size encoding of draw objects. In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying). Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style). This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken. Closes #152
247 lines
6.4 KiB
Text
Generated
247 lines
6.4 KiB
Text
Generated
#pragma clang diagnostic ignored "-Wmissing-prototypes"
|
|
|
|
#include <metal_stdlib>
|
|
#include <simd/simd.h>
|
|
|
|
using namespace metal;
|
|
|
|
struct Alloc
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct PathRef
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct TileRef
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct Path
|
|
{
|
|
uint4 bbox;
|
|
TileRef tiles;
|
|
};
|
|
|
|
struct Memory
|
|
{
|
|
uint mem_offset;
|
|
uint mem_error;
|
|
uint memory[1];
|
|
};
|
|
|
|
struct Alloc_1
|
|
{
|
|
uint offset;
|
|
};
|
|
|
|
struct Config
|
|
{
|
|
uint n_elements;
|
|
uint n_pathseg;
|
|
uint width_in_tiles;
|
|
uint height_in_tiles;
|
|
Alloc_1 tile_alloc;
|
|
Alloc_1 bin_alloc;
|
|
Alloc_1 ptcl_alloc;
|
|
Alloc_1 pathseg_alloc;
|
|
Alloc_1 anno_alloc;
|
|
Alloc_1 trans_alloc;
|
|
Alloc_1 path_bbox_alloc;
|
|
Alloc_1 drawmonoid_alloc;
|
|
Alloc_1 clip_alloc;
|
|
Alloc_1 clip_bic_alloc;
|
|
Alloc_1 clip_stack_alloc;
|
|
Alloc_1 clip_bbox_alloc;
|
|
Alloc_1 draw_bbox_alloc;
|
|
Alloc_1 drawinfo_alloc;
|
|
uint n_trans;
|
|
uint n_path;
|
|
uint n_clip;
|
|
uint trans_offset;
|
|
uint linewidth_offset;
|
|
uint pathtag_offset;
|
|
uint pathseg_offset;
|
|
uint drawtag_offset;
|
|
uint drawdata_offset;
|
|
};
|
|
|
|
struct ConfigBuf
|
|
{
|
|
Config conf;
|
|
};
|
|
|
|
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u);
|
|
|
|
static inline __attribute__((always_inline))
|
|
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67)
|
|
{
|
|
Alloc param = alloc;
|
|
uint param_1 = offset;
|
|
if (!touch_mem(param, param_1))
|
|
{
|
|
return 0u;
|
|
}
|
|
uint v = v_67.memory[offset];
|
|
return v;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67)
|
|
{
|
|
uint ix = ref.offset >> uint(2);
|
|
Alloc param = a;
|
|
uint param_1 = ix + 0u;
|
|
uint raw0 = read_mem(param, param_1, v_67);
|
|
Alloc param_2 = a;
|
|
uint param_3 = ix + 1u;
|
|
uint raw1 = read_mem(param_2, param_3, v_67);
|
|
Alloc param_4 = a;
|
|
uint param_5 = ix + 2u;
|
|
uint raw2 = read_mem(param_4, param_5, v_67);
|
|
Path s;
|
|
s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16));
|
|
s.tiles = TileRef{ raw2 };
|
|
return s;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok)
|
|
{
|
|
Alloc a;
|
|
a.offset = offset;
|
|
return a;
|
|
}
|
|
|
|
static inline __attribute__((always_inline))
|
|
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67)
|
|
{
|
|
Alloc param = alloc;
|
|
uint param_1 = offset;
|
|
if (!touch_mem(param, param_1))
|
|
{
|
|
return;
|
|
}
|
|
v_67.memory[offset] = val;
|
|
}
|
|
|
|
kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
|
|
{
|
|
threadgroup uint sh_row_width[256];
|
|
threadgroup Alloc sh_row_alloc[256];
|
|
threadgroup uint sh_row_count[256];
|
|
uint th_ix = gl_LocalInvocationIndex;
|
|
uint element_ix = gl_GlobalInvocationID.x;
|
|
uint row_count = 0u;
|
|
bool mem_ok = v_67.mem_error == 0u;
|
|
if (gl_LocalInvocationID.y == 0u)
|
|
{
|
|
if (element_ix < _166.conf.n_elements)
|
|
{
|
|
PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) };
|
|
Alloc param;
|
|
param.offset = _166.conf.tile_alloc.offset;
|
|
PathRef param_1 = path_ref;
|
|
Path path = Path_read(param, param_1, v_67);
|
|
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
|
|
row_count = path.bbox.w - path.bbox.y;
|
|
bool _210 = row_count == 1u;
|
|
bool _216;
|
|
if (_210)
|
|
{
|
|
_216 = path.bbox.y > 0u;
|
|
}
|
|
else
|
|
{
|
|
_216 = _210;
|
|
}
|
|
if (_216)
|
|
{
|
|
row_count = 0u;
|
|
}
|
|
uint param_2 = path.tiles.offset;
|
|
uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u;
|
|
bool param_4 = mem_ok;
|
|
Alloc path_alloc = new_alloc(param_2, param_3, param_4);
|
|
sh_row_alloc[th_ix] = path_alloc;
|
|
}
|
|
sh_row_count[th_ix] = row_count;
|
|
}
|
|
for (uint i = 0u; i < 8u; i++)
|
|
{
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
bool _262 = gl_LocalInvocationID.y == 0u;
|
|
bool _269;
|
|
if (_262)
|
|
{
|
|
_269 = th_ix >= (1u << i);
|
|
}
|
|
else
|
|
{
|
|
_269 = _262;
|
|
}
|
|
if (_269)
|
|
{
|
|
row_count += sh_row_count[th_ix - (1u << i)];
|
|
}
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
if (gl_LocalInvocationID.y == 0u)
|
|
{
|
|
sh_row_count[th_ix] = row_count;
|
|
}
|
|
}
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
uint total_rows = sh_row_count[255];
|
|
uint _348;
|
|
for (uint row = th_ix; row < total_rows; row += 1024u)
|
|
{
|
|
uint el_ix = 0u;
|
|
for (uint i_1 = 0u; i_1 < 8u; i_1++)
|
|
{
|
|
uint probe = el_ix + (128u >> i_1);
|
|
if (row >= sh_row_count[probe - 1u])
|
|
{
|
|
el_ix = probe;
|
|
}
|
|
}
|
|
uint width = sh_row_width[el_ix];
|
|
if ((width > 0u) && mem_ok)
|
|
{
|
|
Alloc tiles_alloc = sh_row_alloc[el_ix];
|
|
if (el_ix > 0u)
|
|
{
|
|
_348 = sh_row_count[el_ix - 1u];
|
|
}
|
|
else
|
|
{
|
|
_348 = 0u;
|
|
}
|
|
uint seq_ix = row - _348;
|
|
uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width);
|
|
Alloc param_5 = tiles_alloc;
|
|
uint param_6 = tile_el_ix;
|
|
uint sum = read_mem(param_5, param_6, v_67);
|
|
for (uint x = 1u; x < width; x++)
|
|
{
|
|
tile_el_ix += 2u;
|
|
Alloc param_7 = tiles_alloc;
|
|
uint param_8 = tile_el_ix;
|
|
sum += read_mem(param_7, param_8, v_67);
|
|
Alloc param_9 = tiles_alloc;
|
|
uint param_10 = tile_el_ix;
|
|
uint param_11 = sum;
|
|
write_mem(param_9, param_10, param_11, v_67);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|