vello/piet-gpu/shader/gen/clip_leaf.hlsl
Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

372 lines
9 KiB
HLSL
Generated

struct Bic
{
uint a;
uint b;
};
struct ClipEl
{
uint parent_ix;
float4 bbox;
};
struct Alloc
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc path_bbox_alloc;
Alloc drawmonoid_alloc;
Alloc clip_alloc;
Alloc clip_bic_alloc;
Alloc clip_stack_alloc;
Alloc clip_bbox_alloc;
Alloc draw_bbox_alloc;
Alloc drawinfo_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
uint drawtag_offset;
uint drawdata_offset;
};
static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
static const Bic _393 = { 0u, 0u };
ByteAddressBuffer _80 : register(t1, space0);
RWByteAddressBuffer _96 : register(u0, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_WorkGroupID : SV_GroupID;
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
groupshared Bic sh_bic[510];
groupshared uint sh_stack[256];
groupshared float4 sh_stack_bbox[256];
groupshared uint sh_link[256];
groupshared float4 sh_bbox[256];
Bic load_bic(uint ix)
{
uint base = (_80.Load(52) >> uint(2)) + (2u * ix);
Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) };
return _286;
}
Bic bic_combine(Bic x, Bic y)
{
uint m = min(x.b, y.a);
Bic _72 = { (x.a + y.a) - m, (x.b + y.b) - m };
return _72;
}
ClipEl load_clip_el(uint ix)
{
uint base = (_80.Load(56) >> uint(2)) + (5u * ix);
uint parent_ix = _96.Load(base * 4 + 8);
float x0 = asfloat(_96.Load((base + 1u) * 4 + 8));
float y0 = asfloat(_96.Load((base + 2u) * 4 + 8));
float x1 = asfloat(_96.Load((base + 3u) * 4 + 8));
float y1 = asfloat(_96.Load((base + 4u) * 4 + 8));
float4 bbox = float4(x0, y0, x1, y1);
ClipEl _335 = { parent_ix, bbox };
return _335;
}
float4 bbox_intersect(float4 a, float4 b)
{
return float4(max(a.xy, b.xy), min(a.zw, b.zw));
}
uint load_path_ix(uint ix)
{
if (ix < _80.Load(80))
{
return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8);
}
else
{
return 2147483648u;
}
}
float4 load_path_bbox(uint path_ix)
{
uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix);
float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f;
float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f;
float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f;
float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f;
float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
uint search_link(inout Bic bic)
{
uint ix = gl_LocalInvocationID.x;
uint j = 0u;
while (j < 8u)
{
uint base = 512u - (2u << (8u - j));
if (((ix >> j) & 1u) != 0u)
{
Bic param = sh_bic[(base + (ix >> j)) - 1u];
Bic param_1 = bic;
Bic test = bic_combine(param, param_1);
if (test.b > 0u)
{
break;
}
bic = test;
ix -= (1u << j);
}
j++;
}
if (ix > 0u)
{
while (j > 0u)
{
j--;
uint base_1 = 512u - (2u << (8u - j));
Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u];
Bic param_3 = bic;
Bic test_1 = bic_combine(param_2, param_3);
if (test_1.b == 0u)
{
bic = test_1;
ix -= (1u << j);
}
}
}
if (ix > 0u)
{
return ix - 1u;
}
else
{
return 4294967295u - bic.a;
}
}
void store_clip_bbox(uint ix, float4 bbox)
{
uint base = (_80.Load(60) >> uint(2)) + (4u * ix);
_96.Store(base * 4 + 8, asuint(bbox.x));
_96.Store((base + 1u) * 4 + 8, asuint(bbox.y));
_96.Store((base + 2u) * 4 + 8, asuint(bbox.z));
_96.Store((base + 3u) * 4 + 8, asuint(bbox.w));
}
void comp_main()
{
uint th = gl_LocalInvocationID.x;
Bic bic = _393;
if (th < gl_WorkGroupID.x)
{
uint param = th;
bic = load_bic(param);
}
sh_bic[th] = bic;
for (uint i = 0u; i < 8u; i++)
{
GroupMemoryBarrierWithGroupSync();
if ((th + (1u << i)) < 256u)
{
Bic other = sh_bic[th + (1u << i)];
Bic param_1 = bic;
Bic param_2 = other;
bic = bic_combine(param_1, param_2);
}
GroupMemoryBarrierWithGroupSync();
sh_bic[th] = bic;
}
GroupMemoryBarrierWithGroupSync();
uint stack_size = sh_bic[0].b;
uint sp = 255u - th;
uint ix = 0u;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
uint probe = ix + (128u >> i_1);
if (sp < sh_bic[probe].b)
{
ix = probe;
}
}
uint b = sh_bic[ix].b;
float4 bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
if (sp < b)
{
uint param_3 = (((ix * 256u) + b) - sp) - 1u;
ClipEl el = load_clip_el(param_3);
sh_stack[th] = el.parent_ix;
bbox = el.bbox;
}
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
sh_stack_bbox[th] = bbox;
GroupMemoryBarrierWithGroupSync();
if (th >= (1u << i_2))
{
float4 param_4 = sh_stack_bbox[th - (1u << i_2)];
float4 param_5 = bbox;
bbox = bbox_intersect(param_4, param_5);
}
GroupMemoryBarrierWithGroupSync();
}
sh_stack_bbox[th] = bbox;
uint param_6 = gl_GlobalInvocationID.x;
uint inp = load_path_ix(param_6);
bool is_push = int(inp) >= 0;
Bic _559 = { 1u - uint(is_push), uint(is_push) };
bic = _559;
sh_bic[th] = bic;
if (is_push)
{
uint param_7 = inp;
bbox = load_path_bbox(param_7);
}
else
{
bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
}
uint inbase = 0u;
for (uint i_3 = 0u; i_3 < 7u; i_3++)
{
uint outbase = 512u - (1u << (8u - i_3));
GroupMemoryBarrierWithGroupSync();
if (th < (1u << (7u - i_3)))
{
Bic param_8 = sh_bic[inbase + (th * 2u)];
Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u];
sh_bic[outbase + th] = bic_combine(param_8, param_9);
}
inbase = outbase;
}
GroupMemoryBarrierWithGroupSync();
bic = _393;
Bic param_10 = bic;
uint _618 = search_link(param_10);
bic = param_10;
uint link = _618;
sh_link[th] = link;
GroupMemoryBarrierWithGroupSync();
uint grandparent;
if (int(link) >= 0)
{
grandparent = sh_link[link];
}
else
{
grandparent = link - 1u;
}
uint parent;
if (int(link) >= 0)
{
parent = (gl_WorkGroupID.x * 256u) + link;
}
else
{
if (int(link + stack_size) >= 0)
{
parent = sh_stack[256u + link];
}
else
{
parent = 4294967295u;
}
}
for (uint i_4 = 0u; i_4 < 8u; i_4++)
{
if (i_4 != 0u)
{
sh_link[th] = link;
}
sh_bbox[th] = bbox;
GroupMemoryBarrierWithGroupSync();
if (int(link) >= 0)
{
float4 param_11 = sh_bbox[link];
float4 param_12 = bbox;
bbox = bbox_intersect(param_11, param_12);
link = sh_link[link];
}
GroupMemoryBarrierWithGroupSync();
}
if (int(link + stack_size) >= 0)
{
float4 param_13 = sh_stack_bbox[256u + link];
float4 param_14 = bbox;
bbox = bbox_intersect(param_13, param_14);
}
sh_bbox[th] = bbox;
GroupMemoryBarrierWithGroupSync();
uint path_ix = inp;
bool _717 = !is_push;
bool _725;
if (_717)
{
_725 = gl_GlobalInvocationID.x < _80.Load(80);
}
else
{
_725 = _717;
}
if (_725)
{
uint param_15 = parent;
path_ix = load_path_ix(param_15);
uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp));
_96.Store(drawmonoid_out_base * 4 + 8, path_ix);
if (int(grandparent) >= 0)
{
bbox = sh_bbox[grandparent];
}
else
{
if (int(grandparent + stack_size) >= 0)
{
bbox = sh_stack_bbox[256u + grandparent];
}
else
{
bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f);
}
}
}
uint param_16 = gl_GlobalInvocationID.x;
float4 param_17 = bbox;
store_clip_bbox(param_16, param_17);
}
[numthreads(256, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}