mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-26 19:26:33 +11:00
33d7b25a92
This adds a prefix sum test. This patch is also trying to get a little more serious about structuring both the test runner (toward the goal of collecting proper statistics) and pipeline stages for the tests. Still WIP but giving good results.
223 lines
6.3 KiB
HLSL
223 lines
6.3 KiB
HLSL
struct Monoid
|
|
{
|
|
uint element;
|
|
};
|
|
|
|
struct State
|
|
{
|
|
uint flag;
|
|
Monoid aggregate;
|
|
Monoid prefix;
|
|
};
|
|
|
|
static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
|
|
|
|
static const Monoid _187 = { 0u };
|
|
|
|
globallycoherent RWByteAddressBuffer _43 : register(u2);
|
|
ByteAddressBuffer _67 : register(t0);
|
|
RWByteAddressBuffer _374 : register(u1);
|
|
|
|
static uint3 gl_LocalInvocationID;
|
|
struct SPIRV_Cross_Input
|
|
{
|
|
uint3 gl_LocalInvocationID : SV_GroupThreadID;
|
|
};
|
|
|
|
groupshared uint sh_part_ix;
|
|
groupshared Monoid sh_scratch[512];
|
|
groupshared uint sh_flag;
|
|
groupshared Monoid sh_prefix;
|
|
|
|
Monoid combine_monoid(Monoid a, Monoid b)
|
|
{
|
|
Monoid _22 = { a.element + b.element };
|
|
return _22;
|
|
}
|
|
|
|
void comp_main()
|
|
{
|
|
if (gl_LocalInvocationID.x == 0u)
|
|
{
|
|
uint _47;
|
|
_43.InterlockedAdd(0, 1u, _47);
|
|
sh_part_ix = _47;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
uint part_ix = sh_part_ix;
|
|
uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
|
|
Monoid _71;
|
|
_71.element = _67.Load(ix * 4 + 0);
|
|
Monoid local[16];
|
|
local[0].element = _71.element;
|
|
Monoid param_1;
|
|
for (uint i = 1u; i < 16u; i++)
|
|
{
|
|
Monoid param = local[i - 1u];
|
|
Monoid _94;
|
|
_94.element = _67.Load((ix + i) * 4 + 0);
|
|
param_1.element = _94.element;
|
|
local[i] = combine_monoid(param, param_1);
|
|
}
|
|
Monoid agg = local[15];
|
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
|
for (uint i_1 = 0u; i_1 < 9u; i_1++)
|
|
{
|
|
GroupMemoryBarrierWithGroupSync();
|
|
if (gl_LocalInvocationID.x >= uint(1 << int(i_1)))
|
|
{
|
|
Monoid other = sh_scratch[gl_LocalInvocationID.x - uint(1 << int(i_1))];
|
|
Monoid param_2 = other;
|
|
Monoid param_3 = agg;
|
|
agg = combine_monoid(param_2, param_3);
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
|
}
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
_43.Store(part_ix * 12 + 8, agg.element);
|
|
if (part_ix == 0u)
|
|
{
|
|
_43.Store(12, agg.element);
|
|
}
|
|
}
|
|
DeviceMemoryBarrier();
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
uint flag = 1u;
|
|
if (part_ix == 0u)
|
|
{
|
|
flag = 2u;
|
|
}
|
|
_43.Store(part_ix * 12 + 4, flag);
|
|
}
|
|
Monoid exclusive = _187;
|
|
if (part_ix != 0u)
|
|
{
|
|
uint look_back_ix = part_ix - 1u;
|
|
uint their_ix = 0u;
|
|
Monoid their_prefix;
|
|
Monoid their_agg;
|
|
Monoid m;
|
|
while (true)
|
|
{
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
sh_flag = _43.Load(look_back_ix * 12 + 4);
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
DeviceMemoryBarrier();
|
|
uint flag_1 = sh_flag;
|
|
if (flag_1 == 2u)
|
|
{
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
Monoid _225;
|
|
_225.element = _43.Load(look_back_ix * 12 + 12);
|
|
their_prefix.element = _225.element;
|
|
Monoid param_4 = their_prefix;
|
|
Monoid param_5 = exclusive;
|
|
exclusive = combine_monoid(param_4, param_5);
|
|
}
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
if (flag_1 == 1u)
|
|
{
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
Monoid _247;
|
|
_247.element = _43.Load(look_back_ix * 12 + 8);
|
|
their_agg.element = _247.element;
|
|
Monoid param_6 = their_agg;
|
|
Monoid param_7 = exclusive;
|
|
exclusive = combine_monoid(param_6, param_7);
|
|
}
|
|
look_back_ix--;
|
|
their_ix = 0u;
|
|
continue;
|
|
}
|
|
}
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
Monoid _269;
|
|
_269.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
|
|
m.element = _269.element;
|
|
if (their_ix == 0u)
|
|
{
|
|
their_agg = m;
|
|
}
|
|
else
|
|
{
|
|
Monoid param_8 = their_agg;
|
|
Monoid param_9 = m;
|
|
their_agg = combine_monoid(param_8, param_9);
|
|
}
|
|
their_ix++;
|
|
if (their_ix == 8192u)
|
|
{
|
|
Monoid param_10 = their_agg;
|
|
Monoid param_11 = exclusive;
|
|
exclusive = combine_monoid(param_10, param_11);
|
|
if (look_back_ix == 0u)
|
|
{
|
|
sh_flag = 2u;
|
|
}
|
|
else
|
|
{
|
|
look_back_ix--;
|
|
their_ix = 0u;
|
|
}
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
flag_1 = sh_flag;
|
|
if (flag_1 == 2u)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
Monoid param_12 = exclusive;
|
|
Monoid param_13 = agg;
|
|
Monoid inclusive_prefix = combine_monoid(param_12, param_13);
|
|
sh_prefix = exclusive;
|
|
_43.Store(part_ix * 12 + 12, inclusive_prefix.element);
|
|
}
|
|
DeviceMemoryBarrier();
|
|
if (gl_LocalInvocationID.x == 511u)
|
|
{
|
|
_43.Store(part_ix * 12 + 4, 2u);
|
|
}
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
if (part_ix != 0u)
|
|
{
|
|
exclusive = sh_prefix;
|
|
}
|
|
Monoid row = exclusive;
|
|
if (gl_LocalInvocationID.x > 0u)
|
|
{
|
|
Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
|
|
Monoid param_14 = row;
|
|
Monoid param_15 = other_1;
|
|
row = combine_monoid(param_14, param_15);
|
|
}
|
|
for (uint i_2 = 0u; i_2 < 16u; i_2++)
|
|
{
|
|
Monoid param_16 = row;
|
|
Monoid param_17 = local[i_2];
|
|
Monoid m_1 = combine_monoid(param_16, param_17);
|
|
_374.Store((ix + i_2) * 4 + 0, m_1.element);
|
|
}
|
|
}
|
|
|
|
[numthreads(512, 1, 1)]
|
|
void main(SPIRV_Cross_Input stage_input)
|
|
{
|
|
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
|
|
comp_main();
|
|
}
|