vello/piet-gpu/shader/gen/draw_reduce.hlsl
Raph Levien 3b67a4e7c1 New clip implementation
This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack.

This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely.

Fixes #88. Also relevant to #119
2022-02-17 17:13:28 -08:00

165 lines
3.8 KiB
HLSL
Generated

struct ElementRef
{
uint offset;
};
struct ElementTag
{
uint tag;
uint flags;
};
struct DrawMonoid
{
uint path_ix;
uint clip_ix;
};
static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
struct Alloc
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
Alloc bbox_alloc;
Alloc drawmonoid_alloc;
Alloc clip_alloc;
Alloc clip_bic_alloc;
Alloc clip_stack_alloc;
Alloc clip_bbox_alloc;
uint n_trans;
uint n_path;
uint n_clip;
uint trans_offset;
uint linewidth_offset;
uint pathtag_offset;
uint pathseg_offset;
};
static const DrawMonoid _87 = { 1u, 0u };
static const DrawMonoid _89 = { 1u, 1u };
static const DrawMonoid _91 = { 0u, 0u };
ByteAddressBuffer _46 : register(t2, space0);
RWByteAddressBuffer _199 : register(u3, space0);
RWByteAddressBuffer _213 : register(u0, space0);
ByteAddressBuffer _219 : register(t1, space0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_WorkGroupID : SV_GroupID;
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
groupshared DrawMonoid sh_scratch[256];
ElementTag Element_tag(ElementRef ref)
{
uint tag_and_flags = _46.Load((ref.offset >> uint(2)) * 4 + 0);
ElementTag _60 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) };
return _60;
}
DrawMonoid map_tag(uint tag_word)
{
switch (tag_word)
{
case 4u:
case 5u:
case 6u:
{
return _87;
}
case 9u:
case 10u:
{
return _89;
}
default:
{
return _91;
}
}
}
ElementRef Element_index(ElementRef ref, uint index)
{
ElementRef _39 = { ref.offset + (index * 36u) };
return _39;
}
DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b)
{
DrawMonoid c;
c.path_ix = a.path_ix + b.path_ix;
c.clip_ix = a.clip_ix + b.clip_ix;
return c;
}
void comp_main()
{
uint ix = gl_GlobalInvocationID.x * 8u;
ElementRef _107 = { ix * 36u };
ElementRef ref = _107;
ElementRef param = ref;
uint tag_word = Element_tag(param).tag;
uint param_1 = tag_word;
DrawMonoid agg = map_tag(param_1);
for (uint i = 1u; i < 8u; i++)
{
ElementRef param_2 = ref;
uint param_3 = i;
ElementRef param_4 = Element_index(param_2, param_3);
tag_word = Element_tag(param_4).tag;
uint param_5 = tag_word;
DrawMonoid param_6 = agg;
DrawMonoid param_7 = map_tag(param_5);
agg = combine_tag_monoid(param_6, param_7);
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 8u; i_1++)
{
GroupMemoryBarrierWithGroupSync();
if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u)
{
DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
DrawMonoid param_8 = agg;
DrawMonoid param_9 = other;
agg = combine_tag_monoid(param_8, param_9);
}
GroupMemoryBarrierWithGroupSync();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
if (gl_LocalInvocationID.x == 0u)
{
_199.Store(gl_WorkGroupID.x * 8 + 0, agg.path_ix);
_199.Store(gl_WorkGroupID.x * 8 + 4, agg.clip_ix);
}
}
[numthreads(256, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}