diff --git a/.gitignore b/.gitignore index e0229c8..6853bbc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,3 @@ **/*.rs.bk .ninja_deps .ninja_log -**/shader/gen diff --git a/piet-gpu-hal/examples/shader/gen/collatz.dxil b/piet-gpu-hal/examples/shader/gen/collatz.dxil new file mode 100644 index 0000000..a03f96a Binary files /dev/null and b/piet-gpu-hal/examples/shader/gen/collatz.dxil differ diff --git a/piet-gpu-hal/examples/shader/gen/collatz.hlsl b/piet-gpu-hal/examples/shader/gen/collatz.hlsl new file mode 100644 index 0000000..762f06d --- /dev/null +++ b/piet-gpu-hal/examples/shader/gen/collatz.hlsl @@ -0,0 +1,62 @@ +static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u); + +RWByteAddressBuffer _57 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +float mod(float x, float y) +{ + return x - y * floor(x / y); +} + +float2 mod(float2 x, float2 y) +{ + return x - y * floor(x / y); +} + +float3 mod(float3 x, float3 y) +{ + return x - y * floor(x / y); +} + +float4 mod(float4 x, float4 y) +{ + return x - y * floor(x / y); +} + +uint collatz_iterations(inout uint n) +{ + uint i = 0u; + while (n != 1u) + { + if (mod(float(n), 2.0f) == 0.0f) + { + n /= 2u; + } + else + { + n = (3u * n) + 1u; + } + i++; + } + return i; +} + +void comp_main() +{ + uint index = gl_GlobalInvocationID.x; + uint param = _57.Load(index * 4 + 0); + uint _65 = collatz_iterations(param); + _57.Store(index * 4 + 0, _65); +} + +[numthreads(1, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu-hal/examples/shader/gen/collatz.msl b/piet-gpu-hal/examples/shader/gen/collatz.msl new file mode 100644 index 0000000..1b75efe --- /dev/null +++ b/piet-gpu-hal/examples/shader/gen/collatz.msl @@ -0,0 +1,48 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +// Implementation of the GLSL mod() function, which is slightly different than Metal fmod() +template +inline Tx mod(Tx x, Ty y) +{ + return x - y * floor(x / y); +} + +struct PrimeIndices +{ + uint indices[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +static inline __attribute__((always_inline)) +uint collatz_iterations(thread uint& n) +{ + uint i = 0u; + while (n != 1u) + { + if (mod(float(n), 2.0) == 0.0) + { + n /= 2u; + } + else + { + n = (3u * n) + 1u; + } + i++; + } + return i; +} + +kernel void main0(device PrimeIndices& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint index = gl_GlobalInvocationID.x; + uint param = _57.indices[index]; + uint _65 = collatz_iterations(param); + _57.indices[index] = _65; +} + diff --git a/piet-gpu-hal/examples/shader/gen/collatz.spv b/piet-gpu-hal/examples/shader/gen/collatz.spv new file mode 100644 index 0000000..886797e Binary files /dev/null and b/piet-gpu-hal/examples/shader/gen/collatz.spv differ diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil new file mode 100644 index 0000000..0fb9622 Binary files /dev/null and b/piet-gpu/shader/gen/backdrop.dxil differ diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl new file mode 100644 index 0000000..aba3cff --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.hlsl @@ -0,0 +1,244 @@ +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _67 : register(u0, space0); +ByteAddressBuffer _166 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +static uint gl_LocalInvocationIndex; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; + uint gl_LocalInvocationIndex : SV_GroupIndex; +}; + +groupshared uint sh_row_width[256]; +groupshared Alloc sh_row_alloc[256]; +groupshared uint sh_row_count[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _67.Load(offset * 4 + 8); + return v; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _134 = { raw2 }; + s.tiles = _134; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _67.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = _67.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.Load(0)) + { + PathRef _180 = { _166.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _180; + Alloc _185; + _185.offset = _166.Load(16); + Alloc param; + param.offset = _185.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11); + } + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl new file mode 100644 index 0000000..1c0a0bb --- /dev/null +++ b/piet-gpu/shader/gen/backdrop.msl @@ -0,0 +1,247 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_67.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_67); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_67); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_67); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_67.memory[offset] = val; +} + +kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = v_67.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.conf.n_elements) + { + PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param; + param.offset = _166.conf.tile_alloc.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1, v_67); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 256u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6, v_67); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8, v_67); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11, v_67); + } + } + } +} + diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv new file mode 100644 index 0000000..2bd17d8 Binary files /dev/null and b/piet-gpu/shader/gen/backdrop.spv differ diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil new file mode 100644 index 0000000..e24a6d3 Binary files /dev/null and b/piet-gpu/shader/gen/backdrop_lg.dxil differ diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl new file mode 100644 index 0000000..c506403 --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.hlsl @@ -0,0 +1,244 @@ +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u); + +RWByteAddressBuffer _67 : register(u0, space0); +ByteAddressBuffer _166 : register(t1, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +static uint gl_LocalInvocationIndex; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; + uint gl_LocalInvocationIndex : SV_GroupIndex; +}; + +groupshared uint sh_row_width[256]; +groupshared Alloc sh_row_alloc[256]; +groupshared uint sh_row_count[256]; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _67.Load(offset * 4 + 8); + return v; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _134 = { raw2 }; + s.tiles = _134; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _67.Store(offset * 4 + 8, val); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = _67.Load(4) == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.Load(0)) + { + PathRef _180 = { _166.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _180; + Alloc _185; + _185.offset = _166.Load(16); + Alloc param; + param.offset = _185.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + GroupMemoryBarrierWithGroupSync(); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11); + } + } + } +} + +[numthreads(256, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + gl_LocalInvocationIndex = stage_input.gl_LocalInvocationIndex; + comp_main(); +} diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl new file mode 100644 index 0000000..de43ebe --- /dev/null +++ b/piet-gpu/shader/gen/backdrop_lg.msl @@ -0,0 +1,247 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_67.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_67); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_67); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_67); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_67.memory[offset] = val; +} + +kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_row_width[256]; + threadgroup Alloc sh_row_alloc[256]; + threadgroup uint sh_row_count[256]; + uint th_ix = gl_LocalInvocationIndex; + uint element_ix = gl_GlobalInvocationID.x; + uint row_count = 0u; + bool mem_ok = v_67.mem_error == 0u; + if (gl_LocalInvocationID.y == 0u) + { + if (element_ix < _166.conf.n_elements) + { + PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param; + param.offset = _166.conf.tile_alloc.offset; + PathRef param_1 = path_ref; + Path path = Path_read(param, param_1, v_67); + sh_row_width[th_ix] = path.bbox.z - path.bbox.x; + row_count = path.bbox.w - path.bbox.y; + bool _210 = row_count == 1u; + bool _216; + if (_210) + { + _216 = path.bbox.y > 0u; + } + else + { + _216 = _210; + } + if (_216) + { + row_count = 0u; + } + uint param_2 = path.tiles.offset; + uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_4 = mem_ok; + Alloc path_alloc = new_alloc(param_2, param_3, param_4); + sh_row_alloc[th_ix] = path_alloc; + } + sh_row_count[th_ix] = row_count; + } + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _262 = gl_LocalInvocationID.y == 0u; + bool _269; + if (_262) + { + _269 = th_ix >= (1u << i); + } + else + { + _269 = _262; + } + if (_269) + { + row_count += sh_row_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.y == 0u) + { + sh_row_count[th_ix] = row_count; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_rows = sh_row_count[255]; + uint _348; + for (uint row = th_ix; row < total_rows; row += 1024u) + { + uint el_ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = el_ix + (128u >> i_1); + if (row >= sh_row_count[probe - 1u]) + { + el_ix = probe; + } + } + uint width = sh_row_width[el_ix]; + if ((width > 0u) && mem_ok) + { + Alloc tiles_alloc = sh_row_alloc[el_ix]; + if (el_ix > 0u) + { + _348 = sh_row_count[el_ix - 1u]; + } + else + { + _348 = 0u; + } + uint seq_ix = row - _348; + uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); + Alloc param_5 = tiles_alloc; + uint param_6 = tile_el_ix; + uint sum = read_mem(param_5, param_6, v_67); + for (uint x = 1u; x < width; x++) + { + tile_el_ix += 2u; + Alloc param_7 = tiles_alloc; + uint param_8 = tile_el_ix; + sum += read_mem(param_7, param_8, v_67); + Alloc param_9 = tiles_alloc; + uint param_10 = tile_el_ix; + uint param_11 = sum; + write_mem(param_9, param_10, param_11, v_67); + } + } + } +} + diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv new file mode 100644 index 0000000..ff2b1d7 Binary files /dev/null and b/piet-gpu/shader/gen/backdrop_lg.spv differ diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil new file mode 100644 index 0000000..6655b7f Binary files /dev/null and b/piet-gpu/shader/gen/bbox_clear.dxil differ diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl new file mode 100644 index 0000000..8a884d3 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -0,0 +1,66 @@ +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +ByteAddressBuffer _21 : register(t1, space0); +RWByteAddressBuffer _45 : register(u0, space0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.Load(76)) + { + uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix); + _45.Store(out_ix * 4 + 8, 65535u); + _45.Store((out_ix + 1u) * 4 + 8, 65535u); + _45.Store((out_ix + 2u) * 4 + 8, 0u); + _45.Store((out_ix + 3u) * 4 + 8, 0u); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl new file mode 100644 index 0000000..c278c68 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.msl @@ -0,0 +1,68 @@ +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +kernel void main0(device Memory& _45 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.conf.n_path) + { + uint out_ix = (_21.conf.path_bbox_alloc.offset >> uint(2)) + (6u * ix); + _45.memory[out_ix] = 65535u; + _45.memory[out_ix + 1u] = 65535u; + _45.memory[out_ix + 2u] = 0u; + _45.memory[out_ix + 3u] = 0u; + } +} + diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv new file mode 100644 index 0000000..58a270e Binary files /dev/null and b/piet-gpu/shader/gen/bbox_clear.spv differ diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil new file mode 100644 index 0000000..3050aa8 Binary files /dev/null and b/piet-gpu/shader/gen/binning.dxil differ diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl new file mode 100644 index 0000000..986f42b --- /dev/null +++ b/piet-gpu/shader/gen/binning.hlsl @@ -0,0 +1,342 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _81 : register(u0, space0); +ByteAddressBuffer _156 : register(t1, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint bitmaps[8][256]; +groupshared bool sh_alloc_failed; +groupshared uint count[8][256]; +groupshared Alloc sh_chunk_alloc[256]; + +DrawMonoid load_draw_monoid(uint element_ix) +{ + uint base = (_156.Load(44) >> uint(2)) + (4u * element_ix); + uint path_ix = _81.Load(base * 4 + 8); + uint clip_ix = _81.Load((base + 1u) * 4 + 8); + uint scene_offset = _81.Load((base + 2u) * 4 + 8); + uint info_offset = _81.Load((base + 3u) * 4 + 8); + DrawMonoid _190 = { path_ix, clip_ix, scene_offset, info_offset }; + return _190; +} + +float4 load_clip_bbox(uint clip_ix) +{ + uint base = (_156.Load(60) >> uint(2)) + (4u * clip_ix); + float x0 = asfloat(_81.Load(base * 4 + 8)); + float y0 = asfloat(_81.Load((base + 1u) * 4 + 8)); + float x1 = asfloat(_81.Load((base + 2u) * 4 + 8)); + float y1 = asfloat(_81.Load((base + 3u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_156.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_81.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_81.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_81.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_81.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +float4 bbox_intersect(float4 a, float4 b) +{ + return float4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +void store_draw_bbox(uint draw_ix, float4 bbox) +{ + uint base = (_156.Load(64) >> uint(2)) + (4u * draw_ix); + _81.Store(base * 4 + 8, asuint(bbox.x)); + _81.Store((base + 1u) * 4 + 8, asuint(bbox.y)); + _81.Store((base + 2u) * 4 + 8, asuint(bbox.z)); + _81.Store((base + 3u) * 4 + 8, asuint(bbox.w)); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _87; + _81.InterlockedAdd(0, size, _87); + uint offset = _87; + uint _94; + _81.GetDimensions(_94); + _94 = (_94 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_94) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _116; + _81.InterlockedMax(4, 1u, _116); + return r; + } + return r; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _81.Store(offset * 4 + 8, val); +} + +void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = false; + } + GroupMemoryBarrierWithGroupSync(); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if (element_ix < _156.Load(0)) + { + uint param = element_ix; + DrawMonoid draw_monoid = load_draw_monoid(param); + uint path_ix = draw_monoid.path_ix; + float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0u) + { + uint param_1 = clip_ix - 1u; + clip_bbox = load_clip_bbox(param_1); + } + uint param_2 = path_ix; + float4 path_bbox = load_path_bbox(param_2); + float4 param_3 = path_bbox; + float4 param_4 = clip_bbox; + float4 bbox = bbox_intersect(param_3, param_4); + float4 _417 = bbox; + float4 _419 = bbox; + float2 _421 = max(_417.xy, _419.zw); + bbox.z = _421.x; + bbox.w = _421.y; + uint param_5 = element_ix; + float4 param_6 = bbox; + store_draw_bbox(param_5, param_6); + x0 = int(floor(bbox.x * 0.00390625f)); + y0 = int(floor(bbox.y * 0.00390625f)); + x1 = int(ceil(bbox.z * 0.00390625f)); + y1 = int(ceil(bbox.w * 0.00390625f)); + } + uint width_in_bins = ((_156.Load(8) + 16u) - 1u) / 16u; + uint height_in_bins = ((_156.Load(12) + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); + while (y < y1) + { + uint _523; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _523); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + GroupMemoryBarrierWithGroupSync(); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_7 = 0u; + uint param_8 = 0u; + bool param_9 = true; + Alloc chunk_alloc = new_alloc(param_7, param_8, param_9); + if (element_count != 0u) + { + uint param_10 = element_count * 4u; + MallocResult _573 = malloc(param_10); + MallocResult chunk = _573; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = true; + } + } + uint out_ix = (_156.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc _603; + _603.offset = _156.Load(20); + Alloc param_11; + param_11.offset = _603.offset; + uint param_12 = out_ix; + uint param_13 = element_count; + write_mem(param_11, param_12, param_13); + Alloc _615; + _615.offset = _156.Load(20); + Alloc param_14; + param_14.offset = _615.offset; + uint param_15 = out_ix + 1u; + uint param_16 = chunk_alloc.offset; + write_mem(param_14, param_15, param_16); + GroupMemoryBarrierWithGroupSync(); + bool _630; + if (!sh_alloc_failed) + { + _630 = _81.Load(4) != 0u; + } + else + { + _630 = sh_alloc_failed; + } + if (_630) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(countbits(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + BinInstanceRef _692 = { out_offset }; + BinInstance _694 = { element_ix }; + Alloc param_17 = out_alloc; + BinInstanceRef param_18 = _692; + BinInstance param_19 = _694; + BinInstance_write(param_17, param_18, param_19); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl new file mode 100644 index 0000000..2ee5168 --- /dev/null +++ b/piet-gpu/shader/gen/binning.msl @@ -0,0 +1,347 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix); + uint path_ix = v_81.memory[base]; + uint clip_ix = v_81.memory[base + 1u]; + uint scene_offset = v_81.memory[base + 2u]; + uint info_offset = v_81.memory[base + 3u]; + return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset }; +} + +static inline __attribute__((always_inline)) +float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix); + float x0 = as_type(v_81.memory[base]); + float y0 = as_type(v_81.memory[base + 1u]); + float x1 = as_type(v_81.memory[base + 2u]); + float y1 = as_type(v_81.memory[base + 3u]); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_81.memory[base]) - 32768.0; + float bbox_t = float(v_81.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_81.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_81.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +float4 bbox_intersect(thread const float4& a, thread const float4& b) +{ + return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); +} + +static inline __attribute__((always_inline)) +void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +{ + uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); + v_81.memory[base] = as_type(bbox.x); + v_81.memory[base + 1u] = as_type(bbox.y); + v_81.memory[base + 2u] = as_type(bbox.z); + v_81.memory[base + 3u] = as_type(bbox.w); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize) +{ + uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed); + uint offset = _87; + MallocResult r; + r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_81.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.element_ix; + write_mem(param, param_1, param_2, v_81, v_81BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint bitmaps[8][256]; + threadgroup short sh_alloc_failed; + threadgroup uint count[8][256]; + threadgroup Alloc sh_chunk_alloc[256]; + constant uint& v_81BufferSize = spvBufferSizeConstants[0]; + uint my_partition = gl_WorkGroupID.x; + for (uint i = 0u; i < 8u; i++) + { + bitmaps[i][gl_LocalInvocationID.x] = 0u; + } + if (gl_LocalInvocationID.x == 0u) + { + sh_alloc_failed = short(false); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if (element_ix < v_156.conf.n_elements) + { + uint param = element_ix; + DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156); + uint path_ix = draw_monoid.path_ix; + float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0u) + { + uint param_1 = clip_ix - 1u; + clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156); + } + uint param_2 = path_ix; + float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156); + float4 param_3 = path_bbox; + float4 param_4 = clip_bbox; + float4 bbox = bbox_intersect(param_3, param_4); + float4 _417 = bbox; + float4 _419 = bbox; + float2 _421 = fast::max(_417.xy, _419.zw); + bbox.z = _421.x; + bbox.w = _421.y; + uint param_5 = element_ix; + float4 param_6 = bbox; + store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156); + x0 = int(floor(bbox.x * 0.00390625)); + y0 = int(floor(bbox.y * 0.00390625)); + x1 = int(ceil(bbox.z * 0.00390625)); + y1 = int(ceil(bbox.w * 0.00390625)); + } + uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u; + uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u; + x0 = clamp(x0, 0, int(width_in_bins)); + x1 = clamp(x1, x0, int(width_in_bins)); + y0 = clamp(y0, 0, int(height_in_bins)); + y1 = clamp(y1, y0, int(height_in_bins)); + if (x0 == x1) + { + y1 = y0; + } + int x = x0; + int y = y0; + uint my_slice = gl_LocalInvocationID.x / 32u; + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); + while (y < y1) + { + uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + x++; + if (x == x1) + { + x = x0; + y++; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint element_count = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x]))); + count[i_1][gl_LocalInvocationID.x] = element_count; + } + uint param_7 = 0u; + uint param_8 = 0u; + bool param_9 = true; + Alloc chunk_alloc = new_alloc(param_7, param_8, param_9); + if (element_count != 0u) + { + uint param_10 = element_count * 4u; + MallocResult _573 = malloc(param_10, v_81, v_81BufferSize); + MallocResult chunk = _573; + chunk_alloc = chunk.alloc; + sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; + if (chunk.failed) + { + sh_alloc_failed = short(true); + } + } + uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc param_11; + param_11.offset = v_156.conf.bin_alloc.offset; + uint param_12 = out_ix; + uint param_13 = element_count; + write_mem(param_11, param_12, param_13, v_81, v_81BufferSize); + Alloc param_14; + param_14.offset = v_156.conf.bin_alloc.offset; + uint param_15 = out_ix + 1u; + uint param_16 = chunk_alloc.offset; + write_mem(param_14, param_15, param_16, v_81, v_81BufferSize); + threadgroup_barrier(mem_flags::mem_threadgroup); + bool _630; + if (!bool(sh_alloc_failed)) + { + _630 = v_81.mem_error != 0u; + } + else + { + _630 = bool(sh_alloc_failed); + } + if (_630) + { + return; + } + x = x0; + y = y0; + while (y < y1) + { + uint bin_ix = (uint(y) * width_in_bins) + uint(x); + uint out_mask = bitmaps[my_slice][bin_ix]; + if ((out_mask & my_mask) != 0u) + { + uint idx = uint(int(popcount(out_mask & (my_mask - 1u)))); + if (my_slice > 0u) + { + idx += count[my_slice - 1u][bin_ix]; + } + Alloc out_alloc = sh_chunk_alloc[bin_ix]; + uint out_offset = out_alloc.offset + (idx * 4u); + Alloc param_17 = out_alloc; + BinInstanceRef param_18 = BinInstanceRef{ out_offset }; + BinInstance param_19 = BinInstance{ element_ix }; + BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize); + } + x++; + if (x == x1) + { + x = x0; + y++; + } + } +} + diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv new file mode 100644 index 0000000..30eacd6 Binary files /dev/null and b/piet-gpu/shader/gen/binning.spv differ diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil new file mode 100644 index 0000000..29a158e Binary files /dev/null and b/piet-gpu/shader/gen/clip_leaf.dxil differ diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl new file mode 100644 index 0000000..ed45bf1 --- /dev/null +++ b/piet-gpu/shader/gen/clip_leaf.hlsl @@ -0,0 +1,371 @@ +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Bic _393 = { 0u, 0u }; + +ByteAddressBuffer _80 : register(t1, space0); +RWByteAddressBuffer _96 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Bic sh_bic[510]; +groupshared uint sh_stack[256]; +groupshared float4 sh_stack_bbox[256]; +groupshared uint sh_link[256]; +groupshared float4 sh_bbox[256]; + +Bic load_bic(uint ix) +{ + uint base = (_80.Load(52) >> uint(2)) + (2u * ix); + Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) }; + return _286; +} + +Bic bic_combine(Bic x, Bic y) +{ + uint m = min(x.b, y.a); + Bic _72 = { (x.a + y.a) - m, (x.b + y.b) - m }; + return _72; +} + +ClipEl load_clip_el(uint ix) +{ + uint base = (_80.Load(56) >> uint(2)) + (5u * ix); + uint parent_ix = _96.Load(base * 4 + 8); + float x0 = asfloat(_96.Load((base + 1u) * 4 + 8)); + float y0 = asfloat(_96.Load((base + 2u) * 4 + 8)); + float x1 = asfloat(_96.Load((base + 3u) * 4 + 8)); + float y1 = asfloat(_96.Load((base + 4u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + ClipEl _335 = { parent_ix, bbox }; + return _335; +} + +float4 bbox_intersect(float4 a, float4 b) +{ + return float4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +uint load_path_ix(uint ix) +{ + if (ix < _80.Load(80)) + { + return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8); + } + else + { + return 2147483648u; + } +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +uint search_link(inout Bic bic) +{ + uint ix = gl_LocalInvocationID.x; + uint j = 0u; + while (j < 8u) + { + uint base = 512u - (2u << (8u - j)); + if (((ix >> j) & 1u) != 0u) + { + Bic param = sh_bic[(base + (ix >> j)) - 1u]; + Bic param_1 = bic; + Bic test = bic_combine(param, param_1); + if (test.b > 0u) + { + break; + } + bic = test; + ix -= (1u << j); + } + j++; + } + if (ix > 0u) + { + while (j > 0u) + { + j--; + uint base_1 = 512u - (2u << (8u - j)); + Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; + Bic param_3 = bic; + Bic test_1 = bic_combine(param_2, param_3); + if (test_1.b == 0u) + { + bic = test_1; + ix -= (1u << j); + } + } + } + if (ix > 0u) + { + return ix - 1u; + } + else + { + return 4294967295u - bic.a; + } +} + +void store_clip_bbox(uint ix, float4 bbox) +{ + uint base = (_80.Load(60) >> uint(2)) + (4u * ix); + _96.Store(base * 4 + 8, asuint(bbox.x)); + _96.Store((base + 1u) * 4 + 8, asuint(bbox.y)); + _96.Store((base + 2u) * 4 + 8, asuint(bbox.z)); + _96.Store((base + 3u) * 4 + 8, asuint(bbox.w)); +} + +void comp_main() +{ + uint th = gl_LocalInvocationID.x; + Bic bic = _393; + if (th < gl_WorkGroupID.x) + { + uint param = th; + bic = load_bic(param); + } + sh_bic[th] = bic; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[th + (1u << i)]; + Bic param_1 = bic; + Bic param_2 = other; + bic = bic_combine(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_bic[th] = bic; + } + GroupMemoryBarrierWithGroupSync(); + uint stack_size = sh_bic[0].b; + uint sp = 255u - th; + uint ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = ix + (128u >> i_1); + if (sp < sh_bic[probe].b) + { + ix = probe; + } + } + uint b = sh_bic[ix].b; + float4 bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + if (sp < b) + { + uint param_3 = (((ix * 256u) + b) - sp) - 1u; + ClipEl el = load_clip_el(param_3); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + sh_stack_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + if (th >= (1u << i_2)) + { + float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; + float4 param_5 = bbox; + bbox = bbox_intersect(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + } + sh_stack_bbox[th] = bbox; + uint param_6 = gl_GlobalInvocationID.x; + uint inp = load_path_ix(param_6); + bool is_push = int(inp) >= 0; + Bic _559 = { 1u - uint(is_push), uint(is_push) }; + bic = _559; + sh_bic[th] = bic; + if (is_push) + { + uint param_7 = inp; + bbox = load_path_bbox(param_7); + } + else + { + bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + } + uint inbase = 0u; + for (uint i_3 = 0u; i_3 < 7u; i_3++) + { + uint outbase = 512u - (1u << (8u - i_3)); + GroupMemoryBarrierWithGroupSync(); + if (th < (1u << (7u - i_3))) + { + Bic param_8 = sh_bic[inbase + (th * 2u)]; + Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_8, param_9); + } + inbase = outbase; + } + GroupMemoryBarrierWithGroupSync(); + bic = _393; + Bic param_10 = bic; + uint _618 = search_link(param_10); + bic = param_10; + uint link = _618; + sh_link[th] = link; + GroupMemoryBarrierWithGroupSync(); + uint grandparent; + if (int(link) >= 0) + { + grandparent = sh_link[link]; + } + else + { + grandparent = link - 1u; + } + uint parent; + if (int(link) >= 0) + { + parent = (gl_WorkGroupID.x * 256u) + link; + } + else + { + if (int(link + stack_size) >= 0) + { + parent = sh_stack[256u + link]; + } + else + { + parent = 4294967295u; + } + } + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + if (i_4 != 0u) + { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + if (int(link) >= 0) + { + float4 param_11 = sh_bbox[link]; + float4 param_12 = bbox; + bbox = bbox_intersect(param_11, param_12); + link = sh_link[link]; + } + GroupMemoryBarrierWithGroupSync(); + } + if (int(link + stack_size) >= 0) + { + float4 param_13 = sh_stack_bbox[256u + link]; + float4 param_14 = bbox; + bbox = bbox_intersect(param_13, param_14); + } + sh_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + uint path_ix = inp; + bool _717 = !is_push; + bool _725; + if (_717) + { + _725 = gl_GlobalInvocationID.x < _80.Load(80); + } + else + { + _725 = _717; + } + if (_725) + { + uint param_15 = parent; + path_ix = load_path_ix(param_15); + uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp)); + _96.Store(drawmonoid_out_base * 4 + 8, path_ix); + if (int(grandparent) >= 0) + { + bbox = sh_bbox[grandparent]; + } + else + { + if (int(grandparent + stack_size) >= 0) + { + bbox = sh_stack_bbox[256u + grandparent]; + } + else + { + bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + } + } + } + uint param_16 = gl_GlobalInvocationID.x; + float4 param_17 = bbox; + store_clip_bbox(param_16, param_17); +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl new file mode 100644 index 0000000..5f5e0a7 --- /dev/null +++ b/piet-gpu/shader/gen/clip_leaf.msl @@ -0,0 +1,370 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); + return Bic{ v_96.memory[base], v_96.memory[base + 1u] }; +} + +static inline __attribute__((always_inline)) +Bic bic_combine(thread const Bic& x, thread const Bic& y) +{ + uint m = min(x.b, y.a); + return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; +} + +static inline __attribute__((always_inline)) +ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); + uint parent_ix = v_96.memory[base]; + float x0 = as_type(v_96.memory[base + 1u]); + float y0 = as_type(v_96.memory[base + 2u]); + float x1 = as_type(v_96.memory[base + 3u]); + float y1 = as_type(v_96.memory[base + 4u]); + float4 bbox = float4(x0, y0, x1, y1); + return ClipEl{ parent_ix, bbox }; +} + +static inline __attribute__((always_inline)) +float4 bbox_intersect(thread const float4& a, thread const float4& b) +{ + return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); +} + +static inline __attribute__((always_inline)) +uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + if (ix < v_80.conf.n_clip) + { + return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix]; + } + else + { + return 2147483648u; + } +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_96.memory[base]) - 32768.0; + float bbox_t = float(v_96.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_96.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_96.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510]) +{ + uint ix = gl_LocalInvocationID.x; + uint j = 0u; + while (j < 8u) + { + uint base = 512u - (2u << (8u - j)); + if (((ix >> j) & 1u) != 0u) + { + Bic param = sh_bic[(base + (ix >> j)) - 1u]; + Bic param_1 = bic; + Bic test = bic_combine(param, param_1); + if (test.b > 0u) + { + break; + } + bic = test; + ix -= (1u << j); + } + j++; + } + if (ix > 0u) + { + while (j > 0u) + { + j--; + uint base_1 = 512u - (2u << (8u - j)); + Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; + Bic param_3 = bic; + Bic test_1 = bic_combine(param_2, param_3); + if (test_1.b == 0u) + { + bic = test_1; + ix -= (1u << j); + } + } + } + if (ix > 0u) + { + return ix - 1u; + } + else + { + return 4294967295u - bic.a; + } +} + +static inline __attribute__((always_inline)) +void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix); + v_96.memory[base] = as_type(bbox.x); + v_96.memory[base + 1u] = as_type(bbox.y); + v_96.memory[base + 2u] = as_type(bbox.z); + v_96.memory[base + 3u] = as_type(bbox.w); +} + +kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup Bic sh_bic[510]; + threadgroup uint sh_stack[256]; + threadgroup float4 sh_stack_bbox[256]; + threadgroup uint sh_link[256]; + threadgroup float4 sh_bbox[256]; + uint th = gl_LocalInvocationID.x; + Bic bic = Bic{ 0u, 0u }; + if (th < gl_WorkGroupID.x) + { + uint param = th; + bic = load_bic(param, v_80, v_96); + } + sh_bic[th] = bic; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[th + (1u << i)]; + Bic param_1 = bic; + Bic param_2 = other; + bic = bic_combine(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_bic[th] = bic; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint stack_size = sh_bic[0].b; + uint sp = 255u - th; + uint ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = ix + (128u >> i_1); + if (sp < sh_bic[probe].b) + { + ix = probe; + } + } + uint b = sh_bic[ix].b; + float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + if (sp < b) + { + uint param_3 = (((ix * 256u) + b) - sp) - 1u; + ClipEl el = load_clip_el(param_3, v_80, v_96); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + sh_stack_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th >= (1u << i_2)) + { + float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; + float4 param_5 = bbox; + bbox = bbox_intersect(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + sh_stack_bbox[th] = bbox; + uint param_6 = gl_GlobalInvocationID.x; + uint inp = load_path_ix(param_6, v_80, v_96); + bool is_push = int(inp) >= 0; + bic = Bic{ 1u - uint(is_push), uint(is_push) }; + sh_bic[th] = bic; + if (is_push) + { + uint param_7 = inp; + bbox = load_path_bbox(param_7, v_80, v_96); + } + else + { + bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + } + uint inbase = 0u; + for (uint i_3 = 0u; i_3 < 7u; i_3++) + { + uint outbase = 512u - (1u << (8u - i_3)); + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th < (1u << (7u - i_3))) + { + Bic param_8 = sh_bic[inbase + (th * 2u)]; + Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_8, param_9); + } + inbase = outbase; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + bic = Bic{ 0u, 0u }; + Bic param_10 = bic; + uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic); + bic = param_10; + uint link = _618; + sh_link[th] = link; + threadgroup_barrier(mem_flags::mem_threadgroup); + uint grandparent; + if (int(link) >= 0) + { + grandparent = sh_link[link]; + } + else + { + grandparent = link - 1u; + } + uint parent; + if (int(link) >= 0) + { + parent = (gl_WorkGroupID.x * 256u) + link; + } + else + { + if (int(link + stack_size) >= 0) + { + parent = sh_stack[256u + link]; + } + else + { + parent = 4294967295u; + } + } + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + if (i_4 != 0u) + { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (int(link) >= 0) + { + float4 param_11 = sh_bbox[link]; + float4 param_12 = bbox; + bbox = bbox_intersect(param_11, param_12); + link = sh_link[link]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (int(link + stack_size) >= 0) + { + float4 param_13 = sh_stack_bbox[256u + link]; + float4 param_14 = bbox; + bbox = bbox_intersect(param_13, param_14); + } + sh_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = inp; + bool _717 = !is_push; + bool _725; + if (_717) + { + _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip; + } + else + { + _725 = _717; + } + if (_725) + { + uint param_15 = parent; + path_ix = load_path_ix(param_15, v_80, v_96); + uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * (~inp)); + v_96.memory[drawmonoid_out_base] = path_ix; + if (int(grandparent) >= 0) + { + bbox = sh_bbox[grandparent]; + } + else + { + if (int(grandparent + stack_size) >= 0) + { + bbox = sh_stack_bbox[256u + grandparent]; + } + else + { + bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + } + } + } + uint param_16 = gl_GlobalInvocationID.x; + float4 param_17 = bbox; + store_clip_bbox(param_16, param_17, v_80, v_96); +} + diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv new file mode 100644 index 0000000..beac64b Binary files /dev/null and b/piet-gpu/shader/gen/clip_leaf.spv differ diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil new file mode 100644 index 0000000..0dff71b Binary files /dev/null and b/piet-gpu/shader/gen/clip_reduce.dxil differ diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl new file mode 100644 index 0000000..1276b5f --- /dev/null +++ b/piet-gpu/shader/gen/clip_reduce.hlsl @@ -0,0 +1,181 @@ +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Bic _267 = { 0u, 0u }; + +ByteAddressBuffer _64 : register(t1, space0); +RWByteAddressBuffer _80 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Bic sh_bic[256]; +groupshared uint sh_parent[256]; +groupshared uint sh_path_ix[256]; +groupshared float4 sh_bbox[256]; + +Bic bic_combine(Bic x, Bic y) +{ + uint m = min(x.b, y.a); + Bic _56 = { (x.a + y.a) - m, (x.b + y.b) - m }; + return _56; +} + +void store_bic(uint ix, Bic bic) +{ + uint base = (_64.Load(52) >> uint(2)) + (2u * ix); + _80.Store(base * 4 + 8, bic.a); + _80.Store((base + 1u) * 4 + 8, bic.b); +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_80.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_80.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_80.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_80.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +void store_clip_el(uint ix, ClipEl el) +{ + uint base = (_64.Load(56) >> uint(2)) + (5u * ix); + _80.Store(base * 4 + 8, el.parent_ix); + _80.Store((base + 1u) * 4 + 8, asuint(el.bbox.x)); + _80.Store((base + 2u) * 4 + 8, asuint(el.bbox.y)); + _80.Store((base + 3u) * 4 + 8, asuint(el.bbox.z)); + _80.Store((base + 4u) * 4 + 8, asuint(el.bbox.w)); +} + +void comp_main() +{ + uint th = gl_LocalInvocationID.x; + uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 8); + bool is_push = int(inp) >= 0; + Bic _207 = { 1u - uint(is_push), uint(is_push) }; + Bic bic = _207; + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + Bic param = bic; + Bic param_1 = other; + bic = bic_combine(param, param_1); + } + GroupMemoryBarrierWithGroupSync(); + sh_bic[th] = bic; + } + if (th == 0u) + { + uint param_2 = gl_WorkGroupID.x; + Bic param_3 = bic; + store_bic(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + uint size = sh_bic[0].b; + bic = _267; + if ((th + 1u) < 256u) + { + bic = sh_bic[th + 1u]; + } + bool _283; + if (is_push) + { + _283 = bic.a == 0u; + } + else + { + _283 = is_push; + } + if (_283) + { + uint local_ix = (size - bic.b) - 1u; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + GroupMemoryBarrierWithGroupSync(); + float4 bbox; + if (th < size) + { + uint path_ix = sh_path_ix[th]; + uint param_4 = path_ix; + bbox = load_path_bbox(param_4); + } + if (th < size) + { + uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); + ClipEl _331 = { parent_ix, bbox }; + ClipEl el = _331; + uint param_5 = gl_GlobalInvocationID.x; + ClipEl param_6 = el; + store_clip_el(param_5, param_6); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl new file mode 100644 index 0000000..26214f1 --- /dev/null +++ b/piet-gpu/shader/gen/clip_reduce.msl @@ -0,0 +1,177 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Bic bic_combine(thread const Bic& x, thread const Bic& y) +{ + uint m = min(x.b, y.a); + return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; +} + +static inline __attribute__((always_inline)) +void store_bic(thread const uint& ix, thread const Bic& bic, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); + v_80.memory[base] = bic.a; + v_80.memory[base + 1u] = bic.b; +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_80.memory[base]) - 32768.0; + float bbox_t = float(v_80.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_80.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_80.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +void store_clip_el(thread const uint& ix, thread const ClipEl& el, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); + v_80.memory[base] = el.parent_ix; + v_80.memory[base + 1u] = as_type(el.bbox.x); + v_80.memory[base + 2u] = as_type(el.bbox.y); + v_80.memory[base + 3u] = as_type(el.bbox.z); + v_80.memory[base + 4u] = as_type(el.bbox.w); +} + +kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_64 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Bic sh_bic[256]; + threadgroup uint sh_parent[256]; + threadgroup uint sh_path_ix[256]; + threadgroup float4 sh_bbox[256]; + uint th = gl_LocalInvocationID.x; + uint inp = v_80.memory[(v_64.conf.clip_alloc.offset >> uint(2)) + gl_GlobalInvocationID.x]; + bool is_push = int(inp) >= 0; + Bic bic = Bic{ 1u - uint(is_push), uint(is_push) }; + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + Bic param = bic; + Bic param_1 = other; + bic = bic_combine(param, param_1); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_bic[th] = bic; + } + if (th == 0u) + { + uint param_2 = gl_WorkGroupID.x; + Bic param_3 = bic; + store_bic(param_2, param_3, v_64, v_80); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint size = sh_bic[0].b; + bic = Bic{ 0u, 0u }; + if ((th + 1u) < 256u) + { + bic = sh_bic[th + 1u]; + } + bool _283; + if (is_push) + { + _283 = bic.a == 0u; + } + else + { + _283 = is_push; + } + if (_283) + { + uint local_ix = (size - bic.b) - 1u; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + float4 bbox; + if (th < size) + { + uint path_ix = sh_path_ix[th]; + uint param_4 = path_ix; + bbox = load_path_bbox(param_4, v_64, v_80); + } + if (th < size) + { + uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); + ClipEl el = ClipEl{ parent_ix, bbox }; + uint param_5 = gl_GlobalInvocationID.x; + ClipEl param_6 = el; + store_clip_el(param_5, param_6, v_64, v_80); + } +} + diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv new file mode 100644 index 0000000..ce0b9bb Binary files /dev/null and b/piet-gpu/shader/gen/clip_reduce.spv differ diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil new file mode 100644 index 0000000..f71cc04 Binary files /dev/null and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl new file mode 100644 index 0000000..a7f769f --- /dev/null +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -0,0 +1,1254 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _266 : register(u0, space0); +ByteAddressBuffer _1020 : register(t1, space0); +ByteAddressBuffer _1399 : register(t2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_bitmaps[8][256]; +groupshared Alloc sh_part_elements[256]; +groupshared uint sh_part_count[256]; +groupshared uint sh_elements[256]; +groupshared uint sh_tile_stride[256]; +groupshared uint sh_tile_width[256]; +groupshared uint sh_tile_x0[256]; +groupshared uint sh_tile_y0[256]; +groupshared uint sh_tile_base[256]; +groupshared uint sh_tile_count[256]; + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _343 = { a.offset + offset }; + return _343; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _266.Load(offset * 4 + 8); + return v; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) +{ + BinInstanceRef _361 = { ref.offset + (index * 4u) }; + return _361; +} + +BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + BinInstance s; + s.element_ix = raw0; + return s; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _424 = { raw2 }; + s.tiles = _424; + return s; +} + +void write_tile_alloc(uint el_ix, Alloc a) +{ +} + +Alloc read_tile_alloc(uint el_ix, bool mem_ok) +{ + uint _907; + _266.GetDimensions(_907); + _907 = (_907 - 8) / 4; + uint param = 0u; + uint param_1 = uint(int(_907) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +Tile Tile_read(Alloc a, TileRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + TileSegRef _449 = { raw0 }; + Tile s; + s.tile = _449; + s.backdrop = int(raw1); + return s; +} + +MallocResult malloc(uint size) +{ + uint _272; + _266.InterlockedAdd(0, size, _272); + uint offset = _272; + uint _279; + _266.GetDimensions(_279); + _279 = (_279 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_279) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _301; + _266.InterlockedMax(4, 1u, _301); + return r; + } + return r; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _266.Store(offset * 4 + 8, val); +} + +void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2); +} + +void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 11u; + write_mem(param, param_1, param_2); + CmdJumpRef _900 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdJumpRef param_4 = _900; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5); +} + +bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _928 = malloc(param); + MallocResult new_cmd = _928; + if (new_cmd.failed) + { + return false; + } + CmdJump _938 = { new_cmd.alloc.offset }; + CmdJump jump = _938; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3); + cmd_alloc = new_cmd.alloc; + CmdRef _950 = { cmd_alloc.offset }; + cmd_ref = _950; + cmd_limit = (cmd_alloc.offset + 1024u) - 144u; + return true; +} + +void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2); + CmdFillRef _757 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdFillRef param_4 = _757; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5); +} + +void Cmd_Solid_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2); +} + +void CmdStroke_write(Alloc a, CmdStrokeRef ref, CmdStroke s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.half_width); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2); + CmdStrokeRef _775 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdStrokeRef param_4 = _775; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5); +} + +void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) +{ + if (linewidth < 0.0f) + { + if (tile.tile.offset != 0u) + { + CmdFill _973 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _973; + Alloc param = alloc; + CmdRef param_1 = cmd_ref; + CmdFill param_2 = cmd_fill; + Cmd_Fill_write(param, param_1, param_2); + cmd_ref.offset += 12u; + } + else + { + Alloc param_3 = alloc; + CmdRef param_4 = cmd_ref; + Cmd_Solid_write(param_3, param_4); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke _1003 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _1003; + Alloc param_5 = alloc; + CmdRef param_6 = cmd_ref; + CmdStroke param_7 = cmd_stroke; + Cmd_Stroke_write(param_5, param_6, param_7); + cmd_ref.offset += 12u; + } +} + +void CmdColor_write(Alloc a, CmdColorRef ref, CmdColor s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2); +} + +void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2); + CmdColorRef _801 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdColorRef param_4 = _801; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5); +} + +void CmdLinGrad_write(Alloc a, CmdLinGradRef ref, CmdLinGrad s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.line_x); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.line_y); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.line_c); + write_mem(param_9, param_10, param_11); +} + +void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2); + CmdLinGradRef _819 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdLinGradRef param_4 = _819; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5); +} + +void CmdRadGrad_write(Alloc a, CmdRadGradRef ref, CmdRadGrad s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.mat.x); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.mat.y); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.mat.z); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.mat.w); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.xlat.x); + write_mem(param_15, param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = asuint(s.xlat.y); + write_mem(param_18, param_19, param_20); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = asuint(s.c1.x); + write_mem(param_21, param_22, param_23); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = asuint(s.c1.y); + write_mem(param_24, param_25, param_26); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = asuint(s.ra); + write_mem(param_27, param_28, param_29); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = asuint(s.roff); + write_mem(param_30, param_31, param_32); +} + +void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2); + CmdRadGradRef _837 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdRadGradRef param_4 = _837; + CmdRadGrad param_5 = s; + CmdRadGrad_write(param_3, param_4, param_5); +} + +void CmdImage_write(Alloc a, CmdImageRef ref, CmdImage s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5); +} + +void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2); + CmdImageRef _855 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdImageRef param_4 = _855; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5); +} + +void Cmd_BeginClip_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2); +} + +void CmdEndClip_write(Alloc a, CmdEndClipRef ref, CmdEndClip s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.blend; + write_mem(param, param_1, param_2); +} + +void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2); + CmdEndClipRef _881 = { ref.offset + 4u }; + Alloc param_3 = a; + CmdEndClipRef param_4 = _881; + CmdEndClip param_5 = s; + CmdEndClip_write(param_3, param_4, param_5); +} + +void Cmd_End_write(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2); +} + +void alloc_write(Alloc a, uint offset, Alloc alloc) +{ + Alloc param = a; + uint param_1 = offset >> uint(2); + uint param_2 = alloc.offset; + write_mem(param, param_1, param_2); +} + +void comp_main() +{ + uint width_in_bins = ((_1020.Load(8) + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1020.Load(0) + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.Load(8)) + bin_tile_x) + tile_x; + Alloc _1085; + _1085.offset = _1020.Load(24); + Alloc param; + param.offset = _1085.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1094 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1094; + uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + Alloc param_3 = cmd_alloc; + uint param_4 = 0u; + uint param_5 = 8u; + Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; + uint drawmonoid_start = _1020.Load(44) >> uint(2); + uint drawtag_start = _1020.Load(100) >> uint(2); + uint drawdata_start = _1020.Load(104) >> uint(2); + uint drawinfo_start = _1020.Load(68) >> uint(2); + bool mem_ok = _266.Load(4) == 0u; + Alloc param_6; + Alloc param_8; + uint _1331; + uint element_ix; + Alloc param_17; + uint tile_count; + uint _1632; + float linewidth; + CmdLinGrad cmd_lin; + CmdRadGrad cmd_rad; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1383; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1181 = th_ix < 256u; + bool _1189; + if (_1181) + { + _1189 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1189 = _1181; + } + if (_1189) + { + uint in_ix = (_1020.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1206; + _1206.offset = _1020.Load(20); + param_6.offset = _1206.offset; + uint param_7 = in_ix; + count = read_mem(param_6, param_7); + Alloc _1217; + _1217.offset = _1020.Load(20); + param_8.offset = _1217.offset; + uint param_9 = in_ix + 1u; + uint offset = read_mem(param_8, param_9); + uint param_10 = offset; + uint param_11 = count * 4u; + bool param_12 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + GroupMemoryBarrierWithGroupSync(); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + GroupMemoryBarrierWithGroupSync(); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + GroupMemoryBarrierWithGroupSync(); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1331 = sh_part_count[part_ix - 1u]; + } + else + { + _1331 = part_start_ix; + } + ix -= _1331; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef _1350 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1350; + BinInstanceRef param_13 = inst_ref; + uint param_14 = ix; + Alloc param_15 = bin_alloc; + BinInstanceRef param_16 = BinInstance_index(param_13, param_14); + BinInstance inst = BinInstance_read(param_15, param_16); + sh_elements[th_ix] = inst.element_ix; + } + GroupMemoryBarrierWithGroupSync(); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1373 = (wr_ix - rd_ix) < 256u; + if (_1373) + { + _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1383 = _1373; + } + if (_1383) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + tag = _1399.Load((drawtag_start + element_ix) * 4 + 0); + } + switch (tag) + { + case 68u: + case 72u: + case 276u: + case 732u: + case 5u: + case 37u: + { + uint drawmonoid_base = drawmonoid_start + (4u * element_ix); + uint path_ix = _266.Load(drawmonoid_base * 4 + 8); + PathRef _1424 = { _1020.Load(16) + (path_ix * 12u) }; + Alloc _1427; + _1427.offset = _1020.Load(16); + param_17.offset = _1427.offset; + PathRef param_18 = _1424; + Path path = Path_read(param_17, param_18); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_19 = path.tiles.offset; + uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_21 = mem_ok; + Alloc path_alloc = new_alloc(param_19, param_20, param_21); + uint param_22 = th_ix; + Alloc param_23 = path_alloc; + write_tile_alloc(param_22, param_23); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = tile_count; + } + GroupMemoryBarrierWithGroupSync(); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + uint element_ix_1 = sh_elements[el_ix]; + uint tag_1 = _1399.Load((drawtag_start + element_ix_1) * 4 + 0); + if (el_ix > 0u) + { + _1632 = sh_tile_count[el_ix - 1u]; + } + else + { + _1632 = 0u; + } + uint seq_ix = ix_1 - _1632; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if (mem_ok) + { + uint param_24 = el_ix; + bool param_25 = mem_ok; + TileRef _1684 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_26 = read_tile_alloc(param_24, param_25); + TileRef param_27 = _1684; + Tile tile = Tile_read(param_26, param_27); + bool is_clip = (tag_1 & 1u) != 0u; + bool is_blend = false; + if (is_clip) + { + uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); + uint scene_offset = _266.Load((drawmonoid_base_1 + 2u) * 4 + 8); + uint dd = drawdata_start + (scene_offset >> uint(2)); + uint blend = _1399.Load(dd * 4 + 0); + is_blend = blend != 32771u; + } + bool _1720 = tile.tile.offset != 0u; + bool _1729; + if (!_1720) + { + _1729 = (tile.backdrop == 0) == is_clip; + } + else + { + _1729 = _1720; + } + include_tile = _1729 || is_blend; + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1751; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1751); + } + } + GroupMemoryBarrierWithGroupSync(); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); + uint element_ix_2 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + uint drawtag = _1399.Load((drawtag_start + element_ix_2) * 4 + 0); + if (clip_zero_depth == 0u) + { + uint param_28 = element_ref_ix; + bool param_29 = mem_ok; + TileRef _1828 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_30 = read_tile_alloc(param_28, param_29); + TileRef param_31 = _1828; + Tile tile_1 = Tile_read(param_30, param_31); + uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); + uint scene_offset_1 = _266.Load((drawmonoid_base_2 + 2u) * 4 + 8); + uint info_offset = _266.Load((drawmonoid_base_2 + 3u) * 4 + 8); + uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); + uint di = drawinfo_start + (info_offset >> uint(2)); + switch (drawtag) + { + case 68u: + { + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_32 = cmd_alloc; + CmdRef param_33 = cmd_ref; + uint param_34 = cmd_limit; + bool _1876 = alloc_cmd(param_32, param_33, param_34); + cmd_alloc = param_32; + cmd_ref = param_33; + cmd_limit = param_34; + if (!_1876) + { + break; + } + Alloc param_35 = cmd_alloc; + CmdRef param_36 = cmd_ref; + Tile param_37 = tile_1; + float param_38 = linewidth; + write_fill(param_35, param_36, param_37, param_38); + cmd_ref = param_36; + uint rgba = _1399.Load(dd_1 * 4 + 0); + CmdColor _1899 = { rgba }; + Alloc param_39 = cmd_alloc; + CmdRef param_40 = cmd_ref; + CmdColor param_41 = _1899; + Cmd_Color_write(param_39, param_40, param_41); + cmd_ref.offset += 8u; + break; + } + case 276u: + { + Alloc param_42 = cmd_alloc; + CmdRef param_43 = cmd_ref; + uint param_44 = cmd_limit; + bool _1917 = alloc_cmd(param_42, param_43, param_44); + cmd_alloc = param_42; + cmd_ref = param_43; + cmd_limit = param_44; + if (!_1917) + { + break; + } + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + Tile param_47 = tile_1; + float param_48 = linewidth; + write_fill(param_45, param_46, param_47, param_48); + cmd_ref = param_46; + cmd_lin.index = _1399.Load(dd_1 * 4 + 0); + cmd_lin.line_x = asfloat(_266.Load((di + 1u) * 4 + 8)); + cmd_lin.line_y = asfloat(_266.Load((di + 2u) * 4 + 8)); + cmd_lin.line_c = asfloat(_266.Load((di + 3u) * 4 + 8)); + Alloc param_49 = cmd_alloc; + CmdRef param_50 = cmd_ref; + CmdLinGrad param_51 = cmd_lin; + Cmd_LinGrad_write(param_49, param_50, param_51); + cmd_ref.offset += 20u; + break; + } + case 732u: + { + Alloc param_52 = cmd_alloc; + CmdRef param_53 = cmd_ref; + uint param_54 = cmd_limit; + bool _1981 = alloc_cmd(param_52, param_53, param_54); + cmd_alloc = param_52; + cmd_ref = param_53; + cmd_limit = param_54; + if (!_1981) + { + break; + } + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_55 = cmd_alloc; + CmdRef param_56 = cmd_ref; + Tile param_57 = tile_1; + float param_58 = linewidth; + write_fill(param_55, param_56, param_57, param_58); + cmd_ref = param_56; + cmd_rad.index = _1399.Load(dd_1 * 4 + 0); + cmd_rad.mat = asfloat(uint4(_266.Load((di + 1u) * 4 + 8), _266.Load((di + 2u) * 4 + 8), _266.Load((di + 3u) * 4 + 8), _266.Load((di + 4u) * 4 + 8))); + cmd_rad.xlat = asfloat(uint2(_266.Load((di + 5u) * 4 + 8), _266.Load((di + 6u) * 4 + 8))); + cmd_rad.c1 = asfloat(uint2(_266.Load((di + 7u) * 4 + 8), _266.Load((di + 8u) * 4 + 8))); + cmd_rad.ra = asfloat(_266.Load((di + 9u) * 4 + 8)); + cmd_rad.roff = asfloat(_266.Load((di + 10u) * 4 + 8)); + Alloc param_59 = cmd_alloc; + CmdRef param_60 = cmd_ref; + CmdRadGrad param_61 = cmd_rad; + Cmd_RadGrad_write(param_59, param_60, param_61); + cmd_ref.offset += 48u; + break; + } + case 72u: + { + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + uint param_64 = cmd_limit; + bool _2087 = alloc_cmd(param_62, param_63, param_64); + cmd_alloc = param_62; + cmd_ref = param_63; + cmd_limit = param_64; + if (!_2087) + { + break; + } + Alloc param_65 = cmd_alloc; + CmdRef param_66 = cmd_ref; + Tile param_67 = tile_1; + float param_68 = linewidth; + write_fill(param_65, param_66, param_67, param_68); + cmd_ref = param_66; + uint index = _1399.Load(dd_1 * 4 + 0); + uint raw1 = _1399.Load((dd_1 + 1u) * 4 + 0); + int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + CmdImage _2126 = { index, offset_1 }; + Alloc param_69 = cmd_alloc; + CmdRef param_70 = cmd_ref; + CmdImage param_71 = _2126; + Cmd_Image_write(param_69, param_70, param_71); + cmd_ref.offset += 12u; + break; + } + case 5u: + { + bool _2140 = tile_1.tile.offset == 0u; + bool _2146; + if (_2140) + { + _2146 = tile_1.backdrop == 0; + } + else + { + _2146 = _2140; + } + if (_2146) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + Alloc param_72 = cmd_alloc; + CmdRef param_73 = cmd_ref; + uint param_74 = cmd_limit; + bool _2158 = alloc_cmd(param_72, param_73, param_74); + cmd_alloc = param_72; + cmd_ref = param_73; + cmd_limit = param_74; + if (!_2158) + { + break; + } + Alloc param_75 = cmd_alloc; + CmdRef param_76 = cmd_ref; + Cmd_BeginClip_write(param_75, param_76); + cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); + } + clip_depth++; + break; + } + case 37u: + { + clip_depth--; + Alloc param_77 = cmd_alloc; + CmdRef param_78 = cmd_ref; + uint param_79 = cmd_limit; + bool _2191 = alloc_cmd(param_77, param_78, param_79); + cmd_alloc = param_77; + cmd_ref = param_78; + cmd_limit = param_79; + if (!_2191) + { + break; + } + Alloc param_80 = cmd_alloc; + CmdRef param_81 = cmd_ref; + Tile param_82 = tile_1; + float param_83 = -1.0f; + write_fill(param_80, param_81, param_82, param_83); + cmd_ref = param_81; + uint blend_1 = _1399.Load(dd_1 * 4 + 0); + CmdEndClip _2214 = { blend_1 }; + Alloc param_84 = cmd_alloc; + CmdRef param_85 = cmd_ref; + CmdEndClip param_86 = _2214; + Cmd_EndClip_write(param_84, param_85, param_86); + cmd_ref.offset += 8u; + render_blend_depth--; + break; + } + } + } + else + { + switch (drawtag) + { + case 5u: + { + clip_depth++; + break; + } + case 37u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + GroupMemoryBarrierWithGroupSync(); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2263 = (bin_tile_x + tile_x) < _1020.Load(8); + bool _2272; + if (_2263) + { + _2272 = (bin_tile_y + tile_y) < _1020.Load(12); + } + else + { + _2272 = _2263; + } + if (_2272) + { + Alloc param_87 = cmd_alloc; + CmdRef param_88 = cmd_ref; + Cmd_End_write(param_87, param_88); + if (max_blend_depth > 4u) + { + uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; + uint param_89 = scratch_size; + MallocResult _2293 = malloc(param_89); + MallocResult scratch = _2293; + Alloc param_90 = scratch_alloc; + uint param_91 = scratch_alloc.offset; + Alloc param_92 = scratch.alloc; + alloc_write(param_90, param_91, param_92); + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl new file mode 100644 index 0000000..d84add1 --- /dev/null +++ b/piet-gpu/shader/gen/coarse.msl @@ -0,0 +1,1266 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +// Implementation of the GLSL findLSB() function +template +inline T spvFindLSB(T x) +{ + return select(ctz(x), T(-1), x == T(0)); +} + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct BinInstanceRef +{ + uint offset; +}; + +struct BinInstance +{ + uint element_ix; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct Tile +{ + TileSegRef tile; + int backdrop; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_266.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const uint& index) +{ + return BinInstanceRef{ ref.offset + (index * 4u) }; +} + +static inline __attribute__((always_inline)) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + BinInstance s; + s.element_ix = raw0; + return s; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_266, v_266BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) +{ +} + +static inline __attribute__((always_inline)) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint param = 0u; + uint param_1 = uint(int((v_266BufferSize - 8) / 4) * 4); + bool param_2 = mem_ok; + return new_alloc(param, param_1, param_2); +} + +static inline __attribute__((always_inline)) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); + Tile s; + s.tile = TileSegRef{ raw0 }; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint _272 = atomic_fetch_add_explicit((device atomic_uint*)&v_266.mem_offset, size, memory_order_relaxed); + uint offset = _272; + MallocResult r; + r.failed = (offset + size) > uint(int((v_266BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _301 = atomic_fetch_max_explicit((device atomic_uint*)&v_266.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_266.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.new_ref; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 11u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; + CmdJump param_5 = s; + CmdJump_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_266, constant uint& v_266BufferSize) +{ + if (cmd_ref.offset < cmd_limit) + { + return true; + } + uint param = 1024u; + MallocResult _928 = malloc(param, v_266, v_266BufferSize); + MallocResult new_cmd = _928; + if (new_cmd.failed) + { + return false; + } + CmdJump jump = CmdJump{ new_cmd.alloc.offset }; + Alloc param_1 = cmd_alloc; + CmdRef param_2 = cmd_ref; + CmdJump param_3 = jump; + Cmd_Jump_write(param_1, param_2, param_3, v_266, v_266BufferSize); + cmd_alloc = new_cmd.alloc; + cmd_ref = CmdRef{ cmd_alloc.offset }; + cmd_limit = (cmd_alloc.offset + 1024u) - 144u; + return true; +} + +static inline __attribute__((always_inline)) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = uint(s.backdrop); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 1u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; + CmdFill param_5 = s; + CmdFill_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 3u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.tile_ref; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.half_width); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 2u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; + CmdStroke param_5 = s; + CmdStroke_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_266, constant uint& v_266BufferSize) +{ + if (linewidth < 0.0) + { + if (tile.tile.offset != 0u) + { + CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop }; + Alloc param = alloc; + CmdRef param_1 = cmd_ref; + CmdFill param_2 = cmd_fill; + Cmd_Fill_write(param, param_1, param_2, v_266, v_266BufferSize); + cmd_ref.offset += 12u; + } + else + { + Alloc param_3 = alloc; + CmdRef param_4 = cmd_ref; + Cmd_Solid_write(param_3, param_4, v_266, v_266BufferSize); + cmd_ref.offset += 4u; + } + } + else + { + CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth }; + Alloc param_5 = alloc; + CmdRef param_6 = cmd_ref; + CmdStroke param_7 = cmd_stroke; + Cmd_Stroke_write(param_5, param_6, param_7, v_266, v_266BufferSize); + cmd_ref.offset += 12u; + } +} + +static inline __attribute__((always_inline)) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.rgba_color; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 5u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; + CmdColor param_5 = s; + CmdColor_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.line_x); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.line_y); + write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.line_c); + write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 6u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; + CmdLinGrad param_5 = s; + CmdLinGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.mat.x); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.mat.y); + write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.mat.z); + write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.mat.w); + write_mem(param_12, param_13, param_14, v_266, v_266BufferSize); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.xlat.x); + write_mem(param_15, param_16, param_17, v_266, v_266BufferSize); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = as_type(s.xlat.y); + write_mem(param_18, param_19, param_20, v_266, v_266BufferSize); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = as_type(s.c1.x); + write_mem(param_21, param_22, param_23, v_266, v_266BufferSize); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = as_type(s.c1.y); + write_mem(param_24, param_25, param_26, v_266, v_266BufferSize); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = as_type(s.ra); + write_mem(param_27, param_28, param_29, v_266, v_266BufferSize); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = as_type(s.roff); + write_mem(param_30, param_31, param_32, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 7u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u }; + CmdRadGrad param_5 = s; + CmdRadGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.index; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 8u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; + CmdImage param_5 = s; + CmdImage_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 9u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.blend; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 10u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); + Alloc param_3 = a; + CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u }; + CmdEndClip param_5 = s; + CmdEndClip_write(param_3, param_4, param_5, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = 0u; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +static inline __attribute__((always_inline)) +void alloc_write(thread const Alloc& a, thread const uint& offset, thread const Alloc& alloc, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = offset >> uint(2); + uint param_2 = alloc.offset; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_266 [[buffer(0)]], const device ConfigBuf& _1020 [[buffer(1)]], const device SceneBuf& _1399 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_bitmaps[8][256]; + threadgroup Alloc sh_part_elements[256]; + threadgroup uint sh_part_count[256]; + threadgroup uint sh_elements[256]; + threadgroup uint sh_tile_stride[256]; + threadgroup uint sh_tile_width[256]; + threadgroup uint sh_tile_x0[256]; + threadgroup uint sh_tile_y0[256]; + threadgroup uint sh_tile_base[256]; + threadgroup uint sh_tile_count[256]; + constant uint& v_266BufferSize = spvBufferSizeConstants[0]; + uint width_in_bins = ((_1020.conf.width_in_tiles + 16u) - 1u) / 16u; + uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; + uint partition_ix = 0u; + uint n_partitions = ((_1020.conf.n_elements + 256u) - 1u) / 256u; + uint th_ix = gl_LocalInvocationID.x; + uint bin_tile_x = 16u * gl_WorkGroupID.x; + uint bin_tile_y = 16u * gl_WorkGroupID.y; + uint tile_x = gl_LocalInvocationID.x % 16u; + uint tile_y = gl_LocalInvocationID.x / 16u; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.conf.width_in_tiles) + bin_tile_x) + tile_x; + Alloc param; + param.offset = _1020.conf.ptcl_alloc.offset; + uint param_1 = this_tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; + uint clip_depth = 0u; + uint clip_zero_depth = 0u; + uint rd_ix = 0u; + uint wr_ix = 0u; + uint part_start_ix = 0u; + uint ready_ix = 0u; + Alloc param_3 = cmd_alloc; + uint param_4 = 0u; + uint param_5 = 8u; + Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; + uint drawmonoid_start = _1020.conf.drawmonoid_alloc.offset >> uint(2); + uint drawtag_start = _1020.conf.drawtag_offset >> uint(2); + uint drawdata_start = _1020.conf.drawdata_offset >> uint(2); + uint drawinfo_start = _1020.conf.drawinfo_alloc.offset >> uint(2); + bool mem_ok = v_266.mem_error == 0u; + Alloc param_6; + Alloc param_8; + uint _1331; + uint element_ix; + Alloc param_17; + uint tile_count; + uint _1632; + float linewidth; + CmdLinGrad cmd_lin; + CmdRadGrad cmd_rad; + while (true) + { + for (uint i = 0u; i < 8u; i++) + { + sh_bitmaps[i][th_ix] = 0u; + } + bool _1383; + for (;;) + { + if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) + { + part_start_ix = ready_ix; + uint count = 0u; + bool _1181 = th_ix < 256u; + bool _1189; + if (_1181) + { + _1189 = (partition_ix + th_ix) < n_partitions; + } + else + { + _1189 = _1181; + } + if (_1189) + { + uint in_ix = (_1020.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_6.offset = _1020.conf.bin_alloc.offset; + uint param_7 = in_ix; + count = read_mem(param_6, param_7, v_266, v_266BufferSize); + param_8.offset = _1020.conf.bin_alloc.offset; + uint param_9 = in_ix + 1u; + uint offset = read_mem(param_8, param_9, v_266, v_266BufferSize); + uint param_10 = offset; + uint param_11 = count * 4u; + bool param_12 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + if (th_ix < 256u) + { + sh_part_count[th_ix] = count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix < 256u) + { + if (th_ix >= (1u << i_1)) + { + count += sh_part_count[th_ix - (1u << i_1)]; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (th_ix < 256u) + { + sh_part_count[th_ix] = part_start_ix + count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + ready_ix = sh_part_count[255]; + partition_ix += 256u; + } + uint ix = rd_ix + th_ix; + if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + { + uint part_ix = 0u; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + uint probe = part_ix + (128u >> i_2); + if (ix >= sh_part_count[probe - 1u]) + { + part_ix = probe; + } + } + if (part_ix > 0u) + { + _1331 = sh_part_count[part_ix - 1u]; + } + else + { + _1331 = part_start_ix; + } + ix -= _1331; + Alloc bin_alloc = sh_part_elements[part_ix]; + BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; + BinInstanceRef param_13 = inst_ref; + uint param_14 = ix; + Alloc param_15 = bin_alloc; + BinInstanceRef param_16 = BinInstance_index(param_13, param_14); + BinInstance inst = BinInstance_read(param_15, param_16, v_266, v_266BufferSize); + sh_elements[th_ix] = inst.element_ix; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + wr_ix = min((rd_ix + 256u), ready_ix); + bool _1373 = (wr_ix - rd_ix) < 256u; + if (_1373) + { + _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + } + else + { + _1383 = _1373; + } + if (_1383) + { + continue; + } + else + { + break; + } + } + uint tag = 0u; + if ((th_ix + rd_ix) < wr_ix) + { + element_ix = sh_elements[th_ix]; + tag = _1399.scene[drawtag_start + element_ix]; + } + switch (tag) + { + case 68u: + case 72u: + case 276u: + case 732u: + case 5u: + case 37u: + { + uint drawmonoid_base = drawmonoid_start + (4u * element_ix); + uint path_ix = v_266.memory[drawmonoid_base]; + param_17.offset = _1020.conf.tile_alloc.offset; + PathRef param_18 = PathRef{ _1020.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_17, param_18, v_266, v_266BufferSize); + uint stride = path.bbox.z - path.bbox.x; + sh_tile_stride[th_ix] = stride; + int dx = int(path.bbox.x) - int(bin_tile_x); + int dy = int(path.bbox.y) - int(bin_tile_y); + int x0 = clamp(dx, 0, 16); + int y0 = clamp(dy, 0, 16); + int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, 16); + int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, 16); + sh_tile_width[th_ix] = uint(x1 - x0); + sh_tile_x0[th_ix] = uint(x0); + sh_tile_y0[th_ix] = uint(y0); + tile_count = uint(x1 - x0) * uint(y1 - y0); + uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); + sh_tile_base[th_ix] = base; + uint param_19 = path.tiles.offset; + uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_21 = mem_ok; + Alloc path_alloc = new_alloc(param_19, param_20, param_21); + uint param_22 = th_ix; + Alloc param_23 = path_alloc; + write_tile_alloc(param_22, param_23); + break; + } + default: + { + tile_count = 0u; + break; + } + } + sh_tile_count[th_ix] = tile_count; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= (1u << i_3)) + { + tile_count += sh_tile_count[th_ix - (1u << i_3)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = tile_count; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint total_tile_count = sh_tile_count[255]; + for (uint ix_1 = th_ix; ix_1 < total_tile_count; ix_1 += 256u) + { + uint el_ix = 0u; + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + uint probe_1 = el_ix + (128u >> i_4); + if (ix_1 >= sh_tile_count[probe_1 - 1u]) + { + el_ix = probe_1; + } + } + uint element_ix_1 = sh_elements[el_ix]; + uint tag_1 = _1399.scene[drawtag_start + element_ix_1]; + if (el_ix > 0u) + { + _1632 = sh_tile_count[el_ix - 1u]; + } + else + { + _1632 = 0u; + } + uint seq_ix = ix_1 - _1632; + uint width = sh_tile_width[el_ix]; + uint x = sh_tile_x0[el_ix] + (seq_ix % width); + uint y = sh_tile_y0[el_ix] + (seq_ix / width); + bool include_tile = false; + if (mem_ok) + { + uint param_24 = el_ix; + bool param_25 = mem_ok; + Alloc param_26 = read_tile_alloc(param_24, param_25, v_266, v_266BufferSize); + TileRef param_27 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_26, param_27, v_266, v_266BufferSize); + bool is_clip = (tag_1 & 1u) != 0u; + bool is_blend = false; + if (is_clip) + { + uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); + uint scene_offset = v_266.memory[drawmonoid_base_1 + 2u]; + uint dd = drawdata_start + (scene_offset >> uint(2)); + uint blend = _1399.scene[dd]; + is_blend = blend != 32771u; + } + bool _1720 = tile.tile.offset != 0u; + bool _1729; + if (!_1720) + { + _1729 = (tile.backdrop == 0) == is_clip; + } + else + { + _1729 = _1720; + } + include_tile = _1729 || is_blend; + } + if (include_tile) + { + uint el_slice = el_ix / 32u; + uint el_mask = 1u << (el_ix & 31u); + uint _1751 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint slice_ix = 0u; + uint bitmap = sh_bitmaps[0][th_ix]; + while (mem_ok) + { + if (bitmap == 0u) + { + slice_ix++; + if (slice_ix == 8u) + { + break; + } + bitmap = sh_bitmaps[slice_ix][th_ix]; + if (bitmap == 0u) + { + continue; + } + } + uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); + uint element_ix_2 = sh_elements[element_ref_ix]; + bitmap &= (bitmap - 1u); + uint drawtag = _1399.scene[drawtag_start + element_ix_2]; + if (clip_zero_depth == 0u) + { + uint param_28 = element_ref_ix; + bool param_29 = mem_ok; + Alloc param_30 = read_tile_alloc(param_28, param_29, v_266, v_266BufferSize); + TileRef param_31 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Tile tile_1 = Tile_read(param_30, param_31, v_266, v_266BufferSize); + uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); + uint scene_offset_1 = v_266.memory[drawmonoid_base_2 + 2u]; + uint info_offset = v_266.memory[drawmonoid_base_2 + 3u]; + uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); + uint di = drawinfo_start + (info_offset >> uint(2)); + switch (drawtag) + { + case 68u: + { + linewidth = as_type(v_266.memory[di]); + Alloc param_32 = cmd_alloc; + CmdRef param_33 = cmd_ref; + uint param_34 = cmd_limit; + bool _1876 = alloc_cmd(param_32, param_33, param_34, v_266, v_266BufferSize); + cmd_alloc = param_32; + cmd_ref = param_33; + cmd_limit = param_34; + if (!_1876) + { + break; + } + Alloc param_35 = cmd_alloc; + CmdRef param_36 = cmd_ref; + Tile param_37 = tile_1; + float param_38 = linewidth; + write_fill(param_35, param_36, param_37, param_38, v_266, v_266BufferSize); + cmd_ref = param_36; + uint rgba = _1399.scene[dd_1]; + Alloc param_39 = cmd_alloc; + CmdRef param_40 = cmd_ref; + CmdColor param_41 = CmdColor{ rgba }; + Cmd_Color_write(param_39, param_40, param_41, v_266, v_266BufferSize); + cmd_ref.offset += 8u; + break; + } + case 276u: + { + Alloc param_42 = cmd_alloc; + CmdRef param_43 = cmd_ref; + uint param_44 = cmd_limit; + bool _1917 = alloc_cmd(param_42, param_43, param_44, v_266, v_266BufferSize); + cmd_alloc = param_42; + cmd_ref = param_43; + cmd_limit = param_44; + if (!_1917) + { + break; + } + linewidth = as_type(v_266.memory[di]); + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + Tile param_47 = tile_1; + float param_48 = linewidth; + write_fill(param_45, param_46, param_47, param_48, v_266, v_266BufferSize); + cmd_ref = param_46; + cmd_lin.index = _1399.scene[dd_1]; + cmd_lin.line_x = as_type(v_266.memory[di + 1u]); + cmd_lin.line_y = as_type(v_266.memory[di + 2u]); + cmd_lin.line_c = as_type(v_266.memory[di + 3u]); + Alloc param_49 = cmd_alloc; + CmdRef param_50 = cmd_ref; + CmdLinGrad param_51 = cmd_lin; + Cmd_LinGrad_write(param_49, param_50, param_51, v_266, v_266BufferSize); + cmd_ref.offset += 20u; + break; + } + case 732u: + { + Alloc param_52 = cmd_alloc; + CmdRef param_53 = cmd_ref; + uint param_54 = cmd_limit; + bool _1981 = alloc_cmd(param_52, param_53, param_54, v_266, v_266BufferSize); + cmd_alloc = param_52; + cmd_ref = param_53; + cmd_limit = param_54; + if (!_1981) + { + break; + } + linewidth = as_type(v_266.memory[di]); + Alloc param_55 = cmd_alloc; + CmdRef param_56 = cmd_ref; + Tile param_57 = tile_1; + float param_58 = linewidth; + write_fill(param_55, param_56, param_57, param_58, v_266, v_266BufferSize); + cmd_ref = param_56; + cmd_rad.index = _1399.scene[dd_1]; + cmd_rad.mat = as_type(uint4(v_266.memory[di + 1u], v_266.memory[di + 2u], v_266.memory[di + 3u], v_266.memory[di + 4u])); + cmd_rad.xlat = as_type(uint2(v_266.memory[di + 5u], v_266.memory[di + 6u])); + cmd_rad.c1 = as_type(uint2(v_266.memory[di + 7u], v_266.memory[di + 8u])); + cmd_rad.ra = as_type(v_266.memory[di + 9u]); + cmd_rad.roff = as_type(v_266.memory[di + 10u]); + Alloc param_59 = cmd_alloc; + CmdRef param_60 = cmd_ref; + CmdRadGrad param_61 = cmd_rad; + Cmd_RadGrad_write(param_59, param_60, param_61, v_266, v_266BufferSize); + cmd_ref.offset += 48u; + break; + } + case 72u: + { + linewidth = as_type(v_266.memory[di]); + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + uint param_64 = cmd_limit; + bool _2087 = alloc_cmd(param_62, param_63, param_64, v_266, v_266BufferSize); + cmd_alloc = param_62; + cmd_ref = param_63; + cmd_limit = param_64; + if (!_2087) + { + break; + } + Alloc param_65 = cmd_alloc; + CmdRef param_66 = cmd_ref; + Tile param_67 = tile_1; + float param_68 = linewidth; + write_fill(param_65, param_66, param_67, param_68, v_266, v_266BufferSize); + cmd_ref = param_66; + uint index = _1399.scene[dd_1]; + uint raw1 = _1399.scene[dd_1 + 1u]; + int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + Alloc param_69 = cmd_alloc; + CmdRef param_70 = cmd_ref; + CmdImage param_71 = CmdImage{ index, offset_1 }; + Cmd_Image_write(param_69, param_70, param_71, v_266, v_266BufferSize); + cmd_ref.offset += 12u; + break; + } + case 5u: + { + bool _2140 = tile_1.tile.offset == 0u; + bool _2146; + if (_2140) + { + _2146 = tile_1.backdrop == 0; + } + else + { + _2146 = _2140; + } + if (_2146) + { + clip_zero_depth = clip_depth + 1u; + } + else + { + Alloc param_72 = cmd_alloc; + CmdRef param_73 = cmd_ref; + uint param_74 = cmd_limit; + bool _2158 = alloc_cmd(param_72, param_73, param_74, v_266, v_266BufferSize); + cmd_alloc = param_72; + cmd_ref = param_73; + cmd_limit = param_74; + if (!_2158) + { + break; + } + Alloc param_75 = cmd_alloc; + CmdRef param_76 = cmd_ref; + Cmd_BeginClip_write(param_75, param_76, v_266, v_266BufferSize); + cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); + } + clip_depth++; + break; + } + case 37u: + { + clip_depth--; + Alloc param_77 = cmd_alloc; + CmdRef param_78 = cmd_ref; + uint param_79 = cmd_limit; + bool _2191 = alloc_cmd(param_77, param_78, param_79, v_266, v_266BufferSize); + cmd_alloc = param_77; + cmd_ref = param_78; + cmd_limit = param_79; + if (!_2191) + { + break; + } + Alloc param_80 = cmd_alloc; + CmdRef param_81 = cmd_ref; + Tile param_82 = tile_1; + float param_83 = -1.0; + write_fill(param_80, param_81, param_82, param_83, v_266, v_266BufferSize); + cmd_ref = param_81; + uint blend_1 = _1399.scene[dd_1]; + Alloc param_84 = cmd_alloc; + CmdRef param_85 = cmd_ref; + CmdEndClip param_86 = CmdEndClip{ blend_1 }; + Cmd_EndClip_write(param_84, param_85, param_86, v_266, v_266BufferSize); + cmd_ref.offset += 8u; + render_blend_depth--; + break; + } + } + } + else + { + switch (drawtag) + { + case 5u: + { + clip_depth++; + break; + } + case 37u: + { + if (clip_depth == clip_zero_depth) + { + clip_zero_depth = 0u; + } + clip_depth--; + break; + } + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + rd_ix += 256u; + if ((rd_ix >= ready_ix) && (partition_ix >= n_partitions)) + { + break; + } + } + bool _2263 = (bin_tile_x + tile_x) < _1020.conf.width_in_tiles; + bool _2272; + if (_2263) + { + _2272 = (bin_tile_y + tile_y) < _1020.conf.height_in_tiles; + } + else + { + _2272 = _2263; + } + if (_2272) + { + Alloc param_87 = cmd_alloc; + CmdRef param_88 = cmd_ref; + Cmd_End_write(param_87, param_88, v_266, v_266BufferSize); + if (max_blend_depth > 4u) + { + uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; + uint param_89 = scratch_size; + MallocResult _2293 = malloc(param_89, v_266, v_266BufferSize); + MallocResult scratch = _2293; + Alloc param_90 = scratch_alloc; + uint param_91 = scratch_alloc.offset; + Alloc param_92 = scratch.alloc; + alloc_write(param_90, param_91, param_92, v_266, v_266BufferSize); + } + } +} + diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv new file mode 100644 index 0000000..fe5eeee Binary files /dev/null and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil new file mode 100644 index 0000000..200f169 Binary files /dev/null and b/piet-gpu/shader/gen/draw_leaf.dxil differ diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl new file mode 100644 index 0000000..734d21e --- /dev/null +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -0,0 +1,268 @@ +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const DrawMonoid _23 = { 0u, 0u, 0u, 0u }; + +ByteAddressBuffer _93 : register(t1, space0); +ByteAddressBuffer _103 : register(t2, space0); +ByteAddressBuffer _203 : register(t3, space0); +RWByteAddressBuffer _285 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared DrawMonoid sh_scratch[256]; + +DrawMonoid map_tag(uint tag_word) +{ + uint has_path = uint(tag_word != 0u); + DrawMonoid _76 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; + return _76; +} + +DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +DrawMonoid draw_monoid_identity() +{ + return _23; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _93.Load(100) >> uint(2); + uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0); + uint param = tag_word; + DrawMonoid agg = map_tag(param); + DrawMonoid local[8]; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + tag_word = _103.Load(((drawtag_base + ix) + i) * 4 + 0); + uint param_1 = tag_word; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_4 = other; + DrawMonoid param_5 = agg; + agg = combine_draw_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + DrawMonoid row = draw_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + DrawMonoid _209; + _209.path_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 0); + _209.clip_ix = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 4); + _209.scene_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 8); + _209.info_offset = _203.Load((gl_WorkGroupID.x - 1u) * 16 + 12); + row.path_ix = _209.path_ix; + row.clip_ix = _209.clip_ix; + row.scene_offset = _209.scene_offset; + row.info_offset = _209.info_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + DrawMonoid param_6 = row; + DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_draw_monoid(param_6, param_7); + } + uint drawdata_base = _93.Load(104) >> uint(2); + uint drawinfo_base = _93.Load(68) >> uint(2); + uint out_ix = gl_GlobalInvocationID.x * 8u; + uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u); + uint clip_out_base = _93.Load(48) >> uint(2); + float4 mat; + float2 translate; + float2 p0; + float2 p1; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid m = row; + if (i_2 > 0u) + { + DrawMonoid param_8 = m; + DrawMonoid param_9 = local[i_2 - 1u]; + m = combine_draw_monoid(param_8, param_9); + } + _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix); + _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix); + _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset); + _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset); + uint dd = drawdata_base + (m.scene_offset >> uint(2)); + uint di = drawinfo_base + (m.info_offset >> uint(2)); + tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0); + if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u)) + { + uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f; + float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8)); + uint fill_mode = uint(linewidth >= 0.0f); + if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u)) + { + uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8); + uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix); + mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8))); + if ((tag_word == 276u) || (tag_word == 732u)) + { + translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8))); + } + } + if (linewidth >= 0.0f) + { + linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z))); + } + switch (tag_word) + { + case 68u: + case 72u: + { + _285.Store(di * 4 + 8, asuint(linewidth)); + break; + } + case 276u: + { + _285.Store(di * 4 + 8, asuint(linewidth)); + p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0))); + p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0))); + p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate; + p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate; + float2 dxy = p1 - p0; + float scale = 1.0f / ((dxy.x * dxy.x) + (dxy.y * dxy.y)); + float line_x = dxy.x * scale; + float line_y = dxy.y * scale; + float line_c = -((p0.x * line_x) + (p0.y * line_y)); + _285.Store((di + 1u) * 4 + 8, asuint(line_x)); + _285.Store((di + 2u) * 4 + 8, asuint(line_y)); + _285.Store((di + 3u) * 4 + 8, asuint(line_c)); + break; + } + case 732u: + { + p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0))); + p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0))); + float r0 = asfloat(_103.Load((dd + 5u) * 4 + 0)); + float r1 = asfloat(_103.Load((dd + 6u) * 4 + 0)); + float inv_det = 1.0f / ((mat.x * mat.w) - (mat.y * mat.z)); + float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det; + float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y); + inv_tr += p0; + float2 center1 = p1 - p0; + float rr = r1 / (r1 - r0); + float rainv = rr / ((r1 * r1) - dot(center1, center1)); + float2 c1 = center1 * rainv; + float ra = rr * rainv; + float roff = rr - 1.0f; + _285.Store(di * 4 + 8, asuint(linewidth)); + _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x)); + _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y)); + _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z)); + _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w)); + _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x)); + _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y)); + _285.Store((di + 7u) * 4 + 8, asuint(c1.x)); + _285.Store((di + 8u) * 4 + 8, asuint(c1.y)); + _285.Store((di + 9u) * 4 + 8, asuint(ra)); + _285.Store((di + 10u) * 4 + 8, asuint(roff)); + break; + } + case 5u: + { + break; + } + } + } + if ((tag_word == 5u) || (tag_word == 37u)) + { + uint path_ix = ~(out_ix + i_2); + if (tag_word == 5u) + { + path_ix = m.path_ix; + } + _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix); + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl new file mode 100644 index 0000000..c11e21b --- /dev/null +++ b/piet-gpu/shader/gen/draw_leaf.msl @@ -0,0 +1,316 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct DrawMonoid_1 +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct ParentBuf +{ + DrawMonoid_1 parent[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid map_tag(thread const uint& tag_word) +{ + uint has_path = uint(tag_word != 0u); + return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; +} + +static inline __attribute__((always_inline)) +DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +static inline __attribute__((always_inline)) +DrawMonoid draw_monoid_identity() +{ + return DrawMonoid{ 0u, 0u, 0u, 0u }; +} + +kernel void main0(device Memory& _285 [[buffer(0)]], const device ConfigBuf& _93 [[buffer(1)]], const device SceneBuf& _103 [[buffer(2)]], const device ParentBuf& _203 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup DrawMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _93.conf.drawtag_offset >> uint(2); + uint tag_word = _103.scene[drawtag_base + ix]; + uint param = tag_word; + DrawMonoid agg = map_tag(param); + spvUnsafeArray local; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + tag_word = _103.scene[(drawtag_base + ix) + i]; + uint param_1 = tag_word; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_4 = other; + DrawMonoid param_5 = agg; + agg = combine_draw_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + DrawMonoid row = draw_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _206 = gl_WorkGroupID.x - 1u; + row.path_ix = _203.parent[_206].path_ix; + row.clip_ix = _203.parent[_206].clip_ix; + row.scene_offset = _203.parent[_206].scene_offset; + row.info_offset = _203.parent[_206].info_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + DrawMonoid param_6 = row; + DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_draw_monoid(param_6, param_7); + } + uint drawdata_base = _93.conf.drawdata_offset >> uint(2); + uint drawinfo_base = _93.conf.drawinfo_alloc.offset >> uint(2); + uint out_ix = gl_GlobalInvocationID.x * 8u; + uint out_base = (_93.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 4u); + uint clip_out_base = _93.conf.clip_alloc.offset >> uint(2); + float4 mat; + float2 translate; + float2 p0; + float2 p1; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid m = row; + if (i_2 > 0u) + { + DrawMonoid param_8 = m; + DrawMonoid param_9 = local[i_2 - 1u]; + m = combine_draw_monoid(param_8, param_9); + } + _285.memory[out_base + (i_2 * 4u)] = m.path_ix; + _285.memory[(out_base + (i_2 * 4u)) + 1u] = m.clip_ix; + _285.memory[(out_base + (i_2 * 4u)) + 2u] = m.scene_offset; + _285.memory[(out_base + (i_2 * 4u)) + 3u] = m.info_offset; + uint dd = drawdata_base + (m.scene_offset >> uint(2)); + uint di = drawinfo_base + (m.info_offset >> uint(2)); + tag_word = _103.scene[(drawtag_base + ix) + i_2]; + if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u)) + { + uint bbox_offset = (_93.conf.path_bbox_alloc.offset >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(_285.memory[bbox_offset]) - 32768.0; + float bbox_t = float(_285.memory[bbox_offset + 1u]) - 32768.0; + float bbox_r = float(_285.memory[bbox_offset + 2u]) - 32768.0; + float bbox_b = float(_285.memory[bbox_offset + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + float linewidth = as_type(_285.memory[bbox_offset + 4u]); + uint fill_mode = uint(linewidth >= 0.0); + if (((linewidth >= 0.0) || (tag_word == 276u)) || (tag_word == 732u)) + { + uint trans_ix = _285.memory[bbox_offset + 5u]; + uint t = (_93.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); + mat = as_type(uint4(_285.memory[t], _285.memory[t + 1u], _285.memory[t + 2u], _285.memory[t + 3u])); + if ((tag_word == 276u) || (tag_word == 732u)) + { + translate = as_type(uint2(_285.memory[t + 4u], _285.memory[t + 5u])); + } + } + if (linewidth >= 0.0) + { + linewidth *= sqrt(abs((mat.x * mat.w) - (mat.y * mat.z))); + } + switch (tag_word) + { + case 68u: + case 72u: + { + _285.memory[di] = as_type(linewidth); + break; + } + case 276u: + { + _285.memory[di] = as_type(linewidth); + p0 = as_type(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u])); + p1 = as_type(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u])); + p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate; + p1 = ((mat.xy * p1.x) + (mat.zw * p1.y)) + translate; + float2 dxy = p1 - p0; + float scale = 1.0 / ((dxy.x * dxy.x) + (dxy.y * dxy.y)); + float line_x = dxy.x * scale; + float line_y = dxy.y * scale; + float line_c = -((p0.x * line_x) + (p0.y * line_y)); + _285.memory[di + 1u] = as_type(line_x); + _285.memory[di + 2u] = as_type(line_y); + _285.memory[di + 3u] = as_type(line_c); + break; + } + case 732u: + { + p0 = as_type(uint2(_103.scene[dd + 1u], _103.scene[dd + 2u])); + p1 = as_type(uint2(_103.scene[dd + 3u], _103.scene[dd + 4u])); + float r0 = as_type(_103.scene[dd + 5u]); + float r1 = as_type(_103.scene[dd + 6u]); + float inv_det = 1.0 / ((mat.x * mat.w) - (mat.y * mat.z)); + float4 inv_mat = float4(mat.w, -mat.y, -mat.z, mat.x) * inv_det; + float2 inv_tr = (inv_mat.xz * translate.x) + (inv_mat.yw * translate.y); + inv_tr += p0; + float2 center1 = p1 - p0; + float rr = r1 / (r1 - r0); + float rainv = rr / ((r1 * r1) - dot(center1, center1)); + float2 c1 = center1 * rainv; + float ra = rr * rainv; + float roff = rr - 1.0; + _285.memory[di] = as_type(linewidth); + _285.memory[di + 1u] = as_type(inv_mat.x); + _285.memory[di + 2u] = as_type(inv_mat.y); + _285.memory[di + 3u] = as_type(inv_mat.z); + _285.memory[di + 4u] = as_type(inv_mat.w); + _285.memory[di + 5u] = as_type(inv_tr.x); + _285.memory[di + 6u] = as_type(inv_tr.y); + _285.memory[di + 7u] = as_type(c1.x); + _285.memory[di + 8u] = as_type(c1.y); + _285.memory[di + 9u] = as_type(ra); + _285.memory[di + 10u] = as_type(roff); + break; + } + case 5u: + { + break; + } + } + } + if ((tag_word == 5u) || (tag_word == 37u)) + { + uint path_ix = ~(out_ix + i_2); + if (tag_word == 5u) + { + path_ix = m.path_ix; + } + _285.memory[clip_out_base + m.clip_ix] = path_ix; + } + } +} + diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv new file mode 100644 index 0000000..58dde43 Binary files /dev/null and b/piet-gpu/shader/gen/draw_leaf.spv differ diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil new file mode 100644 index 0000000..be69aad Binary files /dev/null and b/piet-gpu/shader/gen/draw_reduce.dxil differ diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl new file mode 100644 index 0000000..8311155 --- /dev/null +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -0,0 +1,126 @@ +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +ByteAddressBuffer _87 : register(t1, space0); +ByteAddressBuffer _97 : register(t2, space0); +RWByteAddressBuffer _188 : register(u3, space0); +RWByteAddressBuffer _206 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared DrawMonoid sh_scratch[256]; + +DrawMonoid map_tag(uint tag_word) +{ + uint has_path = uint(tag_word != 0u); + DrawMonoid _70 = { has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; + return _70; +} + +DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _87.Load(100) >> uint(2); + uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0); + uint param = tag_word; + DrawMonoid agg = map_tag(param); + for (uint i = 1u; i < 8u; i++) + { + uint tag_word_1 = _97.Load(((drawtag_base + ix) + i) * 4 + 0); + uint param_1 = tag_word_1; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + DrawMonoid param_4 = agg; + DrawMonoid param_5 = other; + agg = combine_draw_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _188.Store(gl_WorkGroupID.x * 16 + 0, agg.path_ix); + _188.Store(gl_WorkGroupID.x * 16 + 4, agg.clip_ix); + _188.Store(gl_WorkGroupID.x * 16 + 8, agg.scene_offset); + _188.Store(gl_WorkGroupID.x * 16 + 12, agg.info_offset); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl new file mode 100644 index 0000000..759267c --- /dev/null +++ b/piet-gpu/shader/gen/draw_reduce.msl @@ -0,0 +1,140 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct DrawMonoid_1 +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct OutBuf +{ + DrawMonoid_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid map_tag(thread const uint& tag_word) +{ + uint has_path = uint(tag_word != 0u); + return DrawMonoid{ has_path, tag_word & 1u, tag_word & 28u, (tag_word >> uint(4)) & 60u }; +} + +static inline __attribute__((always_inline)) +DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +kernel void main0(const device ConfigBuf& _87 [[buffer(1)]], const device SceneBuf& _97 [[buffer(2)]], device OutBuf& _188 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup DrawMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + uint drawtag_base = _87.conf.drawtag_offset >> uint(2); + uint tag_word = _97.scene[drawtag_base + ix]; + uint param = tag_word; + DrawMonoid agg = map_tag(param); + for (uint i = 1u; i < 8u; i++) + { + uint tag_word_1 = _97.scene[(drawtag_base + ix) + i]; + uint param_1 = tag_word_1; + DrawMonoid param_2 = agg; + DrawMonoid param_3 = map_tag(param_1); + agg = combine_draw_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + DrawMonoid param_4 = agg; + DrawMonoid param_5 = other; + agg = combine_draw_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _188.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _188.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix; + _188.outbuf[gl_WorkGroupID.x].scene_offset = agg.scene_offset; + _188.outbuf[gl_WorkGroupID.x].info_offset = agg.info_offset; + } +} + diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv new file mode 100644 index 0000000..d6c6fb7 Binary files /dev/null and b/piet-gpu/shader/gen/draw_reduce.spv differ diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil new file mode 100644 index 0000000..4ea23f7 Binary files /dev/null and b/piet-gpu/shader/gen/draw_root.dxil differ diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl new file mode 100644 index 0000000..b4cb7e4 --- /dev/null +++ b/piet-gpu/shader/gen/draw_root.hlsl @@ -0,0 +1,108 @@ +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const DrawMonoid _18 = { 0u, 0u, 0u, 0u }; + +RWByteAddressBuffer _71 : register(u0, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared DrawMonoid sh_scratch[256]; + +DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +DrawMonoid draw_monoid_identity() +{ + return _18; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + DrawMonoid _75; + _75.path_ix = _71.Load(ix * 16 + 0); + _75.clip_ix = _71.Load(ix * 16 + 4); + _75.scene_offset = _71.Load(ix * 16 + 8); + _75.info_offset = _71.Load(ix * 16 + 12); + DrawMonoid local[8]; + local[0].path_ix = _75.path_ix; + local[0].clip_ix = _75.clip_ix; + local[0].scene_offset = _75.scene_offset; + local[0].info_offset = _75.info_offset; + DrawMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + DrawMonoid param = local[i - 1u]; + DrawMonoid _106; + _106.path_ix = _71.Load((ix + i) * 16 + 0); + _106.clip_ix = _71.Load((ix + i) * 16 + 4); + _106.scene_offset = _71.Load((ix + i) * 16 + 8); + _106.info_offset = _71.Load((ix + i) * 16 + 12); + param_1.path_ix = _106.path_ix; + param_1.clip_ix = _106.clip_ix; + param_1.scene_offset = _106.scene_offset; + param_1.info_offset = _106.info_offset; + local[i] = combine_draw_monoid(param, param_1); + } + DrawMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_2 = other; + DrawMonoid param_3 = agg; + agg = combine_draw_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + DrawMonoid row = draw_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid param_4 = row; + DrawMonoid param_5 = local[i_2]; + DrawMonoid m = combine_draw_monoid(param_4, param_5); + uint _199 = ix + i_2; + _71.Store(_199 * 16 + 0, m.path_ix); + _71.Store(_199 * 16 + 4, m.clip_ix); + _71.Store(_199 * 16 + 8, m.scene_offset); + _71.Store(_199 * 16 + 12, m.info_offset); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl new file mode 100644 index 0000000..9ee8cfe --- /dev/null +++ b/piet-gpu/shader/gen/draw_root.msl @@ -0,0 +1,140 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct DrawMonoid_1 +{ + uint path_ix; + uint clip_ix; + uint scene_offset; + uint info_offset; +}; + +struct DataBuf +{ + DrawMonoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +DrawMonoid combine_draw_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) +{ + DrawMonoid c; + c.path_ix = a.path_ix + b.path_ix; + c.clip_ix = a.clip_ix + b.clip_ix; + c.scene_offset = a.scene_offset + b.scene_offset; + c.info_offset = a.info_offset + b.info_offset; + return c; +} + +static inline __attribute__((always_inline)) +DrawMonoid draw_monoid_identity() +{ + return DrawMonoid{ 0u, 0u, 0u, 0u }; +} + +kernel void main0(device DataBuf& _71 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup DrawMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].path_ix = _71.data[ix].path_ix; + local[0].clip_ix = _71.data[ix].clip_ix; + local[0].scene_offset = _71.data[ix].scene_offset; + local[0].info_offset = _71.data[ix].info_offset; + DrawMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _100 = ix + i; + DrawMonoid param = local[i - 1u]; + param_1.path_ix = _71.data[_100].path_ix; + param_1.clip_ix = _71.data[_100].clip_ix; + param_1.scene_offset = _71.data[_100].scene_offset; + param_1.info_offset = _71.data[_100].info_offset; + local[i] = combine_draw_monoid(param, param_1); + } + DrawMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + DrawMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + DrawMonoid param_2 = other; + DrawMonoid param_3 = agg; + agg = combine_draw_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + DrawMonoid row = draw_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + DrawMonoid param_4 = row; + DrawMonoid param_5 = local[i_2]; + DrawMonoid m = combine_draw_monoid(param_4, param_5); + uint _199 = ix + i_2; + _71.data[_199].path_ix = m.path_ix; + _71.data[_199].clip_ix = m.clip_ix; + _71.data[_199].scene_offset = m.scene_offset; + _71.data[_199].info_offset = m.info_offset; + } +} + diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv new file mode 100644 index 0000000..e6a53e5 Binary files /dev/null and b/piet-gpu/shader/gen/draw_root.spv differ diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil new file mode 100644 index 0000000..c48d59d Binary files /dev/null and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl new file mode 100644 index 0000000..0a6c022 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -0,0 +1,1303 @@ +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); + +RWByteAddressBuffer _297 : register(u0, space0); +ByteAddressBuffer _1681 : register(t1, space0); +RWTexture2D image_atlas : register(u3, space0); +RWTexture2D gradients : register(u4, space0); +RWTexture2D image : register(u2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +uint spvPackUnorm4x8(float4 value) +{ + uint4 Packed = uint4(round(saturate(value) * 255.0)); + return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24); +} + +float4 spvUnpackUnorm4x8(uint value) +{ + uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24); + return float4(Packed) / 255.0; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _310 = { a.offset + offset }; + return _310; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _297.Load(offset * 4 + 8); + return v; +} + +CmdTag Cmd_tag(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _669; +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = asfloat(raw1); + return s; +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) +{ + CmdStrokeRef _685 = { ref.offset + 4u }; + Alloc param = a; + CmdStrokeRef param_1 = _685; + return CmdStroke_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TileSeg s; + s.origin = float2(asfloat(raw0), asfloat(raw1)); + s._vector = float2(asfloat(raw2), asfloat(raw3)); + s.y_edge = asfloat(raw4); + TileSegRef _826 = { raw5 }; + s.next = _826; + return s; +} + +uint2 chunk_offset(uint i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) +{ + CmdFillRef _675 = { ref.offset + 4u }; + Alloc param = a; + CmdFillRef param_1 = _675; + return CmdFill_read(param, param_1); +} + +CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdAlpha s; + s.alpha = asfloat(raw0); + return s; +} + +CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) +{ + CmdAlphaRef _695 = { ref.offset + 4u }; + Alloc param = a; + CmdAlphaRef param_1 = _695; + return CmdAlpha_read(param, param_1); +} + +CmdColor CmdColor_read(Alloc a, CmdColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +CmdColor Cmd_Color_read(Alloc a, CmdRef ref) +{ + CmdColorRef _705 = { ref.offset + 4u }; + Alloc param = a; + CmdColorRef param_1 = _705; + return CmdColor_read(param, param_1); +} + +float3 fromsRGB(float3 srgb) +{ + return srgb; +} + +float4 unpacksRGB(uint srgba) +{ + float4 color = spvUnpackUnorm4x8(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + CmdLinGrad s; + s.index = raw0; + s.line_x = asfloat(raw1); + s.line_y = asfloat(raw2); + s.line_c = asfloat(raw3); + return s; +} + +CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) +{ + CmdLinGradRef _715 = { ref.offset + 4u }; + Alloc param = a; + CmdLinGradRef param_1 = _715; + return CmdLinGrad_read(param, param_1); +} + +CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4)); + s.xlat = float2(asfloat(raw5), asfloat(raw6)); + s.c1 = float2(asfloat(raw7), asfloat(raw8)); + s.ra = asfloat(raw9); + s.roff = asfloat(raw10); + return s; +} + +CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) +{ + CmdRadGradRef _725 = { ref.offset + 4u }; + Alloc param = a; + CmdRadGradRef param_1 = _725; + return CmdRadGrad_read(param, param_1); +} + +CmdImage CmdImage_read(Alloc a, CmdImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +CmdImage Cmd_Image_read(Alloc a, CmdRef ref) +{ + CmdImageRef _735 = { ref.offset + 4u }; + Alloc param = a; + CmdImageRef param_1 = _735; + return CmdImage_read(param, param_1); +} + +void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) +{ + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas[uv]; + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + spvReturnValue = rgba; +} + +float3 tosRGB(float3 rgb) +{ + return rgb; +} + +uint packsRGB(inout float4 rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return spvPackUnorm4x8(rgba.wzyx); +} + +CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdEndClip s; + s.blend = raw0; + return s; +} + +CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) +{ + CmdEndClipRef _745 = { ref.offset + 4u }; + Alloc param = a; + CmdEndClipRef param_1 = _745; + return CmdEndClip_read(param, param_1); +} + +float3 screen(float3 cb, float3 cs) +{ + return (cb + cs) - (cb * cs); +} + +float3 hard_light(float3 cb, float3 cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0f) - 1.0f.xxx; + float3 _889 = screen(param, param_1); + float3 _893 = (cb * 2.0f) * cs; + bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z); +} + +float color_dodge(float cb, float cs) +{ + if (cb == 0.0f) + { + return 0.0f; + } + else + { + if (cs == 1.0f) + { + return 1.0f; + } + else + { + return min(1.0f, cb / (1.0f - cs)); + } + } +} + +float color_burn(float cb, float cs) +{ + if (cb == 1.0f) + { + return 1.0f; + } + else + { + if (cs == 0.0f) + { + return 0.0f; + } + else + { + return 1.0f - min(1.0f, (1.0f - cb) / cs); + } + } +} + +float3 soft_light(float3 cb, float3 cs) +{ + float3 _904 = sqrt(cb); + float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb; + bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z); + float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z); + float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)); + float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)); + bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z); +} + +float sat(float3 c) +{ + return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z)); +} + +void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0f; + cmax = 0.0f; + } + cmin = 0.0f; +} + +float3 set_sat(inout float3 c, float s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +float lum(float3 c) +{ + float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f); + return dot(c, f); +} + +float3 clip_color(inout float3 c) +{ + float3 param = c; + float L = lum(param); + float n = min(c.x, min(c.y, c.z)); + float x = max(c.x, max(c.y, c.z)); + if (n < 0.0f) + { + c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx); + } + if (x > 1.0f) + { + c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx); + } + return c; +} + +float3 set_lum(float3 c, float l) +{ + float3 param = c; + float3 param_1 = c + (l - lum(param)).xxx; + float3 _1048 = clip_color(param_1); + return _1048; +} + +float3 mix_blend(float3 cb, float3 cs, uint mode) +{ + float3 b = 0.0f.xxx; + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = min(cb, cs); + break; + } + case 5u: + { + b = max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0f) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode) +{ + float fa = 0.0f; + float fb = 0.0f; + switch (mode) + { + case 1u: + { + fa = 1.0f; + fb = 0.0f; + break; + } + case 2u: + { + fa = 0.0f; + fb = 1.0f; + break; + } + case 3u: + { + fa = 1.0f; + fb = 1.0f - as; + break; + } + case 4u: + { + fa = 1.0f - ab; + fb = 1.0f; + break; + } + case 5u: + { + fa = ab; + fb = 0.0f; + break; + } + case 6u: + { + fa = 0.0f; + fb = as; + break; + } + case 7u: + { + fa = 1.0f - ab; + fb = 0.0f; + break; + } + case 8u: + { + fa = 0.0f; + fb = 1.0f - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0f - as; + break; + } + case 10u: + { + fa = 1.0f - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0f - ab; + fb = 1.0f - as; + break; + } + case 12u: + { + fa = 1.0f; + fb = 1.0f; + break; + } + case 13u: + { + return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +float4 mix_blend_compose(float4 backdrop, float4 src, uint mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0f - src.w)) + src; + } + float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = lerp(cs, blended, backdrop.w.xxx); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = lerp(backdrop.xyz, cs, src.w.xxx); + return float4(co, src.w + (backdrop.w * (1.0f - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdJump s; + s.new_ref = raw0; + return s; +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) +{ + CmdJumpRef _755 = { ref.offset + 4u }; + Alloc param = a; + CmdJumpRef param_1 = _755; + return CmdJump_read(param, param_1); +} + +void comp_main() +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x; + Alloc _1696; + _1696.offset = _1681.Load(24); + Alloc param; + param.offset = _1696.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1705 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1705; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = 0.0f.xxxx; + } + uint clip_depth = 0u; + bool mem_ok = _297.Load(4) == 0u; + float df[8]; + TileSegRef tile_seg_ref; + float area[8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0f; + } + TileSegRef _1810 = { stroke.tile_ref }; + tile_seg_ref = _1810; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11); + float2 line_vec = seg._vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + 0.5f.xx) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f); + df[k_1] = min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + TileSegRef _1930 = { fill.tile_ref }; + tile_seg_ref = _1930; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1._vector; + float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx); + if (window.x != window.y) + { + float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx; + float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y)); + float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0f); + float c = max(b, 0.0f); + float d = max(xmin, 0.0f); + float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = min(abs(area[k_5]), 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0f; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba = gradients[int2(x, int(lin.index))]; + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + float4 _2417[8]; + fillImage(_2417, param_36, param_37); + float4 img[8] = _2417; + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2522); + rgba[k_13] = 0.0f.xxxx; + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2621; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w); + } +} + +[numthreads(8, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl new file mode 100644 index 0000000..f60ea81 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4.msl @@ -0,0 +1,1349 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_297.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_297); + return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = as_type(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u }; + return CmdStroke_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + TileSeg s; + s.origin = float2(as_type(raw0), as_type(raw1)); + s.vector = float2(as_type(raw2), as_type(raw3)); + s.y_edge = as_type(raw4); + s.next = TileSegRef{ raw5 }; + return s; +} + +static inline __attribute__((always_inline)) +uint2 chunk_offset(thread const uint& i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +static inline __attribute__((always_inline)) +CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u }; + return CmdFill_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdAlpha s; + s.alpha = as_type(raw0); + return s; +} + +static inline __attribute__((always_inline)) +CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u }; + return CmdAlpha_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u }; + return CmdColor_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 fromsRGB(thread const float3& srgb) +{ + return srgb; +} + +static inline __attribute__((always_inline)) +float4 unpacksRGB(thread const uint& srgba) +{ + float4 color = unpack_unorm4x8_to_float(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +static inline __attribute__((always_inline)) +CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + CmdLinGrad s; + s.index = raw0; + s.line_x = as_type(raw1); + s.line_y = as_type(raw2); + s.line_c = as_type(raw3); + return s; +} + +static inline __attribute__((always_inline)) +CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u }; + return CmdLinGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_297); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_297); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_297); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_297); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_297); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(as_type(raw1), as_type(raw2), as_type(raw3), as_type(raw4)); + s.xlat = float2(as_type(raw5), as_type(raw6)); + s.c1 = float2(as_type(raw7), as_type(raw8)); + s.ra = as_type(raw9); + s.roff = as_type(raw10); + return s; +} + +static inline __attribute__((always_inline)) +CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u }; + return CmdRadGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u }; + return CmdImage_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, texture2d image_atlas) +{ + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas.read(uint2(uv)); + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + return rgba; +} + +static inline __attribute__((always_inline)) +float3 tosRGB(thread const float3& rgb) +{ + return rgb; +} + +static inline __attribute__((always_inline)) +uint packsRGB(thread float4& rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return pack_float_to_unorm4x8(rgba.wzyx); +} + +static inline __attribute__((always_inline)) +CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdEndClip s; + s.blend = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u }; + return CmdEndClip_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 screen(thread const float3& cb, thread const float3& cs) +{ + return (cb + cs) - (cb * cs); +} + +static inline __attribute__((always_inline)) +float3 hard_light(thread const float3& cb, thread const float3& cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0) - float3(1.0); + return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float color_dodge(thread const float& cb, thread const float& cs) +{ + if (cb == 0.0) + { + return 0.0; + } + else + { + if (cs == 1.0) + { + return 1.0; + } + else + { + return fast::min(1.0, cb / (1.0 - cs)); + } + } +} + +static inline __attribute__((always_inline)) +float color_burn(thread const float& cb, thread const float& cs) +{ + if (cb == 1.0) + { + return 1.0; + } + else + { + if (cs == 0.0) + { + return 0.0; + } + else + { + return 1.0 - fast::min(1.0, (1.0 - cb) / cs); + } + } +} + +static inline __attribute__((always_inline)) +float3 soft_light(thread const float3& cb, thread const float3& cs) +{ + float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25)); + return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float sat(thread const float3& c) +{ + return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z)); +} + +static inline __attribute__((always_inline)) +void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0; + cmax = 0.0; + } + cmin = 0.0; +} + +static inline __attribute__((always_inline)) +float3 set_sat(thread float3& c, thread const float& s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +static inline __attribute__((always_inline)) +float lum(thread const float3& c) +{ + float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375); + return dot(c, f); +} + +static inline __attribute__((always_inline)) +float3 clip_color(thread float3& c) +{ + float3 param = c; + float L = lum(param); + float n = fast::min(c.x, fast::min(c.y, c.z)); + float x = fast::max(c.x, fast::max(c.y, c.z)); + if (n < 0.0) + { + c = float3(L) + (((c - float3(L)) * L) / float3(L - n)); + } + if (x > 1.0) + { + c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L)); + } + return c; +} + +static inline __attribute__((always_inline)) +float3 set_lum(thread const float3& c, thread const float& l) +{ + float3 param = c; + float3 param_1 = c + float3(l - lum(param)); + float3 _1048 = clip_color(param_1); + return _1048; +} + +static inline __attribute__((always_inline)) +float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode) +{ + float3 b = float3(0.0); + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = fast::min(cb, cs); + break; + } + case 5u: + { + b = fast::max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +static inline __attribute__((always_inline)) +float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode) +{ + float fa = 0.0; + float fb = 0.0; + switch (mode) + { + case 1u: + { + fa = 1.0; + fb = 0.0; + break; + } + case 2u: + { + fa = 0.0; + fb = 1.0; + break; + } + case 3u: + { + fa = 1.0; + fb = 1.0 - as; + break; + } + case 4u: + { + fa = 1.0 - ab; + fb = 1.0; + break; + } + case 5u: + { + fa = ab; + fb = 0.0; + break; + } + case 6u: + { + fa = 0.0; + fb = as; + break; + } + case 7u: + { + fa = 1.0 - ab; + fb = 0.0; + break; + } + case 8u: + { + fa = 0.0; + fb = 1.0 - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0 - as; + break; + } + case 10u: + { + fa = 1.0 - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0 - ab; + fb = 1.0 - as; + break; + } + case 12u: + { + fa = 1.0; + fb = 1.0; + break; + } + case 13u: + { + return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +static inline __attribute__((always_inline)) +float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0 - src.w)) + src; + } + float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = mix(cs, blended, float3(backdrop.w)); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = mix(backdrop.xyz, cs, float3(src.w)); + return float4(co, src.w + (backdrop.w * (1.0 - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +static inline __attribute__((always_inline)) +CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdJump s; + s.new_ref = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u }; + return CmdJump_read(param, param_1, v_297); +} + +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x; + Alloc param; + param.offset = _1681.conf.ptcl_alloc.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = float4(0.0); + } + uint clip_depth = 0u; + bool mem_ok = v_297.mem_error == 0u; + spvUnsafeArray df; + TileSegRef tile_seg_ref; + spvUnsafeArray area; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4, v_297).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0; + } + tile_seg_ref = TileSegRef{ stroke.tile_ref }; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11, v_297); + float2 line_vec = seg.vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + float2(0.5)) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + tile_seg_ref = TileSegRef{ fill.tile_ref }; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1.vector; + float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0)); + if ((isunordered(window.x, window.y) || window.x != window.y)) + { + float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y); + float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y)); + float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07; + float xmax = fast::max(xs.x, xs.y); + float b = fast::min(xmax, 1.0); + float c = fast::max(b, 0.0); + float d = fast::max(xmin, 0.0); + float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = fast::min(abs(area[k_5]), 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24, v_297); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); + float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); + float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + spvUnsafeArray img; + img = fillImage(param_36, param_37, image_atlas); + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2522; + rgba[k_13] = float4(0.0); + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); + } +} + diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv new file mode 100644 index 0000000..c388941 Binary files /dev/null and b/piet-gpu/shader/gen/kernel4.spv differ diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil new file mode 100644 index 0000000..7390167 Binary files /dev/null and b/piet-gpu/shader/gen/kernel4_gray.dxil differ diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl new file mode 100644 index 0000000..ffada37 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4_gray.hlsl @@ -0,0 +1,1302 @@ +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); + +RWByteAddressBuffer _297 : register(u0, space0); +ByteAddressBuffer _1681 : register(t1, space0); +RWTexture2D image_atlas : register(u3, space0); +RWTexture2D gradients : register(u4, space0); +RWTexture2D image : register(u2, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +uint spvPackUnorm4x8(float4 value) +{ + uint4 Packed = uint4(round(saturate(value) * 255.0)); + return Packed.x | (Packed.y << 8) | (Packed.z << 16) | (Packed.w << 24); +} + +float4 spvUnpackUnorm4x8(uint value) +{ + uint4 Packed = uint4(value & 0xff, (value >> 8) & 0xff, (value >> 16) & 0xff, value >> 24); + return float4(Packed) / 255.0; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _310 = { a.offset + offset }; + return _310; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _297.Load(offset * 4 + 8); + return v; +} + +CmdTag Cmd_tag(Alloc a, CmdRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + CmdTag _669 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _669; +} + +CmdStroke CmdStroke_read(Alloc a, CmdStrokeRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = asfloat(raw1); + return s; +} + +CmdStroke Cmd_Stroke_read(Alloc a, CmdRef ref) +{ + CmdStrokeRef _685 = { ref.offset + 4u }; + Alloc param = a; + CmdStrokeRef param_1 = _685; + return CmdStroke_read(param, param_1); +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +TileSeg TileSeg_read(Alloc a, TileSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TileSeg s; + s.origin = float2(asfloat(raw0), asfloat(raw1)); + s._vector = float2(asfloat(raw2), asfloat(raw3)); + s.y_edge = asfloat(raw4); + TileSegRef _826 = { raw5 }; + s.next = _826; + return s; +} + +uint2 chunk_offset(uint i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +CmdFill CmdFill_read(Alloc a, CmdFillRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +CmdFill Cmd_Fill_read(Alloc a, CmdRef ref) +{ + CmdFillRef _675 = { ref.offset + 4u }; + Alloc param = a; + CmdFillRef param_1 = _675; + return CmdFill_read(param, param_1); +} + +CmdAlpha CmdAlpha_read(Alloc a, CmdAlphaRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdAlpha s; + s.alpha = asfloat(raw0); + return s; +} + +CmdAlpha Cmd_Alpha_read(Alloc a, CmdRef ref) +{ + CmdAlphaRef _695 = { ref.offset + 4u }; + Alloc param = a; + CmdAlphaRef param_1 = _695; + return CmdAlpha_read(param, param_1); +} + +CmdColor CmdColor_read(Alloc a, CmdColorRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +CmdColor Cmd_Color_read(Alloc a, CmdRef ref) +{ + CmdColorRef _705 = { ref.offset + 4u }; + Alloc param = a; + CmdColorRef param_1 = _705; + return CmdColor_read(param, param_1); +} + +float3 fromsRGB(float3 srgb) +{ + return srgb; +} + +float4 unpacksRGB(uint srgba) +{ + float4 color = spvUnpackUnorm4x8(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +CmdLinGrad CmdLinGrad_read(Alloc a, CmdLinGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + CmdLinGrad s; + s.index = raw0; + s.line_x = asfloat(raw1); + s.line_y = asfloat(raw2); + s.line_c = asfloat(raw3); + return s; +} + +CmdLinGrad Cmd_LinGrad_read(Alloc a, CmdRef ref) +{ + CmdLinGradRef _715 = { ref.offset + 4u }; + Alloc param = a; + CmdLinGradRef param_1 = _715; + return CmdLinGrad_read(param, param_1); +} + +CmdRadGrad CmdRadGrad_read(Alloc a, CmdRadGradRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(asfloat(raw1), asfloat(raw2), asfloat(raw3), asfloat(raw4)); + s.xlat = float2(asfloat(raw5), asfloat(raw6)); + s.c1 = float2(asfloat(raw7), asfloat(raw8)); + s.ra = asfloat(raw9); + s.roff = asfloat(raw10); + return s; +} + +CmdRadGrad Cmd_RadGrad_read(Alloc a, CmdRef ref) +{ + CmdRadGradRef _725 = { ref.offset + 4u }; + Alloc param = a; + CmdRadGradRef param_1 = _725; + return CmdRadGrad_read(param, param_1); +} + +CmdImage CmdImage_read(Alloc a, CmdImageRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +CmdImage Cmd_Image_read(Alloc a, CmdRef ref) +{ + CmdImageRef _735 = { ref.offset + 4u }; + Alloc param = a; + CmdImageRef param_1 = _735; + return CmdImage_read(param, param_1); +} + +void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) +{ + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas[uv]; + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + spvReturnValue = rgba; +} + +float3 tosRGB(float3 rgb) +{ + return rgb; +} + +uint packsRGB(inout float4 rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return spvPackUnorm4x8(rgba.wzyx); +} + +CmdEndClip CmdEndClip_read(Alloc a, CmdEndClipRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdEndClip s; + s.blend = raw0; + return s; +} + +CmdEndClip Cmd_EndClip_read(Alloc a, CmdRef ref) +{ + CmdEndClipRef _745 = { ref.offset + 4u }; + Alloc param = a; + CmdEndClipRef param_1 = _745; + return CmdEndClip_read(param, param_1); +} + +float3 screen(float3 cb, float3 cs) +{ + return (cb + cs) - (cb * cs); +} + +float3 hard_light(float3 cb, float3 cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0f) - 1.0f.xxx; + float3 _889 = screen(param, param_1); + float3 _893 = (cb * 2.0f) * cs; + bool3 _898 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_898.x ? _893.x : _889.x, _898.y ? _893.y : _889.y, _898.z ? _893.z : _889.z); +} + +float color_dodge(float cb, float cs) +{ + if (cb == 0.0f) + { + return 0.0f; + } + else + { + if (cs == 1.0f) + { + return 1.0f; + } + else + { + return min(1.0f, cb / (1.0f - cs)); + } + } +} + +float color_burn(float cb, float cs) +{ + if (cb == 1.0f) + { + return 1.0f; + } + else + { + if (cs == 0.0f) + { + return 0.0f; + } + else + { + return 1.0f - min(1.0f, (1.0f - cb) / cs); + } + } +} + +float3 soft_light(float3 cb, float3 cs) +{ + float3 _904 = sqrt(cb); + float3 _917 = ((((cb * 16.0f) - 12.0f.xxx) * cb) + 4.0f.xxx) * cb; + bool3 _921 = bool3(cb.x <= 0.25f.xxx.x, cb.y <= 0.25f.xxx.y, cb.z <= 0.25f.xxx.z); + float3 d = float3(_921.x ? _917.x : _904.x, _921.y ? _917.y : _904.y, _921.z ? _917.z : _904.z); + float3 _932 = cb + (((cs * 2.0f) - 1.0f.xxx) * (d - cb)); + float3 _942 = cb - (((1.0f.xxx - (cs * 2.0f)) * cb) * (1.0f.xxx - cb)); + bool3 _944 = bool3(cs.x <= 0.5f.xxx.x, cs.y <= 0.5f.xxx.y, cs.z <= 0.5f.xxx.z); + return float3(_944.x ? _942.x : _932.x, _944.y ? _942.y : _932.y, _944.z ? _942.z : _932.z); +} + +float sat(float3 c) +{ + return max(c.x, max(c.y, c.z)) - min(c.x, min(c.y, c.z)); +} + +void set_sat_inner(inout float cmin, inout float cmid, inout float cmax, float s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0f; + cmax = 0.0f; + } + cmin = 0.0f; +} + +float3 set_sat(inout float3 c, float s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +float lum(float3 c) +{ + float3 f = float3(0.300000011920928955078125f, 0.589999973773956298828125f, 0.10999999940395355224609375f); + return dot(c, f); +} + +float3 clip_color(inout float3 c) +{ + float3 param = c; + float L = lum(param); + float n = min(c.x, min(c.y, c.z)); + float x = max(c.x, max(c.y, c.z)); + if (n < 0.0f) + { + c = L.xxx + (((c - L.xxx) * L) / (L - n).xxx); + } + if (x > 1.0f) + { + c = L.xxx + (((c - L.xxx) * (1.0f - L)) / (x - L).xxx); + } + return c; +} + +float3 set_lum(float3 c, float l) +{ + float3 param = c; + float3 param_1 = c + (l - lum(param)).xxx; + float3 _1048 = clip_color(param_1); + return _1048; +} + +float3 mix_blend(float3 cb, float3 cs, uint mode) +{ + float3 b = 0.0f.xxx; + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = min(cb, cs); + break; + } + case 5u: + { + b = max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0f) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode) +{ + float fa = 0.0f; + float fb = 0.0f; + switch (mode) + { + case 1u: + { + fa = 1.0f; + fb = 0.0f; + break; + } + case 2u: + { + fa = 0.0f; + fb = 1.0f; + break; + } + case 3u: + { + fa = 1.0f; + fb = 1.0f - as; + break; + } + case 4u: + { + fa = 1.0f - ab; + fb = 1.0f; + break; + } + case 5u: + { + fa = ab; + fb = 0.0f; + break; + } + case 6u: + { + fa = 0.0f; + fb = as; + break; + } + case 7u: + { + fa = 1.0f - ab; + fb = 0.0f; + break; + } + case 8u: + { + fa = 0.0f; + fb = 1.0f - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0f - as; + break; + } + case 10u: + { + fa = 1.0f - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0f - ab; + fb = 1.0f - as; + break; + } + case 12u: + { + fa = 1.0f; + fb = 1.0f; + break; + } + case 13u: + { + return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +float4 mix_blend_compose(float4 backdrop, float4 src, uint mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0f - src.w)) + src; + } + float inv_src_a = 1.0f / (src.w + 1.0000000036274937255387218471014e-15f); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0f / (backdrop.w + 1.0000000036274937255387218471014e-15f); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = lerp(cs, blended, backdrop.w.xxx); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = lerp(backdrop.xyz, cs, src.w.xxx); + return float4(co, src.w + (backdrop.w * (1.0f - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +CmdJump CmdJump_read(Alloc a, CmdJumpRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + CmdJump s; + s.new_ref = raw0; + return s; +} + +CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) +{ + CmdJumpRef _755 = { ref.offset + 4u }; + Alloc param = a; + CmdJumpRef param_1 = _755; + return CmdJump_read(param, param_1); +} + +void comp_main() +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x; + Alloc _1696; + _1696.offset = _1681.Load(24); + Alloc param; + param.offset = _1696.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef _1705 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1705; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + float4 rgba[8]; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = 0.0f.xxxx; + } + uint clip_depth = 0u; + bool mem_ok = _297.Load(4) == 0u; + float df[8]; + TileSegRef tile_seg_ref; + float area[8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0f; + } + TileSegRef _1810 = { stroke.tile_ref }; + tile_seg_ref = _1810; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11); + float2 line_vec = seg._vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + 0.5f.xx) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0f, 1.0f); + df[k_1] = min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = clamp((stroke.half_width + 0.5f) - df[k_2], 0.0f, 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + TileSegRef _1930 = { fill.tile_ref }; + tile_seg_ref = _1930; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1._vector; + float2 window = clamp(float2(start.y, end.y), 0.0f.xx, 1.0f.xx); + if (window.x != window.y) + { + float2 t_1 = (window - start.y.xx) / seg_1._vector.y.xx; + float2 xs = float2(lerp(start.x, end.x, t_1.x), lerp(start.x, end.x, t_1.y)); + float xmin = min(min(xs.x, xs.y), 1.0f) - 9.9999999747524270787835121154785e-07f; + float xmax = max(xs.x, xs.y); + float b = min(xmax, 1.0f); + float c = max(b, 0.0f); + float d = max(xmin, 0.0f); + float a = ((b + (0.5f * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1._vector.x) * clamp((my_xy.y - seg_1.y_edge) + 1.0f, 0.0f, 1.0f)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = min(abs(area[k_5]), 1.0f); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0f; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0f - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba = gradients[int2(x, int(lin.index))]; + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); + float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + float4 _2417[8]; + fillImage(_2417, param_36, param_37); + float4 img[8] = _2417; + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0f - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2522); + rgba[k_13] = 0.0f.xxxx; + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2621; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x; + } +} + +[numthreads(8, 4, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl new file mode 100644 index 0000000..e174713 --- /dev/null +++ b/piet-gpu/shader/gen/kernel4_gray.msl @@ -0,0 +1,1348 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct CmdStrokeRef +{ + uint offset; +}; + +struct CmdStroke +{ + uint tile_ref; + float half_width; +}; + +struct CmdFillRef +{ + uint offset; +}; + +struct CmdFill +{ + uint tile_ref; + int backdrop; +}; + +struct CmdColorRef +{ + uint offset; +}; + +struct CmdColor +{ + uint rgba_color; +}; + +struct CmdLinGradRef +{ + uint offset; +}; + +struct CmdLinGrad +{ + uint index; + float line_x; + float line_y; + float line_c; +}; + +struct CmdRadGradRef +{ + uint offset; +}; + +struct CmdRadGrad +{ + uint index; + float4 mat; + float2 xlat; + float2 c1; + float ra; + float roff; +}; + +struct CmdImageRef +{ + uint offset; +}; + +struct CmdImage +{ + uint index; + int2 offset; +}; + +struct CmdAlphaRef +{ + uint offset; +}; + +struct CmdAlpha +{ + float alpha; +}; + +struct CmdEndClipRef +{ + uint offset; +}; + +struct CmdEndClip +{ + uint blend; +}; + +struct CmdJumpRef +{ + uint offset; +}; + +struct CmdJump +{ + uint new_ref; +}; + +struct CmdRef +{ + uint offset; +}; + +struct CmdTag +{ + uint tag; + uint flags; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_297) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_297.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +CmdTag Cmd_tag(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_297); + return CmdTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +CmdStroke CmdStroke_read(thread const Alloc& a, thread const CmdStrokeRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdStroke s; + s.tile_ref = raw0; + s.half_width = as_type(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdStroke Cmd_Stroke_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdStrokeRef param_1 = CmdStrokeRef{ ref.offset + 4u }; + return CmdStroke_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +TileSeg TileSeg_read(thread const Alloc& a, thread const TileSegRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + TileSeg s; + s.origin = float2(as_type(raw0), as_type(raw1)); + s.vector = float2(as_type(raw2), as_type(raw3)); + s.y_edge = as_type(raw4); + s.next = TileSegRef{ raw5 }; + return s; +} + +static inline __attribute__((always_inline)) +uint2 chunk_offset(thread const uint& i) +{ + return uint2((i % 2u) * 8u, (i / 2u) * 4u); +} + +static inline __attribute__((always_inline)) +CmdFill CmdFill_read(thread const Alloc& a, thread const CmdFillRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdFill s; + s.tile_ref = raw0; + s.backdrop = int(raw1); + return s; +} + +static inline __attribute__((always_inline)) +CmdFill Cmd_Fill_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdFillRef param_1 = CmdFillRef{ ref.offset + 4u }; + return CmdFill_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdAlpha CmdAlpha_read(thread const Alloc& a, thread const CmdAlphaRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdAlpha s; + s.alpha = as_type(raw0); + return s; +} + +static inline __attribute__((always_inline)) +CmdAlpha Cmd_Alpha_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdAlphaRef param_1 = CmdAlphaRef{ ref.offset + 4u }; + return CmdAlpha_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdColor CmdColor_read(thread const Alloc& a, thread const CmdColorRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdColor s; + s.rgba_color = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdColor Cmd_Color_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdColorRef param_1 = CmdColorRef{ ref.offset + 4u }; + return CmdColor_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 fromsRGB(thread const float3& srgb) +{ + return srgb; +} + +static inline __attribute__((always_inline)) +float4 unpacksRGB(thread const uint& srgba) +{ + float4 color = unpack_unorm4x8_to_float(srgba).wzyx; + float3 param = color.xyz; + return float4(fromsRGB(param), color.w); +} + +static inline __attribute__((always_inline)) +CmdLinGrad CmdLinGrad_read(thread const Alloc& a, thread const CmdLinGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + CmdLinGrad s; + s.index = raw0; + s.line_x = as_type(raw1); + s.line_y = as_type(raw2); + s.line_c = as_type(raw3); + return s; +} + +static inline __attribute__((always_inline)) +CmdLinGrad Cmd_LinGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdLinGradRef param_1 = CmdLinGradRef{ ref.offset + 4u }; + return CmdLinGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdRadGrad CmdRadGrad_read(thread const Alloc& a, thread const CmdRadGradRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_297); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_297); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_297); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_297); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_297); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_297); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_297); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_297); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_297); + CmdRadGrad s; + s.index = raw0; + s.mat = float4(as_type(raw1), as_type(raw2), as_type(raw3), as_type(raw4)); + s.xlat = float2(as_type(raw5), as_type(raw6)); + s.c1 = float2(as_type(raw7), as_type(raw8)); + s.ra = as_type(raw9); + s.roff = as_type(raw10); + return s; +} + +static inline __attribute__((always_inline)) +CmdRadGrad Cmd_RadGrad_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdRadGradRef param_1 = CmdRadGradRef{ ref.offset + 4u }; + return CmdRadGrad_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +CmdImage CmdImage_read(thread const Alloc& a, thread const CmdImageRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_297); + CmdImage s; + s.index = raw0; + s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); + return s; +} + +static inline __attribute__((always_inline)) +CmdImage Cmd_Image_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdImageRef param_1 = CmdImageRef{ ref.offset + 4u }; + return CmdImage_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImage& cmd_img, texture2d image_atlas) +{ + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + uint param = i; + int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; + float4 fg_rgba = image_atlas.read(uint2(uv)); + float3 param_1 = fg_rgba.xyz; + float3 _1653 = fromsRGB(param_1); + fg_rgba.x = _1653.x; + fg_rgba.y = _1653.y; + fg_rgba.z = _1653.z; + rgba[i] = fg_rgba; + } + return rgba; +} + +static inline __attribute__((always_inline)) +float3 tosRGB(thread const float3& rgb) +{ + return rgb; +} + +static inline __attribute__((always_inline)) +uint packsRGB(thread float4& rgba) +{ + float3 param = rgba.xyz; + rgba = float4(tosRGB(param), rgba.w); + return pack_float_to_unorm4x8(rgba.wzyx); +} + +static inline __attribute__((always_inline)) +CmdEndClip CmdEndClip_read(thread const Alloc& a, thread const CmdEndClipRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdEndClip s; + s.blend = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdEndClip Cmd_EndClip_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdEndClipRef param_1 = CmdEndClipRef{ ref.offset + 4u }; + return CmdEndClip_read(param, param_1, v_297); +} + +static inline __attribute__((always_inline)) +float3 screen(thread const float3& cb, thread const float3& cs) +{ + return (cb + cs) - (cb * cs); +} + +static inline __attribute__((always_inline)) +float3 hard_light(thread const float3& cb, thread const float3& cs) +{ + float3 param = cb; + float3 param_1 = (cs * 2.0) - float3(1.0); + return select(screen(param, param_1), (cb * 2.0) * cs, cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float color_dodge(thread const float& cb, thread const float& cs) +{ + if (cb == 0.0) + { + return 0.0; + } + else + { + if (cs == 1.0) + { + return 1.0; + } + else + { + return fast::min(1.0, cb / (1.0 - cs)); + } + } +} + +static inline __attribute__((always_inline)) +float color_burn(thread const float& cb, thread const float& cs) +{ + if (cb == 1.0) + { + return 1.0; + } + else + { + if (cs == 0.0) + { + return 0.0; + } + else + { + return 1.0 - fast::min(1.0, (1.0 - cb) / cs); + } + } +} + +static inline __attribute__((always_inline)) +float3 soft_light(thread const float3& cb, thread const float3& cs) +{ + float3 d = select(sqrt(cb), ((((cb * 16.0) - float3(12.0)) * cb) + float3(4.0)) * cb, cb <= float3(0.25)); + return select(cb + (((cs * 2.0) - float3(1.0)) * (d - cb)), cb - (((float3(1.0) - (cs * 2.0)) * cb) * (float3(1.0) - cb)), cs <= float3(0.5)); +} + +static inline __attribute__((always_inline)) +float sat(thread const float3& c) +{ + return fast::max(c.x, fast::max(c.y, c.z)) - fast::min(c.x, fast::min(c.y, c.z)); +} + +static inline __attribute__((always_inline)) +void set_sat_inner(thread float& cmin, thread float& cmid, thread float& cmax, thread const float& s) +{ + if (cmax > cmin) + { + cmid = ((cmid - cmin) * s) / (cmax - cmin); + cmax = s; + } + else + { + cmid = 0.0; + cmax = 0.0; + } + cmin = 0.0; +} + +static inline __attribute__((always_inline)) +float3 set_sat(thread float3& c, thread const float& s) +{ + if (c.x <= c.y) + { + if (c.y <= c.z) + { + float param = c.x; + float param_1 = c.y; + float param_2 = c.z; + float param_3 = s; + set_sat_inner(param, param_1, param_2, param_3); + c.x = param; + c.y = param_1; + c.z = param_2; + } + else + { + if (c.x <= c.z) + { + float param_4 = c.x; + float param_5 = c.z; + float param_6 = c.y; + float param_7 = s; + set_sat_inner(param_4, param_5, param_6, param_7); + c.x = param_4; + c.z = param_5; + c.y = param_6; + } + else + { + float param_8 = c.z; + float param_9 = c.x; + float param_10 = c.y; + float param_11 = s; + set_sat_inner(param_8, param_9, param_10, param_11); + c.z = param_8; + c.x = param_9; + c.y = param_10; + } + } + } + else + { + if (c.x <= c.z) + { + float param_12 = c.y; + float param_13 = c.x; + float param_14 = c.z; + float param_15 = s; + set_sat_inner(param_12, param_13, param_14, param_15); + c.y = param_12; + c.x = param_13; + c.z = param_14; + } + else + { + if (c.y <= c.z) + { + float param_16 = c.y; + float param_17 = c.z; + float param_18 = c.x; + float param_19 = s; + set_sat_inner(param_16, param_17, param_18, param_19); + c.y = param_16; + c.z = param_17; + c.x = param_18; + } + else + { + float param_20 = c.z; + float param_21 = c.y; + float param_22 = c.x; + float param_23 = s; + set_sat_inner(param_20, param_21, param_22, param_23); + c.z = param_20; + c.y = param_21; + c.x = param_22; + } + } + } + return c; +} + +static inline __attribute__((always_inline)) +float lum(thread const float3& c) +{ + float3 f = float3(0.300000011920928955078125, 0.589999973773956298828125, 0.10999999940395355224609375); + return dot(c, f); +} + +static inline __attribute__((always_inline)) +float3 clip_color(thread float3& c) +{ + float3 param = c; + float L = lum(param); + float n = fast::min(c.x, fast::min(c.y, c.z)); + float x = fast::max(c.x, fast::max(c.y, c.z)); + if (n < 0.0) + { + c = float3(L) + (((c - float3(L)) * L) / float3(L - n)); + } + if (x > 1.0) + { + c = float3(L) + (((c - float3(L)) * (1.0 - L)) / float3(x - L)); + } + return c; +} + +static inline __attribute__((always_inline)) +float3 set_lum(thread const float3& c, thread const float& l) +{ + float3 param = c; + float3 param_1 = c + float3(l - lum(param)); + float3 _1048 = clip_color(param_1); + return _1048; +} + +static inline __attribute__((always_inline)) +float3 mix_blend(thread const float3& cb, thread const float3& cs, thread const uint& mode) +{ + float3 b = float3(0.0); + switch (mode) + { + case 1u: + { + b = cb * cs; + break; + } + case 2u: + { + float3 param = cb; + float3 param_1 = cs; + b = screen(param, param_1); + break; + } + case 3u: + { + float3 param_2 = cs; + float3 param_3 = cb; + b = hard_light(param_2, param_3); + break; + } + case 4u: + { + b = fast::min(cb, cs); + break; + } + case 5u: + { + b = fast::max(cb, cs); + break; + } + case 6u: + { + float param_4 = cb.x; + float param_5 = cs.x; + float param_6 = cb.y; + float param_7 = cs.y; + float param_8 = cb.z; + float param_9 = cs.z; + b = float3(color_dodge(param_4, param_5), color_dodge(param_6, param_7), color_dodge(param_8, param_9)); + break; + } + case 7u: + { + float param_10 = cb.x; + float param_11 = cs.x; + float param_12 = cb.y; + float param_13 = cs.y; + float param_14 = cb.z; + float param_15 = cs.z; + b = float3(color_burn(param_10, param_11), color_burn(param_12, param_13), color_burn(param_14, param_15)); + break; + } + case 8u: + { + float3 param_16 = cb; + float3 param_17 = cs; + b = hard_light(param_16, param_17); + break; + } + case 9u: + { + float3 param_18 = cb; + float3 param_19 = cs; + b = soft_light(param_18, param_19); + break; + } + case 10u: + { + b = abs(cb - cs); + break; + } + case 11u: + { + b = (cb + cs) - ((cb * 2.0) * cs); + break; + } + case 12u: + { + float3 param_20 = cb; + float3 param_21 = cs; + float param_22 = sat(param_20); + float3 _1340 = set_sat(param_21, param_22); + float3 param_23 = cb; + float3 param_24 = _1340; + float param_25 = lum(param_23); + b = set_lum(param_24, param_25); + break; + } + case 13u: + { + float3 param_26 = cs; + float3 param_27 = cb; + float param_28 = sat(param_26); + float3 _1354 = set_sat(param_27, param_28); + float3 param_29 = cb; + float3 param_30 = _1354; + float param_31 = lum(param_29); + b = set_lum(param_30, param_31); + break; + } + case 14u: + { + float3 param_32 = cb; + float3 param_33 = cs; + float param_34 = lum(param_32); + b = set_lum(param_33, param_34); + break; + } + case 15u: + { + float3 param_35 = cs; + float3 param_36 = cb; + float param_37 = lum(param_35); + b = set_lum(param_36, param_37); + break; + } + default: + { + b = cs; + break; + } + } + return b; +} + +static inline __attribute__((always_inline)) +float4 mix_compose(thread const float3& cb, thread const float3& cs, thread const float& ab, thread const float& as, thread const uint& mode) +{ + float fa = 0.0; + float fb = 0.0; + switch (mode) + { + case 1u: + { + fa = 1.0; + fb = 0.0; + break; + } + case 2u: + { + fa = 0.0; + fb = 1.0; + break; + } + case 3u: + { + fa = 1.0; + fb = 1.0 - as; + break; + } + case 4u: + { + fa = 1.0 - ab; + fb = 1.0; + break; + } + case 5u: + { + fa = ab; + fb = 0.0; + break; + } + case 6u: + { + fa = 0.0; + fb = as; + break; + } + case 7u: + { + fa = 1.0 - ab; + fb = 0.0; + break; + } + case 8u: + { + fa = 0.0; + fb = 1.0 - as; + break; + } + case 9u: + { + fa = ab; + fb = 1.0 - as; + break; + } + case 10u: + { + fa = 1.0 - ab; + fb = as; + break; + } + case 11u: + { + fa = 1.0 - ab; + fb = 1.0 - as; + break; + } + case 12u: + { + fa = 1.0; + fb = 1.0; + break; + } + case 13u: + { + return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab)); + } + default: + { + break; + } + } + float as_fa = as * fa; + float ab_fb = ab * fb; + float3 co = (cs * as_fa) + (cb * ab_fb); + return float4(co, as_fa + ab_fb); +} + +static inline __attribute__((always_inline)) +float4 mix_blend_compose(thread const float4& backdrop, thread const float4& src, thread const uint& mode) +{ + if ((mode & 32767u) == 3u) + { + return (backdrop * (1.0 - src.w)) + src; + } + float inv_src_a = 1.0 / (src.w + 1.0000000036274937255387218471014e-15); + float3 cs = src.xyz * inv_src_a; + float inv_backdrop_a = 1.0 / (backdrop.w + 1.0000000036274937255387218471014e-15); + float3 cb = backdrop.xyz * inv_backdrop_a; + uint blend_mode = mode >> uint(8); + float3 param = cb; + float3 param_1 = cs; + uint param_2 = blend_mode; + float3 blended = mix_blend(param, param_1, param_2); + cs = mix(cs, blended, float3(backdrop.w)); + uint comp_mode = mode & 255u; + if (comp_mode == 3u) + { + float3 co = mix(backdrop.xyz, cs, float3(src.w)); + return float4(co, src.w + (backdrop.w * (1.0 - src.w))); + } + else + { + float3 param_3 = cb; + float3 param_4 = cs; + float param_5 = backdrop.w; + float param_6 = src.w; + uint param_7 = comp_mode; + return mix_compose(param_3, param_4, param_5, param_6, param_7); + } +} + +static inline __attribute__((always_inline)) +CmdJump CmdJump_read(thread const Alloc& a, thread const CmdJumpRef& ref, device Memory& v_297) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_297); + CmdJump s; + s.new_ref = raw0; + return s; +} + +static inline __attribute__((always_inline)) +CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_297) +{ + Alloc param = a; + CmdJumpRef param_1 = CmdJumpRef{ ref.offset + 4u }; + return CmdJump_read(param, param_1, v_297); +} + +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x; + Alloc param; + param.offset = _1681.conf.ptcl_alloc.offset; + uint param_1 = tile_ix * 1024u; + uint param_2 = 1024u; + Alloc cmd_alloc = slice_mem(param, param_1, param_2); + CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; + uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); + float2 xy = float2(xy_uint); + spvUnsafeArray rgba; + for (uint i = 0u; i < 8u; i++) + { + rgba[i] = float4(0.0); + } + uint clip_depth = 0u; + bool mem_ok = v_297.mem_error == 0u; + spvUnsafeArray df; + TileSegRef tile_seg_ref; + spvUnsafeArray area; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; + while (mem_ok) + { + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + uint tag = Cmd_tag(param_3, param_4, v_297).tag; + if (tag == 0u) + { + break; + } + switch (tag) + { + case 2u: + { + Alloc param_5 = cmd_alloc; + CmdRef param_6 = cmd_ref; + CmdStroke stroke = Cmd_Stroke_read(param_5, param_6, v_297); + for (uint k = 0u; k < 8u; k++) + { + df[k] = 1000000000.0; + } + tile_seg_ref = TileSegRef{ stroke.tile_ref }; + do + { + uint param_7 = tile_seg_ref.offset; + uint param_8 = 24u; + bool param_9 = mem_ok; + Alloc param_10 = new_alloc(param_7, param_8, param_9); + TileSegRef param_11 = tile_seg_ref; + TileSeg seg = TileSeg_read(param_10, param_11, v_297); + float2 line_vec = seg.vector; + for (uint k_1 = 0u; k_1 < 8u; k_1++) + { + float2 dpos = (xy + float2(0.5)) - seg.origin; + uint param_12 = k_1; + dpos += float2(chunk_offset(param_12)); + float t = fast::clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0); + df[k_1] = fast::min(df[k_1], length((line_vec * t) - dpos)); + } + tile_seg_ref = seg.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_2 = 0u; k_2 < 8u; k_2++) + { + area[k_2] = fast::clamp((stroke.half_width + 0.5) - df[k_2], 0.0, 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 1u: + { + Alloc param_13 = cmd_alloc; + CmdRef param_14 = cmd_ref; + CmdFill fill = Cmd_Fill_read(param_13, param_14, v_297); + for (uint k_3 = 0u; k_3 < 8u; k_3++) + { + area[k_3] = float(fill.backdrop); + } + tile_seg_ref = TileSegRef{ fill.tile_ref }; + do + { + uint param_15 = tile_seg_ref.offset; + uint param_16 = 24u; + bool param_17 = mem_ok; + Alloc param_18 = new_alloc(param_15, param_16, param_17); + TileSegRef param_19 = tile_seg_ref; + TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297); + for (uint k_4 = 0u; k_4 < 8u; k_4++) + { + uint param_20 = k_4; + float2 my_xy = xy + float2(chunk_offset(param_20)); + float2 start = seg_1.origin - my_xy; + float2 end = start + seg_1.vector; + float2 window = fast::clamp(float2(start.y, end.y), float2(0.0), float2(1.0)); + if ((isunordered(window.x, window.y) || window.x != window.y)) + { + float2 t_1 = (window - float2(start.y)) / float2(seg_1.vector.y); + float2 xs = float2(mix(start.x, end.x, t_1.x), mix(start.x, end.x, t_1.y)); + float xmin = fast::min(fast::min(xs.x, xs.y), 1.0) - 9.9999999747524270787835121154785e-07; + float xmax = fast::max(xs.x, xs.y); + float b = fast::min(xmax, 1.0); + float c = fast::max(b, 0.0); + float d = fast::max(xmin, 0.0); + float a = ((b + (0.5 * ((d * d) - (c * c)))) - xmin) / (xmax - xmin); + area[k_4] += (a * (window.x - window.y)); + } + area[k_4] += (sign(seg_1.vector.x) * fast::clamp((my_xy.y - seg_1.y_edge) + 1.0, 0.0, 1.0)); + } + tile_seg_ref = seg_1.next; + } while (tile_seg_ref.offset != 0u); + for (uint k_5 = 0u; k_5 < 8u; k_5++) + { + area[k_5] = fast::min(abs(area[k_5]), 1.0); + } + cmd_ref.offset += 12u; + break; + } + case 3u: + { + for (uint k_6 = 0u; k_6 < 8u; k_6++) + { + area[k_6] = 1.0; + } + cmd_ref.offset += 4u; + break; + } + case 4u: + { + Alloc param_21 = cmd_alloc; + CmdRef param_22 = cmd_ref; + CmdAlpha alpha = Cmd_Alpha_read(param_21, param_22, v_297); + for (uint k_7 = 0u; k_7 < 8u; k_7++) + { + area[k_7] = alpha.alpha; + } + cmd_ref.offset += 8u; + break; + } + case 5u: + { + Alloc param_23 = cmd_alloc; + CmdRef param_24 = cmd_ref; + CmdColor color = Cmd_Color_read(param_23, param_24, v_297); + uint param_25 = color.rgba_color; + float4 fg = unpacksRGB(param_25); + for (uint k_8 = 0u; k_8 < 8u; k_8++) + { + float4 fg_k = fg * area[k_8]; + rgba[k_8] = (rgba[k_8] * (1.0 - fg_k.w)) + fg_k; + } + cmd_ref.offset += 8u; + break; + } + case 6u: + { + Alloc param_26 = cmd_alloc; + CmdRef param_27 = cmd_ref; + CmdLinGrad lin = Cmd_LinGrad_read(param_26, param_27, v_297); + float d_1 = ((lin.line_x * xy.x) + (lin.line_y * xy.y)) + lin.line_c; + for (uint k_9 = 0u; k_9 < 8u; k_9++) + { + uint param_28 = k_9; + float2 chunk_xy = float2(chunk_offset(param_28)); + float my_d = (d_1 + (lin.line_x * chunk_xy.x)) + (lin.line_y * chunk_xy.y); + int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); + float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); + float3 param_29 = fg_rgba.xyz; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; + float4 fg_k_1 = fg_rgba * area[k_9]; + rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; + } + cmd_ref.offset += 20u; + break; + } + case 7u: + { + Alloc param_30 = cmd_alloc; + CmdRef param_31 = cmd_ref; + CmdRadGrad rad = Cmd_RadGrad_read(param_30, param_31, v_297); + for (uint k_10 = 0u; k_10 < 8u; k_10++) + { + uint param_32 = k_10; + float2 my_xy_1 = xy + float2(chunk_offset(param_32)); + my_xy_1 = ((rad.mat.xz * my_xy_1.x) + (rad.mat.yw * my_xy_1.y)) - rad.xlat; + float ba = dot(my_xy_1, rad.c1); + float ca = rad.ra * dot(my_xy_1, my_xy_1); + float t_2 = (sqrt((ba * ba) + ca) - ba) - rad.roff; + int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); + float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); + float3 param_33 = fg_rgba_1.xyz; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; + float4 fg_k_2 = fg_rgba_1 * area[k_10]; + rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; + } + cmd_ref.offset += 48u; + break; + } + case 8u: + { + Alloc param_34 = cmd_alloc; + CmdRef param_35 = cmd_ref; + CmdImage fill_img = Cmd_Image_read(param_34, param_35, v_297); + uint2 param_36 = xy_uint; + CmdImage param_37 = fill_img; + spvUnsafeArray img; + img = fillImage(param_36, param_37, image_atlas); + for (uint k_11 = 0u; k_11 < 8u; k_11++) + { + float4 fg_k_3 = img[k_11] * area[k_11]; + rgba[k_11] = (rgba[k_11] * (1.0 - fg_k_3.w)) + fg_k_3; + } + cmd_ref.offset += 12u; + break; + } + case 9u: + { + if (clip_depth < 4u) + { + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2522; + rgba[k_13] = float4(0.0); + } + } + clip_depth++; + cmd_ref.offset += 4u; + break; + } + case 10u: + { + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); + clip_depth--; + if (clip_depth >= 4u) + { + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); + } + cmd_ref.offset += 8u; + break; + } + case 11u: + { + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; + cmd_alloc.offset = cmd_ref.offset; + break; + } + } + } + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint param_48 = i_1; + image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); + } +} + diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv new file mode 100644 index 0000000..17c7531 Binary files /dev/null and b/piet-gpu/shader/gen/kernel4_gray.spv differ diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil new file mode 100644 index 0000000..9fd593c Binary files /dev/null and b/piet-gpu/shader/gen/path_coarse.dxil differ diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl new file mode 100644 index 0000000..93ee8f0 --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.hlsl @@ -0,0 +1,673 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 _vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u); + +static const PathSegTag _721 = { 0u, 0u }; + +RWByteAddressBuffer _136 : register(u0, space0); +ByteAddressBuffer _710 : register(t1, space0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _136.Load(offset * 4 + 8); + return v; +} + +PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1); + PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _367; +} + +PathCubic PathCubic_read(Alloc a, PathCubicRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23); + PathCubic s; + s.p0 = float2(asfloat(raw0), asfloat(raw1)); + s.p1 = float2(asfloat(raw2), asfloat(raw3)); + s.p2 = float2(asfloat(raw4), asfloat(raw5)); + s.p3 = float2(asfloat(raw6), asfloat(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(asfloat(raw10), asfloat(raw11)); + return s; +} + +PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) +{ + PathCubicRef _373 = { ref.offset + 4u }; + Alloc param = a; + PathCubicRef param_1 = _373; + return PathCubic_read(param, param_1); +} + +float2 eval_cubic(float2 p0, float2 p1, float2 p2, float2 p3, float t) +{ + float mt = 1.0f - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0f)) + (((p2 * (mt * 3.0f)) + (p3 * t)) * t)) * t); +} + +float approx_parabola_integral(float x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375f + (0.201511204242706298828125f + ((0.25f * x) * x)))); +} + +SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0f; + if (scale < 1000000000.0f) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + SubdivResult _695 = { val, a0, a2 }; + return _695; +} + +uint fill_mode_from_flags(uint flags) +{ + return flags & 1u; +} + +Path Path_read(Alloc a, PathRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + TileRef _427 = { raw2 }; + s.tiles = _427; + return s; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +float approx_parabola_inv_integral(float x) +{ + return x * sqrt(0.61000001430511474609375f + (0.1520999968051910400390625f + ((0.25f * x) * x))); +} + +float2 eval_quad(float2 p0, float2 p1, float2 p2, float t) +{ + float mt = 1.0f - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t); +} + +MallocResult malloc(uint size) +{ + uint _142; + _136.InterlockedAdd(0, size, _142); + uint offset = _142; + uint _149; + _136.GetDimensions(_149); + _149 = (_149 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_149) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171; + _136.InterlockedMax(4, 1u, _171); + return r; + } + return r; +} + +TileRef Tile_index(TileRef ref, uint index) +{ + TileRef _385 = { ref.offset + (index * 8u) }; + return _385; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _136.Store(offset * 4 + 8, val); +} + +void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.origin.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.origin.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s._vector.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s._vector.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.y_edge); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) }; + PathSegRef ref = _718; + PathSegTag tag = _721; + if (element_ix < _710.Load(4)) + { + Alloc _731; + _731.offset = _710.Load(28); + Alloc param; + param.offset = _731.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1); + } + bool mem_ok = _136.Load(4) == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc _748; + _748.offset = _710.Load(28); + Alloc param_2; + param_2.offset = _748.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0f; + float2 qp0 = cubic.p0; + float _step = 1.0f / float(n_quads); + SubdivResult keep_params[16]; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5f * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125f; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + PathRef _904 = { _710.Load(16) + (path_ix * 12u) }; + Alloc _907; + _907.offset = _710.Load(16); + Alloc param_19; + param_19.offset = _907.offset; + PathRef param_20 = _904; + Path path = Path_read(param_19, param_20); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0f; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5f * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0f / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = lerp(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = min(p0.x, p1.x) - cubic.stroke.x; + float xmax = max(p0.x, p1.x) + cubic.stroke.x; + float ymin = min(p0.y, p1.y) - cubic.stroke.y; + float ymax = max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10f) + { + _1147 = 1000000000.0f; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f; + int x0 = int(floor(xmin * 0.0625f)); + int x1 = int(floor(xmax * 0.0625f) + 1.0f); + int y0 = int(floor(ymin * 0.0625f)); + int y1 = int(floor(ymax * 0.0625f) + 1.0f); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625f)); + int last_xray = int(floor(p1.x * 0.0625f)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374; + _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = lerp(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625f)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef _1454 = { path.tiles.offset }; + TileRef param_46 = _1454; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477; + _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477); + old = _1477; + } + tile_seg.origin = p0; + tile_seg._vector = p1 - p0; + float y_edge = 0.0f; + if (!is_stroke) + { + y_edge = lerp(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg._vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg._vector = p1 - p; + } + if (tile_seg._vector.x == 0.0f) + { + tile_seg._vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10f; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0f; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + TileSegRef _1559 = { tile_offset }; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = _1559; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + +[numthreads(32, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl new file mode 100644 index 0000000..26aa33a --- /dev/null +++ b/piet-gpu/shader/gen/path_coarse.msl @@ -0,0 +1,717 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct PathSegTag +{ + uint tag; + uint flags; +}; + +struct TileRef +{ + uint offset; +}; + +struct PathRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct TileSegRef +{ + uint offset; +}; + +struct TileSeg +{ + float2 origin; + float2 vector; + float y_edge; + TileSegRef next; +}; + +struct SubdivResult +{ + float val; + float a0; + float a2; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u); + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_136.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize); + return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; +} + +static inline __attribute__((always_inline)) +PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 6u; + uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize); + Alloc param_14 = a; + uint param_15 = ix + 7u; + uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize); + Alloc param_16 = a; + uint param_17 = ix + 8u; + uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize); + Alloc param_18 = a; + uint param_19 = ix + 9u; + uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize); + Alloc param_20 = a; + uint param_21 = ix + 10u; + uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize); + Alloc param_22 = a; + uint param_23 = ix + 11u; + uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize); + PathCubic s; + s.p0 = float2(as_type(raw0), as_type(raw1)); + s.p1 = float2(as_type(raw2), as_type(raw3)); + s.p2 = float2(as_type(raw4), as_type(raw5)); + s.p3 = float2(as_type(raw6), as_type(raw7)); + s.path_ix = raw8; + s.trans_ix = raw9; + s.stroke = float2(as_type(raw10), as_type(raw11)); + return s; +} + +static inline __attribute__((always_inline)) +PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = a; + PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u }; + return PathCubic_read(param, param_1, v_136, v_136BufferSize); +} + +static inline __attribute__((always_inline)) +float2 eval_cubic(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float2& p3, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * ((mt * mt) * mt)) + (((p1 * ((mt * mt) * 3.0)) + (((p2 * (mt * 3.0)) + (p3 * t)) * t)) * t); +} + +static inline __attribute__((always_inline)) +float approx_parabola_integral(thread const float& x) +{ + return x * rsqrt(sqrt(0.3300000131130218505859375 + (0.201511204242706298828125 + ((0.25 * x) * x)))); +} + +static inline __attribute__((always_inline)) +SubdivResult estimate_subdiv(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& sqrt_tol) +{ + float2 d01 = p1 - p0; + float2 d12 = p2 - p1; + float2 dd = d01 - d12; + float _cross = ((p2.x - p0.x) * dd.y) - ((p2.y - p0.y) * dd.x); + float x0 = ((d01.x * dd.x) + (d01.y * dd.y)) / _cross; + float x2 = ((d12.x * dd.x) + (d12.y * dd.y)) / _cross; + float scale = abs(_cross / (length(dd) * (x2 - x0))); + float param = x0; + float a0 = approx_parabola_integral(param); + float param_1 = x2; + float a2 = approx_parabola_integral(param_1); + float val = 0.0; + if (scale < 1000000000.0) + { + float da = abs(a2 - a0); + float sqrt_scale = sqrt(scale); + if (sign(x0) == sign(x2)) + { + val = da * sqrt_scale; + } + else + { + float xmin = sqrt_tol / sqrt_scale; + float param_2 = xmin; + val = (sqrt_tol * da) / approx_parabola_integral(param_2); + } + } + return SubdivResult{ val, a0, a2 }; +} + +static inline __attribute__((always_inline)) +uint fill_mode_from_flags(thread const uint& flags) +{ + return flags & 1u; +} + +static inline __attribute__((always_inline)) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + Path s; + s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); + s.tiles = TileRef{ raw2 }; + return s; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +float approx_parabola_inv_integral(thread const float& x) +{ + return x * sqrt(0.61000001430511474609375 + (0.1520999968051910400390625 + ((0.25 * x) * x))); +} + +static inline __attribute__((always_inline)) +float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const float2& p2, thread const float& t) +{ + float mt = 1.0 - t; + return (p0 * (mt * mt)) + (((p1 * (mt * 2.0)) + (p2 * t)) * t); +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed); + uint offset = _142; + MallocResult r; + r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +TileRef Tile_index(thread const TileRef& ref, thread const uint& index) +{ + return TileRef{ ref.offset + (index * 8u) }; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_136.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.origin.x); + write_mem(param, param_1, param_2, v_136, v_136BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.origin.y); + write_mem(param_3, param_4, param_5, v_136, v_136BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.vector.x); + write_mem(param_6, param_7, param_8, v_136, v_136BufferSize); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.vector.y); + write_mem(param_9, param_10, param_11, v_136, v_136BufferSize); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.y_edge); + write_mem(param_12, param_13, param_14, v_136, v_136BufferSize); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = s.next.offset; + write_mem(param_15, param_16, param_17, v_136, v_136BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + constant uint& v_136BufferSize = spvBufferSizeConstants[0]; + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) }; + PathSegTag tag = PathSegTag{ 0u, 0u }; + if (element_ix < _710.conf.n_pathseg) + { + Alloc param; + param.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_1 = ref; + tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize); + } + bool mem_ok = v_136.mem_error == 0u; + switch (tag.tag) + { + case 1u: + { + Alloc param_2; + param_2.offset = _710.conf.pathseg_alloc.offset; + PathSegRef param_3 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize); + float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3; + float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); + uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u); + n_quads = min(n_quads, 16u); + float val = 0.0; + float2 qp0 = cubic.p0; + float _step = 1.0 / float(n_quads); + spvUnsafeArray keep_params; + for (uint i = 0u; i < n_quads; i++) + { + float t = float(i + 1u) * _step; + float2 param_4 = cubic.p0; + float2 param_5 = cubic.p1; + float2 param_6 = cubic.p2; + float2 param_7 = cubic.p3; + float param_8 = t; + float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); + float2 param_9 = cubic.p0; + float2 param_10 = cubic.p1; + float2 param_11 = cubic.p2; + float2 param_12 = cubic.p3; + float param_13 = t - (0.5 * _step); + float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5); + float2 param_14 = qp0; + float2 param_15 = qp1; + float2 param_16 = qp2; + float param_17 = 0.4743416607379913330078125; + SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + keep_params[i] = params; + val += params.val; + qp0 = qp2; + } + uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u); + uint param_18 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint path_ix = cubic.path_ix; + Alloc param_19; + param_19.offset = _710.conf.tile_alloc.offset; + PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_19, param_20, v_136, v_136BufferSize); + uint param_21 = path.tiles.offset; + uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_23 = mem_ok; + Alloc path_alloc = new_alloc(param_21, param_22, param_23); + int4 bbox = int4(path.bbox); + float2 p0 = cubic.p0; + qp0 = cubic.p0; + float v_step = val / float(n); + int n_out = 1; + float val_sum = 0.0; + float2 p1; + float _1147; + TileSeg tile_seg; + for (uint i_1 = 0u; i_1 < n_quads; i_1++) + { + float t_1 = float(i_1 + 1u) * _step; + float2 param_24 = cubic.p0; + float2 param_25 = cubic.p1; + float2 param_26 = cubic.p2; + float2 param_27 = cubic.p3; + float param_28 = t_1; + float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); + float2 param_29 = cubic.p0; + float2 param_30 = cubic.p1; + float2 param_31 = cubic.p2; + float2 param_32 = cubic.p3; + float param_33 = t_1 - (0.5 * _step); + float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5); + SubdivResult params_1 = keep_params[i_1]; + float param_34 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_34); + float param_35 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_35); + float uscale = 1.0 / (u2 - u0); + float target = float(n_out) * v_step; + for (;;) + { + bool _1040 = uint(n_out) == n; + bool _1050; + if (!_1040) + { + _1050 = target < (val_sum + params_1.val); + } + else + { + _1050 = _1040; + } + if (_1050) + { + if (uint(n_out) == n) + { + p1 = cubic.p3; + } + else + { + float u = (target - val_sum) / params_1.val; + float a = mix(params_1.a0, params_1.a2, u); + float param_36 = a; + float au = approx_parabola_inv_integral(param_36); + float t_2 = (au - u0) * uscale; + float2 param_37 = qp0; + float2 param_38 = qp1_1; + float2 param_39 = qp2_1; + float param_40 = t_2; + p1 = eval_quad(param_37, param_38, param_39, param_40); + } + float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x; + float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x; + float ymin = fast::min(p0.y, p1.y) - cubic.stroke.y; + float ymax = fast::max(p0.y, p1.y) + cubic.stroke.y; + float dx = p1.x - p0.x; + float dy = p1.y - p0.y; + if (abs(dy) < 9.999999717180685365747194737196e-10) + { + _1147 = 1000000000.0; + } + else + { + _1147 = dx / dy; + } + float invslope = _1147; + float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625; + float b = invslope; + float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625; + int x0 = int(floor(xmin * 0.0625)); + int x1 = int(floor(xmax * 0.0625) + 1.0); + int y0 = int(floor(ymin * 0.0625)); + int y1 = int(floor(ymax * 0.0625) + 1.0); + x0 = clamp(x0, bbox.x, bbox.z); + y0 = clamp(y0, bbox.y, bbox.w); + x1 = clamp(x1, bbox.x, bbox.z); + y1 = clamp(y1, bbox.y, bbox.w); + float xc = a_1 + (b * float(y0)); + int stride = bbox.z - bbox.x; + int base = ((y0 - bbox.y) * stride) - bbox.x; + uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); + uint param_41 = n_tile_alloc * 24u; + MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize); + MallocResult tile_alloc = _1263; + if (tile_alloc.failed || (!mem_ok)) + { + return; + } + uint tile_offset = tile_alloc.alloc.offset; + int xray = int(floor(p0.x * 0.0625)); + int last_xray = int(floor(p1.x * 0.0625)); + if (p0.y > p1.y) + { + int tmp = xray; + xray = last_xray; + last_xray = tmp; + } + for (int y = y0; y < y1; y++) + { + float tile_y0 = float(y * 16); + int xbackdrop = max((xray + 1), bbox.x); + bool _1319 = !is_stroke; + bool _1329; + if (_1319) + { + _1329 = fast::min(p0.y, p1.y) < tile_y0; + } + else + { + _1329 = _1319; + } + bool _1336; + if (_1329) + { + _1336 = xbackdrop < bbox.z; + } + else + { + _1336 = _1329; + } + if (_1336) + { + int backdrop = (p1.y < p0.y) ? 1 : (-1); + TileRef param_42 = path.tiles; + uint param_43 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_42, param_43); + uint tile_el = tile_ref.offset >> uint(2); + Alloc param_44 = path_alloc; + uint param_45 = tile_el + 1u; + if (touch_mem(param_44, param_45)) + { + uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed); + } + } + int next_xray = last_xray; + if (y < (y1 - 1)) + { + float tile_y1 = float((y + 1) * 16); + float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy); + next_xray = int(floor(x_edge * 0.0625)); + } + int min_xray = min(xray, next_xray); + int max_xray = max(xray, next_xray); + int xx0 = min(int(floor(xc - c)), min_xray); + int xx1 = max(int(ceil(xc + c)), (max_xray + 1)); + xx0 = clamp(xx0, x0, x1); + xx1 = clamp(xx1, x0, x1); + for (int x = xx0; x < xx1; x++) + { + float tile_x0 = float(x * 16); + TileRef param_46 = TileRef{ path.tiles.offset }; + uint param_47 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_46, param_47); + uint tile_el_1 = tile_ref_1.offset >> uint(2); + uint old = 0u; + Alloc param_48 = path_alloc; + uint param_49 = tile_el_1; + if (touch_mem(param_48, param_49)) + { + uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed); + old = _1477; + } + tile_seg.origin = p0; + tile_seg.vector = p1 - p0; + float y_edge = 0.0; + if (!is_stroke) + { + y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx); + if (fast::min(p0.x, p1.x) < tile_x0) + { + float2 p = float2(tile_x0, y_edge); + if (p0.x > p1.x) + { + tile_seg.vector = p - p0; + } + else + { + tile_seg.origin = p; + tile_seg.vector = p1 - p; + } + if (tile_seg.vector.x == 0.0) + { + tile_seg.vector.x = sign(p1.x - p0.x) * 9.999999717180685365747194737196e-10; + } + } + if ((x <= min_xray) || (max_xray < x)) + { + y_edge = 1000000000.0; + } + } + tile_seg.y_edge = y_edge; + tile_seg.next.offset = old; + Alloc param_50 = tile_alloc.alloc; + TileSegRef param_51 = TileSegRef{ tile_offset }; + TileSeg param_52 = tile_seg; + TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize); + tile_offset += 24u; + } + xc += b; + base += stride; + xray = next_xray; + } + n_out++; + target += v_step; + p0 = p1; + continue; + } + else + { + break; + } + } + val_sum += params_1.val; + qp0 = qp2_1; + } + break; + } + } +} + diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv new file mode 100644 index 0000000..5e6beda Binary files /dev/null and b/piet-gpu/shader/gen/path_coarse.spv differ diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil new file mode 100644 index 0000000..6130712 Binary files /dev/null and b/piet-gpu/shader/gen/pathseg.dxil differ diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl new file mode 100644 index 0000000..578417f --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -0,0 +1,661 @@ +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; +static const Monoid _567 = { 0.0f.xxxx, 0u }; + +RWByteAddressBuffer _111 : register(u0, space0); +ByteAddressBuffer _574 : register(t2, space0); +ByteAddressBuffer _639 : register(t1, space0); +ByteAddressBuffer _710 : register(t3, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_tag[256]; +groupshared Monoid sh_scratch[256]; + +TagMonoid reduce_tag(uint tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u))); + c.path_ix = uint(int(countbits(tag_word & 269488144u))); + c.trans_ix = uint(int(countbits(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _135; +} + +float2 read_f32_point(uint ix) +{ + float x = asfloat(_574.Load(ix * 4 + 0)); + float y = asfloat(_574.Load((ix + 1u) * 4 + 0)); + return float2(x, y); +} + +float2 read_i16_point(uint ix) +{ + uint raw = _574.Load(ix * 4 + 0); + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _111.Load(offset * 4 + 8); + return v; +} + +TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TransformSeg s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _111.Store(offset * 4 + 8, val); +} + +void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.p0.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.p0.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.p1.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.p1.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.p2.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.p2.y); + write_mem(param_15, param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = asuint(s.p3.x); + write_mem(param_18, param_19, param_20); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = asuint(s.p3.y); + write_mem(param_21, param_22, param_23); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = asuint(s.stroke.x); + write_mem(param_30, param_31, param_32); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = asuint(s.stroke.y); + write_mem(param_33, param_34, param_35); +} + +void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2); + PathCubicRef _458 = { ref.offset + 4u }; + Alloc param_3 = a; + PathCubicRef param_4 = _458; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5); +} + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +Monoid monoid_identity() +{ + return _567; +} + +uint round_down(float x) +{ + return uint(max(0.0f, floor(x) + 32768.0f)); +} + +uint round_up(float x) +{ + return uint(min(65535.0f, ceil(x) + 32768.0f)); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = _574.Load(((_639.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0); + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + TagMonoid _716; + _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0); + _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4); + _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8); + _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12); + _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16); + tm.trans_ix = _716.trans_ix; + tm.linewidth_ix = _716.linewidth_ix; + tm.pathseg_ix = _716.pathseg_ix; + tm.path_ix = _716.path_ix; + tm.pathseg_offset = _716.pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.Load(96) >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.Load(88) >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + uint trans_ix = tm.trans_ix; + TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) }; + TransformSegRef trans_ref = _771; + PathSegRef _781 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; + PathSegRef ps_ref = _781; + float linewidth[4]; + uint save_trans_ix[4]; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + Monoid local[4]; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + linewidth[i_1] = asfloat(_574.Load(lw_ix * 4 + 0)); + save_trans_ix[i_1] = trans_ix; + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12); + } + } + } + Alloc _877; + _877.offset = _639.Load(36); + param_13.offset = _877.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(min(p0, p1), max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _947 = bbox; + float2 _950 = min(_947.xy, p2); + bbox.x = _950.x; + bbox.y = _950.y; + float4 _955 = bbox; + float2 _958 = max(_955.zw, p2); + bbox.z = _958.x; + bbox.w = _958.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _983 = bbox; + float2 _986 = min(_983.xy, p3); + bbox.x = _986.x; + bbox.y = _986.y; + float4 _991 = bbox; + float2 _994 = max(_991.zw, p3); + bbox.z = _994.x; + bbox.w = _994.y; + } + else + { + p3 = p2; + p2 = lerp(p1, p2, 0.3333333432674407958984375f.xx); + p1 = lerp(p1, p0, 0.3333333432674407958984375f.xx); + } + } + else + { + p3 = p1; + p2 = lerp(p3, p0, 0.3333333432674407958984375f.xx); + p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx); + } + float2 stroke = 0.0f.xx; + if (linewidth[i_1] >= 0.0f) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth[i_1]); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth[i_1] >= 0.0f); + Alloc _1089; + _1089.offset = _639.Load(28); + param_15.offset = _1089.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = 0.0f.xxxx; + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ix += ((tag_byte >> uint(5)) & 1u); + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 6u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1264 = i_4 == 3u; + bool _1270; + if (_1264) + { + _1270 = gl_LocalInvocationID.x == 255u; + } + else + { + _1270 = _1264; + } + if (_1270) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + _111.Store((bbox_out_ix + 4u) * 4 + 8, asuint(linewidth[i_4])); + _111.Store((bbox_out_ix + 5u) * 4 + 8, save_trans_ix[i_4]); + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + _111.Store(bbox_out_ix * 4 + 8, round_down(param_25)); + float param_26 = m.bbox.y; + _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26)); + float param_27 = m.bbox.z; + _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27)); + float param_28 = m.bbox.w; + _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28)); + bbox_out_ix += 6u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1335 = m.bbox.z > m.bbox.x; + bool _1344; + if (!_1335) + { + _1344 = m.bbox.w > m.bbox.y; + } + else + { + _1344 = _1335; + } + if (_1344) + { + float param_29 = m.bbox.x; + uint _1353; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353); + float param_30 = m.bbox.y; + uint _1361; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361); + float param_31 = m.bbox.z; + uint _1369; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369); + float param_32 = m.bbox.w; + uint _1377; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377); + } + bbox_out_ix += 6u; + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl new file mode 100644 index 0000000..9f6328e --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.msl @@ -0,0 +1,717 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct ParentBuf +{ + TagMonoid_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +static inline __attribute__((always_inline)) +float2 read_f32_point(thread const uint& ix, const device SceneBuf& v_574) +{ + float x = as_type(v_574.scene[ix]); + float y = as_type(v_574.scene[ix + 1u]); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +float2 read_i16_point(thread const uint& ix, const device SceneBuf& v_574) +{ + uint raw = v_574.scene[ix]; + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_111.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +TransformSeg TransformSeg_read(thread const Alloc& a, thread const TransformSegRef& ref, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_111); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_111); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_111); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_111); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_111); + TransformSeg s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_111.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void PathCubic_write(thread const Alloc& a, thread const PathCubicRef& ref, thread const PathCubic& s, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.p0.x); + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.p0.y); + write_mem(param_3, param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.p1.x); + write_mem(param_6, param_7, param_8, v_111); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.p1.y); + write_mem(param_9, param_10, param_11, v_111); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.p2.x); + write_mem(param_12, param_13, param_14, v_111); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.p2.y); + write_mem(param_15, param_16, param_17, v_111); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = as_type(s.p3.x); + write_mem(param_18, param_19, param_20, v_111); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = as_type(s.p3.y); + write_mem(param_21, param_22, param_23, v_111); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26, v_111); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29, v_111); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = as_type(s.stroke.x); + write_mem(param_30, param_31, param_32, v_111); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = as_type(s.stroke.y); + write_mem(param_33, param_34, param_35, v_111); +} + +static inline __attribute__((always_inline)) +void PathSeg_Cubic_write(thread const Alloc& a, thread const PathSegRef& ref, thread const uint& flags, thread const PathCubic& s, device Memory& v_111) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + PathCubicRef param_4 = PathCubicRef{ ref.offset + 4u }; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5, v_111); +} + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = fast::min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = fast::max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +static inline __attribute__((always_inline)) +Monoid monoid_identity() +{ + return Monoid{ float4(0.0), 0u }; +} + +static inline __attribute__((always_inline)) +uint round_down(thread const float& x) +{ + return uint(fast::max(0.0, floor(x) + 32768.0)); +} + +static inline __attribute__((always_inline)) +uint round_up(thread const float& x) +{ + return uint(fast::min(65535.0, ceil(x) + 32768.0)); +} + +kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_tag[256]; + threadgroup Monoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))]; + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _713 = gl_WorkGroupID.x - 1u; + tm.trans_ix = _710.parent[_713].trans_ix; + tm.linewidth_ix = _710.parent[_713].linewidth_ix; + tm.pathseg_ix = _710.parent[_713].pathseg_ix; + tm.path_ix = _710.parent[_713].path_ix; + tm.pathseg_offset = _710.parent[_713].pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + uint trans_ix = tm.trans_ix; + TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (trans_ix * 24u) }; + PathSegRef ps_ref = PathSegRef{ _639.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) }; + spvUnsafeArray linewidth; + spvUnsafeArray save_trans_ix; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + spvUnsafeArray local; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + linewidth[i_1] = as_type(v_574.scene[lw_ix]); + save_trans_ix[i_1] = trans_ix; + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5, v_574); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6, v_574); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7, v_574); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8, v_574); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9, v_574); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10, v_574); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11, v_574); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12, v_574); + } + } + } + param_13.offset = _639.conf.trans_alloc.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14, v_111); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(fast::min(p0, p1), fast::max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _947 = bbox; + float2 _950 = fast::min(_947.xy, p2); + bbox.x = _950.x; + bbox.y = _950.y; + float4 _955 = bbox; + float2 _958 = fast::max(_955.zw, p2); + bbox.z = _958.x; + bbox.w = _958.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _983 = bbox; + float2 _986 = fast::min(_983.xy, p3); + bbox.x = _986.x; + bbox.y = _986.y; + float4 _991 = bbox; + float2 _994 = fast::max(_991.zw, p3); + bbox.z = _994.x; + bbox.w = _994.y; + } + else + { + p3 = p2; + p2 = mix(p1, p2, float2(0.3333333432674407958984375)); + p1 = mix(p1, p0, float2(0.3333333432674407958984375)); + } + } + else + { + p3 = p1; + p2 = mix(p3, p0, float2(0.3333333432674407958984375)); + p1 = mix(p0, p3, float2(0.3333333432674407958984375)); + } + float2 stroke = float2(0.0); + if (linewidth[i_1] >= 0.0) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth[i_1]); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth[i_1] >= 0.0); + param_15.offset = _639.conf.pathseg_alloc.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18, v_111); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = float4(0.0); + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ix += ((tag_byte >> uint(5)) & 1u); + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 8u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.conf.path_bbox_alloc.offset >> uint(2)) + (path_ix * 6u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1264 = i_4 == 3u; + bool _1270; + if (_1264) + { + _1270 = gl_LocalInvocationID.x == 255u; + } + else + { + _1270 = _1264; + } + if (_1270) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + v_111.memory[bbox_out_ix + 4u] = as_type(linewidth[i_4]); + v_111.memory[bbox_out_ix + 5u] = save_trans_ix[i_4]; + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + v_111.memory[bbox_out_ix] = round_down(param_25); + float param_26 = m.bbox.y; + v_111.memory[bbox_out_ix + 1u] = round_down(param_26); + float param_27 = m.bbox.z; + v_111.memory[bbox_out_ix + 2u] = round_up(param_27); + float param_28 = m.bbox.w; + v_111.memory[bbox_out_ix + 3u] = round_up(param_28); + bbox_out_ix += 6u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1335 = m.bbox.z > m.bbox.x; + bool _1344; + if (!_1335) + { + _1344 = m.bbox.w > m.bbox.y; + } + else + { + _1344 = _1335; + } + if (_1344) + { + float param_29 = m.bbox.x; + uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + float param_30 = m.bbox.y; + uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + float param_31 = m.bbox.z; + uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + float param_32 = m.bbox.w; + uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + } + bbox_out_ix += 6u; + } + } +} + diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv new file mode 100644 index 0000000..4e2e9d5 Binary files /dev/null and b/piet-gpu/shader/gen/pathseg.spv differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil new file mode 100644 index 0000000..4c2bd23 Binary files /dev/null and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl new file mode 100644 index 0000000..5f7d125 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -0,0 +1,138 @@ +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); + +ByteAddressBuffer _139 : register(t1, space0); +ByteAddressBuffer _151 : register(t2, space0); +RWByteAddressBuffer _238 : register(u3, space0); +RWByteAddressBuffer _258 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_scratch[128]; + +TagMonoid reduce_tag(uint tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u))); + c.path_ix = uint(int(countbits(tag_word & 269488144u))); + c.trans_ix = uint(int(countbits(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 2u; + uint scene_ix = (_139.Load(92) >> uint(2)) + ix; + uint tag_word = _151.Load(scene_ix * 4 + 0); + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 2u; i++) + { + tag_word = _151.Load((scene_ix + i) * 4 + 0); + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + } +} + +[numthreads(128, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl new file mode 100644 index 0000000..91e0cca --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -0,0 +1,154 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct OutBuf +{ + TagMonoid_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(128u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_scratch[128]; + uint ix = gl_GlobalInvocationID.x * 2u; + uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; + uint tag_word = _151.scene[scene_ix]; + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 2u; i++) + { + tag_word = _151.scene[scene_ix + i]; + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + } +} + diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv new file mode 100644 index 0000000..f1d8679 Binary files /dev/null and b/piet-gpu/shader/gen/pathtag_reduce.spv differ diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil new file mode 100644 index 0000000..77f12e6 Binary files /dev/null and b/piet-gpu/shader/gen/pathtag_root.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl new file mode 100644 index 0000000..7ad806c --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -0,0 +1,115 @@ +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; + +RWByteAddressBuffer _78 : register(u0, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_scratch[256]; + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _18; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TagMonoid _82; + _82.trans_ix = _78.Load(ix * 20 + 0); + _82.linewidth_ix = _78.Load(ix * 20 + 4); + _82.pathseg_ix = _78.Load(ix * 20 + 8); + _82.path_ix = _78.Load(ix * 20 + 12); + _82.pathseg_offset = _78.Load(ix * 20 + 16); + TagMonoid local[8]; + local[0].trans_ix = _82.trans_ix; + local[0].linewidth_ix = _82.linewidth_ix; + local[0].pathseg_ix = _82.pathseg_ix; + local[0].path_ix = _82.path_ix; + local[0].pathseg_offset = _82.pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + TagMonoid param = local[i - 1u]; + TagMonoid _115; + _115.trans_ix = _78.Load((ix + i) * 20 + 0); + _115.linewidth_ix = _78.Load((ix + i) * 20 + 4); + _115.pathseg_ix = _78.Load((ix + i) * 20 + 8); + _115.path_ix = _78.Load((ix + i) * 20 + 12); + _115.pathseg_offset = _78.Load((ix + i) * 20 + 16); + param_1.trans_ix = _115.trans_ix; + param_1.linewidth_ix = _115.linewidth_ix; + param_1.pathseg_ix = _115.pathseg_ix; + param_1.path_ix = _115.path_ix; + param_1.pathseg_offset = _115.pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _210 = ix + i_2; + _78.Store(_210 * 20 + 0, m.trans_ix); + _78.Store(_210 * 20 + 4, m.linewidth_ix); + _78.Store(_210 * 20 + 8, m.pathseg_ix); + _78.Store(_210 * 20 + 12, m.path_ix); + _78.Store(_210 * 20 + 16, m.pathseg_offset); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl new file mode 100644 index 0000000..65e3741 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.msl @@ -0,0 +1,146 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct DataBuf +{ + TagMonoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup TagMonoid sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].trans_ix = _78.data[ix].trans_ix; + local[0].linewidth_ix = _78.data[ix].linewidth_ix; + local[0].pathseg_ix = _78.data[ix].pathseg_ix; + local[0].path_ix = _78.data[ix].path_ix; + local[0].pathseg_offset = _78.data[ix].pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _109 = ix + i; + TagMonoid param = local[i - 1u]; + param_1.trans_ix = _78.data[_109].trans_ix; + param_1.linewidth_ix = _78.data[_109].linewidth_ix; + param_1.pathseg_ix = _78.data[_109].pathseg_ix; + param_1.path_ix = _78.data[_109].path_ix; + param_1.pathseg_offset = _78.data[_109].pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _210 = ix + i_2; + _78.data[_210].trans_ix = m.trans_ix; + _78.data[_210].linewidth_ix = m.linewidth_ix; + _78.data[_210].pathseg_ix = m.pathseg_ix; + _78.data[_210].path_ix = m.path_ix; + _78.data[_210].pathseg_offset = m.pathseg_offset; + } +} + diff --git a/piet-gpu/shader/gen/pathtag_root.spv b/piet-gpu/shader/gen/pathtag_root.spv new file mode 100644 index 0000000..3783b49 Binary files /dev/null and b/piet-gpu/shader/gen/pathtag_root.spv differ diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil new file mode 100644 index 0000000..7759910 Binary files /dev/null and b/piet-gpu/shader/gen/tile_alloc.dxil differ diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl new file mode 100644 index 0000000..73e0a8e --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -0,0 +1,264 @@ +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _70 : register(u0, space0); +ByteAddressBuffer _181 : register(t1, space0); +ByteAddressBuffer _257 : register(t2, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared uint sh_tile_count[256]; +groupshared MallocResult sh_tile_alloc; + +float4 load_draw_bbox(uint draw_ix) +{ + uint base = (_181.Load(64) >> uint(2)) + (4u * draw_ix); + float x0 = asfloat(_70.Load(base * 4 + 8)); + float y0 = asfloat(_70.Load((base + 1u) * 4 + 8)); + float x1 = asfloat(_70.Load((base + 2u) * 4 + 8)); + float y1 = asfloat(_70.Load((base + 3u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +Alloc new_alloc(uint offset, uint size, bool mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +MallocResult malloc(uint size) +{ + uint _76; + _70.InterlockedAdd(0, size, _76); + uint offset = _76; + uint _83; + _70.GetDimensions(_83); + _83 = (_83 - 8) / 4; + MallocResult r; + r.failed = (offset + size) > uint(int(_83) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _105; + _70.InterlockedMax(4, 1u, _105); + return r; + } + return r; +} + +Alloc slice_mem(Alloc a, uint offset, uint size) +{ + Alloc _131 = { a.offset + offset }; + return _131; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _70.Store(offset * 4 + 8, val); +} + +void Path_write(Alloc a, PathRef ref, Path s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8); +} + +void comp_main() +{ + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef _241 = { _181.Load(16) + (element_ix * 12u) }; + PathRef path_ref = _241; + uint drawtag_base = _181.Load(100) >> uint(2); + uint drawtag = 0u; + if (element_ix < _181.Load(0)) + { + drawtag = _257.Load((drawtag_base + element_ix) * 4 + 0); + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if ((drawtag != 0u) && (drawtag != 37u)) + { + uint param = element_ix; + float4 bbox = load_draw_bbox(param); + x0 = int(floor(bbox.x * 0.0625f)); + y0 = int(floor(bbox.y * 0.0625f)); + x1 = int(ceil(bbox.z * 0.0625f)); + y1 = int(ceil(bbox.w * 0.0625f)); + } + x0 = clamp(x0, 0, int(_181.Load(8))); + y0 = clamp(y0, 0, int(_181.Load(12))); + x1 = clamp(x1, 0, int(_181.Load(8))); + y1 = clamp(y1, 0, int(_181.Load(12))); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (th_ix >= (1u << i)) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + GroupMemoryBarrierWithGroupSync(); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_1 = total_tile_count * 8u; + MallocResult _392 = malloc(param_1); + sh_tile_alloc = _392; + } + GroupMemoryBarrierWithGroupSync(); + MallocResult alloc_start = sh_tile_alloc; + bool _403; + if (!alloc_start.failed) + { + _403 = _70.Load(4) != 0u; + } + else + { + _403 = alloc_start.failed; + } + if (_403) + { + return; + } + if (element_ix < _181.Load(0)) + { + uint _416; + if (th_ix > 0u) + { + _416 = sh_tile_count[th_ix - 1u]; + } + else + { + _416 = 0u; + } + uint tile_subix = _416; + Alloc param_2 = alloc_start.alloc; + uint param_3 = 8u * tile_subix; + uint param_4 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_2, param_3, param_4); + TileRef _438 = { tiles_alloc.offset }; + path.tiles = _438; + Alloc _444; + _444.offset = _181.Load(16); + Alloc param_5; + param_5.offset = _444.offset; + PathRef param_6 = path_ref; + Path param_7 = path; + Path_write(param_5, param_6, param_7); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_8 = alloc_start.alloc; + uint param_9 = start_ix + i_1; + uint param_10 = 0u; + write_mem(param_8, param_9, param_10); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl new file mode 100644 index 0000000..961be50 --- /dev/null +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -0,0 +1,273 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct MallocResult +{ + Alloc alloc; + bool failed; +}; + +struct PathRef +{ + uint offset; +}; + +struct TileRef +{ + uint offset; +}; + +struct Path +{ + uint4 bbox; + TileRef tiles; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181) +{ + uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); + float x0 = as_type(v_70.memory[base]); + float y0 = as_type(v_70.memory[base + 1u]); + float x1 = as_type(v_70.memory[base + 2u]); + float y1 = as_type(v_70.memory[base + 3u]); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +static inline __attribute__((always_inline)) +Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +{ + Alloc a; + a.offset = offset; + return a; +} + +static inline __attribute__((always_inline)) +MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize) +{ + uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed); + uint offset = _76; + MallocResult r; + r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4); + uint param = offset; + uint param_1 = size; + bool param_2 = !r.failed; + r.alloc = new_alloc(param, param_1, param_2); + if (r.failed) + { + uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed); + return r; + } + return r; +} + +static inline __attribute__((always_inline)) +Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) +{ + return Alloc{ a.offset + offset }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_70.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); + write_mem(param, param_1, param_2, v_70, v_70BufferSize); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); + write_mem(param_3, param_4, param_5, v_70, v_70BufferSize); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = s.tiles.offset; + write_mem(param_6, param_7, param_8, v_70, v_70BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup uint sh_tile_count[256]; + threadgroup MallocResult sh_tile_alloc; + constant uint& v_70BufferSize = spvBufferSizeConstants[0]; + uint th_ix = gl_LocalInvocationID.x; + uint element_ix = gl_GlobalInvocationID.x; + PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) }; + uint drawtag_base = v_181.conf.drawtag_offset >> uint(2); + uint drawtag = 0u; + if (element_ix < v_181.conf.n_elements) + { + drawtag = _257.scene[drawtag_base + element_ix]; + } + int x0 = 0; + int y0 = 0; + int x1 = 0; + int y1 = 0; + if ((drawtag != 0u) && (drawtag != 37u)) + { + uint param = element_ix; + float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181); + x0 = int(floor(bbox.x * 0.0625)); + y0 = int(floor(bbox.y * 0.0625)); + x1 = int(ceil(bbox.z * 0.0625)); + y1 = int(ceil(bbox.w * 0.0625)); + } + x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles)); + y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles)); + x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles)); + y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles)); + Path path; + path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); + uint tile_count = uint((x1 - x0) * (y1 - y0)); + sh_tile_count[th_ix] = tile_count; + uint total_tile_count = tile_count; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th_ix >= (1u << i)) + { + total_tile_count += sh_tile_count[th_ix - (1u << i)]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tile_count[th_ix] = total_tile_count; + } + if (th_ix == 255u) + { + uint param_1 = total_tile_count * 8u; + MallocResult _392 = malloc(param_1, v_70, v_70BufferSize); + sh_tile_alloc = _392; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + MallocResult alloc_start = sh_tile_alloc; + bool _403; + if (!alloc_start.failed) + { + _403 = v_70.mem_error != 0u; + } + else + { + _403 = alloc_start.failed; + } + if (_403) + { + return; + } + if (element_ix < v_181.conf.n_elements) + { + uint _416; + if (th_ix > 0u) + { + _416 = sh_tile_count[th_ix - 1u]; + } + else + { + _416 = 0u; + } + uint tile_subix = _416; + Alloc param_2 = alloc_start.alloc; + uint param_3 = 8u * tile_subix; + uint param_4 = 8u * tile_count; + Alloc tiles_alloc = slice_mem(param_2, param_3, param_4); + path.tiles = TileRef{ tiles_alloc.offset }; + Alloc param_5; + param_5.offset = v_181.conf.tile_alloc.offset; + PathRef param_6 = path_ref; + Path param_7 = path; + Path_write(param_5, param_6, param_7, v_70, v_70BufferSize); + } + uint total_count = sh_tile_count[255] * 2u; + uint start_ix = alloc_start.alloc.offset >> uint(2); + for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) + { + Alloc param_8 = alloc_start.alloc; + uint param_9 = start_ix + i_1; + uint param_10 = 0u; + write_mem(param_8, param_9, param_10, v_70, v_70BufferSize); + } +} + diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv new file mode 100644 index 0000000..dbc02a8 Binary files /dev/null and b/piet-gpu/shader/gen/tile_alloc.spv differ diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil new file mode 100644 index 0000000..f9f31e6 Binary files /dev/null and b/piet-gpu/shader/gen/transform_leaf.dxil differ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl new file mode 100644 index 0000000..8a3b3d5 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -0,0 +1,234 @@ +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _71 : register(u0, space0); +ByteAddressBuffer _96 : register(t2, space0); +ByteAddressBuffer _278 : register(t1, space0); +ByteAddressBuffer _376 : register(t3, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[256]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _96.Load((ix + 0u) * 4 + 0); + uint raw1 = _96.Load((ix + 1u) * 4 + 0); + uint raw2 = _96.Load((ix + 2u) * 4 + 0); + uint raw3 = _96.Load((ix + 3u) * 4 + 0); + uint raw4 = _96.Load((ix + 4u) * 4 + 0); + uint raw5 = _96.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _85 = { ref.offset + (index * 24u) }; + return _85; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _224; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _71.Store(offset * 4 + 8, val); +} + +void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.mat.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.mat.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.mat.z); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.mat.w); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.translate.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.translate.y); + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _285 = { _278.Load(84) + (ix * 24u) }; + TransformRef ref = _285; + TransformRef param = ref; + Transform agg = Transform_read(param); + Transform local[8]; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + Transform _382; + _382.mat = asfloat(_376.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _382.translate = asfloat(_376.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _382.mat; + row.translate = _382.translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg _422 = { m.mat, m.translate }; + TransformSeg transform = _422; + TransformSegRef _432 = { _278.Load(36) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _432; + Alloc _436; + _436.offset = _278.Load(36); + param_12.offset = _436.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl new file mode 100644 index 0000000..fe45438 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -0,0 +1,287 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 path_bbox_alloc; + Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; + Alloc_1 draw_bbox_alloc; + Alloc_1 drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct ParentBuf +{ + Transform_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_96.scene[ix + 0u]; + uint raw1 = v_96.scene[ix + 1u]; + uint raw2 = v_96.scene[ix + 2u]; + uint raw3 = v_96.scene[ix + 3u]; + uint raw4 = v_96.scene[ix + 4u]; + uint raw5 = v_96.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_71.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.mat.x); + write_mem(param, param_1, param_2, v_71); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.mat.y); + write_mem(param_3, param_4, param_5, v_71); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.mat.z); + write_mem(param_6, param_7, param_8, v_71); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.mat.w); + write_mem(param_9, param_10, param_11, v_71); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.translate.x); + write_mem(param_12, param_13, param_14, v_71); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.translate.y); + write_mem(param_15, param_16, param_17, v_71); +} + +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _376 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_96); + spvUnsafeArray local; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_96); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _379 = gl_WorkGroupID.x - 1u; + row.mat = _376.parent[_379].mat; + row.translate = _376.parent[_379].translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg transform = TransformSeg{ m.mat, m.translate }; + TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) }; + param_12.offset = _278.conf.trans_alloc.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14, v_71); + } +} + diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv new file mode 100644 index 0000000..b739099 Binary files /dev/null and b/piet-gpu/shader/gen/transform_leaf.spv differ diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil new file mode 100644 index 0000000..978dd98 Binary files /dev/null and b/piet-gpu/shader/gen/transform_reduce.dxil differ diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl new file mode 100644 index 0000000..bd14f79 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -0,0 +1,140 @@ +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +ByteAddressBuffer _49 : register(t2, space0); +ByteAddressBuffer _161 : register(t1, space0); +RWByteAddressBuffer _250 : register(u3, space0); +RWByteAddressBuffer _266 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[256]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _49.Load((ix + 0u) * 4 + 0); + uint raw1 = _49.Load((ix + 1u) * 4 + 0); + uint raw2 = _49.Load((ix + 2u) * 4 + 0); + uint raw3 = _49.Load((ix + 3u) * 4 + 0); + uint raw4 = _49.Load((ix + 4u) * 4 + 0); + uint raw5 = _49.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _37 = { ref.offset + (index * 24u) }; + return _37; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _168 = { _161.Load(84) + (ix * 24u) }; + TransformRef ref = _168; + TransformRef param = ref; + Transform agg = Transform_read(param); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _250.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat)); + _250.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate)); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl new file mode 100644 index 0000000..62da531 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -0,0 +1,153 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc path_bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + Alloc draw_bbox_alloc; + Alloc drawinfo_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; + uint drawtag_offset; + uint drawdata_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct OutBuf +{ + Transform_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_49.scene[ix + 0u]; + uint raw1 = v_49.scene[ix + 1u]; + uint raw2 = v_49.scene[ix + 2u]; + uint raw3 = v_49.scene[ix + 3u]; + uint raw4 = v_49.scene[ix + 4u]; + uint raw5 = v_49.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _250 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_49); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_49); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _250.outbuf[gl_WorkGroupID.x].mat = agg.mat; + _250.outbuf[gl_WorkGroupID.x].translate = agg.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv new file mode 100644 index 0000000..6aa6b94 Binary files /dev/null and b/piet-gpu/shader/gen/transform_reduce.spv differ diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil new file mode 100644 index 0000000..5b4f059 Binary files /dev/null and b/piet-gpu/shader/gen/transform_root.dxil differ diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl new file mode 100644 index 0000000..d447db6 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -0,0 +1,94 @@ +struct Transform +{ + float4 mat; + float2 translate; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _89 : register(u0, space0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[256]; + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _23; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Transform _93; + _93.mat = asfloat(_89.Load4(ix * 32 + 0)); + _93.translate = asfloat(_89.Load2(ix * 32 + 16)); + Transform local[8]; + local[0].mat = _93.mat; + local[0].translate = _93.translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + Transform param = local[i - 1u]; + Transform _119; + _119.mat = asfloat(_89.Load4((ix + i) * 32 + 0)); + _119.translate = asfloat(_89.Load2((ix + i) * 32 + 16)); + param_1.mat = _119.mat; + param_1.translate = _119.translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _208 = ix + i_2; + _89.Store4(_208 * 32 + 0, asuint(m.mat)); + _89.Store2(_208 * 32 + 16, asuint(m.translate)); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl new file mode 100644 index 0000000..8b4b2a1 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.msl @@ -0,0 +1,129 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct DataBuf +{ + Transform_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup Transform sh_scratch[256]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].mat = _89.data[ix].mat; + local[0].translate = _89.data[ix].translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _113 = ix + i; + Transform param = local[i - 1u]; + param_1.mat = _89.data[_113].mat; + param_1.translate = _89.data[_113].translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _208 = ix + i_2; + _89.data[_208].mat = m.mat; + _89.data[_208].translate = m.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv new file mode 100644 index 0000000..1578842 Binary files /dev/null and b/piet-gpu/shader/gen/transform_root.spv differ diff --git a/tests/shader/gen/clear.dxil b/tests/shader/gen/clear.dxil new file mode 100644 index 0000000..a79182a Binary files /dev/null and b/tests/shader/gen/clear.dxil differ diff --git a/tests/shader/gen/clear.hlsl b/tests/shader/gen/clear.hlsl new file mode 100644 index 0000000..f6a576c --- /dev/null +++ b/tests/shader/gen/clear.hlsl @@ -0,0 +1,26 @@ +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +ByteAddressBuffer _19 : register(t0); +RWByteAddressBuffer _32 : register(u1); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _19.Load(0)) + { + _32.Store(ix * 4 + 0, _19.Load(4)); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/clear.msl b/tests/shader/gen/clear.msl new file mode 100644 index 0000000..d89853b --- /dev/null +++ b/tests/shader/gen/clear.msl @@ -0,0 +1,27 @@ +#include +#include + +using namespace metal; + +struct ConfigBuf +{ + uint size; + uint value; +}; + +struct TargetBuf +{ + uint data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +kernel void main0(const device ConfigBuf& _19 [[buffer(0)]], device TargetBuf& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _19.size) + { + _32.data[ix] = _19.value; + } +} + diff --git a/tests/shader/gen/clear.spv b/tests/shader/gen/clear.spv new file mode 100644 index 0000000..0e8d1d7 Binary files /dev/null and b/tests/shader/gen/clear.spv differ diff --git a/tests/shader/gen/linkedlist.dxil b/tests/shader/gen/linkedlist.dxil new file mode 100644 index 0000000..231f0f6 Binary files /dev/null and b/tests/shader/gen/linkedlist.dxil differ diff --git a/tests/shader/gen/linkedlist.hlsl b/tests/shader/gen/linkedlist.hlsl new file mode 100644 index 0000000..614791a --- /dev/null +++ b/tests/shader/gen/linkedlist.hlsl @@ -0,0 +1,39 @@ +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer _56 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint rng = gl_GlobalInvocationID.x + 1u; + for (uint i = 0u; i < 100u; i++) + { + rng ^= (rng << uint(13)); + rng ^= (rng >> uint(17)); + rng ^= (rng << uint(5)); + uint bucket = rng % 65536u; + if (bucket != 0u) + { + uint _61; + _56.InterlockedAdd(0, 2u, _61); + uint alloc = _61 + 65536u; + uint _67; + _56.InterlockedExchange(bucket * 4 + 0, alloc, _67); + uint old = _67; + _56.Store(alloc * 4 + 0, old); + _56.Store((alloc + 1u) * 4 + 0, gl_GlobalInvocationID.x); + } + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/linkedlist.msl b/tests/shader/gen/linkedlist.msl new file mode 100644 index 0000000..0461d79 --- /dev/null +++ b/tests/shader/gen/linkedlist.msl @@ -0,0 +1,36 @@ +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct MemBuf +{ + uint mem[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +kernel void main0(device MemBuf& _56 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint rng = gl_GlobalInvocationID.x + 1u; + for (uint i = 0u; i < 100u; i++) + { + rng ^= (rng << uint(13)); + rng ^= (rng >> uint(17)); + rng ^= (rng << uint(5)); + uint bucket = rng % 65536u; + if (bucket != 0u) + { + uint _61 = atomic_fetch_add_explicit((device atomic_uint*)&_56.mem[0], 2u, memory_order_relaxed); + uint alloc = _61 + 65536u; + uint _67 = atomic_exchange_explicit((device atomic_uint*)&_56.mem[bucket], alloc, memory_order_relaxed); + uint old = _67; + _56.mem[alloc] = old; + _56.mem[alloc + 1u] = gl_GlobalInvocationID.x; + } + } +} + diff --git a/tests/shader/gen/linkedlist.spv b/tests/shader/gen/linkedlist.spv new file mode 100644 index 0000000..a723283 Binary files /dev/null and b/tests/shader/gen/linkedlist.spv differ diff --git a/tests/shader/gen/message_passing.dxil b/tests/shader/gen/message_passing.dxil new file mode 100644 index 0000000..2be73da Binary files /dev/null and b/tests/shader/gen/message_passing.dxil differ diff --git a/tests/shader/gen/message_passing.hlsl b/tests/shader/gen/message_passing.hlsl new file mode 100644 index 0000000..ba8ce5f --- /dev/null +++ b/tests/shader/gen/message_passing.hlsl @@ -0,0 +1,54 @@ +struct Element +{ + uint data; + uint flag; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +RWByteAddressBuffer data_buf : register(u0); +RWByteAddressBuffer control_buf : register(u1); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +uint permute_flag_ix(uint data_ix) +{ + return (data_ix * 419u) & 65535u; +} + +void comp_main() +{ + uint _76; + data_buf.InterlockedExchange(gl_GlobalInvocationID.x * 8 + 0, 1u, _76); + DeviceMemoryBarrier(); + uint param = gl_GlobalInvocationID.x; + uint write_flag_ix = permute_flag_ix(param); + uint _77; + data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _77); + uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u; + uint param_1 = read_ix; + uint read_flag_ix = permute_flag_ix(param_1); + uint _58; + data_buf.InterlockedAdd(read_flag_ix * 8 + 4, 0, _58); + uint flag = _58; + DeviceMemoryBarrier(); + uint _62; + data_buf.InterlockedAdd(read_ix * 8 + 0, 0, _62); + uint data = _62; + if (flag > data) + { + uint _73; + control_buf.InterlockedAdd(0, 1u, _73); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/message_passing.msl b/tests/shader/gen/message_passing.msl new file mode 100644 index 0000000..e48f48a --- /dev/null +++ b/tests/shader/gen/message_passing.msl @@ -0,0 +1,54 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +struct Element +{ + uint data; + uint flag; +}; + +struct DataBuf +{ + Element data[1]; +}; + +struct ControlBuf +{ + uint failures; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +uint permute_flag_ix(thread const uint& data_ix) +{ + return (data_ix * 419u) & 65535u; +} + +kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed); + threadgroup_barrier(mem_flags::mem_device); + uint param = gl_GlobalInvocationID.x; + uint write_flag_ix = permute_flag_ix(param); + atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed); + uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u; + uint param_1 = read_ix; + uint read_flag_ix = permute_flag_ix(param_1); + uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed); + uint flag = _58; + threadgroup_barrier(mem_flags::mem_device); + uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed); + uint data = _62; + if (flag > data) + { + uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed); + } +} + diff --git a/tests/shader/gen/message_passing.spv b/tests/shader/gen/message_passing.spv new file mode 100644 index 0000000..e5f56d6 Binary files /dev/null and b/tests/shader/gen/message_passing.spv differ diff --git a/tests/shader/gen/message_passing_vkmm.spv b/tests/shader/gen/message_passing_vkmm.spv new file mode 100644 index 0000000..8527c2b Binary files /dev/null and b/tests/shader/gen/message_passing_vkmm.spv differ diff --git a/tests/shader/gen/prefix.dxil b/tests/shader/gen/prefix.dxil new file mode 100644 index 0000000..73f1ba1 Binary files /dev/null and b/tests/shader/gen/prefix.dxil differ diff --git a/tests/shader/gen/prefix.hlsl b/tests/shader/gen/prefix.hlsl new file mode 100644 index 0000000..72cfa90 --- /dev/null +++ b/tests/shader/gen/prefix.hlsl @@ -0,0 +1,225 @@ +struct Monoid +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid aggregate; + Monoid prefix; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _185 = { 0u }; + +globallycoherent RWByteAddressBuffer _43 : register(u2); +ByteAddressBuffer _67 : register(t0); +RWByteAddressBuffer _372 : register(u1); + +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_part_ix; +groupshared Monoid sh_scratch[512]; +groupshared uint sh_flag; +groupshared Monoid sh_prefix; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + if (gl_LocalInvocationID.x == 0u) + { + uint _47; + _43.InterlockedAdd(0, 1u, _47); + sh_part_ix = _47; + } + GroupMemoryBarrierWithGroupSync(); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + Monoid _71; + _71.element = _67.Load(ix * 4 + 0); + Monoid local[16]; + local[0].element = _71.element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + Monoid _94; + _94.element = _67.Load((ix + i) * 4 + 0); + param_1.element = _94.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.Store(part_ix * 12 + 8, agg.element); + if (part_ix == 0u) + { + _43.Store(12, agg.element); + } + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + _43.Store(part_ix * 12 + 4, flag); + } + Monoid exclusive = _185; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + sh_flag = _43.Load(look_back_ix * 12 + 4); + } + GroupMemoryBarrierWithGroupSync(); + DeviceMemoryBarrier(); + uint flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _223; + _223.element = _43.Load(look_back_ix * 12 + 12); + their_prefix.element = _223.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _245; + _245.element = _43.Load(look_back_ix * 12 + 8); + their_agg.element = _245.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid _267; + _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0); + m.element = _267.element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + GroupMemoryBarrierWithGroupSync(); + flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.Store(part_ix * 12 + 12, inclusive_prefix.element); + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + _43.Store(part_ix * 12 + 4, 2u); + } + } + GroupMemoryBarrierWithGroupSync(); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.Store((ix + i_2) * 4 + 0, m_1.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix.msl b/tests/shader/gen/prefix.msl new file mode 100644 index 0000000..24bee60 --- /dev/null +++ b/tests/shader/gen/prefix.msl @@ -0,0 +1,264 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid_1 aggregate; + Monoid_1 prefix; +}; + +struct StateBuf +{ + uint part_counter; + State state[1]; +}; + +struct InBuf +{ + Monoid_1 inbuf[1]; +}; + +struct OutBuf +{ + Monoid_1 outbuf[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_part_ix; + threadgroup Monoid sh_scratch[512]; + threadgroup uint sh_flag; + threadgroup Monoid sh_prefix; + if (gl_LocalInvocationID.x == 0u) + { + uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed); + sh_part_ix = _47; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + spvUnsafeArray local; + local[0].element = _67.inbuf[ix].element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _67.inbuf[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.state[part_ix].aggregate.element = agg.element; + if (part_ix == 0u) + { + _43.state[0].prefix.element = agg.element; + } + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + _43.state[part_ix].flag = flag; + } + Monoid exclusive = Monoid{ 0u }; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + sh_flag = _43.state[look_back_ix].flag; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_device); + uint flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_prefix.element = _43.state[look_back_ix].prefix.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_agg.element = _43.state[look_back_ix].aggregate.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.state[part_ix].prefix.element = inclusive_prefix.element; + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + _43.state[part_ix].flag = 2u; + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.outbuf[ix + i_2].element = m_1.element; + } +} + diff --git a/tests/shader/gen/prefix.spv b/tests/shader/gen/prefix.spv new file mode 100644 index 0000000..8e7db4a Binary files /dev/null and b/tests/shader/gen/prefix.spv differ diff --git a/tests/shader/gen/prefix_atomic.dxil b/tests/shader/gen/prefix_atomic.dxil new file mode 100644 index 0000000..45a7dd8 Binary files /dev/null and b/tests/shader/gen/prefix_atomic.dxil differ diff --git a/tests/shader/gen/prefix_atomic.hlsl b/tests/shader/gen/prefix_atomic.hlsl new file mode 100644 index 0000000..a75448f --- /dev/null +++ b/tests/shader/gen/prefix_atomic.hlsl @@ -0,0 +1,229 @@ +struct Monoid +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid aggregate; + Monoid prefix; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _185 = { 0u }; + +globallycoherent RWByteAddressBuffer _43 : register(u2); +ByteAddressBuffer _67 : register(t0); +RWByteAddressBuffer _372 : register(u1); + +static uint3 gl_LocalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; +}; + +groupshared uint sh_part_ix; +groupshared Monoid sh_scratch[512]; +groupshared uint sh_flag; +groupshared Monoid sh_prefix; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + if (gl_LocalInvocationID.x == 0u) + { + uint _47; + _43.InterlockedAdd(0, 1u, _47); + sh_part_ix = _47; + } + GroupMemoryBarrierWithGroupSync(); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + Monoid _71; + _71.element = _67.Load(ix * 4 + 0); + Monoid local[16]; + local[0].element = _71.element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + Monoid _94; + _94.element = _67.Load((ix + i) * 4 + 0); + param_1.element = _94.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.Store(part_ix * 12 + 8, agg.element); + if (part_ix == 0u) + { + _43.Store(12, agg.element); + } + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + uint _383; + _43.InterlockedExchange(part_ix * 12 + 4, flag, _383); + } + Monoid exclusive = _185; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + uint _208; + _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _208); + sh_flag = _208; + } + GroupMemoryBarrierWithGroupSync(); + DeviceMemoryBarrier(); + uint flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _223; + _223.element = _43.Load(look_back_ix * 12 + 12); + their_prefix.element = _223.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + Monoid _245; + _245.element = _43.Load(look_back_ix * 12 + 8); + their_agg.element = _245.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid _267; + _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0); + m.element = _267.element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + GroupMemoryBarrierWithGroupSync(); + flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.Store(part_ix * 12 + 12, inclusive_prefix.element); + } + DeviceMemoryBarrier(); + if (gl_LocalInvocationID.x == 511u) + { + uint _384; + _43.InterlockedExchange(part_ix * 12 + 4, 2u, _384); + } + } + GroupMemoryBarrierWithGroupSync(); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.Store((ix + i_2) * 4 + 0, m_1.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_atomic.msl b/tests/shader/gen/prefix_atomic.msl new file mode 100644 index 0000000..910e842 --- /dev/null +++ b/tests/shader/gen/prefix_atomic.msl @@ -0,0 +1,265 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct State +{ + uint flag; + Monoid_1 aggregate; + Monoid_1 prefix; +}; + +struct StateBuf +{ + uint part_counter; + State state[1]; +}; + +struct InBuf +{ + Monoid_1 inbuf[1]; +}; + +struct OutBuf +{ + Monoid_1 outbuf[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup uint sh_part_ix; + threadgroup Monoid sh_scratch[512]; + threadgroup uint sh_flag; + threadgroup Monoid sh_prefix; + if (gl_LocalInvocationID.x == 0u) + { + uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed); + sh_part_ix = _47; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint part_ix = sh_part_ix; + uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u); + spvUnsafeArray local; + local[0].element = _67.inbuf[ix].element; + Monoid param_1; + for (uint i = 1u; i < 16u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _67.inbuf[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[15]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 511u) + { + _43.state[part_ix].aggregate.element = agg.element; + if (part_ix == 0u) + { + _43.state[0].prefix.element = agg.element; + } + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + uint flag = 1u; + if (part_ix == 0u) + { + flag = 2u; + } + atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, flag, memory_order_relaxed); + } + Monoid exclusive = Monoid{ 0u }; + if (part_ix != 0u) + { + uint look_back_ix = part_ix - 1u; + uint their_ix = 0u; + Monoid their_prefix; + Monoid their_agg; + Monoid m; + while (true) + { + if (gl_LocalInvocationID.x == 511u) + { + uint _208 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed); + sh_flag = _208; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup_barrier(mem_flags::mem_device); + uint flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_prefix.element = _43.state[look_back_ix].prefix.element; + Monoid param_4 = their_prefix; + Monoid param_5 = exclusive; + exclusive = combine_monoid(param_4, param_5); + } + break; + } + else + { + if (flag_1 == 1u) + { + if (gl_LocalInvocationID.x == 511u) + { + their_agg.element = _43.state[look_back_ix].aggregate.element; + Monoid param_6 = their_agg; + Monoid param_7 = exclusive; + exclusive = combine_monoid(param_6, param_7); + } + look_back_ix--; + their_ix = 0u; + continue; + } + } + if (gl_LocalInvocationID.x == 511u) + { + m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element; + if (their_ix == 0u) + { + their_agg = m; + } + else + { + Monoid param_8 = their_agg; + Monoid param_9 = m; + their_agg = combine_monoid(param_8, param_9); + } + their_ix++; + if (their_ix == 8192u) + { + Monoid param_10 = their_agg; + Monoid param_11 = exclusive; + exclusive = combine_monoid(param_10, param_11); + if (look_back_ix == 0u) + { + sh_flag = 2u; + } + else + { + look_back_ix--; + their_ix = 0u; + } + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (flag_1 == 2u) + { + break; + } + } + if (gl_LocalInvocationID.x == 511u) + { + Monoid param_12 = exclusive; + Monoid param_13 = agg; + Monoid inclusive_prefix = combine_monoid(param_12, param_13); + sh_prefix = exclusive; + _43.state[part_ix].prefix.element = inclusive_prefix.element; + } + threadgroup_barrier(mem_flags::mem_device); + if (gl_LocalInvocationID.x == 511u) + { + atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, 2u, memory_order_relaxed); + } + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (part_ix != 0u) + { + exclusive = sh_prefix; + } + Monoid row = exclusive; + if (gl_LocalInvocationID.x > 0u) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u]; + Monoid param_14 = row; + Monoid param_15 = other_1; + row = combine_monoid(param_14, param_15); + } + for (uint i_2 = 0u; i_2 < 16u; i_2++) + { + Monoid param_16 = row; + Monoid param_17 = local[i_2]; + Monoid m_1 = combine_monoid(param_16, param_17); + _372.outbuf[ix + i_2].element = m_1.element; + } +} + diff --git a/tests/shader/gen/prefix_atomic.spv b/tests/shader/gen/prefix_atomic.spv new file mode 100644 index 0000000..d7dac5b Binary files /dev/null and b/tests/shader/gen/prefix_atomic.spv differ diff --git a/tests/shader/gen/prefix_reduce.dxil b/tests/shader/gen/prefix_reduce.dxil new file mode 100644 index 0000000..0ee28e8 Binary files /dev/null and b/tests/shader/gen/prefix_reduce.dxil differ diff --git a/tests/shader/gen/prefix_reduce.hlsl b/tests/shader/gen/prefix_reduce.hlsl new file mode 100644 index 0000000..f2de539 --- /dev/null +++ b/tests/shader/gen/prefix_reduce.hlsl @@ -0,0 +1,72 @@ +struct Monoid +{ + uint element; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +ByteAddressBuffer _40 : register(t0); +RWByteAddressBuffer _127 : register(u1); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Monoid sh_scratch[512]; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid _44; + _44.element = _40.Load(ix * 4 + 0); + Monoid agg; + agg.element = _44.element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = agg; + Monoid _64; + _64.element = _40.Load((ix + i) * 4 + 0); + param_1.element = _64.element; + agg = combine_monoid(param, param_1); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Monoid param_2 = agg; + Monoid param_3 = other; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _127.Store(gl_WorkGroupID.x * 4 + 0, agg.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_reduce.msl b/tests/shader/gen/prefix_reduce.msl new file mode 100644 index 0000000..3a3125d --- /dev/null +++ b/tests/shader/gen/prefix_reduce.msl @@ -0,0 +1,68 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct InBuf +{ + Monoid_1 inbuf[1]; +}; + +struct OutBuf +{ + Monoid_1 outbuf[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(const device InBuf& _40 [[buffer(0)]], device OutBuf& _127 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid agg; + agg.element = _40.inbuf[ix].element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = agg; + param_1.element = _40.inbuf[ix + i].element; + agg = combine_monoid(param, param_1); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Monoid param_2 = agg; + Monoid param_3 = other; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _127.outbuf[gl_WorkGroupID.x].element = agg.element; + } +} + diff --git a/tests/shader/gen/prefix_reduce.spv b/tests/shader/gen/prefix_reduce.spv new file mode 100644 index 0000000..b2e35fc Binary files /dev/null and b/tests/shader/gen/prefix_reduce.spv differ diff --git a/tests/shader/gen/prefix_root.dxil b/tests/shader/gen/prefix_root.dxil new file mode 100644 index 0000000..03fe2d1 Binary files /dev/null and b/tests/shader/gen/prefix_root.dxil differ diff --git a/tests/shader/gen/prefix_root.hlsl b/tests/shader/gen/prefix_root.hlsl new file mode 100644 index 0000000..adf6bf8 --- /dev/null +++ b/tests/shader/gen/prefix_root.hlsl @@ -0,0 +1,80 @@ +struct Monoid +{ + uint element; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _131 = { 0u }; + +RWByteAddressBuffer _42 : register(u0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Monoid sh_scratch[512]; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid _46; + _46.element = _42.Load(ix * 4 + 0); + Monoid local[8]; + local[0].element = _46.element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + Monoid _71; + _71.element = _42.Load((ix + i) * 4 + 0); + param_1.element = _71.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Monoid row = _131; + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_4 = row; + Monoid param_5 = local[i_2]; + Monoid m = combine_monoid(param_4, param_5); + _42.Store((ix + i_2) * 4 + 0, m.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_root.msl b/tests/shader/gen/prefix_root.msl new file mode 100644 index 0000000..897a6a4 --- /dev/null +++ b/tests/shader/gen/prefix_root.msl @@ -0,0 +1,112 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct DataBuf +{ + Monoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(device DataBuf& _42 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].element = _42.data[ix].element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _42.data[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Monoid row = Monoid{ 0u }; + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_4 = row; + Monoid param_5 = local[i_2]; + Monoid m = combine_monoid(param_4, param_5); + _42.data[ix + i_2].element = m.element; + } +} + diff --git a/tests/shader/gen/prefix_root.spv b/tests/shader/gen/prefix_root.spv new file mode 100644 index 0000000..3e04224 Binary files /dev/null and b/tests/shader/gen/prefix_root.spv differ diff --git a/tests/shader/gen/prefix_scan.dxil b/tests/shader/gen/prefix_scan.dxil new file mode 100644 index 0000000..427f14d Binary files /dev/null and b/tests/shader/gen/prefix_scan.dxil differ diff --git a/tests/shader/gen/prefix_scan.hlsl b/tests/shader/gen/prefix_scan.hlsl new file mode 100644 index 0000000..d9e74ea --- /dev/null +++ b/tests/shader/gen/prefix_scan.hlsl @@ -0,0 +1,92 @@ +struct Monoid +{ + uint element; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Monoid _131 = { 0u }; + +RWByteAddressBuffer _42 : register(u0); +ByteAddressBuffer _141 : register(t1); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Monoid sh_scratch[512]; + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid _22 = { a.element + b.element }; + return _22; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Monoid _46; + _46.element = _42.Load(ix * 4 + 0); + Monoid local[8]; + local[0].element = _46.element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + Monoid _71; + _71.element = _42.Load((ix + i) * 4 + 0); + param_1.element = _71.element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Monoid row = _131; + if (gl_WorkGroupID.x > 0u) + { + Monoid _146; + _146.element = _141.Load((gl_WorkGroupID.x - 1u) * 4 + 0); + row.element = _146.element; + } + if (gl_LocalInvocationID.x > 0u) + { + Monoid param_4 = row; + Monoid param_5 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_4, param_5); + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_6 = row; + Monoid param_7 = local[i_2]; + Monoid m = combine_monoid(param_6, param_7); + _42.Store((ix + i_2) * 4 + 0, m.element); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/tests/shader/gen/prefix_scan.msl b/tests/shader/gen/prefix_scan.msl new file mode 100644 index 0000000..5be4e65 --- /dev/null +++ b/tests/shader/gen/prefix_scan.msl @@ -0,0 +1,123 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Monoid +{ + uint element; +}; + +struct Monoid_1 +{ + uint element; +}; + +struct DataBuf +{ + Monoid_1 data[1]; +}; + +struct ParentBuf +{ + Monoid_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + return Monoid{ a.element + b.element }; +} + +kernel void main0(device DataBuf& _42 [[buffer(0)]], const device ParentBuf& _141 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].element = _42.data[ix].element; + Monoid param_1; + for (uint i = 1u; i < 8u; i++) + { + Monoid param = local[i - 1u]; + param_1.element = _42.data[ix + i].element; + local[i] = combine_monoid(param, param_1); + } + Monoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Monoid param_2 = other; + Monoid param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Monoid row = Monoid{ 0u }; + if (gl_WorkGroupID.x > 0u) + { + row.element = _141.parent[gl_WorkGroupID.x - 1u].element; + } + if (gl_LocalInvocationID.x > 0u) + { + Monoid param_4 = row; + Monoid param_5 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_4, param_5); + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Monoid param_6 = row; + Monoid param_7 = local[i_2]; + Monoid m = combine_monoid(param_6, param_7); + _42.data[ix + i_2].element = m.element; + } +} + diff --git a/tests/shader/gen/prefix_scan.spv b/tests/shader/gen/prefix_scan.spv new file mode 100644 index 0000000..6d8fe0a Binary files /dev/null and b/tests/shader/gen/prefix_scan.spv differ diff --git a/tests/shader/gen/prefix_vkmm.spv b/tests/shader/gen/prefix_vkmm.spv new file mode 100644 index 0000000..cef3965 Binary files /dev/null and b/tests/shader/gen/prefix_vkmm.spv differ