diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil index 0fb9622..50f5bad 100644 Binary files a/piet-gpu/shader/gen/backdrop.dxil and b/piet-gpu/shader/gen/backdrop.dxil differ diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl index aba3cff..2ed8898 100644 --- a/piet-gpu/shader/gen/backdrop.hlsl +++ b/piet-gpu/shader/gen/backdrop.hlsl @@ -21,6 +21,7 @@ struct Path struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -52,8 +53,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _67 : register(u0, space0); -ByteAddressBuffer _166 : register(t1, space0); +RWByteAddressBuffer _59 : register(u0, space0); +ByteAddressBuffer _181 : register(t1, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; @@ -69,6 +70,13 @@ groupshared uint sh_row_width[256]; groupshared Alloc sh_row_alloc[256]; groupshared uint sh_row_count[256]; +bool check_deps(uint dep_stage) +{ + uint _65; + _59.InterlockedOr(4, 0u, _65); + return (_65 & dep_stage) == 0u; +} + bool touch_mem(Alloc alloc, uint offset) { return true; @@ -82,7 +90,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _67.Load(offset * 4 + 8); + uint v = _59.Load(offset * 4 + 12); return v; } @@ -100,8 +108,8 @@ Path Path_read(Alloc a, PathRef ref) uint raw2 = read_mem(param_4, param_5); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); - TileRef _134 = { raw2 }; - s.tiles = _134; + TileRef _146 = { raw2 }; + s.tiles = _146; return s; } @@ -120,47 +128,52 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _67.Store(offset * 4 + 8, val); + _59.Store(offset * 4 + 12, val); } void comp_main() { + uint param = 7u; + bool _154 = check_deps(param); + if (!_154) + { + return; + } uint th_ix = gl_LocalInvocationIndex; uint element_ix = gl_GlobalInvocationID.x; uint row_count = 0u; - bool mem_ok = _67.Load(4) == 0u; if (gl_LocalInvocationID.y == 0u) { - if (element_ix < _166.Load(0)) + if (element_ix < _181.Load(4)) { - PathRef _180 = { _166.Load(16) + (element_ix * 12u) }; - PathRef path_ref = _180; - Alloc _185; - _185.offset = _166.Load(16); - Alloc param; - param.offset = _185.offset; - PathRef param_1 = path_ref; - Path path = Path_read(param, param_1); + PathRef _195 = { _181.Load(20) + (element_ix * 12u) }; + PathRef path_ref = _195; + Alloc _200; + _200.offset = _181.Load(20); + Alloc param_1; + param_1.offset = _200.offset; + PathRef param_2 = path_ref; + Path path = Path_read(param_1, param_2); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; - bool _210 = row_count == 1u; - bool _216; - if (_210) + bool _225 = row_count == 1u; + bool _231; + if (_225) { - _216 = path.bbox.y > 0u; + _231 = path.bbox.y > 0u; } else { - _216 = _210; + _231 = _225; } - if (_216) + if (_231) { row_count = 0u; } - uint param_2 = path.tiles.offset; - uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_4 = mem_ok; - Alloc path_alloc = new_alloc(param_2, param_3, param_4); + uint param_3 = path.tiles.offset; + uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_5 = true; + Alloc path_alloc = new_alloc(param_3, param_4, param_5); sh_row_alloc[th_ix] = path_alloc; } sh_row_count[th_ix] = row_count; @@ -168,17 +181,17 @@ void comp_main() for (uint i = 0u; i < 8u; i++) { GroupMemoryBarrierWithGroupSync(); - bool _262 = gl_LocalInvocationID.y == 0u; - bool _269; - if (_262) + bool _276 = gl_LocalInvocationID.y == 0u; + bool _283; + if (_276) { - _269 = th_ix >= (1u << i); + _283 = th_ix >= (1u << i); } else { - _269 = _262; + _283 = _276; } - if (_269) + if (_283) { row_count += sh_row_count[th_ix - (1u << i)]; } @@ -190,7 +203,7 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); uint total_rows = sh_row_count[255]; - uint _348; + uint _360; for (uint row = th_ix; row < total_rows; row += 256u) { uint el_ix = 0u; @@ -203,32 +216,32 @@ void comp_main() } } uint width = sh_row_width[el_ix]; - if ((width > 0u) && mem_ok) + if (width > 0u) { Alloc tiles_alloc = sh_row_alloc[el_ix]; if (el_ix > 0u) { - _348 = sh_row_count[el_ix - 1u]; + _360 = sh_row_count[el_ix - 1u]; } else { - _348 = 0u; + _360 = 0u; } - uint seq_ix = row - _348; + uint seq_ix = row - _360; uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); - Alloc param_5 = tiles_alloc; - uint param_6 = tile_el_ix; - uint sum = read_mem(param_5, param_6); + Alloc param_6 = tiles_alloc; + uint param_7 = tile_el_ix; + uint sum = read_mem(param_6, param_7); for (uint x = 1u; x < width; x++) { tile_el_ix += 2u; - Alloc param_7 = tiles_alloc; - uint param_8 = tile_el_ix; - sum += read_mem(param_7, param_8); - Alloc param_9 = tiles_alloc; - uint param_10 = tile_el_ix; - uint param_11 = sum; - write_mem(param_9, param_10, param_11); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + sum += read_mem(param_8, param_9); + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + uint param_12 = sum; + write_mem(param_10, param_11, param_12); } } } diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl index 1c0a0bb..3726dff 100644 --- a/piet-gpu/shader/gen/backdrop.msl +++ b/piet-gpu/shader/gen/backdrop.msl @@ -1,7 +1,9 @@ #pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" #include #include +#include using namespace metal; @@ -30,6 +32,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -40,6 +43,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -76,6 +80,13 @@ struct ConfigBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); +static inline __attribute__((always_inline)) +bool check_deps(thread const uint& dep_stage, device Memory& v_59) +{ + uint _65 = atomic_fetch_or_explicit((device atomic_uint*)&v_59.mem_error, 0u, memory_order_relaxed); + return (_65 & dep_stage) == 0u; +} + static inline __attribute__((always_inline)) bool touch_mem(thread const Alloc& alloc, thread const uint& offset) { @@ -83,7 +94,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_59) { Alloc param = alloc; uint param_1 = offset; @@ -91,23 +102,23 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_67.memory[offset]; + uint v = v_59.memory[offset]; return v; } static inline __attribute__((always_inline)) -Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_59) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_67); + uint raw0 = read_mem(param, param_1, v_59); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_67); + uint raw1 = read_mem(param_2, param_3, v_59); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_67); + uint raw2 = read_mem(param_4, param_5, v_59); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); s.tiles = TileRef{ raw2 }; @@ -123,7 +134,7 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_59) { Alloc param = alloc; uint param_1 = offset; @@ -131,47 +142,52 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_67.memory[offset] = val; + v_59.memory[offset] = val; } -kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_59 [[buffer(0)]], const device ConfigBuf& _181 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint sh_row_width[256]; threadgroup Alloc sh_row_alloc[256]; threadgroup uint sh_row_count[256]; + uint param = 7u; + bool _154 = check_deps(param, v_59); + if (!_154) + { + return; + } uint th_ix = gl_LocalInvocationIndex; uint element_ix = gl_GlobalInvocationID.x; uint row_count = 0u; - bool mem_ok = v_67.mem_error == 0u; if (gl_LocalInvocationID.y == 0u) { - if (element_ix < _166.conf.n_elements) + if (element_ix < _181.conf.n_elements) { - PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) }; - Alloc param; - param.offset = _166.conf.tile_alloc.offset; - PathRef param_1 = path_ref; - Path path = Path_read(param, param_1, v_67); + PathRef path_ref = PathRef{ _181.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param_1; + param_1.offset = _181.conf.tile_alloc.offset; + PathRef param_2 = path_ref; + Path path = Path_read(param_1, param_2, v_59); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; - bool _210 = row_count == 1u; - bool _216; - if (_210) + bool _225 = row_count == 1u; + bool _231; + if (_225) { - _216 = path.bbox.y > 0u; + _231 = path.bbox.y > 0u; } else { - _216 = _210; + _231 = _225; } - if (_216) + if (_231) { row_count = 0u; } - uint param_2 = path.tiles.offset; - uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_4 = mem_ok; - Alloc path_alloc = new_alloc(param_2, param_3, param_4); + uint param_3 = path.tiles.offset; + uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_5 = true; + Alloc path_alloc = new_alloc(param_3, param_4, param_5); sh_row_alloc[th_ix] = path_alloc; } sh_row_count[th_ix] = row_count; @@ -179,17 +195,17 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16 for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); - bool _262 = gl_LocalInvocationID.y == 0u; - bool _269; - if (_262) + bool _276 = gl_LocalInvocationID.y == 0u; + bool _283; + if (_276) { - _269 = th_ix >= (1u << i); + _283 = th_ix >= (1u << i); } else { - _269 = _262; + _283 = _276; } - if (_269) + if (_283) { row_count += sh_row_count[th_ix - (1u << i)]; } @@ -201,7 +217,7 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16 } threadgroup_barrier(mem_flags::mem_threadgroup); uint total_rows = sh_row_count[255]; - uint _348; + uint _360; for (uint row = th_ix; row < total_rows; row += 256u) { uint el_ix = 0u; @@ -214,32 +230,32 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16 } } uint width = sh_row_width[el_ix]; - if ((width > 0u) && mem_ok) + if (width > 0u) { Alloc tiles_alloc = sh_row_alloc[el_ix]; if (el_ix > 0u) { - _348 = sh_row_count[el_ix - 1u]; + _360 = sh_row_count[el_ix - 1u]; } else { - _348 = 0u; + _360 = 0u; } - uint seq_ix = row - _348; + uint seq_ix = row - _360; uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); - Alloc param_5 = tiles_alloc; - uint param_6 = tile_el_ix; - uint sum = read_mem(param_5, param_6, v_67); + Alloc param_6 = tiles_alloc; + uint param_7 = tile_el_ix; + uint sum = read_mem(param_6, param_7, v_59); for (uint x = 1u; x < width; x++) { tile_el_ix += 2u; - Alloc param_7 = tiles_alloc; - uint param_8 = tile_el_ix; - sum += read_mem(param_7, param_8, v_67); - Alloc param_9 = tiles_alloc; - uint param_10 = tile_el_ix; - uint param_11 = sum; - write_mem(param_9, param_10, param_11, v_67); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + sum += read_mem(param_8, param_9, v_59); + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + uint param_12 = sum; + write_mem(param_10, param_11, param_12, v_59); } } } diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv index 2bd17d8..b8a74ea 100644 Binary files a/piet-gpu/shader/gen/backdrop.spv and b/piet-gpu/shader/gen/backdrop.spv differ diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil index e24a6d3..06bacaf 100644 Binary files a/piet-gpu/shader/gen/backdrop_lg.dxil and b/piet-gpu/shader/gen/backdrop_lg.dxil differ diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl index c506403..e547762 100644 --- a/piet-gpu/shader/gen/backdrop_lg.hlsl +++ b/piet-gpu/shader/gen/backdrop_lg.hlsl @@ -21,6 +21,7 @@ struct Path struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -52,8 +53,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 4u, 1u); -RWByteAddressBuffer _67 : register(u0, space0); -ByteAddressBuffer _166 : register(t1, space0); +RWByteAddressBuffer _59 : register(u0, space0); +ByteAddressBuffer _181 : register(t1, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; @@ -69,6 +70,13 @@ groupshared uint sh_row_width[256]; groupshared Alloc sh_row_alloc[256]; groupshared uint sh_row_count[256]; +bool check_deps(uint dep_stage) +{ + uint _65; + _59.InterlockedOr(4, 0u, _65); + return (_65 & dep_stage) == 0u; +} + bool touch_mem(Alloc alloc, uint offset) { return true; @@ -82,7 +90,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _67.Load(offset * 4 + 8); + uint v = _59.Load(offset * 4 + 12); return v; } @@ -100,8 +108,8 @@ Path Path_read(Alloc a, PathRef ref) uint raw2 = read_mem(param_4, param_5); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); - TileRef _134 = { raw2 }; - s.tiles = _134; + TileRef _146 = { raw2 }; + s.tiles = _146; return s; } @@ -120,47 +128,52 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _67.Store(offset * 4 + 8, val); + _59.Store(offset * 4 + 12, val); } void comp_main() { + uint param = 7u; + bool _154 = check_deps(param); + if (!_154) + { + return; + } uint th_ix = gl_LocalInvocationIndex; uint element_ix = gl_GlobalInvocationID.x; uint row_count = 0u; - bool mem_ok = _67.Load(4) == 0u; if (gl_LocalInvocationID.y == 0u) { - if (element_ix < _166.Load(0)) + if (element_ix < _181.Load(4)) { - PathRef _180 = { _166.Load(16) + (element_ix * 12u) }; - PathRef path_ref = _180; - Alloc _185; - _185.offset = _166.Load(16); - Alloc param; - param.offset = _185.offset; - PathRef param_1 = path_ref; - Path path = Path_read(param, param_1); + PathRef _195 = { _181.Load(20) + (element_ix * 12u) }; + PathRef path_ref = _195; + Alloc _200; + _200.offset = _181.Load(20); + Alloc param_1; + param_1.offset = _200.offset; + PathRef param_2 = path_ref; + Path path = Path_read(param_1, param_2); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; - bool _210 = row_count == 1u; - bool _216; - if (_210) + bool _225 = row_count == 1u; + bool _231; + if (_225) { - _216 = path.bbox.y > 0u; + _231 = path.bbox.y > 0u; } else { - _216 = _210; + _231 = _225; } - if (_216) + if (_231) { row_count = 0u; } - uint param_2 = path.tiles.offset; - uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_4 = mem_ok; - Alloc path_alloc = new_alloc(param_2, param_3, param_4); + uint param_3 = path.tiles.offset; + uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_5 = true; + Alloc path_alloc = new_alloc(param_3, param_4, param_5); sh_row_alloc[th_ix] = path_alloc; } sh_row_count[th_ix] = row_count; @@ -168,17 +181,17 @@ void comp_main() for (uint i = 0u; i < 8u; i++) { GroupMemoryBarrierWithGroupSync(); - bool _262 = gl_LocalInvocationID.y == 0u; - bool _269; - if (_262) + bool _276 = gl_LocalInvocationID.y == 0u; + bool _283; + if (_276) { - _269 = th_ix >= (1u << i); + _283 = th_ix >= (1u << i); } else { - _269 = _262; + _283 = _276; } - if (_269) + if (_283) { row_count += sh_row_count[th_ix - (1u << i)]; } @@ -190,7 +203,7 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); uint total_rows = sh_row_count[255]; - uint _348; + uint _360; for (uint row = th_ix; row < total_rows; row += 1024u) { uint el_ix = 0u; @@ -203,32 +216,32 @@ void comp_main() } } uint width = sh_row_width[el_ix]; - if ((width > 0u) && mem_ok) + if (width > 0u) { Alloc tiles_alloc = sh_row_alloc[el_ix]; if (el_ix > 0u) { - _348 = sh_row_count[el_ix - 1u]; + _360 = sh_row_count[el_ix - 1u]; } else { - _348 = 0u; + _360 = 0u; } - uint seq_ix = row - _348; + uint seq_ix = row - _360; uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); - Alloc param_5 = tiles_alloc; - uint param_6 = tile_el_ix; - uint sum = read_mem(param_5, param_6); + Alloc param_6 = tiles_alloc; + uint param_7 = tile_el_ix; + uint sum = read_mem(param_6, param_7); for (uint x = 1u; x < width; x++) { tile_el_ix += 2u; - Alloc param_7 = tiles_alloc; - uint param_8 = tile_el_ix; - sum += read_mem(param_7, param_8); - Alloc param_9 = tiles_alloc; - uint param_10 = tile_el_ix; - uint param_11 = sum; - write_mem(param_9, param_10, param_11); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + sum += read_mem(param_8, param_9); + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + uint param_12 = sum; + write_mem(param_10, param_11, param_12); } } } diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl index de43ebe..68f0905 100644 --- a/piet-gpu/shader/gen/backdrop_lg.msl +++ b/piet-gpu/shader/gen/backdrop_lg.msl @@ -1,7 +1,9 @@ #pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wunused-variable" #include #include +#include using namespace metal; @@ -30,6 +32,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -40,6 +43,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -76,6 +80,13 @@ struct ConfigBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 4u, 1u); +static inline __attribute__((always_inline)) +bool check_deps(thread const uint& dep_stage, device Memory& v_59) +{ + uint _65 = atomic_fetch_or_explicit((device atomic_uint*)&v_59.mem_error, 0u, memory_order_relaxed); + return (_65 & dep_stage) == 0u; +} + static inline __attribute__((always_inline)) bool touch_mem(thread const Alloc& alloc, thread const uint& offset) { @@ -83,7 +94,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_67) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_59) { Alloc param = alloc; uint param_1 = offset; @@ -91,23 +102,23 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_67.memory[offset]; + uint v = v_59.memory[offset]; return v; } static inline __attribute__((always_inline)) -Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_67) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_59) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_67); + uint raw0 = read_mem(param, param_1, v_59); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_67); + uint raw1 = read_mem(param_2, param_3, v_59); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_67); + uint raw2 = read_mem(param_4, param_5, v_59); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); s.tiles = TileRef{ raw2 }; @@ -123,7 +134,7 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_67) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_59) { Alloc param = alloc; uint param_1 = offset; @@ -131,47 +142,52 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_67.memory[offset] = val; + v_59.memory[offset] = val; } -kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _166 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_59 [[buffer(0)]], const device ConfigBuf& _181 [[buffer(1)]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint sh_row_width[256]; threadgroup Alloc sh_row_alloc[256]; threadgroup uint sh_row_count[256]; + uint param = 7u; + bool _154 = check_deps(param, v_59); + if (!_154) + { + return; + } uint th_ix = gl_LocalInvocationIndex; uint element_ix = gl_GlobalInvocationID.x; uint row_count = 0u; - bool mem_ok = v_67.mem_error == 0u; if (gl_LocalInvocationID.y == 0u) { - if (element_ix < _166.conf.n_elements) + if (element_ix < _181.conf.n_elements) { - PathRef path_ref = PathRef{ _166.conf.tile_alloc.offset + (element_ix * 12u) }; - Alloc param; - param.offset = _166.conf.tile_alloc.offset; - PathRef param_1 = path_ref; - Path path = Path_read(param, param_1, v_67); + PathRef path_ref = PathRef{ _181.conf.tile_alloc.offset + (element_ix * 12u) }; + Alloc param_1; + param_1.offset = _181.conf.tile_alloc.offset; + PathRef param_2 = path_ref; + Path path = Path_read(param_1, param_2, v_59); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; - bool _210 = row_count == 1u; - bool _216; - if (_210) + bool _225 = row_count == 1u; + bool _231; + if (_225) { - _216 = path.bbox.y > 0u; + _231 = path.bbox.y > 0u; } else { - _216 = _210; + _231 = _225; } - if (_216) + if (_231) { row_count = 0u; } - uint param_2 = path.tiles.offset; - uint param_3 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_4 = mem_ok; - Alloc path_alloc = new_alloc(param_2, param_3, param_4); + uint param_3 = path.tiles.offset; + uint param_4 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_5 = true; + Alloc path_alloc = new_alloc(param_3, param_4, param_5); sh_row_alloc[th_ix] = path_alloc; } sh_row_count[th_ix] = row_count; @@ -179,17 +195,17 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16 for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); - bool _262 = gl_LocalInvocationID.y == 0u; - bool _269; - if (_262) + bool _276 = gl_LocalInvocationID.y == 0u; + bool _283; + if (_276) { - _269 = th_ix >= (1u << i); + _283 = th_ix >= (1u << i); } else { - _269 = _262; + _283 = _276; } - if (_269) + if (_283) { row_count += sh_row_count[th_ix - (1u << i)]; } @@ -201,7 +217,7 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16 } threadgroup_barrier(mem_flags::mem_threadgroup); uint total_rows = sh_row_count[255]; - uint _348; + uint _360; for (uint row = th_ix; row < total_rows; row += 1024u) { uint el_ix = 0u; @@ -214,32 +230,32 @@ kernel void main0(device Memory& v_67 [[buffer(0)]], const device ConfigBuf& _16 } } uint width = sh_row_width[el_ix]; - if ((width > 0u) && mem_ok) + if (width > 0u) { Alloc tiles_alloc = sh_row_alloc[el_ix]; if (el_ix > 0u) { - _348 = sh_row_count[el_ix - 1u]; + _360 = sh_row_count[el_ix - 1u]; } else { - _348 = 0u; + _360 = 0u; } - uint seq_ix = row - _348; + uint seq_ix = row - _360; uint tile_el_ix = ((tiles_alloc.offset >> uint(2)) + 1u) + ((seq_ix * 2u) * width); - Alloc param_5 = tiles_alloc; - uint param_6 = tile_el_ix; - uint sum = read_mem(param_5, param_6, v_67); + Alloc param_6 = tiles_alloc; + uint param_7 = tile_el_ix; + uint sum = read_mem(param_6, param_7, v_59); for (uint x = 1u; x < width; x++) { tile_el_ix += 2u; - Alloc param_7 = tiles_alloc; - uint param_8 = tile_el_ix; - sum += read_mem(param_7, param_8, v_67); - Alloc param_9 = tiles_alloc; - uint param_10 = tile_el_ix; - uint param_11 = sum; - write_mem(param_9, param_10, param_11, v_67); + Alloc param_8 = tiles_alloc; + uint param_9 = tile_el_ix; + sum += read_mem(param_8, param_9, v_59); + Alloc param_10 = tiles_alloc; + uint param_11 = tile_el_ix; + uint param_12 = sum; + write_mem(param_10, param_11, param_12, v_59); } } } diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv index ff2b1d7..2819ec5 100644 Binary files a/piet-gpu/shader/gen/backdrop_lg.spv and b/piet-gpu/shader/gen/backdrop_lg.spv differ diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil index 6655b7f..82cfb03 100644 Binary files a/piet-gpu/shader/gen/bbox_clear.dxil and b/piet-gpu/shader/gen/bbox_clear.dxil differ diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl index 8a884d3..5d29894 100644 --- a/piet-gpu/shader/gen/bbox_clear.hlsl +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -5,6 +5,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -48,13 +49,13 @@ struct SPIRV_Cross_Input void comp_main() { uint ix = gl_GlobalInvocationID.x; - if (ix < _21.Load(76)) + if (ix < _21.Load(80)) { - uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix); - _45.Store(out_ix * 4 + 8, 65535u); - _45.Store((out_ix + 1u) * 4 + 8, 65535u); - _45.Store((out_ix + 2u) * 4 + 8, 0u); - _45.Store((out_ix + 3u) * 4 + 8, 0u); + uint out_ix = (_21.Load(44) >> uint(2)) + (6u * ix); + _45.Store(out_ix * 4 + 12, 65535u); + _45.Store((out_ix + 1u) * 4 + 12, 65535u); + _45.Store((out_ix + 2u) * 4 + 12, 0u); + _45.Store((out_ix + 3u) * 4 + 12, 0u); } } diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl index c278c68..289fc9a 100644 --- a/piet-gpu/shader/gen/bbox_clear.msl +++ b/piet-gpu/shader/gen/bbox_clear.msl @@ -10,6 +10,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -48,6 +49,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv index 58a270e..f1ef371 100644 Binary files a/piet-gpu/shader/gen/bbox_clear.spv and b/piet-gpu/shader/gen/bbox_clear.spv differ diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil index 3050aa8..5c89a15 100644 Binary files a/piet-gpu/shader/gen/binning.dxil and b/piet-gpu/shader/gen/binning.dxil differ diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl index 986f42b..7096371 100644 --- a/piet-gpu/shader/gen/binning.hlsl +++ b/piet-gpu/shader/gen/binning.hlsl @@ -3,22 +3,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - -struct BinInstanceRef -{ - uint offset; -}; - -struct BinInstance -{ - uint element_ix; -}; - struct DrawMonoid { uint path_ix; @@ -29,6 +13,7 @@ struct DrawMonoid struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -60,8 +45,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _81 : register(u0, space0); -ByteAddressBuffer _156 : register(t1, space0); +RWByteAddressBuffer _57 : register(u0, space0); +ByteAddressBuffer _101 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -72,39 +57,38 @@ struct SPIRV_Cross_Input }; groupshared uint bitmaps[8][256]; -groupshared bool sh_alloc_failed; groupshared uint count[8][256]; -groupshared Alloc sh_chunk_alloc[256]; +groupshared uint sh_chunk_offset[256]; DrawMonoid load_draw_monoid(uint element_ix) { - uint base = (_156.Load(44) >> uint(2)) + (4u * element_ix); - uint path_ix = _81.Load(base * 4 + 8); - uint clip_ix = _81.Load((base + 1u) * 4 + 8); - uint scene_offset = _81.Load((base + 2u) * 4 + 8); - uint info_offset = _81.Load((base + 3u) * 4 + 8); - DrawMonoid _190 = { path_ix, clip_ix, scene_offset, info_offset }; - return _190; + uint base = (_101.Load(48) >> uint(2)) + (4u * element_ix); + uint path_ix = _57.Load(base * 4 + 12); + uint clip_ix = _57.Load((base + 1u) * 4 + 12); + uint scene_offset = _57.Load((base + 2u) * 4 + 12); + uint info_offset = _57.Load((base + 3u) * 4 + 12); + DrawMonoid _136 = { path_ix, clip_ix, scene_offset, info_offset }; + return _136; } float4 load_clip_bbox(uint clip_ix) { - uint base = (_156.Load(60) >> uint(2)) + (4u * clip_ix); - float x0 = asfloat(_81.Load(base * 4 + 8)); - float y0 = asfloat(_81.Load((base + 1u) * 4 + 8)); - float x1 = asfloat(_81.Load((base + 2u) * 4 + 8)); - float y1 = asfloat(_81.Load((base + 3u) * 4 + 8)); + uint base = (_101.Load(64) >> uint(2)) + (4u * clip_ix); + float x0 = asfloat(_57.Load(base * 4 + 12)); + float y0 = asfloat(_57.Load((base + 1u) * 4 + 12)); + float x1 = asfloat(_57.Load((base + 2u) * 4 + 12)); + float y1 = asfloat(_57.Load((base + 3u) * 4 + 12)); float4 bbox = float4(x0, y0, x1, y1); return bbox; } float4 load_path_bbox(uint path_ix) { - uint base = (_156.Load(40) >> uint(2)) + (6u * path_ix); - float bbox_l = float(_81.Load(base * 4 + 8)) - 32768.0f; - float bbox_t = float(_81.Load((base + 1u) * 4 + 8)) - 32768.0f; - float bbox_r = float(_81.Load((base + 2u) * 4 + 8)) - 32768.0f; - float bbox_b = float(_81.Load((base + 3u) * 4 + 8)) - 32768.0f; + uint base = (_101.Load(44) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_57.Load(base * 4 + 12)) - 32768.0f; + float bbox_t = float(_57.Load((base + 1u) * 4 + 12)) - 32768.0f; + float bbox_r = float(_57.Load((base + 2u) * 4 + 12)) - 32768.0f; + float bbox_b = float(_57.Load((base + 3u) * 4 + 12)) - 32768.0f; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } @@ -116,41 +100,25 @@ float4 bbox_intersect(float4 a, float4 b) void store_draw_bbox(uint draw_ix, float4 bbox) { - uint base = (_156.Load(64) >> uint(2)) + (4u * draw_ix); - _81.Store(base * 4 + 8, asuint(bbox.x)); - _81.Store((base + 1u) * 4 + 8, asuint(bbox.y)); - _81.Store((base + 2u) * 4 + 8, asuint(bbox.z)); - _81.Store((base + 3u) * 4 + 8, asuint(bbox.w)); + uint base = (_101.Load(68) >> uint(2)) + (4u * draw_ix); + _57.Store(base * 4 + 12, asuint(bbox.x)); + _57.Store((base + 1u) * 4 + 12, asuint(bbox.y)); + _57.Store((base + 2u) * 4 + 12, asuint(bbox.z)); + _57.Store((base + 3u) * 4 + 12, asuint(bbox.w)); } -Alloc new_alloc(uint offset, uint size, bool mem_ok) +uint malloc_stage(uint size, uint mem_size, uint stage) { - Alloc a; - a.offset = offset; - return a; -} - -MallocResult malloc(uint size) -{ - uint _87; - _81.InterlockedAdd(0, size, _87); - uint offset = _87; - uint _94; - _81.GetDimensions(_94); - _94 = (_94 - 8) / 4; - MallocResult r; - r.failed = (offset + size) > uint(int(_94) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _65; + _57.InterlockedAdd(0, size, _65); + uint offset = _65; + if ((offset + size) > mem_size) { - uint _116; - _81.InterlockedMax(4, 1u, _116); - return r; + uint _76; + _57.InterlockedOr(4, stage, _76); + offset = 0u; } - return r; + return offset; } bool touch_mem(Alloc alloc, uint offset) @@ -166,16 +134,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _81.Store(offset * 4 + 8, val); -} - -void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) -{ - uint ix = ref.offset >> uint(2); - Alloc param = a; - uint param_1 = ix + 0u; - uint param_2 = s.element_ix; - write_mem(param, param_1, param_2); + _57.Store(offset * 4 + 12, val); } void comp_main() @@ -185,17 +144,12 @@ void comp_main() { bitmaps[i][gl_LocalInvocationID.x] = 0u; } - if (gl_LocalInvocationID.x == 0u) - { - sh_alloc_failed = false; - } - GroupMemoryBarrierWithGroupSync(); uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; int x0 = 0; int y0 = 0; int x1 = 0; int y1 = 0; - if (element_ix < _156.Load(0)) + if (element_ix < _101.Load(4)) { uint param = element_ix; DrawMonoid draw_monoid = load_draw_monoid(param); @@ -212,11 +166,11 @@ void comp_main() float4 param_3 = path_bbox; float4 param_4 = clip_bbox; float4 bbox = bbox_intersect(param_3, param_4); - float4 _417 = bbox; - float4 _419 = bbox; - float2 _421 = max(_417.xy, _419.zw); - bbox.z = _421.x; - bbox.w = _421.y; + float4 _354 = bbox; + float4 _356 = bbox; + float2 _358 = max(_354.xy, _356.zw); + bbox.z = _358.x; + bbox.w = _358.y; uint param_5 = element_ix; float4 param_6 = bbox; store_draw_bbox(param_5, param_6); @@ -225,8 +179,8 @@ void comp_main() x1 = int(ceil(bbox.z * 0.00390625f)); y1 = int(ceil(bbox.w * 0.00390625f)); } - uint width_in_bins = ((_156.Load(8) + 16u) - 1u) / 16u; - uint height_in_bins = ((_156.Load(12) + 16u) - 1u) / 16u; + uint width_in_bins = ((_101.Load(12) + 16u) - 1u) / 16u; + uint height_in_bins = ((_101.Load(16) + 16u) - 1u) / 16u; x0 = clamp(x0, 0, int(width_in_bins)); x1 = clamp(x1, x0, int(width_in_bins)); y0 = clamp(y0, 0, int(height_in_bins)); @@ -241,8 +195,8 @@ void comp_main() uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _523; - InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _523); + uint _460; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _460); x++; if (x == x1) { @@ -257,51 +211,32 @@ void comp_main() element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x]))); count[i_1][gl_LocalInvocationID.x] = element_count; } - uint param_7 = 0u; - uint param_8 = 0u; - bool param_9 = true; - Alloc chunk_alloc = new_alloc(param_7, param_8, param_9); + uint chunk_offset = 0u; if (element_count != 0u) { - uint param_10 = element_count * 4u; - MallocResult _573 = malloc(param_10); - MallocResult chunk = _573; - chunk_alloc = chunk.alloc; - sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; - if (chunk.failed) - { - sh_alloc_failed = true; - } + uint param_7 = element_count * 4u; + uint param_8 = _101.Load(0); + uint param_9 = 1u; + uint _510 = malloc_stage(param_7, param_8, param_9); + chunk_offset = _510; + sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset; } - uint out_ix = (_156.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); - Alloc _603; - _603.offset = _156.Load(20); - Alloc param_11; - param_11.offset = _603.offset; - uint param_12 = out_ix; - uint param_13 = element_count; - write_mem(param_11, param_12, param_13); - Alloc _615; - _615.offset = _156.Load(20); - Alloc param_14; - param_14.offset = _615.offset; - uint param_15 = out_ix + 1u; - uint param_16 = chunk_alloc.offset; - write_mem(param_14, param_15, param_16); + uint out_ix = (_101.Load(24) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc _532; + _532.offset = _101.Load(24); + Alloc param_10; + param_10.offset = _532.offset; + uint param_11 = out_ix; + uint param_12 = element_count; + write_mem(param_10, param_11, param_12); + Alloc _544; + _544.offset = _101.Load(24); + Alloc param_13; + param_13.offset = _544.offset; + uint param_14 = out_ix + 1u; + uint param_15 = chunk_offset; + write_mem(param_13, param_14, param_15); GroupMemoryBarrierWithGroupSync(); - bool _630; - if (!sh_alloc_failed) - { - _630 = _81.Load(4) != 0u; - } - else - { - _630 = sh_alloc_failed; - } - if (_630) - { - return; - } x = x0; y = y0; while (y < y1) @@ -315,14 +250,11 @@ void comp_main() { idx += count[my_slice - 1u][bin_ix]; } - Alloc out_alloc = sh_chunk_alloc[bin_ix]; - uint out_offset = out_alloc.offset + (idx * 4u); - BinInstanceRef _692 = { out_offset }; - BinInstance _694 = { element_ix }; - Alloc param_17 = out_alloc; - BinInstanceRef param_18 = _692; - BinInstance param_19 = _694; - BinInstance_write(param_17, param_18, param_19); + uint chunk_offset_1 = sh_chunk_offset[bin_ix]; + if (chunk_offset_1 != 0u) + { + _57.Store(((chunk_offset_1 >> uint(2)) + idx) * 4 + 12, element_ix); + } } x++; if (x == x1) diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl index 2ee5168..d3ef95c 100644 --- a/piet-gpu/shader/gen/binning.msl +++ b/piet-gpu/shader/gen/binning.msl @@ -12,22 +12,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - -struct BinInstanceRef -{ - uint offset; -}; - -struct BinInstance -{ - uint element_ix; -}; - struct DrawMonoid { uint path_ix; @@ -40,6 +24,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -50,6 +35,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -87,36 +73,36 @@ struct ConfigBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) -DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_57, const device ConfigBuf& v_101) { - uint base = (v_156.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix); - uint path_ix = v_81.memory[base]; - uint clip_ix = v_81.memory[base + 1u]; - uint scene_offset = v_81.memory[base + 2u]; - uint info_offset = v_81.memory[base + 3u]; + uint base = (v_101.conf.drawmonoid_alloc.offset >> uint(2)) + (4u * element_ix); + uint path_ix = v_57.memory[base]; + uint clip_ix = v_57.memory[base + 1u]; + uint scene_offset = v_57.memory[base + 2u]; + uint info_offset = v_57.memory[base + 3u]; return DrawMonoid{ path_ix, clip_ix, scene_offset, info_offset }; } static inline __attribute__((always_inline)) -float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_57, const device ConfigBuf& v_101) { - uint base = (v_156.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix); - float x0 = as_type(v_81.memory[base]); - float y0 = as_type(v_81.memory[base + 1u]); - float x1 = as_type(v_81.memory[base + 2u]); - float y1 = as_type(v_81.memory[base + 3u]); + uint base = (v_101.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix); + float x0 = as_type(v_57.memory[base]); + float y0 = as_type(v_57.memory[base + 1u]); + float x1 = as_type(v_57.memory[base + 2u]); + float y1 = as_type(v_57.memory[base + 3u]); float4 bbox = float4(x0, y0, x1, y1); return bbox; } static inline __attribute__((always_inline)) -float4 load_path_bbox(thread const uint& path_ix, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +float4 load_path_bbox(thread const uint& path_ix, device Memory& v_57, const device ConfigBuf& v_101) { - uint base = (v_156.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); - float bbox_l = float(v_81.memory[base]) - 32768.0; - float bbox_t = float(v_81.memory[base + 1u]) - 32768.0; - float bbox_r = float(v_81.memory[base + 2u]) - 32768.0; - float bbox_b = float(v_81.memory[base + 3u]) - 32768.0; + uint base = (v_101.conf.path_bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_57.memory[base]) - 32768.0; + float bbox_t = float(v_57.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_57.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_57.memory[base + 3u]) - 32768.0; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } @@ -128,40 +114,26 @@ float4 bbox_intersect(thread const float4& a, thread const float4& b) } static inline __attribute__((always_inline)) -void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_81, constant uint& v_81BufferSize, const device ConfigBuf& v_156) +void store_draw_bbox(thread const uint& draw_ix, thread const float4& bbox, device Memory& v_57, const device ConfigBuf& v_101) { - uint base = (v_156.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); - v_81.memory[base] = as_type(bbox.x); - v_81.memory[base + 1u] = as_type(bbox.y); - v_81.memory[base + 2u] = as_type(bbox.z); - v_81.memory[base + 3u] = as_type(bbox.w); + uint base = (v_101.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); + v_57.memory[base] = as_type(bbox.x); + v_57.memory[base + 1u] = as_type(bbox.y); + v_57.memory[base + 2u] = as_type(bbox.z); + v_57.memory[base + 3u] = as_type(bbox.w); } static inline __attribute__((always_inline)) -Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_57) { - Alloc a; - a.offset = offset; - return a; -} - -static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_81, constant uint& v_81BufferSize) -{ - uint _87 = atomic_fetch_add_explicit((device atomic_uint*)&v_81.mem_offset, size, memory_order_relaxed); - uint offset = _87; - MallocResult r; - r.failed = (offset + size) > uint(int((v_81BufferSize - 8) / 4) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _65 = atomic_fetch_add_explicit((device atomic_uint*)&v_57.mem_offset, size, memory_order_relaxed); + uint offset = _65; + if ((offset + size) > mem_size) { - uint _116 = atomic_fetch_max_explicit((device atomic_uint*)&v_81.mem_error, 1u, memory_order_relaxed); - return r; + uint _76 = atomic_fetch_or_explicit((device atomic_uint*)&v_57.mem_error, stage, memory_order_relaxed); + offset = 0u; } - return r; + return offset; } static inline __attribute__((always_inline)) @@ -171,7 +143,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_81, constant uint& v_81BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_57) { Alloc param = alloc; uint param_1 = offset; @@ -179,73 +151,56 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_81.memory[offset] = val; + v_57.memory[offset] = val; } -static inline __attribute__((always_inline)) -void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_81, constant uint& v_81BufferSize) -{ - uint ix = ref.offset >> uint(2); - Alloc param = a; - uint param_1 = ix + 0u; - uint param_2 = s.element_ix; - write_mem(param, param_1, param_2, v_81, v_81BufferSize); -} - -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_81 [[buffer(0)]], const device ConfigBuf& v_156 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_57 [[buffer(0)]], const device ConfigBuf& v_101 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint bitmaps[8][256]; - threadgroup short sh_alloc_failed; threadgroup uint count[8][256]; - threadgroup Alloc sh_chunk_alloc[256]; - constant uint& v_81BufferSize = spvBufferSizeConstants[0]; + threadgroup uint sh_chunk_offset[256]; uint my_partition = gl_WorkGroupID.x; for (uint i = 0u; i < 8u; i++) { bitmaps[i][gl_LocalInvocationID.x] = 0u; } - if (gl_LocalInvocationID.x == 0u) - { - sh_alloc_failed = short(false); - } - threadgroup_barrier(mem_flags::mem_threadgroup); uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; int x0 = 0; int y0 = 0; int x1 = 0; int y1 = 0; - if (element_ix < v_156.conf.n_elements) + if (element_ix < v_101.conf.n_elements) { uint param = element_ix; - DrawMonoid draw_monoid = load_draw_monoid(param, v_81, v_81BufferSize, v_156); + DrawMonoid draw_monoid = load_draw_monoid(param, v_57, v_101); uint path_ix = draw_monoid.path_ix; float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); uint clip_ix = draw_monoid.clip_ix; if (clip_ix > 0u) { uint param_1 = clip_ix - 1u; - clip_bbox = load_clip_bbox(param_1, v_81, v_81BufferSize, v_156); + clip_bbox = load_clip_bbox(param_1, v_57, v_101); } uint param_2 = path_ix; - float4 path_bbox = load_path_bbox(param_2, v_81, v_81BufferSize, v_156); + float4 path_bbox = load_path_bbox(param_2, v_57, v_101); float4 param_3 = path_bbox; float4 param_4 = clip_bbox; float4 bbox = bbox_intersect(param_3, param_4); - float4 _417 = bbox; - float4 _419 = bbox; - float2 _421 = fast::max(_417.xy, _419.zw); - bbox.z = _421.x; - bbox.w = _421.y; + float4 _354 = bbox; + float4 _356 = bbox; + float2 _358 = fast::max(_354.xy, _356.zw); + bbox.z = _358.x; + bbox.w = _358.y; uint param_5 = element_ix; float4 param_6 = bbox; - store_draw_bbox(param_5, param_6, v_81, v_81BufferSize, v_156); + store_draw_bbox(param_5, param_6, v_57, v_101); x0 = int(floor(bbox.x * 0.00390625)); y0 = int(floor(bbox.y * 0.00390625)); x1 = int(ceil(bbox.z * 0.00390625)); y1 = int(ceil(bbox.w * 0.00390625)); } - uint width_in_bins = ((v_156.conf.width_in_tiles + 16u) - 1u) / 16u; - uint height_in_bins = ((v_156.conf.height_in_tiles + 16u) - 1u) / 16u; + uint width_in_bins = ((v_101.conf.width_in_tiles + 16u) - 1u) / 16u; + uint height_in_bins = ((v_101.conf.height_in_tiles + 16u) - 1u) / 16u; x0 = clamp(x0, 0, int(width_in_bins)); x1 = clamp(x1, x0, int(width_in_bins)); y0 = clamp(y0, 0, int(height_in_bins)); @@ -260,7 +215,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _523 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + uint _460 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); x++; if (x == x1) { @@ -275,47 +230,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x]))); count[i_1][gl_LocalInvocationID.x] = element_count; } - uint param_7 = 0u; - uint param_8 = 0u; - bool param_9 = true; - Alloc chunk_alloc = new_alloc(param_7, param_8, param_9); + uint chunk_offset = 0u; if (element_count != 0u) { - uint param_10 = element_count * 4u; - MallocResult _573 = malloc(param_10, v_81, v_81BufferSize); - MallocResult chunk = _573; - chunk_alloc = chunk.alloc; - sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; - if (chunk.failed) - { - sh_alloc_failed = short(true); - } + uint param_7 = element_count * 4u; + uint param_8 = v_101.conf.mem_size; + uint param_9 = 1u; + uint _510 = malloc_stage(param_7, param_8, param_9, v_57); + chunk_offset = _510; + sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset; } - uint out_ix = (v_156.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); - Alloc param_11; - param_11.offset = v_156.conf.bin_alloc.offset; - uint param_12 = out_ix; - uint param_13 = element_count; - write_mem(param_11, param_12, param_13, v_81, v_81BufferSize); - Alloc param_14; - param_14.offset = v_156.conf.bin_alloc.offset; - uint param_15 = out_ix + 1u; - uint param_16 = chunk_alloc.offset; - write_mem(param_14, param_15, param_16, v_81, v_81BufferSize); + uint out_ix = (v_101.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc param_10; + param_10.offset = v_101.conf.bin_alloc.offset; + uint param_11 = out_ix; + uint param_12 = element_count; + write_mem(param_10, param_11, param_12, v_57); + Alloc param_13; + param_13.offset = v_101.conf.bin_alloc.offset; + uint param_14 = out_ix + 1u; + uint param_15 = chunk_offset; + write_mem(param_13, param_14, param_15, v_57); threadgroup_barrier(mem_flags::mem_threadgroup); - bool _630; - if (!bool(sh_alloc_failed)) - { - _630 = v_81.mem_error != 0u; - } - else - { - _630 = bool(sh_alloc_failed); - } - if (_630) - { - return; - } x = x0; y = y0; while (y < y1) @@ -329,12 +265,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { idx += count[my_slice - 1u][bin_ix]; } - Alloc out_alloc = sh_chunk_alloc[bin_ix]; - uint out_offset = out_alloc.offset + (idx * 4u); - Alloc param_17 = out_alloc; - BinInstanceRef param_18 = BinInstanceRef{ out_offset }; - BinInstance param_19 = BinInstance{ element_ix }; - BinInstance_write(param_17, param_18, param_19, v_81, v_81BufferSize); + uint chunk_offset_1 = sh_chunk_offset[bin_ix]; + if (chunk_offset_1 != 0u) + { + v_57.memory[(chunk_offset_1 >> uint(2)) + idx] = element_ix; + } } x++; if (x == x1) diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv index 30eacd6..1a5c2e1 100644 Binary files a/piet-gpu/shader/gen/binning.spv and b/piet-gpu/shader/gen/binning.spv differ diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil index 29a158e..d5123cb 100644 Binary files a/piet-gpu/shader/gen/clip_leaf.dxil and b/piet-gpu/shader/gen/clip_leaf.dxil differ diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl index ed45bf1..4eb9994 100644 --- a/piet-gpu/shader/gen/clip_leaf.hlsl +++ b/piet-gpu/shader/gen/clip_leaf.hlsl @@ -17,6 +17,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -48,7 +49,7 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -static const Bic _393 = { 0u, 0u }; +static const Bic _394 = { 0u, 0u }; ByteAddressBuffer _80 : register(t1, space0); RWByteAddressBuffer _96 : register(u0, space0); @@ -71,9 +72,9 @@ groupshared float4 sh_bbox[256]; Bic load_bic(uint ix) { - uint base = (_80.Load(52) >> uint(2)) + (2u * ix); - Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) }; - return _286; + uint base = (_80.Load(56) >> uint(2)) + (2u * ix); + Bic _287 = { _96.Load(base * 4 + 12), _96.Load((base + 1u) * 4 + 12) }; + return _287; } Bic bic_combine(Bic x, Bic y) @@ -85,15 +86,15 @@ Bic bic_combine(Bic x, Bic y) ClipEl load_clip_el(uint ix) { - uint base = (_80.Load(56) >> uint(2)) + (5u * ix); - uint parent_ix = _96.Load(base * 4 + 8); - float x0 = asfloat(_96.Load((base + 1u) * 4 + 8)); - float y0 = asfloat(_96.Load((base + 2u) * 4 + 8)); - float x1 = asfloat(_96.Load((base + 3u) * 4 + 8)); - float y1 = asfloat(_96.Load((base + 4u) * 4 + 8)); + uint base = (_80.Load(60) >> uint(2)) + (5u * ix); + uint parent_ix = _96.Load(base * 4 + 12); + float x0 = asfloat(_96.Load((base + 1u) * 4 + 12)); + float y0 = asfloat(_96.Load((base + 2u) * 4 + 12)); + float x1 = asfloat(_96.Load((base + 3u) * 4 + 12)); + float y1 = asfloat(_96.Load((base + 4u) * 4 + 12)); float4 bbox = float4(x0, y0, x1, y1); - ClipEl _335 = { parent_ix, bbox }; - return _335; + ClipEl _336 = { parent_ix, bbox }; + return _336; } float4 bbox_intersect(float4 a, float4 b) @@ -103,9 +104,9 @@ float4 bbox_intersect(float4 a, float4 b) uint load_path_ix(uint ix) { - if (ix < _80.Load(80)) + if (ix < _80.Load(84)) { - return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8); + return _96.Load(((_80.Load(52) >> uint(2)) + ix) * 4 + 12); } else { @@ -115,11 +116,11 @@ uint load_path_ix(uint ix) float4 load_path_bbox(uint path_ix) { - uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix); - float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f; - float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f; - float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f; - float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f; + uint base = (_80.Load(44) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_96.Load(base * 4 + 12)) - 32768.0f; + float bbox_t = float(_96.Load((base + 1u) * 4 + 12)) - 32768.0f; + float bbox_r = float(_96.Load((base + 2u) * 4 + 12)) - 32768.0f; + float bbox_b = float(_96.Load((base + 3u) * 4 + 12)) - 32768.0f; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } @@ -173,17 +174,17 @@ uint search_link(inout Bic bic) void store_clip_bbox(uint ix, float4 bbox) { - uint base = (_80.Load(60) >> uint(2)) + (4u * ix); - _96.Store(base * 4 + 8, asuint(bbox.x)); - _96.Store((base + 1u) * 4 + 8, asuint(bbox.y)); - _96.Store((base + 2u) * 4 + 8, asuint(bbox.z)); - _96.Store((base + 3u) * 4 + 8, asuint(bbox.w)); + uint base = (_80.Load(64) >> uint(2)) + (4u * ix); + _96.Store(base * 4 + 12, asuint(bbox.x)); + _96.Store((base + 1u) * 4 + 12, asuint(bbox.y)); + _96.Store((base + 2u) * 4 + 12, asuint(bbox.z)); + _96.Store((base + 3u) * 4 + 12, asuint(bbox.w)); } void comp_main() { uint th = gl_LocalInvocationID.x; - Bic bic = _393; + Bic bic = _394; if (th < gl_WorkGroupID.x) { uint param = th; @@ -240,8 +241,8 @@ void comp_main() uint param_6 = gl_GlobalInvocationID.x; uint inp = load_path_ix(param_6); bool is_push = int(inp) >= 0; - Bic _559 = { 1u - uint(is_push), uint(is_push) }; - bic = _559; + Bic _560 = { 1u - uint(is_push), uint(is_push) }; + bic = _560; sh_bic[th] = bic; if (is_push) { @@ -266,11 +267,11 @@ void comp_main() inbase = outbase; } GroupMemoryBarrierWithGroupSync(); - bic = _393; + bic = _394; Bic param_10 = bic; - uint _618 = search_link(param_10); + uint _619 = search_link(param_10); bic = param_10; - uint link = _618; + uint link = _619; sh_link[th] = link; GroupMemoryBarrierWithGroupSync(); uint grandparent; @@ -324,22 +325,22 @@ void comp_main() sh_bbox[th] = bbox; GroupMemoryBarrierWithGroupSync(); uint path_ix = inp; - bool _717 = !is_push; - bool _725; - if (_717) + bool _718 = !is_push; + bool _726; + if (_718) { - _725 = gl_GlobalInvocationID.x < _80.Load(80); + _726 = gl_GlobalInvocationID.x < _80.Load(84); } else { - _725 = _717; + _726 = _718; } - if (_725) + if (_726) { uint param_15 = parent; path_ix = load_path_ix(param_15); - uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (4u * (~inp)); - _96.Store(drawmonoid_out_base * 4 + 8, path_ix); + uint drawmonoid_out_base = (_80.Load(48) >> uint(2)) + (4u * (~inp)); + _96.Store(drawmonoid_out_base * 4 + 12, path_ix); if (int(grandparent) >= 0) { bbox = sh_bbox[grandparent]; diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl index 5f5e0a7..c9456e8 100644 --- a/piet-gpu/shader/gen/clip_leaf.msl +++ b/piet-gpu/shader/gen/clip_leaf.msl @@ -24,6 +24,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -62,6 +63,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -275,9 +277,9 @@ kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_8 threadgroup_barrier(mem_flags::mem_threadgroup); bic = Bic{ 0u, 0u }; Bic param_10 = bic; - uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic); + uint _619 = search_link(param_10, gl_LocalInvocationID, sh_bic); bic = param_10; - uint link = _618; + uint link = _619; sh_link[th] = link; threadgroup_barrier(mem_flags::mem_threadgroup); uint grandparent; @@ -331,17 +333,17 @@ kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_8 sh_bbox[th] = bbox; threadgroup_barrier(mem_flags::mem_threadgroup); uint path_ix = inp; - bool _717 = !is_push; - bool _725; - if (_717) + bool _718 = !is_push; + bool _726; + if (_718) { - _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip; + _726 = gl_GlobalInvocationID.x < v_80.conf.n_clip; } else { - _725 = _717; + _726 = _718; } - if (_725) + if (_726) { uint param_15 = parent; path_ix = load_path_ix(param_15, v_80, v_96); diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv index beac64b..fe62632 100644 Binary files a/piet-gpu/shader/gen/clip_leaf.spv and b/piet-gpu/shader/gen/clip_leaf.spv differ diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil index 0dff71b..13ffb01 100644 Binary files a/piet-gpu/shader/gen/clip_reduce.dxil and b/piet-gpu/shader/gen/clip_reduce.dxil differ diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl index 1276b5f..e031f84 100644 --- a/piet-gpu/shader/gen/clip_reduce.hlsl +++ b/piet-gpu/shader/gen/clip_reduce.hlsl @@ -17,6 +17,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -48,7 +49,7 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -static const Bic _267 = { 0u, 0u }; +static const Bic _268 = { 0u, 0u }; ByteAddressBuffer _64 : register(t1, space0); RWByteAddressBuffer _80 : register(u0, space0); @@ -77,39 +78,39 @@ Bic bic_combine(Bic x, Bic y) void store_bic(uint ix, Bic bic) { - uint base = (_64.Load(52) >> uint(2)) + (2u * ix); - _80.Store(base * 4 + 8, bic.a); - _80.Store((base + 1u) * 4 + 8, bic.b); + uint base = (_64.Load(56) >> uint(2)) + (2u * ix); + _80.Store(base * 4 + 12, bic.a); + _80.Store((base + 1u) * 4 + 12, bic.b); } float4 load_path_bbox(uint path_ix) { - uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix); - float bbox_l = float(_80.Load(base * 4 + 8)) - 32768.0f; - float bbox_t = float(_80.Load((base + 1u) * 4 + 8)) - 32768.0f; - float bbox_r = float(_80.Load((base + 2u) * 4 + 8)) - 32768.0f; - float bbox_b = float(_80.Load((base + 3u) * 4 + 8)) - 32768.0f; + uint base = (_64.Load(44) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_80.Load(base * 4 + 12)) - 32768.0f; + float bbox_t = float(_80.Load((base + 1u) * 4 + 12)) - 32768.0f; + float bbox_r = float(_80.Load((base + 2u) * 4 + 12)) - 32768.0f; + float bbox_b = float(_80.Load((base + 3u) * 4 + 12)) - 32768.0f; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); return bbox; } void store_clip_el(uint ix, ClipEl el) { - uint base = (_64.Load(56) >> uint(2)) + (5u * ix); - _80.Store(base * 4 + 8, el.parent_ix); - _80.Store((base + 1u) * 4 + 8, asuint(el.bbox.x)); - _80.Store((base + 2u) * 4 + 8, asuint(el.bbox.y)); - _80.Store((base + 3u) * 4 + 8, asuint(el.bbox.z)); - _80.Store((base + 4u) * 4 + 8, asuint(el.bbox.w)); + uint base = (_64.Load(60) >> uint(2)) + (5u * ix); + _80.Store(base * 4 + 12, el.parent_ix); + _80.Store((base + 1u) * 4 + 12, asuint(el.bbox.x)); + _80.Store((base + 2u) * 4 + 12, asuint(el.bbox.y)); + _80.Store((base + 3u) * 4 + 12, asuint(el.bbox.z)); + _80.Store((base + 4u) * 4 + 12, asuint(el.bbox.w)); } void comp_main() { uint th = gl_LocalInvocationID.x; - uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 8); + uint inp = _80.Load(((_64.Load(52) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 12); bool is_push = int(inp) >= 0; - Bic _207 = { 1u - uint(is_push), uint(is_push) }; - Bic bic = _207; + Bic _208 = { 1u - uint(is_push), uint(is_push) }; + Bic bic = _208; sh_bic[gl_LocalInvocationID.x] = bic; for (uint i = 0u; i < 8u; i++) { @@ -132,21 +133,21 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); uint size = sh_bic[0].b; - bic = _267; + bic = _268; if ((th + 1u) < 256u) { bic = sh_bic[th + 1u]; } - bool _283; + bool _284; if (is_push) { - _283 = bic.a == 0u; + _284 = bic.a == 0u; } else { - _283 = is_push; + _284 = is_push; } - if (_283) + if (_284) { uint local_ix = (size - bic.b) - 1u; sh_parent[local_ix] = th; @@ -163,8 +164,8 @@ void comp_main() if (th < size) { uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); - ClipEl _331 = { parent_ix, bbox }; - ClipEl el = _331; + ClipEl _332 = { parent_ix, bbox }; + ClipEl el = _332; uint param_5 = gl_GlobalInvocationID.x; ClipEl param_6 = el; store_clip_el(param_5, param_6); diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl index 26214f1..dd34e64 100644 --- a/piet-gpu/shader/gen/clip_reduce.msl +++ b/piet-gpu/shader/gen/clip_reduce.msl @@ -24,6 +24,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -62,6 +63,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -142,16 +144,16 @@ kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_6 { bic = sh_bic[th + 1u]; } - bool _283; + bool _284; if (is_push) { - _283 = bic.a == 0u; + _284 = bic.a == 0u; } else { - _283 = is_push; + _284 = is_push; } - if (_283) + if (_284) { uint local_ix = (size - bic.b) - 1u; sh_parent[local_ix] = th; diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv index ce0b9bb..40121e7 100644 Binary files a/piet-gpu/shader/gen/clip_reduce.spv and b/piet-gpu/shader/gen/clip_reduce.spv differ diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil index f71cc04..58e2da8 100644 Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl index a7f769f..673e879 100644 --- a/piet-gpu/shader/gen/coarse.hlsl +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -3,12 +3,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - struct BinInstanceRef { uint offset; @@ -144,6 +138,7 @@ struct CmdRef struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -175,9 +170,9 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _266 : register(u0, space0); -ByteAddressBuffer _1020 : register(t1, space0); -ByteAddressBuffer _1399 : register(t2, space0); +RWByteAddressBuffer _267 : register(u0, space0); +ByteAddressBuffer _891 : register(t1, space0); +ByteAddressBuffer _1390 : register(t2, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -187,6 +182,7 @@ struct SPIRV_Cross_Input uint3 gl_LocalInvocationID : SV_GroupThreadID; }; +static bool mem_ok; groupshared uint sh_bitmaps[8][256]; groupshared Alloc sh_part_elements[256]; groupshared uint sh_part_count[256]; @@ -198,10 +194,17 @@ groupshared uint sh_tile_y0[256]; groupshared uint sh_tile_base[256]; groupshared uint sh_tile_count[256]; +bool check_deps(uint dep_stage) +{ + uint _273; + _267.InterlockedOr(4, 0u, _273); + return (_273 & dep_stage) == 0u; +} + Alloc slice_mem(Alloc a, uint offset, uint size) { - Alloc _343 = { a.offset + offset }; - return _343; + Alloc _331 = { a.offset + offset }; + return _331; } bool touch_mem(Alloc alloc, uint offset) @@ -217,11 +220,11 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _266.Load(offset * 4 + 8); + uint v = _267.Load(offset * 4 + 12); return v; } -Alloc new_alloc(uint offset, uint size, bool mem_ok) +Alloc new_alloc(uint offset, uint size, bool mem_ok_1) { Alloc a; a.offset = offset; @@ -230,8 +233,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { - BinInstanceRef _361 = { ref.offset + (index * 4u) }; - return _361; + BinInstanceRef _340 = { ref.offset + (index * 4u) }; + return _340; } BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) @@ -259,8 +262,8 @@ Path Path_read(Alloc a, PathRef ref) uint raw2 = read_mem(param_4, param_5); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); - TileRef _424 = { raw2 }; - s.tiles = _424; + TileRef _404 = { raw2 }; + s.tiles = _404; return s; } @@ -268,14 +271,11 @@ void write_tile_alloc(uint el_ix, Alloc a) { } -Alloc read_tile_alloc(uint el_ix, bool mem_ok) +Alloc read_tile_alloc(uint el_ix, bool mem_ok_1) { - uint _907; - _266.GetDimensions(_907); - _907 = (_907 - 8) / 4; uint param = 0u; - uint param_1 = uint(int(_907) * 4); - bool param_2 = mem_ok; + uint param_1 = _891.Load(0); + bool param_2 = mem_ok_1; return new_alloc(param, param_1, param_2); } @@ -288,34 +288,25 @@ Tile Tile_read(Alloc a, TileRef ref) Alloc param_2 = a; uint param_3 = ix + 1u; uint raw1 = read_mem(param_2, param_3); - TileSegRef _449 = { raw0 }; + TileSegRef _429 = { raw0 }; Tile s; - s.tile = _449; + s.tile = _429; s.backdrop = int(raw1); return s; } -MallocResult malloc(uint size) +uint malloc_stage(uint size, uint mem_size, uint stage) { - uint _272; - _266.InterlockedAdd(0, size, _272); - uint offset = _272; - uint _279; - _266.GetDimensions(_279); - _279 = (_279 - 8) / 4; - MallocResult r; - r.failed = (offset + size) > uint(int(_279) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _282; + _267.InterlockedAdd(0, size, _282); + uint offset = _282; + if ((offset + size) > mem_size) { - uint _301; - _266.InterlockedMax(4, 1u, _301); - return r; + uint _292; + _267.InterlockedOr(4, stage, _292); + offset = 0u; } - return r; + return offset; } void write_mem(Alloc alloc, uint offset, uint val) @@ -326,7 +317,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _266.Store(offset * 4 + 8, val); + _267.Store(offset * 4 + 12, val); } void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) @@ -344,37 +335,44 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) uint param_1 = ref.offset >> uint(2); uint param_2 = 11u; write_mem(param, param_1, param_2); - CmdJumpRef _900 = { ref.offset + 4u }; + CmdJumpRef _880 = { ref.offset + 4u }; Alloc param_3 = a; - CmdJumpRef param_4 = _900; + CmdJumpRef param_4 = _880; CmdJump param_5 = s; CmdJump_write(param_3, param_4, param_5); } -bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) +void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { if (cmd_ref.offset < cmd_limit) { - return true; + return; } uint param = 1024u; - MallocResult _928 = malloc(param); - MallocResult new_cmd = _928; - if (new_cmd.failed) + uint param_1 = _891.Load(0); + uint param_2 = 8u; + uint _915 = malloc_stage(param, param_1, param_2); + uint new_cmd = _915; + if (new_cmd == 0u) { - return false; + mem_ok = false; } - CmdJump _938 = { new_cmd.alloc.offset }; - CmdJump jump = _938; - Alloc param_1 = cmd_alloc; - CmdRef param_2 = cmd_ref; - CmdJump param_3 = jump; - Cmd_Jump_write(param_1, param_2, param_3); - cmd_alloc = new_cmd.alloc; - CmdRef _950 = { cmd_alloc.offset }; - cmd_ref = _950; - cmd_limit = (cmd_alloc.offset + 1024u) - 144u; - return true; + if (mem_ok) + { + CmdJump _926 = { new_cmd }; + CmdJump jump = _926; + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + CmdJump param_5 = jump; + Cmd_Jump_write(param_3, param_4, param_5); + } + uint param_6 = new_cmd; + uint param_7 = 1024u; + bool param_8 = true; + cmd_alloc = new_alloc(param_6, param_7, param_8); + CmdRef _940 = { new_cmd }; + cmd_ref = _940; + cmd_limit = (new_cmd + 1024u) - 144u; } void CmdFill_write(Alloc a, CmdFillRef ref, CmdFill s) @@ -396,9 +394,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) uint param_1 = ref.offset >> uint(2); uint param_2 = 1u; write_mem(param, param_1, param_2); - CmdFillRef _757 = { ref.offset + 4u }; + CmdFillRef _737 = { ref.offset + 4u }; Alloc param_3 = a; - CmdFillRef param_4 = _757; + CmdFillRef param_4 = _737; CmdFill param_5 = s; CmdFill_write(param_3, param_4, param_5); } @@ -430,9 +428,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) uint param_1 = ref.offset >> uint(2); uint param_2 = 2u; write_mem(param, param_1, param_2); - CmdStrokeRef _775 = { ref.offset + 4u }; + CmdStrokeRef _755 = { ref.offset + 4u }; Alloc param_3 = a; - CmdStrokeRef param_4 = _775; + CmdStrokeRef param_4 = _755; CmdStroke param_5 = s; CmdStroke_write(param_3, param_4, param_5); } @@ -443,30 +441,39 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) { if (tile.tile.offset != 0u) { - CmdFill _973 = { tile.tile.offset, tile.backdrop }; - CmdFill cmd_fill = _973; - Alloc param = alloc; - CmdRef param_1 = cmd_ref; - CmdFill param_2 = cmd_fill; - Cmd_Fill_write(param, param_1, param_2); + CmdFill _960 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _960; + if (mem_ok) + { + Alloc param = alloc; + CmdRef param_1 = cmd_ref; + CmdFill param_2 = cmd_fill; + Cmd_Fill_write(param, param_1, param_2); + } cmd_ref.offset += 12u; } else { - Alloc param_3 = alloc; - CmdRef param_4 = cmd_ref; - Cmd_Solid_write(param_3, param_4); + if (mem_ok) + { + Alloc param_3 = alloc; + CmdRef param_4 = cmd_ref; + Cmd_Solid_write(param_3, param_4); + } cmd_ref.offset += 4u; } } else { - CmdStroke _1003 = { tile.tile.offset, 0.5f * linewidth }; - CmdStroke cmd_stroke = _1003; - Alloc param_5 = alloc; - CmdRef param_6 = cmd_ref; - CmdStroke param_7 = cmd_stroke; - Cmd_Stroke_write(param_5, param_6, param_7); + CmdStroke _996 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _996; + if (mem_ok) + { + Alloc param_5 = alloc; + CmdRef param_6 = cmd_ref; + CmdStroke param_7 = cmd_stroke; + Cmd_Stroke_write(param_5, param_6, param_7); + } cmd_ref.offset += 12u; } } @@ -486,9 +493,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; write_mem(param, param_1, param_2); - CmdColorRef _801 = { ref.offset + 4u }; + CmdColorRef _781 = { ref.offset + 4u }; Alloc param_3 = a; - CmdColorRef param_4 = _801; + CmdColorRef param_4 = _781; CmdColor param_5 = s; CmdColor_write(param_3, param_4, param_5); } @@ -520,9 +527,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) uint param_1 = ref.offset >> uint(2); uint param_2 = 6u; write_mem(param, param_1, param_2); - CmdLinGradRef _819 = { ref.offset + 4u }; + CmdLinGradRef _799 = { ref.offset + 4u }; Alloc param_3 = a; - CmdLinGradRef param_4 = _819; + CmdLinGradRef param_4 = _799; CmdLinGrad param_5 = s; CmdLinGrad_write(param_3, param_4, param_5); } @@ -582,9 +589,9 @@ void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) uint param_1 = ref.offset >> uint(2); uint param_2 = 7u; write_mem(param, param_1, param_2); - CmdRadGradRef _837 = { ref.offset + 4u }; + CmdRadGradRef _817 = { ref.offset + 4u }; Alloc param_3 = a; - CmdRadGradRef param_4 = _837; + CmdRadGradRef param_4 = _817; CmdRadGrad param_5 = s; CmdRadGrad_write(param_3, param_4, param_5); } @@ -608,9 +615,9 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) uint param_1 = ref.offset >> uint(2); uint param_2 = 8u; write_mem(param, param_1, param_2); - CmdImageRef _855 = { ref.offset + 4u }; + CmdImageRef _835 = { ref.offset + 4u }; Alloc param_3 = a; - CmdImageRef param_4 = _855; + CmdImageRef param_4 = _835; CmdImage param_5 = s; CmdImage_write(param_3, param_4, param_5); } @@ -638,9 +645,9 @@ void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) uint param_1 = ref.offset >> uint(2); uint param_2 = 10u; write_mem(param, param_1, param_2); - CmdEndClipRef _881 = { ref.offset + 4u }; + CmdEndClipRef _861 = { ref.offset + 4u }; Alloc param_3 = a; - CmdEndClipRef param_4 = _881; + CmdEndClipRef param_4 = _861; CmdEndClip param_5 = s; CmdEndClip_write(param_3, param_4, param_5); } @@ -653,35 +660,34 @@ void Cmd_End_write(Alloc a, CmdRef ref) write_mem(param, param_1, param_2); } -void alloc_write(Alloc a, uint offset, Alloc alloc) -{ - Alloc param = a; - uint param_1 = offset >> uint(2); - uint param_2 = alloc.offset; - write_mem(param, param_1, param_2); -} - void comp_main() { - uint width_in_bins = ((_1020.Load(8) + 16u) - 1u) / 16u; + mem_ok = true; + uint param = 7u; + bool _1012 = check_deps(param); + if (!_1012) + { + return; + } + uint width_in_bins = ((_891.Load(12) + 16u) - 1u) / 16u; uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; uint partition_ix = 0u; - uint n_partitions = ((_1020.Load(0) + 256u) - 1u) / 256u; + uint n_partitions = ((_891.Load(4) + 256u) - 1u) / 256u; uint th_ix = gl_LocalInvocationID.x; uint bin_tile_x = 16u * gl_WorkGroupID.x; uint bin_tile_y = 16u * gl_WorkGroupID.y; uint tile_x = gl_LocalInvocationID.x % 16u; uint tile_y = gl_LocalInvocationID.x / 16u; - uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.Load(8)) + bin_tile_x) + tile_x; - Alloc _1085; - _1085.offset = _1020.Load(24); - Alloc param; - param.offset = _1085.offset; - uint param_1 = this_tile_ix * 1024u; - uint param_2 = 1024u; - Alloc cmd_alloc = slice_mem(param, param_1, param_2); - CmdRef _1094 = { cmd_alloc.offset }; - CmdRef cmd_ref = _1094; + uint this_tile_ix = (((bin_tile_y + tile_y) * _891.Load(12)) + bin_tile_x) + tile_x; + Alloc _1082; + _1082.offset = _891.Load(28); + Alloc param_1; + param_1.offset = _1082.offset; + uint param_2 = this_tile_ix * 1024u; + uint param_3 = 1024u; + Alloc cmd_alloc = slice_mem(param_1, param_2, param_3); + CmdRef _1091 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1091; uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; uint clip_depth = 0u; uint clip_zero_depth = 0u; @@ -689,25 +695,24 @@ void comp_main() uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; - Alloc param_3 = cmd_alloc; - uint param_4 = 0u; - uint param_5 = 8u; - Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + Alloc param_4 = cmd_alloc; + uint param_5 = 0u; + uint param_6 = 8u; + Alloc scratch_alloc = slice_mem(param_4, param_5, param_6); cmd_ref.offset += 4u; uint render_blend_depth = 0u; uint max_blend_depth = 0u; - uint drawmonoid_start = _1020.Load(44) >> uint(2); - uint drawtag_start = _1020.Load(100) >> uint(2); - uint drawdata_start = _1020.Load(104) >> uint(2); - uint drawinfo_start = _1020.Load(68) >> uint(2); - bool mem_ok = _266.Load(4) == 0u; - Alloc param_6; - Alloc param_8; - uint _1331; + uint drawmonoid_start = _891.Load(48) >> uint(2); + uint drawtag_start = _891.Load(104) >> uint(2); + uint drawdata_start = _891.Load(108) >> uint(2); + uint drawinfo_start = _891.Load(72) >> uint(2); + Alloc param_7; + Alloc param_9; + uint _1322; uint element_ix; - Alloc param_17; + Alloc param_18; uint tile_count; - uint _1632; + uint _1622; float linewidth; CmdLinGrad cmd_lin; CmdRadGrad cmd_rad; @@ -717,40 +722,40 @@ void comp_main() { sh_bitmaps[i][th_ix] = 0u; } - bool _1383; + bool _1374; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1181 = th_ix < 256u; - bool _1189; - if (_1181) + bool _1174 = th_ix < 256u; + bool _1182; + if (_1174) { - _1189 = (partition_ix + th_ix) < n_partitions; + _1182 = (partition_ix + th_ix) < n_partitions; } else { - _1189 = _1181; + _1182 = _1174; } - if (_1189) + if (_1182) { - uint in_ix = (_1020.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - Alloc _1206; - _1206.offset = _1020.Load(20); - param_6.offset = _1206.offset; - uint param_7 = in_ix; - count = read_mem(param_6, param_7); - Alloc _1217; - _1217.offset = _1020.Load(20); - param_8.offset = _1217.offset; - uint param_9 = in_ix + 1u; - uint offset = read_mem(param_8, param_9); - uint param_10 = offset; - uint param_11 = count * 4u; - bool param_12 = mem_ok; - sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); + uint in_ix = (_891.Load(24) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1200; + _1200.offset = _891.Load(24); + param_7.offset = _1200.offset; + uint param_8 = in_ix; + count = read_mem(param_7, param_8); + Alloc _1211; + _1211.offset = _891.Load(24); + param_9.offset = _1211.offset; + uint param_10 = in_ix + 1u; + uint offset = read_mem(param_9, param_10); + uint param_11 = offset; + uint param_12 = count * 4u; + bool param_13 = true; + sh_part_elements[th_ix] = new_alloc(param_11, param_12, param_13); } for (uint i_1 = 0u; i_1 < 8u; i_1++) { @@ -777,7 +782,7 @@ void comp_main() partition_ix += 256u; } uint ix = rd_ix + th_ix; - if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + if ((ix >= wr_ix) && (ix < ready_ix)) { uint part_ix = 0u; for (uint i_2 = 0u; i_2 < 8u; i_2++) @@ -790,35 +795,35 @@ void comp_main() } if (part_ix > 0u) { - _1331 = sh_part_count[part_ix - 1u]; + _1322 = sh_part_count[part_ix - 1u]; } else { - _1331 = part_start_ix; + _1322 = part_start_ix; } - ix -= _1331; + ix -= _1322; Alloc bin_alloc = sh_part_elements[part_ix]; - BinInstanceRef _1350 = { bin_alloc.offset }; - BinInstanceRef inst_ref = _1350; - BinInstanceRef param_13 = inst_ref; - uint param_14 = ix; - Alloc param_15 = bin_alloc; - BinInstanceRef param_16 = BinInstance_index(param_13, param_14); - BinInstance inst = BinInstance_read(param_15, param_16); + BinInstanceRef _1341 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1341; + BinInstanceRef param_14 = inst_ref; + uint param_15 = ix; + Alloc param_16 = bin_alloc; + BinInstanceRef param_17 = BinInstance_index(param_14, param_15); + BinInstance inst = BinInstance_read(param_16, param_17); sh_elements[th_ix] = inst.element_ix; } GroupMemoryBarrierWithGroupSync(); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1373 = (wr_ix - rd_ix) < 256u; - if (_1373) + bool _1364 = (wr_ix - rd_ix) < 256u; + if (_1364) { - _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1374 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1383 = _1373; + _1374 = _1364; } - if (_1383) + if (_1374) { continue; } @@ -831,7 +836,7 @@ void comp_main() if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - tag = _1399.Load((drawtag_start + element_ix) * 4 + 0); + tag = _1390.Load((drawtag_start + element_ix) * 4 + 0); } switch (tag) { @@ -843,13 +848,13 @@ void comp_main() case 37u: { uint drawmonoid_base = drawmonoid_start + (4u * element_ix); - uint path_ix = _266.Load(drawmonoid_base * 4 + 8); - PathRef _1424 = { _1020.Load(16) + (path_ix * 12u) }; - Alloc _1427; - _1427.offset = _1020.Load(16); - param_17.offset = _1427.offset; - PathRef param_18 = _1424; - Path path = Path_read(param_17, param_18); + uint path_ix = _267.Load(drawmonoid_base * 4 + 12); + PathRef _1415 = { _891.Load(20) + (path_ix * 12u) }; + Alloc _1418; + _1418.offset = _891.Load(20); + param_18.offset = _1418.offset; + PathRef param_19 = _1415; + Path path = Path_read(param_18, param_19); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -864,13 +869,13 @@ void comp_main() tile_count = uint(x1 - x0) * uint(y1 - y0); uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); sh_tile_base[th_ix] = base; - uint param_19 = path.tiles.offset; - uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_21 = mem_ok; - Alloc path_alloc = new_alloc(param_19, param_20, param_21); - uint param_22 = th_ix; - Alloc param_23 = path_alloc; - write_tile_alloc(param_22, param_23); + uint param_20 = path.tiles.offset; + uint param_21 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_22 = true; + Alloc path_alloc = new_alloc(param_20, param_21, param_22); + uint param_23 = th_ix; + Alloc param_24 = path_alloc; + write_tile_alloc(param_23, param_24); break; } default: @@ -904,62 +909,59 @@ void comp_main() } } uint element_ix_1 = sh_elements[el_ix]; - uint tag_1 = _1399.Load((drawtag_start + element_ix_1) * 4 + 0); + uint tag_1 = _1390.Load((drawtag_start + element_ix_1) * 4 + 0); if (el_ix > 0u) { - _1632 = sh_tile_count[el_ix - 1u]; + _1622 = sh_tile_count[el_ix - 1u]; } else { - _1632 = 0u; + _1622 = 0u; } - uint seq_ix = ix_1 - _1632; + uint seq_ix = ix_1 - _1622; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); bool include_tile = false; - if (mem_ok) + uint param_25 = el_ix; + bool param_26 = true; + TileRef _1670 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_27 = read_tile_alloc(param_25, param_26); + TileRef param_28 = _1670; + Tile tile = Tile_read(param_27, param_28); + bool is_clip = (tag_1 & 1u) != 0u; + bool is_blend = false; + if (is_clip) { - uint param_24 = el_ix; - bool param_25 = mem_ok; - TileRef _1684 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; - Alloc param_26 = read_tile_alloc(param_24, param_25); - TileRef param_27 = _1684; - Tile tile = Tile_read(param_26, param_27); - bool is_clip = (tag_1 & 1u) != 0u; - bool is_blend = false; - if (is_clip) - { - uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); - uint scene_offset = _266.Load((drawmonoid_base_1 + 2u) * 4 + 8); - uint dd = drawdata_start + (scene_offset >> uint(2)); - uint blend = _1399.Load(dd * 4 + 0); - is_blend = blend != 32771u; - } - bool _1720 = tile.tile.offset != 0u; - bool _1729; - if (!_1720) - { - _1729 = (tile.backdrop == 0) == is_clip; - } - else - { - _1729 = _1720; - } - include_tile = _1729 || is_blend; + uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); + uint scene_offset = _267.Load((drawmonoid_base_1 + 2u) * 4 + 12); + uint dd = drawdata_start + (scene_offset >> uint(2)); + uint blend = _1390.Load(dd * 4 + 0); + is_blend = blend != 32771u; } + bool _1706 = tile.tile.offset != 0u; + bool _1715; + if (!_1706) + { + _1715 = (tile.backdrop == 0) == is_clip; + } + else + { + _1715 = _1706; + } + include_tile = _1715 || is_blend; if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1751; - InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1751); + uint _1737; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1737); } } GroupMemoryBarrierWithGroupSync(); uint slice_ix = 0u; uint bitmap = sh_bitmaps[0][th_ix]; - while (mem_ok) + while (true) { if (bitmap == 0u) { @@ -977,178 +979,173 @@ void comp_main() uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); uint element_ix_2 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - uint drawtag = _1399.Load((drawtag_start + element_ix_2) * 4 + 0); + uint drawtag = _1390.Load((drawtag_start + element_ix_2) * 4 + 0); if (clip_zero_depth == 0u) { - uint param_28 = element_ref_ix; - bool param_29 = mem_ok; - TileRef _1828 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - Alloc param_30 = read_tile_alloc(param_28, param_29); - TileRef param_31 = _1828; - Tile tile_1 = Tile_read(param_30, param_31); + uint param_29 = element_ref_ix; + bool param_30 = true; + TileRef _1812 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_31 = read_tile_alloc(param_29, param_30); + TileRef param_32 = _1812; + Tile tile_1 = Tile_read(param_31, param_32); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); - uint scene_offset_1 = _266.Load((drawmonoid_base_2 + 2u) * 4 + 8); - uint info_offset = _266.Load((drawmonoid_base_2 + 3u) * 4 + 8); + uint scene_offset_1 = _267.Load((drawmonoid_base_2 + 2u) * 4 + 12); + uint info_offset = _267.Load((drawmonoid_base_2 + 3u) * 4 + 12); uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); uint di = drawinfo_start + (info_offset >> uint(2)); switch (drawtag) { case 68u: { - linewidth = asfloat(_266.Load(di * 4 + 8)); - Alloc param_32 = cmd_alloc; - CmdRef param_33 = cmd_ref; - uint param_34 = cmd_limit; - bool _1876 = alloc_cmd(param_32, param_33, param_34); - cmd_alloc = param_32; - cmd_ref = param_33; - cmd_limit = param_34; - if (!_1876) + linewidth = asfloat(_267.Load(di * 4 + 12)); + Alloc param_33 = cmd_alloc; + CmdRef param_34 = cmd_ref; + uint param_35 = cmd_limit; + alloc_cmd(param_33, param_34, param_35); + cmd_alloc = param_33; + cmd_ref = param_34; + cmd_limit = param_35; + Alloc param_36 = cmd_alloc; + CmdRef param_37 = cmd_ref; + Tile param_38 = tile_1; + float param_39 = linewidth; + write_fill(param_36, param_37, param_38, param_39); + cmd_ref = param_37; + uint rgba = _1390.Load(dd_1 * 4 + 0); + if (mem_ok) { - break; + CmdColor _1882 = { rgba }; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdColor param_42 = _1882; + Cmd_Color_write(param_40, param_41, param_42); } - Alloc param_35 = cmd_alloc; - CmdRef param_36 = cmd_ref; - Tile param_37 = tile_1; - float param_38 = linewidth; - write_fill(param_35, param_36, param_37, param_38); - cmd_ref = param_36; - uint rgba = _1399.Load(dd_1 * 4 + 0); - CmdColor _1899 = { rgba }; - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdColor param_41 = _1899; - Cmd_Color_write(param_39, param_40, param_41); cmd_ref.offset += 8u; break; } case 276u: { - Alloc param_42 = cmd_alloc; - CmdRef param_43 = cmd_ref; - uint param_44 = cmd_limit; - bool _1917 = alloc_cmd(param_42, param_43, param_44); - cmd_alloc = param_42; - cmd_ref = param_43; - cmd_limit = param_44; - if (!_1917) + Alloc param_43 = cmd_alloc; + CmdRef param_44 = cmd_ref; + uint param_45 = cmd_limit; + alloc_cmd(param_43, param_44, param_45); + cmd_alloc = param_43; + cmd_ref = param_44; + cmd_limit = param_45; + linewidth = asfloat(_267.Load(di * 4 + 12)); + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + Tile param_48 = tile_1; + float param_49 = linewidth; + write_fill(param_46, param_47, param_48, param_49); + cmd_ref = param_47; + cmd_lin.index = _1390.Load(dd_1 * 4 + 0); + cmd_lin.line_x = asfloat(_267.Load((di + 1u) * 4 + 12)); + cmd_lin.line_y = asfloat(_267.Load((di + 2u) * 4 + 12)); + cmd_lin.line_c = asfloat(_267.Load((di + 3u) * 4 + 12)); + if (mem_ok) { - break; + Alloc param_50 = cmd_alloc; + CmdRef param_51 = cmd_ref; + CmdLinGrad param_52 = cmd_lin; + Cmd_LinGrad_write(param_50, param_51, param_52); } - linewidth = asfloat(_266.Load(di * 4 + 8)); - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - Tile param_47 = tile_1; - float param_48 = linewidth; - write_fill(param_45, param_46, param_47, param_48); - cmd_ref = param_46; - cmd_lin.index = _1399.Load(dd_1 * 4 + 0); - cmd_lin.line_x = asfloat(_266.Load((di + 1u) * 4 + 8)); - cmd_lin.line_y = asfloat(_266.Load((di + 2u) * 4 + 8)); - cmd_lin.line_c = asfloat(_266.Load((di + 3u) * 4 + 8)); - Alloc param_49 = cmd_alloc; - CmdRef param_50 = cmd_ref; - CmdLinGrad param_51 = cmd_lin; - Cmd_LinGrad_write(param_49, param_50, param_51); cmd_ref.offset += 20u; break; } case 732u: { - Alloc param_52 = cmd_alloc; - CmdRef param_53 = cmd_ref; - uint param_54 = cmd_limit; - bool _1981 = alloc_cmd(param_52, param_53, param_54); - cmd_alloc = param_52; - cmd_ref = param_53; - cmd_limit = param_54; - if (!_1981) + Alloc param_53 = cmd_alloc; + CmdRef param_54 = cmd_ref; + uint param_55 = cmd_limit; + alloc_cmd(param_53, param_54, param_55); + cmd_alloc = param_53; + cmd_ref = param_54; + cmd_limit = param_55; + linewidth = asfloat(_267.Load(di * 4 + 12)); + Alloc param_56 = cmd_alloc; + CmdRef param_57 = cmd_ref; + Tile param_58 = tile_1; + float param_59 = linewidth; + write_fill(param_56, param_57, param_58, param_59); + cmd_ref = param_57; + cmd_rad.index = _1390.Load(dd_1 * 4 + 0); + cmd_rad.mat = asfloat(uint4(_267.Load((di + 1u) * 4 + 12), _267.Load((di + 2u) * 4 + 12), _267.Load((di + 3u) * 4 + 12), _267.Load((di + 4u) * 4 + 12))); + cmd_rad.xlat = asfloat(uint2(_267.Load((di + 5u) * 4 + 12), _267.Load((di + 6u) * 4 + 12))); + cmd_rad.c1 = asfloat(uint2(_267.Load((di + 7u) * 4 + 12), _267.Load((di + 8u) * 4 + 12))); + cmd_rad.ra = asfloat(_267.Load((di + 9u) * 4 + 12)); + cmd_rad.roff = asfloat(_267.Load((di + 10u) * 4 + 12)); + if (mem_ok) { - break; + Alloc param_60 = cmd_alloc; + CmdRef param_61 = cmd_ref; + CmdRadGrad param_62 = cmd_rad; + Cmd_RadGrad_write(param_60, param_61, param_62); } - linewidth = asfloat(_266.Load(di * 4 + 8)); - Alloc param_55 = cmd_alloc; - CmdRef param_56 = cmd_ref; - Tile param_57 = tile_1; - float param_58 = linewidth; - write_fill(param_55, param_56, param_57, param_58); - cmd_ref = param_56; - cmd_rad.index = _1399.Load(dd_1 * 4 + 0); - cmd_rad.mat = asfloat(uint4(_266.Load((di + 1u) * 4 + 8), _266.Load((di + 2u) * 4 + 8), _266.Load((di + 3u) * 4 + 8), _266.Load((di + 4u) * 4 + 8))); - cmd_rad.xlat = asfloat(uint2(_266.Load((di + 5u) * 4 + 8), _266.Load((di + 6u) * 4 + 8))); - cmd_rad.c1 = asfloat(uint2(_266.Load((di + 7u) * 4 + 8), _266.Load((di + 8u) * 4 + 8))); - cmd_rad.ra = asfloat(_266.Load((di + 9u) * 4 + 8)); - cmd_rad.roff = asfloat(_266.Load((di + 10u) * 4 + 8)); - Alloc param_59 = cmd_alloc; - CmdRef param_60 = cmd_ref; - CmdRadGrad param_61 = cmd_rad; - Cmd_RadGrad_write(param_59, param_60, param_61); cmd_ref.offset += 48u; break; } case 72u: { - linewidth = asfloat(_266.Load(di * 4 + 8)); - Alloc param_62 = cmd_alloc; - CmdRef param_63 = cmd_ref; - uint param_64 = cmd_limit; - bool _2087 = alloc_cmd(param_62, param_63, param_64); - cmd_alloc = param_62; - cmd_ref = param_63; - cmd_limit = param_64; - if (!_2087) - { - break; - } - Alloc param_65 = cmd_alloc; - CmdRef param_66 = cmd_ref; - Tile param_67 = tile_1; - float param_68 = linewidth; - write_fill(param_65, param_66, param_67, param_68); - cmd_ref = param_66; - uint index = _1399.Load(dd_1 * 4 + 0); - uint raw1 = _1399.Load((dd_1 + 1u) * 4 + 0); + Alloc param_63 = cmd_alloc; + CmdRef param_64 = cmd_ref; + uint param_65 = cmd_limit; + alloc_cmd(param_63, param_64, param_65); + cmd_alloc = param_63; + cmd_ref = param_64; + cmd_limit = param_65; + linewidth = asfloat(_267.Load(di * 4 + 12)); + Alloc param_66 = cmd_alloc; + CmdRef param_67 = cmd_ref; + Tile param_68 = tile_1; + float param_69 = linewidth; + write_fill(param_66, param_67, param_68, param_69); + cmd_ref = param_67; + uint index = _1390.Load(dd_1 * 4 + 0); + uint raw1 = _1390.Load((dd_1 + 1u) * 4 + 0); int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); - CmdImage _2126 = { index, offset_1 }; - Alloc param_69 = cmd_alloc; - CmdRef param_70 = cmd_ref; - CmdImage param_71 = _2126; - Cmd_Image_write(param_69, param_70, param_71); + if (mem_ok) + { + CmdImage _2106 = { index, offset_1 }; + Alloc param_70 = cmd_alloc; + CmdRef param_71 = cmd_ref; + CmdImage param_72 = _2106; + Cmd_Image_write(param_70, param_71, param_72); + } cmd_ref.offset += 12u; break; } case 5u: { - bool _2140 = tile_1.tile.offset == 0u; - bool _2146; - if (_2140) + bool _2120 = tile_1.tile.offset == 0u; + bool _2126; + if (_2120) { - _2146 = tile_1.backdrop == 0; + _2126 = tile_1.backdrop == 0; } else { - _2146 = _2140; + _2126 = _2120; } - if (_2146) + if (_2126) { clip_zero_depth = clip_depth + 1u; } else { - Alloc param_72 = cmd_alloc; - CmdRef param_73 = cmd_ref; - uint param_74 = cmd_limit; - bool _2158 = alloc_cmd(param_72, param_73, param_74); - cmd_alloc = param_72; - cmd_ref = param_73; - cmd_limit = param_74; - if (!_2158) + Alloc param_73 = cmd_alloc; + CmdRef param_74 = cmd_ref; + uint param_75 = cmd_limit; + alloc_cmd(param_73, param_74, param_75); + cmd_alloc = param_73; + cmd_ref = param_74; + cmd_limit = param_75; + if (mem_ok) { - break; + Alloc param_76 = cmd_alloc; + CmdRef param_77 = cmd_ref; + Cmd_BeginClip_write(param_76, param_77); } - Alloc param_75 = cmd_alloc; - CmdRef param_76 = cmd_ref; - Cmd_BeginClip_write(param_75, param_76); cmd_ref.offset += 4u; render_blend_depth++; max_blend_depth = max(max_blend_depth, render_blend_depth); @@ -1159,29 +1156,21 @@ void comp_main() case 37u: { clip_depth--; - Alloc param_77 = cmd_alloc; - CmdRef param_78 = cmd_ref; - uint param_79 = cmd_limit; - bool _2191 = alloc_cmd(param_77, param_78, param_79); - cmd_alloc = param_77; - cmd_ref = param_78; - cmd_limit = param_79; - if (!_2191) + Alloc param_78 = cmd_alloc; + CmdRef param_79 = cmd_ref; + Tile param_80 = tile_1; + float param_81 = -1.0f; + write_fill(param_78, param_79, param_80, param_81); + cmd_ref = param_79; + uint blend_1 = _1390.Load(dd_1 * 4 + 0); + if (mem_ok) { - break; + CmdEndClip _2182 = { blend_1 }; + Alloc param_82 = cmd_alloc; + CmdRef param_83 = cmd_ref; + CmdEndClip param_84 = _2182; + Cmd_EndClip_write(param_82, param_83, param_84); } - Alloc param_80 = cmd_alloc; - CmdRef param_81 = cmd_ref; - Tile param_82 = tile_1; - float param_83 = -1.0f; - write_fill(param_80, param_81, param_82, param_83); - cmd_ref = param_81; - uint blend_1 = _1399.Load(dd_1 * 4 + 0); - CmdEndClip _2214 = { blend_1 }; - Alloc param_84 = cmd_alloc; - CmdRef param_85 = cmd_ref; - CmdEndClip param_86 = _2214; - Cmd_EndClip_write(param_84, param_85, param_86); cmd_ref.offset += 8u; render_blend_depth--; break; @@ -1216,31 +1205,34 @@ void comp_main() break; } } - bool _2263 = (bin_tile_x + tile_x) < _1020.Load(8); - bool _2272; - if (_2263) + bool _2231 = (bin_tile_x + tile_x) < _891.Load(12); + bool _2240; + if (_2231) { - _2272 = (bin_tile_y + tile_y) < _1020.Load(12); + _2240 = (bin_tile_y + tile_y) < _891.Load(16); } else { - _2272 = _2263; + _2240 = _2231; } - if (_2272) + if (_2240) { - Alloc param_87 = cmd_alloc; - CmdRef param_88 = cmd_ref; - Cmd_End_write(param_87, param_88); + if (mem_ok) + { + Alloc param_85 = cmd_alloc; + CmdRef param_86 = cmd_ref; + Cmd_End_write(param_85, param_86); + } if (max_blend_depth > 4u) { uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; - uint param_89 = scratch_size; - MallocResult _2293 = malloc(param_89); - MallocResult scratch = _2293; - Alloc param_90 = scratch_alloc; - uint param_91 = scratch_alloc.offset; - Alloc param_92 = scratch.alloc; - alloc_write(param_90, param_91, param_92); + uint _2264; + _267.InterlockedAdd(8, scratch_size, _2264); + uint scratch = _2264; + Alloc param_87 = scratch_alloc; + uint param_88 = scratch_alloc.offset >> uint(2); + uint param_89 = scratch; + write_mem(param_87, param_88, param_89); } } } diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl index d84add1..5df99b9 100644 --- a/piet-gpu/shader/gen/coarse.msl +++ b/piet-gpu/shader/gen/coarse.msl @@ -19,12 +19,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - struct BinInstanceRef { uint offset; @@ -162,6 +156,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -172,6 +167,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -213,6 +209,13 @@ struct SceneBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); +static inline __attribute__((always_inline)) +bool check_deps(thread const uint& dep_stage, device Memory& v_267) +{ + uint _273 = atomic_fetch_or_explicit((device atomic_uint*)&v_267.mem_error, 0u, memory_order_relaxed); + return (_273 & dep_stage) == 0u; +} + static inline __attribute__((always_inline)) Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) { @@ -226,7 +229,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_266, constant uint& v_266BufferSize) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_267) { Alloc param = alloc; uint param_1 = offset; @@ -234,7 +237,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_266.memory[offset]; + uint v = v_267.memory[offset]; return v; } @@ -253,30 +256,30 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const } static inline __attribute__((always_inline)) -BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + uint raw0 = read_mem(param, param_1, v_267); BinInstance s; s.element_ix = raw0; return s; } static inline __attribute__((always_inline)) -Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + uint raw0 = read_mem(param, param_1, v_267); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); + uint raw1 = read_mem(param_2, param_3, v_267); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_266, v_266BufferSize); + uint raw2 = read_mem(param_4, param_5, v_267); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); s.tiles = TileRef{ raw2 }; @@ -289,24 +292,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) } static inline __attribute__((always_inline)) -Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_266, constant uint& v_266BufferSize) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, const device ConfigBuf& v_891) { uint param = 0u; - uint param_1 = uint(int((v_266BufferSize - 8) / 4) * 4); + uint param_1 = v_891.conf.mem_size; bool param_2 = mem_ok; return new_alloc(param, param_1, param_2); } static inline __attribute__((always_inline)) -Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); + uint raw0 = read_mem(param, param_1, v_267); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); + uint raw1 = read_mem(param_2, param_3, v_267); Tile s; s.tile = TileSegRef{ raw0 }; s.backdrop = int(raw1); @@ -314,26 +317,20 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& } static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_266, constant uint& v_266BufferSize) +uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_267) { - uint _272 = atomic_fetch_add_explicit((device atomic_uint*)&v_266.mem_offset, size, memory_order_relaxed); - uint offset = _272; - MallocResult r; - r.failed = (offset + size) > uint(int((v_266BufferSize - 8) / 4) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _282 = atomic_fetch_add_explicit((device atomic_uint*)&v_267.mem_offset, size, memory_order_relaxed); + uint offset = _282; + if ((offset + size) > mem_size) { - uint _301 = atomic_fetch_max_explicit((device atomic_uint*)&v_266.mem_error, 1u, memory_order_relaxed); - return r; + uint _292 = atomic_fetch_or_explicit((device atomic_uint*)&v_267.mem_error, stage, memory_order_relaxed); + offset = 0u; } - return r; + return offset; } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_266, constant uint& v_266BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_267) { Alloc param = alloc; uint param_1 = offset; @@ -341,352 +338,359 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_266.memory[offset] = val; + v_267.memory[offset] = val; } static inline __attribute__((always_inline)) -void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.new_ref; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); } static inline __attribute__((always_inline)) -void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 11u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; CmdJump param_5 = s; - CmdJump_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdJump_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_266, constant uint& v_266BufferSize) +void alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, thread bool& mem_ok, device Memory& v_267, const device ConfigBuf& v_891) { if (cmd_ref.offset < cmd_limit) { - return true; + return; } uint param = 1024u; - MallocResult _928 = malloc(param, v_266, v_266BufferSize); - MallocResult new_cmd = _928; - if (new_cmd.failed) + uint param_1 = v_891.conf.mem_size; + uint param_2 = 8u; + uint _915 = malloc_stage(param, param_1, param_2, v_267); + uint new_cmd = _915; + if (new_cmd == 0u) { - return false; + mem_ok = false; } - CmdJump jump = CmdJump{ new_cmd.alloc.offset }; - Alloc param_1 = cmd_alloc; - CmdRef param_2 = cmd_ref; - CmdJump param_3 = jump; - Cmd_Jump_write(param_1, param_2, param_3, v_266, v_266BufferSize); - cmd_alloc = new_cmd.alloc; - cmd_ref = CmdRef{ cmd_alloc.offset }; - cmd_limit = (cmd_alloc.offset + 1024u) - 144u; - return true; + if (mem_ok) + { + CmdJump jump = CmdJump{ new_cmd }; + Alloc param_3 = cmd_alloc; + CmdRef param_4 = cmd_ref; + CmdJump param_5 = jump; + Cmd_Jump_write(param_3, param_4, param_5, v_267); + } + uint param_6 = new_cmd; + uint param_7 = 1024u; + bool param_8 = true; + cmd_alloc = new_alloc(param_6, param_7, param_8); + cmd_ref = CmdRef{ new_cmd }; + cmd_limit = (new_cmd + 1024u) - 144u; } static inline __attribute__((always_inline)) -void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.tile_ref; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = uint(s.backdrop); - write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + write_mem(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 1u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; CmdFill param_5 = s; - CmdFill_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdFill_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 3u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); } static inline __attribute__((always_inline)) -void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.tile_ref; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.half_width); - write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + write_mem(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 2u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; CmdStroke param_5 = s; - CmdStroke_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdStroke_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_266, constant uint& v_266BufferSize) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, thread bool& mem_ok, device Memory& v_267) { if (linewidth < 0.0) { if (tile.tile.offset != 0u) { CmdFill cmd_fill = CmdFill{ tile.tile.offset, tile.backdrop }; - Alloc param = alloc; - CmdRef param_1 = cmd_ref; - CmdFill param_2 = cmd_fill; - Cmd_Fill_write(param, param_1, param_2, v_266, v_266BufferSize); + if (mem_ok) + { + Alloc param = alloc; + CmdRef param_1 = cmd_ref; + CmdFill param_2 = cmd_fill; + Cmd_Fill_write(param, param_1, param_2, v_267); + } cmd_ref.offset += 12u; } else { - Alloc param_3 = alloc; - CmdRef param_4 = cmd_ref; - Cmd_Solid_write(param_3, param_4, v_266, v_266BufferSize); + if (mem_ok) + { + Alloc param_3 = alloc; + CmdRef param_4 = cmd_ref; + Cmd_Solid_write(param_3, param_4, v_267); + } cmd_ref.offset += 4u; } } else { CmdStroke cmd_stroke = CmdStroke{ tile.tile.offset, 0.5 * linewidth }; - Alloc param_5 = alloc; - CmdRef param_6 = cmd_ref; - CmdStroke param_7 = cmd_stroke; - Cmd_Stroke_write(param_5, param_6, param_7, v_266, v_266BufferSize); + if (mem_ok) + { + Alloc param_5 = alloc; + CmdRef param_6 = cmd_ref; + CmdStroke param_7 = cmd_stroke; + Cmd_Stroke_write(param_5, param_6, param_7, v_267); + } cmd_ref.offset += 12u; } } static inline __attribute__((always_inline)) -void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.rgba_color; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); } static inline __attribute__((always_inline)) -void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; CmdColor param_5 = s; - CmdColor_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdColor_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.line_x); - write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + write_mem(param_3, param_4, param_5, v_267); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.line_y); - write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); + write_mem(param_6, param_7, param_8, v_267); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.line_c); - write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); + write_mem(param_9, param_10, param_11, v_267); } static inline __attribute__((always_inline)) -void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 6u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; CmdLinGrad param_5 = s; - CmdLinGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdLinGrad_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.mat.x); - write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + write_mem(param_3, param_4, param_5, v_267); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.mat.y); - write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); + write_mem(param_6, param_7, param_8, v_267); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.mat.z); - write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); + write_mem(param_9, param_10, param_11, v_267); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.mat.w); - write_mem(param_12, param_13, param_14, v_266, v_266BufferSize); + write_mem(param_12, param_13, param_14, v_267); Alloc param_15 = a; uint param_16 = ix + 5u; uint param_17 = as_type(s.xlat.x); - write_mem(param_15, param_16, param_17, v_266, v_266BufferSize); + write_mem(param_15, param_16, param_17, v_267); Alloc param_18 = a; uint param_19 = ix + 6u; uint param_20 = as_type(s.xlat.y); - write_mem(param_18, param_19, param_20, v_266, v_266BufferSize); + write_mem(param_18, param_19, param_20, v_267); Alloc param_21 = a; uint param_22 = ix + 7u; uint param_23 = as_type(s.c1.x); - write_mem(param_21, param_22, param_23, v_266, v_266BufferSize); + write_mem(param_21, param_22, param_23, v_267); Alloc param_24 = a; uint param_25 = ix + 8u; uint param_26 = as_type(s.c1.y); - write_mem(param_24, param_25, param_26, v_266, v_266BufferSize); + write_mem(param_24, param_25, param_26, v_267); Alloc param_27 = a; uint param_28 = ix + 9u; uint param_29 = as_type(s.ra); - write_mem(param_27, param_28, param_29, v_266, v_266BufferSize); + write_mem(param_27, param_28, param_29, v_267); Alloc param_30 = a; uint param_31 = ix + 10u; uint param_32 = as_type(s.roff); - write_mem(param_30, param_31, param_32, v_266, v_266BufferSize); + write_mem(param_30, param_31, param_32, v_267); } static inline __attribute__((always_inline)) -void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 7u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u }; CmdRadGrad param_5 = s; - CmdRadGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdRadGrad_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); - write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); + write_mem(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 8u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; CmdImage param_5 = s; - CmdImage_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdImage_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 9u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); } static inline __attribute__((always_inline)) -void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) +void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_267) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.blend; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); } static inline __attribute__((always_inline)) -void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 10u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); Alloc param_3 = a; CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u }; CmdEndClip param_5 = s; - CmdEndClip_write(param_3, param_4, param_5, v_266, v_266BufferSize); + CmdEndClip_write(param_3, param_4, param_5, v_267); } static inline __attribute__((always_inline)) -void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_267) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 0u; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); + write_mem(param, param_1, param_2, v_267); } -static inline __attribute__((always_inline)) -void alloc_write(thread const Alloc& a, thread const uint& offset, thread const Alloc& alloc, device Memory& v_266, constant uint& v_266BufferSize) -{ - Alloc param = a; - uint param_1 = offset >> uint(2); - uint param_2 = alloc.offset; - write_mem(param, param_1, param_2, v_266, v_266BufferSize); -} - -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_266 [[buffer(0)]], const device ConfigBuf& _1020 [[buffer(1)]], const device SceneBuf& _1399 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_267 [[buffer(0)]], const device ConfigBuf& v_891 [[buffer(1)]], const device SceneBuf& _1390 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint sh_bitmaps[8][256]; threadgroup Alloc sh_part_elements[256]; @@ -698,22 +702,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M threadgroup uint sh_tile_y0[256]; threadgroup uint sh_tile_base[256]; threadgroup uint sh_tile_count[256]; - constant uint& v_266BufferSize = spvBufferSizeConstants[0]; - uint width_in_bins = ((_1020.conf.width_in_tiles + 16u) - 1u) / 16u; + bool mem_ok = true; + uint param = 7u; + bool _1012 = check_deps(param, v_267); + if (!_1012) + { + return; + } + uint width_in_bins = ((v_891.conf.width_in_tiles + 16u) - 1u) / 16u; uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; uint partition_ix = 0u; - uint n_partitions = ((_1020.conf.n_elements + 256u) - 1u) / 256u; + uint n_partitions = ((v_891.conf.n_elements + 256u) - 1u) / 256u; uint th_ix = gl_LocalInvocationID.x; uint bin_tile_x = 16u * gl_WorkGroupID.x; uint bin_tile_y = 16u * gl_WorkGroupID.y; uint tile_x = gl_LocalInvocationID.x % 16u; uint tile_y = gl_LocalInvocationID.x / 16u; - uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.conf.width_in_tiles) + bin_tile_x) + tile_x; - Alloc param; - param.offset = _1020.conf.ptcl_alloc.offset; - uint param_1 = this_tile_ix * 1024u; - uint param_2 = 1024u; - Alloc cmd_alloc = slice_mem(param, param_1, param_2); + uint this_tile_ix = (((bin_tile_y + tile_y) * v_891.conf.width_in_tiles) + bin_tile_x) + tile_x; + Alloc param_1; + param_1.offset = v_891.conf.ptcl_alloc.offset; + uint param_2 = this_tile_ix * 1024u; + uint param_3 = 1024u; + Alloc cmd_alloc = slice_mem(param_1, param_2, param_3); CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; uint clip_depth = 0u; @@ -722,25 +732,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; - Alloc param_3 = cmd_alloc; - uint param_4 = 0u; - uint param_5 = 8u; - Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + Alloc param_4 = cmd_alloc; + uint param_5 = 0u; + uint param_6 = 8u; + Alloc scratch_alloc = slice_mem(param_4, param_5, param_6); cmd_ref.offset += 4u; uint render_blend_depth = 0u; uint max_blend_depth = 0u; - uint drawmonoid_start = _1020.conf.drawmonoid_alloc.offset >> uint(2); - uint drawtag_start = _1020.conf.drawtag_offset >> uint(2); - uint drawdata_start = _1020.conf.drawdata_offset >> uint(2); - uint drawinfo_start = _1020.conf.drawinfo_alloc.offset >> uint(2); - bool mem_ok = v_266.mem_error == 0u; - Alloc param_6; - Alloc param_8; - uint _1331; + uint drawmonoid_start = v_891.conf.drawmonoid_alloc.offset >> uint(2); + uint drawtag_start = v_891.conf.drawtag_offset >> uint(2); + uint drawdata_start = v_891.conf.drawdata_offset >> uint(2); + uint drawinfo_start = v_891.conf.drawinfo_alloc.offset >> uint(2); + Alloc param_7; + Alloc param_9; + uint _1322; uint element_ix; - Alloc param_17; + Alloc param_18; uint tile_count; - uint _1632; + uint _1622; float linewidth; CmdLinGrad cmd_lin; CmdRadGrad cmd_rad; @@ -750,36 +759,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { sh_bitmaps[i][th_ix] = 0u; } - bool _1383; + bool _1374; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1181 = th_ix < 256u; - bool _1189; - if (_1181) + bool _1174 = th_ix < 256u; + bool _1182; + if (_1174) { - _1189 = (partition_ix + th_ix) < n_partitions; + _1182 = (partition_ix + th_ix) < n_partitions; } else { - _1189 = _1181; + _1182 = _1174; } - if (_1189) + if (_1182) { - uint in_ix = (_1020.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - param_6.offset = _1020.conf.bin_alloc.offset; - uint param_7 = in_ix; - count = read_mem(param_6, param_7, v_266, v_266BufferSize); - param_8.offset = _1020.conf.bin_alloc.offset; - uint param_9 = in_ix + 1u; - uint offset = read_mem(param_8, param_9, v_266, v_266BufferSize); - uint param_10 = offset; - uint param_11 = count * 4u; - bool param_12 = mem_ok; - sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); + uint in_ix = (v_891.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_7.offset = v_891.conf.bin_alloc.offset; + uint param_8 = in_ix; + count = read_mem(param_7, param_8, v_267); + param_9.offset = v_891.conf.bin_alloc.offset; + uint param_10 = in_ix + 1u; + uint offset = read_mem(param_9, param_10, v_267); + uint param_11 = offset; + uint param_12 = count * 4u; + bool param_13 = true; + sh_part_elements[th_ix] = new_alloc(param_11, param_12, param_13); } for (uint i_1 = 0u; i_1 < 8u; i_1++) { @@ -806,7 +815,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M partition_ix += 256u; } uint ix = rd_ix + th_ix; - if (((ix >= wr_ix) && (ix < ready_ix)) && mem_ok) + if ((ix >= wr_ix) && (ix < ready_ix)) { uint part_ix = 0u; for (uint i_2 = 0u; i_2 < 8u; i_2++) @@ -819,34 +828,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } if (part_ix > 0u) { - _1331 = sh_part_count[part_ix - 1u]; + _1322 = sh_part_count[part_ix - 1u]; } else { - _1331 = part_start_ix; + _1322 = part_start_ix; } - ix -= _1331; + ix -= _1322; Alloc bin_alloc = sh_part_elements[part_ix]; BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; - BinInstanceRef param_13 = inst_ref; - uint param_14 = ix; - Alloc param_15 = bin_alloc; - BinInstanceRef param_16 = BinInstance_index(param_13, param_14); - BinInstance inst = BinInstance_read(param_15, param_16, v_266, v_266BufferSize); + BinInstanceRef param_14 = inst_ref; + uint param_15 = ix; + Alloc param_16 = bin_alloc; + BinInstanceRef param_17 = BinInstance_index(param_14, param_15); + BinInstance inst = BinInstance_read(param_16, param_17, v_267); sh_elements[th_ix] = inst.element_ix; } threadgroup_barrier(mem_flags::mem_threadgroup); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1373 = (wr_ix - rd_ix) < 256u; - if (_1373) + bool _1364 = (wr_ix - rd_ix) < 256u; + if (_1364) { - _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1374 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1383 = _1373; + _1374 = _1364; } - if (_1383) + if (_1374) { continue; } @@ -859,7 +868,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - tag = _1399.scene[drawtag_start + element_ix]; + tag = _1390.scene[drawtag_start + element_ix]; } switch (tag) { @@ -871,10 +880,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M case 37u: { uint drawmonoid_base = drawmonoid_start + (4u * element_ix); - uint path_ix = v_266.memory[drawmonoid_base]; - param_17.offset = _1020.conf.tile_alloc.offset; - PathRef param_18 = PathRef{ _1020.conf.tile_alloc.offset + (path_ix * 12u) }; - Path path = Path_read(param_17, param_18, v_266, v_266BufferSize); + uint path_ix = v_267.memory[drawmonoid_base]; + param_18.offset = v_891.conf.tile_alloc.offset; + PathRef param_19 = PathRef{ v_891.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_18, param_19, v_267); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -889,13 +898,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M tile_count = uint(x1 - x0) * uint(y1 - y0); uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); sh_tile_base[th_ix] = base; - uint param_19 = path.tiles.offset; - uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_21 = mem_ok; - Alloc path_alloc = new_alloc(param_19, param_20, param_21); - uint param_22 = th_ix; - Alloc param_23 = path_alloc; - write_tile_alloc(param_22, param_23); + uint param_20 = path.tiles.offset; + uint param_21 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_22 = true; + Alloc path_alloc = new_alloc(param_20, param_21, param_22); + uint param_23 = th_ix; + Alloc param_24 = path_alloc; + write_tile_alloc(param_23, param_24); break; } default: @@ -929,60 +938,57 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } } uint element_ix_1 = sh_elements[el_ix]; - uint tag_1 = _1399.scene[drawtag_start + element_ix_1]; + uint tag_1 = _1390.scene[drawtag_start + element_ix_1]; if (el_ix > 0u) { - _1632 = sh_tile_count[el_ix - 1u]; + _1622 = sh_tile_count[el_ix - 1u]; } else { - _1632 = 0u; + _1622 = 0u; } - uint seq_ix = ix_1 - _1632; + uint seq_ix = ix_1 - _1622; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); bool include_tile = false; - if (mem_ok) + uint param_25 = el_ix; + bool param_26 = true; + Alloc param_27 = read_tile_alloc(param_25, param_26, v_891); + TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_27, param_28, v_267); + bool is_clip = (tag_1 & 1u) != 0u; + bool is_blend = false; + if (is_clip) { - uint param_24 = el_ix; - bool param_25 = mem_ok; - Alloc param_26 = read_tile_alloc(param_24, param_25, v_266, v_266BufferSize); - TileRef param_27 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; - Tile tile = Tile_read(param_26, param_27, v_266, v_266BufferSize); - bool is_clip = (tag_1 & 1u) != 0u; - bool is_blend = false; - if (is_clip) - { - uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); - uint scene_offset = v_266.memory[drawmonoid_base_1 + 2u]; - uint dd = drawdata_start + (scene_offset >> uint(2)); - uint blend = _1399.scene[dd]; - is_blend = blend != 32771u; - } - bool _1720 = tile.tile.offset != 0u; - bool _1729; - if (!_1720) - { - _1729 = (tile.backdrop == 0) == is_clip; - } - else - { - _1729 = _1720; - } - include_tile = _1729 || is_blend; + uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); + uint scene_offset = v_267.memory[drawmonoid_base_1 + 2u]; + uint dd = drawdata_start + (scene_offset >> uint(2)); + uint blend = _1390.scene[dd]; + is_blend = blend != 32771u; } + bool _1706 = tile.tile.offset != 0u; + bool _1715; + if (!_1706) + { + _1715 = (tile.backdrop == 0) == is_clip; + } + else + { + _1715 = _1706; + } + include_tile = _1715 || is_blend; if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1751 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + uint _1737 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); } } threadgroup_barrier(mem_flags::mem_threadgroup); uint slice_ix = 0u; uint bitmap = sh_bitmaps[0][th_ix]; - while (mem_ok) + while (true) { if (bitmap == 0u) { @@ -1000,175 +1006,170 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); uint element_ix_2 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - uint drawtag = _1399.scene[drawtag_start + element_ix_2]; + uint drawtag = _1390.scene[drawtag_start + element_ix_2]; if (clip_zero_depth == 0u) { - uint param_28 = element_ref_ix; - bool param_29 = mem_ok; - Alloc param_30 = read_tile_alloc(param_28, param_29, v_266, v_266BufferSize); - TileRef param_31 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - Tile tile_1 = Tile_read(param_30, param_31, v_266, v_266BufferSize); + uint param_29 = element_ref_ix; + bool param_30 = true; + Alloc param_31 = read_tile_alloc(param_29, param_30, v_891); + TileRef param_32 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Tile tile_1 = Tile_read(param_31, param_32, v_267); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); - uint scene_offset_1 = v_266.memory[drawmonoid_base_2 + 2u]; - uint info_offset = v_266.memory[drawmonoid_base_2 + 3u]; + uint scene_offset_1 = v_267.memory[drawmonoid_base_2 + 2u]; + uint info_offset = v_267.memory[drawmonoid_base_2 + 3u]; uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); uint di = drawinfo_start + (info_offset >> uint(2)); switch (drawtag) { case 68u: { - linewidth = as_type(v_266.memory[di]); - Alloc param_32 = cmd_alloc; - CmdRef param_33 = cmd_ref; - uint param_34 = cmd_limit; - bool _1876 = alloc_cmd(param_32, param_33, param_34, v_266, v_266BufferSize); - cmd_alloc = param_32; - cmd_ref = param_33; - cmd_limit = param_34; - if (!_1876) + linewidth = as_type(v_267.memory[di]); + Alloc param_33 = cmd_alloc; + CmdRef param_34 = cmd_ref; + uint param_35 = cmd_limit; + alloc_cmd(param_33, param_34, param_35, mem_ok, v_267, v_891); + cmd_alloc = param_33; + cmd_ref = param_34; + cmd_limit = param_35; + Alloc param_36 = cmd_alloc; + CmdRef param_37 = cmd_ref; + Tile param_38 = tile_1; + float param_39 = linewidth; + write_fill(param_36, param_37, param_38, param_39, mem_ok, v_267); + cmd_ref = param_37; + uint rgba = _1390.scene[dd_1]; + if (mem_ok) { - break; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdColor param_42 = CmdColor{ rgba }; + Cmd_Color_write(param_40, param_41, param_42, v_267); } - Alloc param_35 = cmd_alloc; - CmdRef param_36 = cmd_ref; - Tile param_37 = tile_1; - float param_38 = linewidth; - write_fill(param_35, param_36, param_37, param_38, v_266, v_266BufferSize); - cmd_ref = param_36; - uint rgba = _1399.scene[dd_1]; - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdColor param_41 = CmdColor{ rgba }; - Cmd_Color_write(param_39, param_40, param_41, v_266, v_266BufferSize); cmd_ref.offset += 8u; break; } case 276u: { - Alloc param_42 = cmd_alloc; - CmdRef param_43 = cmd_ref; - uint param_44 = cmd_limit; - bool _1917 = alloc_cmd(param_42, param_43, param_44, v_266, v_266BufferSize); - cmd_alloc = param_42; - cmd_ref = param_43; - cmd_limit = param_44; - if (!_1917) + Alloc param_43 = cmd_alloc; + CmdRef param_44 = cmd_ref; + uint param_45 = cmd_limit; + alloc_cmd(param_43, param_44, param_45, mem_ok, v_267, v_891); + cmd_alloc = param_43; + cmd_ref = param_44; + cmd_limit = param_45; + linewidth = as_type(v_267.memory[di]); + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + Tile param_48 = tile_1; + float param_49 = linewidth; + write_fill(param_46, param_47, param_48, param_49, mem_ok, v_267); + cmd_ref = param_47; + cmd_lin.index = _1390.scene[dd_1]; + cmd_lin.line_x = as_type(v_267.memory[di + 1u]); + cmd_lin.line_y = as_type(v_267.memory[di + 2u]); + cmd_lin.line_c = as_type(v_267.memory[di + 3u]); + if (mem_ok) { - break; + Alloc param_50 = cmd_alloc; + CmdRef param_51 = cmd_ref; + CmdLinGrad param_52 = cmd_lin; + Cmd_LinGrad_write(param_50, param_51, param_52, v_267); } - linewidth = as_type(v_266.memory[di]); - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - Tile param_47 = tile_1; - float param_48 = linewidth; - write_fill(param_45, param_46, param_47, param_48, v_266, v_266BufferSize); - cmd_ref = param_46; - cmd_lin.index = _1399.scene[dd_1]; - cmd_lin.line_x = as_type(v_266.memory[di + 1u]); - cmd_lin.line_y = as_type(v_266.memory[di + 2u]); - cmd_lin.line_c = as_type(v_266.memory[di + 3u]); - Alloc param_49 = cmd_alloc; - CmdRef param_50 = cmd_ref; - CmdLinGrad param_51 = cmd_lin; - Cmd_LinGrad_write(param_49, param_50, param_51, v_266, v_266BufferSize); cmd_ref.offset += 20u; break; } case 732u: { - Alloc param_52 = cmd_alloc; - CmdRef param_53 = cmd_ref; - uint param_54 = cmd_limit; - bool _1981 = alloc_cmd(param_52, param_53, param_54, v_266, v_266BufferSize); - cmd_alloc = param_52; - cmd_ref = param_53; - cmd_limit = param_54; - if (!_1981) + Alloc param_53 = cmd_alloc; + CmdRef param_54 = cmd_ref; + uint param_55 = cmd_limit; + alloc_cmd(param_53, param_54, param_55, mem_ok, v_267, v_891); + cmd_alloc = param_53; + cmd_ref = param_54; + cmd_limit = param_55; + linewidth = as_type(v_267.memory[di]); + Alloc param_56 = cmd_alloc; + CmdRef param_57 = cmd_ref; + Tile param_58 = tile_1; + float param_59 = linewidth; + write_fill(param_56, param_57, param_58, param_59, mem_ok, v_267); + cmd_ref = param_57; + cmd_rad.index = _1390.scene[dd_1]; + cmd_rad.mat = as_type(uint4(v_267.memory[di + 1u], v_267.memory[di + 2u], v_267.memory[di + 3u], v_267.memory[di + 4u])); + cmd_rad.xlat = as_type(uint2(v_267.memory[di + 5u], v_267.memory[di + 6u])); + cmd_rad.c1 = as_type(uint2(v_267.memory[di + 7u], v_267.memory[di + 8u])); + cmd_rad.ra = as_type(v_267.memory[di + 9u]); + cmd_rad.roff = as_type(v_267.memory[di + 10u]); + if (mem_ok) { - break; + Alloc param_60 = cmd_alloc; + CmdRef param_61 = cmd_ref; + CmdRadGrad param_62 = cmd_rad; + Cmd_RadGrad_write(param_60, param_61, param_62, v_267); } - linewidth = as_type(v_266.memory[di]); - Alloc param_55 = cmd_alloc; - CmdRef param_56 = cmd_ref; - Tile param_57 = tile_1; - float param_58 = linewidth; - write_fill(param_55, param_56, param_57, param_58, v_266, v_266BufferSize); - cmd_ref = param_56; - cmd_rad.index = _1399.scene[dd_1]; - cmd_rad.mat = as_type(uint4(v_266.memory[di + 1u], v_266.memory[di + 2u], v_266.memory[di + 3u], v_266.memory[di + 4u])); - cmd_rad.xlat = as_type(uint2(v_266.memory[di + 5u], v_266.memory[di + 6u])); - cmd_rad.c1 = as_type(uint2(v_266.memory[di + 7u], v_266.memory[di + 8u])); - cmd_rad.ra = as_type(v_266.memory[di + 9u]); - cmd_rad.roff = as_type(v_266.memory[di + 10u]); - Alloc param_59 = cmd_alloc; - CmdRef param_60 = cmd_ref; - CmdRadGrad param_61 = cmd_rad; - Cmd_RadGrad_write(param_59, param_60, param_61, v_266, v_266BufferSize); cmd_ref.offset += 48u; break; } case 72u: { - linewidth = as_type(v_266.memory[di]); - Alloc param_62 = cmd_alloc; - CmdRef param_63 = cmd_ref; - uint param_64 = cmd_limit; - bool _2087 = alloc_cmd(param_62, param_63, param_64, v_266, v_266BufferSize); - cmd_alloc = param_62; - cmd_ref = param_63; - cmd_limit = param_64; - if (!_2087) - { - break; - } - Alloc param_65 = cmd_alloc; - CmdRef param_66 = cmd_ref; - Tile param_67 = tile_1; - float param_68 = linewidth; - write_fill(param_65, param_66, param_67, param_68, v_266, v_266BufferSize); - cmd_ref = param_66; - uint index = _1399.scene[dd_1]; - uint raw1 = _1399.scene[dd_1 + 1u]; + Alloc param_63 = cmd_alloc; + CmdRef param_64 = cmd_ref; + uint param_65 = cmd_limit; + alloc_cmd(param_63, param_64, param_65, mem_ok, v_267, v_891); + cmd_alloc = param_63; + cmd_ref = param_64; + cmd_limit = param_65; + linewidth = as_type(v_267.memory[di]); + Alloc param_66 = cmd_alloc; + CmdRef param_67 = cmd_ref; + Tile param_68 = tile_1; + float param_69 = linewidth; + write_fill(param_66, param_67, param_68, param_69, mem_ok, v_267); + cmd_ref = param_67; + uint index = _1390.scene[dd_1]; + uint raw1 = _1390.scene[dd_1 + 1u]; int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); - Alloc param_69 = cmd_alloc; - CmdRef param_70 = cmd_ref; - CmdImage param_71 = CmdImage{ index, offset_1 }; - Cmd_Image_write(param_69, param_70, param_71, v_266, v_266BufferSize); + if (mem_ok) + { + Alloc param_70 = cmd_alloc; + CmdRef param_71 = cmd_ref; + CmdImage param_72 = CmdImage{ index, offset_1 }; + Cmd_Image_write(param_70, param_71, param_72, v_267); + } cmd_ref.offset += 12u; break; } case 5u: { - bool _2140 = tile_1.tile.offset == 0u; - bool _2146; - if (_2140) + bool _2120 = tile_1.tile.offset == 0u; + bool _2126; + if (_2120) { - _2146 = tile_1.backdrop == 0; + _2126 = tile_1.backdrop == 0; } else { - _2146 = _2140; + _2126 = _2120; } - if (_2146) + if (_2126) { clip_zero_depth = clip_depth + 1u; } else { - Alloc param_72 = cmd_alloc; - CmdRef param_73 = cmd_ref; - uint param_74 = cmd_limit; - bool _2158 = alloc_cmd(param_72, param_73, param_74, v_266, v_266BufferSize); - cmd_alloc = param_72; - cmd_ref = param_73; - cmd_limit = param_74; - if (!_2158) + Alloc param_73 = cmd_alloc; + CmdRef param_74 = cmd_ref; + uint param_75 = cmd_limit; + alloc_cmd(param_73, param_74, param_75, mem_ok, v_267, v_891); + cmd_alloc = param_73; + cmd_ref = param_74; + cmd_limit = param_75; + if (mem_ok) { - break; + Alloc param_76 = cmd_alloc; + CmdRef param_77 = cmd_ref; + Cmd_BeginClip_write(param_76, param_77, v_267); } - Alloc param_75 = cmd_alloc; - CmdRef param_76 = cmd_ref; - Cmd_BeginClip_write(param_75, param_76, v_266, v_266BufferSize); cmd_ref.offset += 4u; render_blend_depth++; max_blend_depth = max(max_blend_depth, render_blend_depth); @@ -1179,28 +1180,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M case 37u: { clip_depth--; - Alloc param_77 = cmd_alloc; - CmdRef param_78 = cmd_ref; - uint param_79 = cmd_limit; - bool _2191 = alloc_cmd(param_77, param_78, param_79, v_266, v_266BufferSize); - cmd_alloc = param_77; - cmd_ref = param_78; - cmd_limit = param_79; - if (!_2191) + Alloc param_78 = cmd_alloc; + CmdRef param_79 = cmd_ref; + Tile param_80 = tile_1; + float param_81 = -1.0; + write_fill(param_78, param_79, param_80, param_81, mem_ok, v_267); + cmd_ref = param_79; + uint blend_1 = _1390.scene[dd_1]; + if (mem_ok) { - break; + Alloc param_82 = cmd_alloc; + CmdRef param_83 = cmd_ref; + CmdEndClip param_84 = CmdEndClip{ blend_1 }; + Cmd_EndClip_write(param_82, param_83, param_84, v_267); } - Alloc param_80 = cmd_alloc; - CmdRef param_81 = cmd_ref; - Tile param_82 = tile_1; - float param_83 = -1.0; - write_fill(param_80, param_81, param_82, param_83, v_266, v_266BufferSize); - cmd_ref = param_81; - uint blend_1 = _1399.scene[dd_1]; - Alloc param_84 = cmd_alloc; - CmdRef param_85 = cmd_ref; - CmdEndClip param_86 = CmdEndClip{ blend_1 }; - Cmd_EndClip_write(param_84, param_85, param_86, v_266, v_266BufferSize); cmd_ref.offset += 8u; render_blend_depth--; break; @@ -1235,31 +1228,33 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M break; } } - bool _2263 = (bin_tile_x + tile_x) < _1020.conf.width_in_tiles; - bool _2272; - if (_2263) + bool _2231 = (bin_tile_x + tile_x) < v_891.conf.width_in_tiles; + bool _2240; + if (_2231) { - _2272 = (bin_tile_y + tile_y) < _1020.conf.height_in_tiles; + _2240 = (bin_tile_y + tile_y) < v_891.conf.height_in_tiles; } else { - _2272 = _2263; + _2240 = _2231; } - if (_2272) + if (_2240) { - Alloc param_87 = cmd_alloc; - CmdRef param_88 = cmd_ref; - Cmd_End_write(param_87, param_88, v_266, v_266BufferSize); + if (mem_ok) + { + Alloc param_85 = cmd_alloc; + CmdRef param_86 = cmd_ref; + Cmd_End_write(param_85, param_86, v_267); + } if (max_blend_depth > 4u) { uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; - uint param_89 = scratch_size; - MallocResult _2293 = malloc(param_89, v_266, v_266BufferSize); - MallocResult scratch = _2293; - Alloc param_90 = scratch_alloc; - uint param_91 = scratch_alloc.offset; - Alloc param_92 = scratch.alloc; - alloc_write(param_90, param_91, param_92, v_266, v_266BufferSize); + uint _2264 = atomic_fetch_add_explicit((device atomic_uint*)&v_267.blend_offset, scratch_size, memory_order_relaxed); + uint scratch = _2264; + Alloc param_87 = scratch_alloc; + uint param_88 = scratch_alloc.offset >> uint(2); + uint param_89 = scratch; + write_mem(param_87, param_88, param_89, v_267); } } } diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv index fe5eeee..2417cf8 100644 Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil index 200f169..97b006a 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl index 734d21e..789c9b3 100644 --- a/piet-gpu/shader/gen/draw_leaf.hlsl +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -13,6 +13,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -88,7 +89,7 @@ DrawMonoid draw_monoid_identity() void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - uint drawtag_base = _93.Load(100) >> uint(2); + uint drawtag_base = _93.Load(104) >> uint(2); uint tag_word = _103.Load((drawtag_base + ix) * 4 + 0); uint param = tag_word; DrawMonoid agg = map_tag(param); @@ -137,11 +138,11 @@ void comp_main() DrawMonoid param_7 = sh_scratch[gl_LocalInvocationID.x - 1u]; row = combine_draw_monoid(param_6, param_7); } - uint drawdata_base = _93.Load(104) >> uint(2); - uint drawinfo_base = _93.Load(68) >> uint(2); + uint drawdata_base = _93.Load(108) >> uint(2); + uint drawinfo_base = _93.Load(72) >> uint(2); uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_93.Load(44) >> uint(2)) + (out_ix * 4u); - uint clip_out_base = _93.Load(48) >> uint(2); + uint out_base = (_93.Load(48) >> uint(2)) + (out_ix * 4u); + uint clip_out_base = _93.Load(52) >> uint(2); float4 mat; float2 translate; float2 p0; @@ -155,31 +156,31 @@ void comp_main() DrawMonoid param_9 = local[i_2 - 1u]; m = combine_draw_monoid(param_8, param_9); } - _285.Store((out_base + (i_2 * 4u)) * 4 + 8, m.path_ix); - _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 8, m.clip_ix); - _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 8, m.scene_offset); - _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 8, m.info_offset); + _285.Store((out_base + (i_2 * 4u)) * 4 + 12, m.path_ix); + _285.Store(((out_base + (i_2 * 4u)) + 1u) * 4 + 12, m.clip_ix); + _285.Store(((out_base + (i_2 * 4u)) + 2u) * 4 + 12, m.scene_offset); + _285.Store(((out_base + (i_2 * 4u)) + 3u) * 4 + 12, m.info_offset); uint dd = drawdata_base + (m.scene_offset >> uint(2)); uint di = drawinfo_base + (m.info_offset >> uint(2)); tag_word = _103.Load(((drawtag_base + ix) + i_2) * 4 + 0); if (((((tag_word == 68u) || (tag_word == 276u)) || (tag_word == 732u)) || (tag_word == 72u)) || (tag_word == 5u)) { - uint bbox_offset = (_93.Load(40) >> uint(2)) + (6u * m.path_ix); - float bbox_l = float(_285.Load(bbox_offset * 4 + 8)) - 32768.0f; - float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; - float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; - float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f; + uint bbox_offset = (_93.Load(44) >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(_285.Load(bbox_offset * 4 + 12)) - 32768.0f; + float bbox_t = float(_285.Load((bbox_offset + 1u) * 4 + 12)) - 32768.0f; + float bbox_r = float(_285.Load((bbox_offset + 2u) * 4 + 12)) - 32768.0f; + float bbox_b = float(_285.Load((bbox_offset + 3u) * 4 + 12)) - 32768.0f; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); - float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 8)); + float linewidth = asfloat(_285.Load((bbox_offset + 4u) * 4 + 12)); uint fill_mode = uint(linewidth >= 0.0f); if (((linewidth >= 0.0f) || (tag_word == 276u)) || (tag_word == 732u)) { - uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 8); - uint t = (_93.Load(36) >> uint(2)) + (6u * trans_ix); - mat = asfloat(uint4(_285.Load(t * 4 + 8), _285.Load((t + 1u) * 4 + 8), _285.Load((t + 2u) * 4 + 8), _285.Load((t + 3u) * 4 + 8))); + uint trans_ix = _285.Load((bbox_offset + 5u) * 4 + 12); + uint t = (_93.Load(40) >> uint(2)) + (6u * trans_ix); + mat = asfloat(uint4(_285.Load(t * 4 + 12), _285.Load((t + 1u) * 4 + 12), _285.Load((t + 2u) * 4 + 12), _285.Load((t + 3u) * 4 + 12))); if ((tag_word == 276u) || (tag_word == 732u)) { - translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 8), _285.Load((t + 5u) * 4 + 8))); + translate = asfloat(uint2(_285.Load((t + 4u) * 4 + 12), _285.Load((t + 5u) * 4 + 12))); } } if (linewidth >= 0.0f) @@ -191,12 +192,12 @@ void comp_main() case 68u: case 72u: { - _285.Store(di * 4 + 8, asuint(linewidth)); + _285.Store(di * 4 + 12, asuint(linewidth)); break; } case 276u: { - _285.Store(di * 4 + 8, asuint(linewidth)); + _285.Store(di * 4 + 12, asuint(linewidth)); p0 = asfloat(uint2(_103.Load((dd + 1u) * 4 + 0), _103.Load((dd + 2u) * 4 + 0))); p1 = asfloat(uint2(_103.Load((dd + 3u) * 4 + 0), _103.Load((dd + 4u) * 4 + 0))); p0 = ((mat.xy * p0.x) + (mat.zw * p0.y)) + translate; @@ -206,9 +207,9 @@ void comp_main() float line_x = dxy.x * scale; float line_y = dxy.y * scale; float line_c = -((p0.x * line_x) + (p0.y * line_y)); - _285.Store((di + 1u) * 4 + 8, asuint(line_x)); - _285.Store((di + 2u) * 4 + 8, asuint(line_y)); - _285.Store((di + 3u) * 4 + 8, asuint(line_c)); + _285.Store((di + 1u) * 4 + 12, asuint(line_x)); + _285.Store((di + 2u) * 4 + 12, asuint(line_y)); + _285.Store((di + 3u) * 4 + 12, asuint(line_c)); break; } case 732u: @@ -227,17 +228,17 @@ void comp_main() float2 c1 = center1 * rainv; float ra = rr * rainv; float roff = rr - 1.0f; - _285.Store(di * 4 + 8, asuint(linewidth)); - _285.Store((di + 1u) * 4 + 8, asuint(inv_mat.x)); - _285.Store((di + 2u) * 4 + 8, asuint(inv_mat.y)); - _285.Store((di + 3u) * 4 + 8, asuint(inv_mat.z)); - _285.Store((di + 4u) * 4 + 8, asuint(inv_mat.w)); - _285.Store((di + 5u) * 4 + 8, asuint(inv_tr.x)); - _285.Store((di + 6u) * 4 + 8, asuint(inv_tr.y)); - _285.Store((di + 7u) * 4 + 8, asuint(c1.x)); - _285.Store((di + 8u) * 4 + 8, asuint(c1.y)); - _285.Store((di + 9u) * 4 + 8, asuint(ra)); - _285.Store((di + 10u) * 4 + 8, asuint(roff)); + _285.Store(di * 4 + 12, asuint(linewidth)); + _285.Store((di + 1u) * 4 + 12, asuint(inv_mat.x)); + _285.Store((di + 2u) * 4 + 12, asuint(inv_mat.y)); + _285.Store((di + 3u) * 4 + 12, asuint(inv_mat.z)); + _285.Store((di + 4u) * 4 + 12, asuint(inv_mat.w)); + _285.Store((di + 5u) * 4 + 12, asuint(inv_tr.x)); + _285.Store((di + 6u) * 4 + 12, asuint(inv_tr.y)); + _285.Store((di + 7u) * 4 + 12, asuint(c1.x)); + _285.Store((di + 8u) * 4 + 12, asuint(c1.y)); + _285.Store((di + 9u) * 4 + 12, asuint(ra)); + _285.Store((di + 10u) * 4 + 12, asuint(roff)); break; } case 5u: @@ -253,7 +254,7 @@ void comp_main() { path_ix = m.path_ix; } - _285.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix); + _285.Store((clip_out_base + m.clip_ix) * 4 + 12, path_ix); } } } diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl index c11e21b..2ec1911 100644 --- a/piet-gpu/shader/gen/draw_leaf.msl +++ b/piet-gpu/shader/gen/draw_leaf.msl @@ -59,6 +59,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -115,6 +116,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv index 58dde43..f9feedc 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.spv and b/piet-gpu/shader/gen/draw_leaf.spv differ diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil index be69aad..5516a9a 100644 Binary files a/piet-gpu/shader/gen/draw_reduce.dxil and b/piet-gpu/shader/gen/draw_reduce.dxil differ diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl index 8311155..1a8f2b1 100644 --- a/piet-gpu/shader/gen/draw_reduce.hlsl +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -13,6 +13,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -81,7 +82,7 @@ DrawMonoid combine_draw_monoid(DrawMonoid a, DrawMonoid b) void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - uint drawtag_base = _87.Load(100) >> uint(2); + uint drawtag_base = _87.Load(104) >> uint(2); uint tag_word = _97.Load((drawtag_base + ix) * 4 + 0); uint param = tag_word; DrawMonoid agg = map_tag(param); diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl index 759267c..b2510e3 100644 --- a/piet-gpu/shader/gen/draw_reduce.msl +++ b/piet-gpu/shader/gen/draw_reduce.msl @@ -20,6 +20,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -76,6 +77,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv index d6c6fb7..2992677 100644 Binary files a/piet-gpu/shader/gen/draw_reduce.spv and b/piet-gpu/shader/gen/draw_reduce.spv differ diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil index c48d59d..33873b3 100644 Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl index 0a6c022..2e1f937 100644 --- a/piet-gpu/shader/gen/kernel4.hlsl +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -130,6 +130,7 @@ struct TileSeg struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -163,9 +164,10 @@ static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); RWByteAddressBuffer _297 : register(u0, space0); ByteAddressBuffer _1681 : register(t1, space0); -RWTexture2D image_atlas : register(u3, space0); -RWTexture2D gradients : register(u4, space0); -RWTexture2D image : register(u2, space0); +RWByteAddressBuffer _2506 : register(u2, space0); +RWTexture2D image_atlas : register(u4, space0); +RWTexture2D gradients : register(u5, space0); +RWTexture2D image : register(u3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -206,7 +208,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _297.Load(offset * 4 + 8); + uint v = _297.Load(offset * 4 + 12); return v; } @@ -989,9 +991,9 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) void comp_main() { - uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x; + uint tile_ix = (gl_WorkGroupID.y * _1681.Load(12)) + gl_WorkGroupID.x; Alloc _1696; - _1696.offset = _1681.Load(24); + _1696.offset = _1681.Load(28); Alloc param; param.offset = _1696.offset; uint param_1 = tile_ix * 1024u; @@ -999,7 +1001,7 @@ void comp_main() Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef _1705 = { cmd_alloc.offset }; CmdRef cmd_ref = _1705; - uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 12); cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); @@ -1009,14 +1011,13 @@ void comp_main() rgba[i] = 0.0f.xxxx; } uint clip_depth = 0u; - bool mem_ok = _297.Load(4) == 0u; float df[8]; TileSegRef tile_seg_ref; float area[8]; uint blend_stack[4][8]; uint base_ix_1; uint bg_rgba; - while (mem_ok) + while (true) { Alloc param_3 = cmd_alloc; CmdRef param_4 = cmd_ref; @@ -1036,13 +1037,13 @@ void comp_main() { df[k] = 1000000000.0f; } - TileSegRef _1810 = { stroke.tile_ref }; - tile_seg_ref = _1810; + TileSegRef _1805 = { stroke.tile_ref }; + tile_seg_ref = _1805; do { uint param_7 = tile_seg_ref.offset; uint param_8 = 24u; - bool param_9 = mem_ok; + bool param_9 = true; Alloc param_10 = new_alloc(param_7, param_8, param_9); TileSegRef param_11 = tile_seg_ref; TileSeg seg = TileSeg_read(param_10, param_11); @@ -1073,13 +1074,13 @@ void comp_main() { area[k_3] = float(fill.backdrop); } - TileSegRef _1930 = { fill.tile_ref }; - tile_seg_ref = _1930; + TileSegRef _1924 = { fill.tile_ref }; + tile_seg_ref = _1924; do { uint param_15 = tile_seg_ref.offset; uint param_16 = 24u; - bool param_17 = mem_ok; + bool param_17 = true; Alloc param_18 = new_alloc(param_15, param_16, param_17); TileSegRef param_19 = tile_seg_ref; TileSeg seg_1 = TileSeg_read(param_18, param_19); @@ -1163,10 +1164,10 @@ void comp_main() int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba = gradients[int2(x, int(lin.index))]; float3 param_29 = fg_rgba.xyz; - float3 _2264 = fromsRGB(param_29); - fg_rgba.x = _2264.x; - fg_rgba.y = _2264.y; - fg_rgba.z = _2264.z; + float3 _2257 = fromsRGB(param_29); + fg_rgba.x = _2257.x; + fg_rgba.y = _2257.y; + fg_rgba.z = _2257.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; } @@ -1189,10 +1190,10 @@ void comp_main() int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; float3 param_33 = fg_rgba_1.xyz; - float3 _2374 = fromsRGB(param_33); - fg_rgba_1.x = _2374.x; - fg_rgba_1.y = _2374.y; - fg_rgba_1.z = _2374.z; + float3 _2367 = fromsRGB(param_33); + fg_rgba_1.x = _2367.x; + fg_rgba_1.y = _2367.y; + fg_rgba_1.z = _2367.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; } @@ -1206,9 +1207,9 @@ void comp_main() CmdImage fill_img = Cmd_Image_read(param_34, param_35); uint2 param_36 = xy_uint; CmdImage param_37 = fill_img; - float4 _2417[8]; - fillImage(_2417, param_36, param_37); - float4 img[8] = _2417; + float4 _2410[8]; + fillImage(_2410, param_36, param_37); + float4 img[8] = _2410; for (uint k_11 = 0u; k_11 < 8u; k_11++) { float4 fg_k_3 = img[k_11] * area[k_11]; @@ -1224,8 +1225,8 @@ void comp_main() for (uint k_12 = 0u; k_12 < 8u; k_12++) { float4 param_38 = float4(rgba[k_12]); - uint _2479 = packsRGB(param_38); - blend_stack[clip_depth][k_12] = _2479; + uint _2472 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2472; rgba[k_12] = 0.0f.xxxx; } } @@ -1235,8 +1236,8 @@ void comp_main() for (uint k_13 = 0u; k_13 < 8u; k_13++) { float4 param_39 = float4(rgba[k_13]); - uint _2522 = packsRGB(param_39); - _297.Store((base_ix + k_13) * 4 + 8, _2522); + uint _2519 = packsRGB(param_39); + _2506.Store((base_ix + k_13) * 4 + 0, _2519); rgba[k_13] = 0.0f.xxxx; } } @@ -1262,7 +1263,7 @@ void comp_main() } else { - bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + bg_rgba = _2506.Load((base_ix_1 + k_14) * 4 + 0); } uint param_42 = bg_rgba; float4 bg = unpacksRGB(param_42); @@ -1279,8 +1280,8 @@ void comp_main() { Alloc param_46 = cmd_alloc; CmdRef param_47 = cmd_ref; - CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; - cmd_ref = _2621; + CmdRef _2618 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2618; cmd_alloc.offset = cmd_ref.offset; break; } diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl index f60ea81..1cf8cb3 100644 --- a/piet-gpu/shader/gen/kernel4.msl +++ b/piet-gpu/shader/gen/kernel4.msl @@ -178,6 +178,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -188,6 +189,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -222,6 +224,11 @@ struct ConfigBuf Config conf; }; +struct BlendBuf +{ + uint blend_mem[1]; +}; + constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); static inline __attribute__((always_inline)) @@ -1047,7 +1054,7 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me return CmdJump_read(param, param_1, v_297); } -kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], device BlendBuf& _2506 [[buffer(2)]], texture2d image [[texture(3)]], texture2d image_atlas [[texture(4)]], texture2d gradients [[texture(5)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x; Alloc param; @@ -1066,14 +1073,13 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 rgba[i] = float4(0.0); } uint clip_depth = 0u; - bool mem_ok = v_297.mem_error == 0u; spvUnsafeArray df; TileSegRef tile_seg_ref; spvUnsafeArray area; spvUnsafeArray, 4> blend_stack; uint base_ix_1; uint bg_rgba; - while (mem_ok) + while (true) { Alloc param_3 = cmd_alloc; CmdRef param_4 = cmd_ref; @@ -1098,7 +1104,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 { uint param_7 = tile_seg_ref.offset; uint param_8 = 24u; - bool param_9 = mem_ok; + bool param_9 = true; Alloc param_10 = new_alloc(param_7, param_8, param_9); TileSegRef param_11 = tile_seg_ref; TileSeg seg = TileSeg_read(param_10, param_11, v_297); @@ -1134,7 +1140,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 { uint param_15 = tile_seg_ref.offset; uint param_16 = 24u; - bool param_17 = mem_ok; + bool param_17 = true; Alloc param_18 = new_alloc(param_15, param_16, param_17); TileSegRef param_19 = tile_seg_ref; TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297); @@ -1218,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); float3 param_29 = fg_rgba.xyz; - float3 _2264 = fromsRGB(param_29); - fg_rgba.x = _2264.x; - fg_rgba.y = _2264.y; - fg_rgba.z = _2264.z; + float3 _2257 = fromsRGB(param_29); + fg_rgba.x = _2257.x; + fg_rgba.y = _2257.y; + fg_rgba.z = _2257.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; } @@ -1244,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); float3 param_33 = fg_rgba_1.xyz; - float3 _2374 = fromsRGB(param_33); - fg_rgba_1.x = _2374.x; - fg_rgba_1.y = _2374.y; - fg_rgba_1.z = _2374.z; + float3 _2367 = fromsRGB(param_33); + fg_rgba_1.x = _2367.x; + fg_rgba_1.y = _2367.y; + fg_rgba_1.z = _2367.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; } @@ -1278,8 +1284,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 for (uint k_12 = 0u; k_12 < 8u; k_12++) { float4 param_38 = float4(rgba[k_12]); - uint _2479 = packsRGB(param_38); - blend_stack[clip_depth][k_12] = _2479; + uint _2472 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2472; rgba[k_12] = float4(0.0); } } @@ -1289,8 +1295,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 for (uint k_13 = 0u; k_13 < 8u; k_13++) { float4 param_39 = float4(rgba[k_13]); - uint _2522 = packsRGB(param_39); - v_297.memory[base_ix + k_13] = _2522; + uint _2519 = packsRGB(param_39); + _2506.blend_mem[base_ix + k_13] = _2519; rgba[k_13] = float4(0.0); } } @@ -1316,7 +1322,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } else { - bg_rgba = v_297.memory[base_ix_1 + k_14]; + bg_rgba = _2506.blend_mem[base_ix_1 + k_14]; } uint param_42 = bg_rgba; float4 bg = unpacksRGB(param_42); diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv index c388941..f9198c3 100644 Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil index 7390167..bacd925 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl index ffada37..392d1f3 100644 --- a/piet-gpu/shader/gen/kernel4_gray.hlsl +++ b/piet-gpu/shader/gen/kernel4_gray.hlsl @@ -130,6 +130,7 @@ struct TileSeg struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -163,9 +164,10 @@ static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); RWByteAddressBuffer _297 : register(u0, space0); ByteAddressBuffer _1681 : register(t1, space0); -RWTexture2D image_atlas : register(u3, space0); -RWTexture2D gradients : register(u4, space0); -RWTexture2D image : register(u2, space0); +RWByteAddressBuffer _2506 : register(u2, space0); +RWTexture2D image_atlas : register(u4, space0); +RWTexture2D gradients : register(u5, space0); +RWTexture2D image : register(u3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -206,7 +208,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _297.Load(offset * 4 + 8); + uint v = _297.Load(offset * 4 + 12); return v; } @@ -989,9 +991,9 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) void comp_main() { - uint tile_ix = (gl_WorkGroupID.y * _1681.Load(8)) + gl_WorkGroupID.x; + uint tile_ix = (gl_WorkGroupID.y * _1681.Load(12)) + gl_WorkGroupID.x; Alloc _1696; - _1696.offset = _1681.Load(24); + _1696.offset = _1681.Load(28); Alloc param; param.offset = _1696.offset; uint param_1 = tile_ix * 1024u; @@ -999,7 +1001,7 @@ void comp_main() Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef _1705 = { cmd_alloc.offset }; CmdRef cmd_ref = _1705; - uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 12); cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); @@ -1009,14 +1011,13 @@ void comp_main() rgba[i] = 0.0f.xxxx; } uint clip_depth = 0u; - bool mem_ok = _297.Load(4) == 0u; float df[8]; TileSegRef tile_seg_ref; float area[8]; uint blend_stack[4][8]; uint base_ix_1; uint bg_rgba; - while (mem_ok) + while (true) { Alloc param_3 = cmd_alloc; CmdRef param_4 = cmd_ref; @@ -1036,13 +1037,13 @@ void comp_main() { df[k] = 1000000000.0f; } - TileSegRef _1810 = { stroke.tile_ref }; - tile_seg_ref = _1810; + TileSegRef _1805 = { stroke.tile_ref }; + tile_seg_ref = _1805; do { uint param_7 = tile_seg_ref.offset; uint param_8 = 24u; - bool param_9 = mem_ok; + bool param_9 = true; Alloc param_10 = new_alloc(param_7, param_8, param_9); TileSegRef param_11 = tile_seg_ref; TileSeg seg = TileSeg_read(param_10, param_11); @@ -1073,13 +1074,13 @@ void comp_main() { area[k_3] = float(fill.backdrop); } - TileSegRef _1930 = { fill.tile_ref }; - tile_seg_ref = _1930; + TileSegRef _1924 = { fill.tile_ref }; + tile_seg_ref = _1924; do { uint param_15 = tile_seg_ref.offset; uint param_16 = 24u; - bool param_17 = mem_ok; + bool param_17 = true; Alloc param_18 = new_alloc(param_15, param_16, param_17); TileSegRef param_19 = tile_seg_ref; TileSeg seg_1 = TileSeg_read(param_18, param_19); @@ -1163,10 +1164,10 @@ void comp_main() int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba = gradients[int2(x, int(lin.index))]; float3 param_29 = fg_rgba.xyz; - float3 _2264 = fromsRGB(param_29); - fg_rgba.x = _2264.x; - fg_rgba.y = _2264.y; - fg_rgba.z = _2264.z; + float3 _2257 = fromsRGB(param_29); + fg_rgba.x = _2257.x; + fg_rgba.y = _2257.y; + fg_rgba.z = _2257.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; } @@ -1189,10 +1190,10 @@ void comp_main() int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; float3 param_33 = fg_rgba_1.xyz; - float3 _2374 = fromsRGB(param_33); - fg_rgba_1.x = _2374.x; - fg_rgba_1.y = _2374.y; - fg_rgba_1.z = _2374.z; + float3 _2367 = fromsRGB(param_33); + fg_rgba_1.x = _2367.x; + fg_rgba_1.y = _2367.y; + fg_rgba_1.z = _2367.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; } @@ -1206,9 +1207,9 @@ void comp_main() CmdImage fill_img = Cmd_Image_read(param_34, param_35); uint2 param_36 = xy_uint; CmdImage param_37 = fill_img; - float4 _2417[8]; - fillImage(_2417, param_36, param_37); - float4 img[8] = _2417; + float4 _2410[8]; + fillImage(_2410, param_36, param_37); + float4 img[8] = _2410; for (uint k_11 = 0u; k_11 < 8u; k_11++) { float4 fg_k_3 = img[k_11] * area[k_11]; @@ -1224,8 +1225,8 @@ void comp_main() for (uint k_12 = 0u; k_12 < 8u; k_12++) { float4 param_38 = float4(rgba[k_12]); - uint _2479 = packsRGB(param_38); - blend_stack[clip_depth][k_12] = _2479; + uint _2472 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2472; rgba[k_12] = 0.0f.xxxx; } } @@ -1235,8 +1236,8 @@ void comp_main() for (uint k_13 = 0u; k_13 < 8u; k_13++) { float4 param_39 = float4(rgba[k_13]); - uint _2522 = packsRGB(param_39); - _297.Store((base_ix + k_13) * 4 + 8, _2522); + uint _2519 = packsRGB(param_39); + _2506.Store((base_ix + k_13) * 4 + 0, _2519); rgba[k_13] = 0.0f.xxxx; } } @@ -1262,7 +1263,7 @@ void comp_main() } else { - bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + bg_rgba = _2506.Load((base_ix_1 + k_14) * 4 + 0); } uint param_42 = bg_rgba; float4 bg = unpacksRGB(param_42); @@ -1279,8 +1280,8 @@ void comp_main() { Alloc param_46 = cmd_alloc; CmdRef param_47 = cmd_ref; - CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; - cmd_ref = _2621; + CmdRef _2618 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2618; cmd_alloc.offset = cmd_ref.offset; break; } diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl index e174713..45e7a0e 100644 --- a/piet-gpu/shader/gen/kernel4_gray.msl +++ b/piet-gpu/shader/gen/kernel4_gray.msl @@ -178,6 +178,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -188,6 +189,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -222,6 +224,11 @@ struct ConfigBuf Config conf; }; +struct BlendBuf +{ + uint blend_mem[1]; +}; + constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 4u, 1u); static inline __attribute__((always_inline)) @@ -1047,7 +1054,7 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me return CmdJump_read(param, param_1, v_297); } -kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1681 [[buffer(1)]], device BlendBuf& _2506 [[buffer(2)]], texture2d image [[texture(3)]], texture2d image_atlas [[texture(4)]], texture2d gradients [[texture(5)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { uint tile_ix = (gl_WorkGroupID.y * _1681.conf.width_in_tiles) + gl_WorkGroupID.x; Alloc param; @@ -1066,14 +1073,13 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 rgba[i] = float4(0.0); } uint clip_depth = 0u; - bool mem_ok = v_297.mem_error == 0u; spvUnsafeArray df; TileSegRef tile_seg_ref; spvUnsafeArray area; spvUnsafeArray, 4> blend_stack; uint base_ix_1; uint bg_rgba; - while (mem_ok) + while (true) { Alloc param_3 = cmd_alloc; CmdRef param_4 = cmd_ref; @@ -1098,7 +1104,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 { uint param_7 = tile_seg_ref.offset; uint param_8 = 24u; - bool param_9 = mem_ok; + bool param_9 = true; Alloc param_10 = new_alloc(param_7, param_8, param_9); TileSegRef param_11 = tile_seg_ref; TileSeg seg = TileSeg_read(param_10, param_11, v_297); @@ -1134,7 +1140,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 { uint param_15 = tile_seg_ref.offset; uint param_16 = 24u; - bool param_17 = mem_ok; + bool param_17 = true; Alloc param_18 = new_alloc(param_15, param_16, param_17); TileSegRef param_19 = tile_seg_ref; TileSeg seg_1 = TileSeg_read(param_18, param_19, v_297); @@ -1218,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); float3 param_29 = fg_rgba.xyz; - float3 _2264 = fromsRGB(param_29); - fg_rgba.x = _2264.x; - fg_rgba.y = _2264.y; - fg_rgba.z = _2264.z; + float3 _2257 = fromsRGB(param_29); + fg_rgba.x = _2257.x; + fg_rgba.y = _2257.y; + fg_rgba.z = _2257.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; } @@ -1244,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); float3 param_33 = fg_rgba_1.xyz; - float3 _2374 = fromsRGB(param_33); - fg_rgba_1.x = _2374.x; - fg_rgba_1.y = _2374.y; - fg_rgba_1.z = _2374.z; + float3 _2367 = fromsRGB(param_33); + fg_rgba_1.x = _2367.x; + fg_rgba_1.y = _2367.y; + fg_rgba_1.z = _2367.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; } @@ -1278,8 +1284,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 for (uint k_12 = 0u; k_12 < 8u; k_12++) { float4 param_38 = float4(rgba[k_12]); - uint _2479 = packsRGB(param_38); - blend_stack[clip_depth][k_12] = _2479; + uint _2472 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2472; rgba[k_12] = float4(0.0); } } @@ -1289,8 +1295,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 for (uint k_13 = 0u; k_13 < 8u; k_13++) { float4 param_39 = float4(rgba[k_13]); - uint _2522 = packsRGB(param_39); - v_297.memory[base_ix + k_13] = _2522; + uint _2519 = packsRGB(param_39); + _2506.blend_mem[base_ix + k_13] = _2519; rgba[k_13] = float4(0.0); } } @@ -1316,7 +1322,7 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } else { - bg_rgba = v_297.memory[base_ix_1 + k_14]; + bg_rgba = _2506.blend_mem[base_ix_1 + k_14]; } uint param_42 = bg_rgba; float4 bg = unpacksRGB(param_42); diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv index 17c7531..df86875 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil index 9fd593c..2842f0d 100644 Binary files a/piet-gpu/shader/gen/path_coarse.dxil and b/piet-gpu/shader/gen/path_coarse.dxil differ diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl index 93ee8f0..106fdfc 100644 --- a/piet-gpu/shader/gen/path_coarse.hlsl +++ b/piet-gpu/shader/gen/path_coarse.hlsl @@ -3,12 +3,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - struct PathCubicRef { uint offset; @@ -74,6 +68,7 @@ struct SubdivResult struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -105,10 +100,10 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(32u, 1u, 1u); -static const PathSegTag _721 = { 0u, 0u }; +static const PathSegTag _722 = { 0u, 0u }; -RWByteAddressBuffer _136 : register(u0, space0); -ByteAddressBuffer _710 : register(t1, space0); +RWByteAddressBuffer _143 : register(u0, space0); +ByteAddressBuffer _711 : register(t1, space0); static uint3 gl_GlobalInvocationID; struct SPIRV_Cross_Input @@ -116,6 +111,15 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; +static bool mem_ok; + +bool check_deps(uint dep_stage) +{ + uint _149; + _143.InterlockedOr(4, 0u, _149); + return (_149 & dep_stage) == 0u; +} + bool touch_mem(Alloc alloc, uint offset) { return true; @@ -129,7 +133,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _136.Load(offset * 4 + 8); + uint v = _143.Load(offset * 4 + 12); return v; } @@ -138,8 +142,8 @@ PathSegTag PathSeg_tag(Alloc a, PathSegRef ref) Alloc param = a; uint param_1 = ref.offset >> uint(2); uint tag_and_flags = read_mem(param, param_1); - PathSegTag _367 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; - return _367; + PathSegTag _362 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _362; } PathCubic PathCubic_read(Alloc a, PathCubicRef ref) @@ -194,9 +198,9 @@ PathCubic PathCubic_read(Alloc a, PathCubicRef ref) PathCubic PathSeg_Cubic_read(Alloc a, PathSegRef ref) { - PathCubicRef _373 = { ref.offset + 4u }; + PathCubicRef _368 = { ref.offset + 4u }; Alloc param = a; - PathCubicRef param_1 = _373; + PathCubicRef param_1 = _368; return PathCubic_read(param, param_1); } @@ -240,8 +244,8 @@ SubdivResult estimate_subdiv(float2 p0, float2 p1, float2 p2, float sqrt_tol) val = (sqrt_tol * da) / approx_parabola_integral(param_2); } } - SubdivResult _695 = { val, a0, a2 }; - return _695; + SubdivResult _690 = { val, a0, a2 }; + return _690; } uint fill_mode_from_flags(uint flags) @@ -263,12 +267,12 @@ Path Path_read(Alloc a, PathRef ref) uint raw2 = read_mem(param_4, param_5); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); - TileRef _427 = { raw2 }; - s.tiles = _427; + TileRef _422 = { raw2 }; + s.tiles = _422; return s; } -Alloc new_alloc(uint offset, uint size, bool mem_ok) +Alloc new_alloc(uint offset, uint size, bool mem_ok_1) { Alloc a; a.offset = offset; @@ -286,33 +290,24 @@ float2 eval_quad(float2 p0, float2 p1, float2 p2, float t) return (p0 * (mt * mt)) + (((p1 * (mt * 2.0f)) + (p2 * t)) * t); } -MallocResult malloc(uint size) +uint malloc_stage(uint size, uint mem_size, uint stage) { - uint _142; - _136.InterlockedAdd(0, size, _142); - uint offset = _142; - uint _149; - _136.GetDimensions(_149); - _149 = (_149 - 8) / 4; - MallocResult r; - r.failed = (offset + size) > uint(int(_149) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _158; + _143.InterlockedAdd(0, size, _158); + uint offset = _158; + if ((offset + size) > mem_size) { - uint _171; - _136.InterlockedMax(4, 1u, _171); - return r; + uint _168; + _143.InterlockedOr(4, stage, _168); + offset = 0u; } - return r; + return offset; } TileRef Tile_index(TileRef ref, uint index) { - TileRef _385 = { ref.offset + (index * 8u) }; - return _385; + TileRef _380 = { ref.offset + (index * 8u) }; + return _380; } void write_mem(Alloc alloc, uint offset, uint val) @@ -323,7 +318,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _136.Store(offset * 4 + 8, val); + _143.Store(offset * 4 + 12, val); } void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) @@ -357,30 +352,36 @@ void TileSeg_write(Alloc a, TileSegRef ref, TileSeg s) void comp_main() { - uint element_ix = gl_GlobalInvocationID.x; - PathSegRef _718 = { _710.Load(28) + (element_ix * 52u) }; - PathSegRef ref = _718; - PathSegTag tag = _721; - if (element_ix < _710.Load(4)) + mem_ok = true; + uint param = 7u; + bool _694 = check_deps(param); + if (!_694) { - Alloc _731; - _731.offset = _710.Load(28); - Alloc param; - param.offset = _731.offset; - PathSegRef param_1 = ref; - tag = PathSeg_tag(param, param_1); + return; + } + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef _719 = { _711.Load(32) + (element_ix * 52u) }; + PathSegRef ref = _719; + PathSegTag tag = _722; + if (element_ix < _711.Load(8)) + { + Alloc _732; + _732.offset = _711.Load(32); + Alloc param_1; + param_1.offset = _732.offset; + PathSegRef param_2 = ref; + tag = PathSeg_tag(param_1, param_2); } - bool mem_ok = _136.Load(4) == 0u; switch (tag.tag) { case 1u: { - Alloc _748; - _748.offset = _710.Load(28); - Alloc param_2; - param_2.offset = _748.offset; - PathSegRef param_3 = ref; - PathCubic cubic = PathSeg_Cubic_read(param_2, param_3); + Alloc _745; + _745.offset = _711.Load(32); + Alloc param_3; + param_3.offset = _745.offset; + PathSegRef param_4 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_3, param_4); float2 err_v = (((cubic.p2 - cubic.p1) * 3.0f) + cubic.p0) - cubic.p3; float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875f, 0.16666667163372039794921875f))), 1u); @@ -392,43 +393,43 @@ void comp_main() for (uint i = 0u; i < n_quads; i++) { float t = float(i + 1u) * _step; - float2 param_4 = cubic.p0; - float2 param_5 = cubic.p1; - float2 param_6 = cubic.p2; - float2 param_7 = cubic.p3; - float param_8 = t; - float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); - float2 param_9 = cubic.p0; - float2 param_10 = cubic.p1; - float2 param_11 = cubic.p2; - float2 param_12 = cubic.p3; - float param_13 = t - (0.5f * _step); - float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + float2 param_5 = cubic.p0; + float2 param_6 = cubic.p1; + float2 param_7 = cubic.p2; + float2 param_8 = cubic.p3; + float param_9 = t; + float2 qp2 = eval_cubic(param_5, param_6, param_7, param_8, param_9); + float2 param_10 = cubic.p0; + float2 param_11 = cubic.p1; + float2 param_12 = cubic.p2; + float2 param_13 = cubic.p3; + float param_14 = t - (0.5f * _step); + float2 qp1 = eval_cubic(param_10, param_11, param_12, param_13, param_14); qp1 = (qp1 * 2.0f) - ((qp0 + qp2) * 0.5f); - float2 param_14 = qp0; - float2 param_15 = qp1; - float2 param_16 = qp2; - float param_17 = 0.4743416607379913330078125f; - SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + float2 param_15 = qp0; + float2 param_16 = qp1; + float2 param_17 = qp2; + float param_18 = 0.4743416607379913330078125f; + SubdivResult params = estimate_subdiv(param_15, param_16, param_17, param_18); keep_params[i] = params; val += params.val; qp0 = qp2; } uint n = max(uint(ceil((val * 0.5f) / 0.4743416607379913330078125f)), 1u); - uint param_18 = tag.flags; - bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint param_19 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_19) == 1u; uint path_ix = cubic.path_ix; - PathRef _904 = { _710.Load(16) + (path_ix * 12u) }; - Alloc _907; - _907.offset = _710.Load(16); - Alloc param_19; - param_19.offset = _907.offset; - PathRef param_20 = _904; - Path path = Path_read(param_19, param_20); - uint param_21 = path.tiles.offset; - uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_23 = mem_ok; - Alloc path_alloc = new_alloc(param_21, param_22, param_23); + PathRef _901 = { _711.Load(20) + (path_ix * 12u) }; + Alloc _904; + _904.offset = _711.Load(20); + Alloc param_20; + param_20.offset = _904.offset; + PathRef param_21 = _901; + Path path = Path_read(param_20, param_21); + uint param_22 = path.tiles.offset; + uint param_23 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_24 = true; + Alloc path_alloc = new_alloc(param_22, param_23, param_24); int4 bbox = int4(path.bbox); float2 p0 = cubic.p0; qp0 = cubic.p0; @@ -436,44 +437,44 @@ void comp_main() int n_out = 1; float val_sum = 0.0f; float2 p1; - float _1147; + float _1143; TileSeg tile_seg; for (uint i_1 = 0u; i_1 < n_quads; i_1++) { float t_1 = float(i_1 + 1u) * _step; - float2 param_24 = cubic.p0; - float2 param_25 = cubic.p1; - float2 param_26 = cubic.p2; - float2 param_27 = cubic.p3; - float param_28 = t_1; - float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); - float2 param_29 = cubic.p0; - float2 param_30 = cubic.p1; - float2 param_31 = cubic.p2; - float2 param_32 = cubic.p3; - float param_33 = t_1 - (0.5f * _step); - float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + float2 param_25 = cubic.p0; + float2 param_26 = cubic.p1; + float2 param_27 = cubic.p2; + float2 param_28 = cubic.p3; + float param_29 = t_1; + float2 qp2_1 = eval_cubic(param_25, param_26, param_27, param_28, param_29); + float2 param_30 = cubic.p0; + float2 param_31 = cubic.p1; + float2 param_32 = cubic.p2; + float2 param_33 = cubic.p3; + float param_34 = t_1 - (0.5f * _step); + float2 qp1_1 = eval_cubic(param_30, param_31, param_32, param_33, param_34); qp1_1 = (qp1_1 * 2.0f) - ((qp0 + qp2_1) * 0.5f); SubdivResult params_1 = keep_params[i_1]; - float param_34 = params_1.a0; - float u0 = approx_parabola_inv_integral(param_34); - float param_35 = params_1.a2; - float u2 = approx_parabola_inv_integral(param_35); + float param_35 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_35); + float param_36 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_36); float uscale = 1.0f / (u2 - u0); float target = float(n_out) * v_step; for (;;) { - bool _1040 = uint(n_out) == n; - bool _1050; - if (!_1040) + bool _1036 = uint(n_out) == n; + bool _1046; + if (!_1036) { - _1050 = target < (val_sum + params_1.val); + _1046 = target < (val_sum + params_1.val); } else { - _1050 = _1040; + _1046 = _1036; } - if (_1050) + if (_1046) { if (uint(n_out) == n) { @@ -483,14 +484,14 @@ void comp_main() { float u = (target - val_sum) / params_1.val; float a = lerp(params_1.a0, params_1.a2, u); - float param_36 = a; - float au = approx_parabola_inv_integral(param_36); + float param_37 = a; + float au = approx_parabola_inv_integral(param_37); float t_2 = (au - u0) * uscale; - float2 param_37 = qp0; - float2 param_38 = qp1_1; - float2 param_39 = qp2_1; - float param_40 = t_2; - p1 = eval_quad(param_37, param_38, param_39, param_40); + float2 param_38 = qp0; + float2 param_39 = qp1_1; + float2 param_40 = qp2_1; + float param_41 = t_2; + p1 = eval_quad(param_38, param_39, param_40, param_41); } float xmin = min(p0.x, p1.x) - cubic.stroke.x; float xmax = max(p0.x, p1.x) + cubic.stroke.x; @@ -500,13 +501,13 @@ void comp_main() float dy = p1.y - p0.y; if (abs(dy) < 9.999999717180685365747194737196e-10f) { - _1147 = 1000000000.0f; + _1143 = 1000000000.0f; } else { - _1147 = dx / dy; + _1143 = dx / dy; } - float invslope = _1147; + float invslope = _1143; float c = (cubic.stroke.x + (abs(invslope) * (8.0f + cubic.stroke.y))) * 0.0625f; float b = invslope; float a_1 = (p0.x - ((p0.y - 8.0f) * b)) * 0.0625f; @@ -522,14 +523,20 @@ void comp_main() int stride = bbox.z - bbox.x; int base = ((y0 - bbox.y) * stride) - bbox.x; uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); - uint param_41 = n_tile_alloc * 24u; - MallocResult _1263 = malloc(param_41); - MallocResult tile_alloc = _1263; - if (tile_alloc.failed || (!mem_ok)) + uint malloc_size = n_tile_alloc * 24u; + uint param_42 = malloc_size; + uint param_43 = _711.Load(0); + uint param_44 = 4u; + uint _1265 = malloc_stage(param_42, param_43, param_44); + uint tile_offset = _1265; + if (tile_offset == 0u) { - return; + mem_ok = false; } - uint tile_offset = tile_alloc.alloc.offset; + uint param_45 = tile_offset; + uint param_46 = malloc_size; + bool param_47 = true; + Alloc tile_alloc = new_alloc(param_45, param_46, param_47); int xray = int(floor(p0.x * 0.0625f)); int last_xray = int(floor(p1.x * 0.0625f)); if (p0.y > p1.y) @@ -542,39 +549,34 @@ void comp_main() { float tile_y0 = float(y * 16); int xbackdrop = max((xray + 1), bbox.x); - bool _1319 = !is_stroke; - bool _1329; - if (_1319) + bool _1322 = !is_stroke; + bool _1332; + if (_1322) { - _1329 = min(p0.y, p1.y) < tile_y0; + _1332 = min(p0.y, p1.y) < tile_y0; } else { - _1329 = _1319; + _1332 = _1322; } - bool _1336; - if (_1329) + bool _1339; + if (_1332) { - _1336 = xbackdrop < bbox.z; + _1339 = xbackdrop < bbox.z; } else { - _1336 = _1329; + _1339 = _1332; } - if (_1336) + if (_1339) { int backdrop = (p1.y < p0.y) ? 1 : (-1); - TileRef param_42 = path.tiles; - uint param_43 = uint(base + xbackdrop); - TileRef tile_ref = Tile_index(param_42, param_43); + TileRef param_48 = path.tiles; + uint param_49 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_48, param_49); uint tile_el = tile_ref.offset >> uint(2); - Alloc param_44 = path_alloc; - uint param_45 = tile_el + 1u; - if (touch_mem(param_44, param_45)) - { - uint _1374; - _136.InterlockedAdd((tile_el + 1u) * 4 + 8, uint(backdrop), _1374); - } + uint _1369; + _143.InterlockedAdd((tile_el + 1u) * 4 + 12, uint(backdrop), _1369); } int next_xray = last_xray; if (y < (y1 - 1)) @@ -592,20 +594,15 @@ void comp_main() for (int x = xx0; x < xx1; x++) { float tile_x0 = float(x * 16); - TileRef _1454 = { path.tiles.offset }; - TileRef param_46 = _1454; - uint param_47 = uint(base + x); - TileRef tile_ref_1 = Tile_index(param_46, param_47); + TileRef _1449 = { path.tiles.offset }; + TileRef param_50 = _1449; + uint param_51 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_50, param_51); uint tile_el_1 = tile_ref_1.offset >> uint(2); uint old = 0u; - Alloc param_48 = path_alloc; - uint param_49 = tile_el_1; - if (touch_mem(param_48, param_49)) - { - uint _1477; - _136.InterlockedExchange(tile_el_1 * 4 + 8, tile_offset, _1477); - old = _1477; - } + uint _1465; + _143.InterlockedExchange(tile_el_1 * 4 + 12, tile_offset, _1465); + old = _1465; tile_seg.origin = p0; tile_seg._vector = p1 - p0; float y_edge = 0.0f; @@ -636,11 +633,14 @@ void comp_main() } tile_seg.y_edge = y_edge; tile_seg.next.offset = old; - TileSegRef _1559 = { tile_offset }; - Alloc param_50 = tile_alloc.alloc; - TileSegRef param_51 = _1559; - TileSeg param_52 = tile_seg; - TileSeg_write(param_50, param_51, param_52); + if (mem_ok) + { + TileSegRef _1550 = { tile_offset }; + Alloc param_52 = tile_alloc; + TileSegRef param_53 = _1550; + TileSeg param_54 = tile_seg; + TileSeg_write(param_52, param_53, param_54); + } tile_offset += 24u; } xc += b; diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl index 26aa33a..4f59b3f 100644 --- a/piet-gpu/shader/gen/path_coarse.msl +++ b/piet-gpu/shader/gen/path_coarse.msl @@ -51,12 +51,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - struct PathCubicRef { uint offset; @@ -124,6 +118,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -134,6 +129,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -170,6 +166,13 @@ struct ConfigBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(32u, 1u, 1u); +static inline __attribute__((always_inline)) +bool check_deps(thread const uint& dep_stage, device Memory& v_143) +{ + uint _149 = atomic_fetch_or_explicit((device atomic_uint*)&v_143.mem_error, 0u, memory_order_relaxed); + return (_149 & dep_stage) == 0u; +} + static inline __attribute__((always_inline)) bool touch_mem(thread const Alloc& alloc, thread const uint& offset) { @@ -177,7 +180,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_136, constant uint& v_136BufferSize) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_143) { Alloc param = alloc; uint param_1 = offset; @@ -185,59 +188,59 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_136.memory[offset]; + uint v = v_143.memory[offset]; return v; } static inline __attribute__((always_inline)) -PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +PathSegTag PathSeg_tag(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_143) { Alloc param = a; uint param_1 = ref.offset >> uint(2); - uint tag_and_flags = read_mem(param, param_1, v_136, v_136BufferSize); + uint tag_and_flags = read_mem(param, param_1, v_143); return PathSegTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; } static inline __attribute__((always_inline)) -PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, device Memory& v_143) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + uint raw0 = read_mem(param, param_1, v_143); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + uint raw1 = read_mem(param_2, param_3, v_143); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + uint raw2 = read_mem(param_4, param_5, v_143); Alloc param_6 = a; uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7, v_136, v_136BufferSize); + uint raw3 = read_mem(param_6, param_7, v_143); Alloc param_8 = a; uint param_9 = ix + 4u; - uint raw4 = read_mem(param_8, param_9, v_136, v_136BufferSize); + uint raw4 = read_mem(param_8, param_9, v_143); Alloc param_10 = a; uint param_11 = ix + 5u; - uint raw5 = read_mem(param_10, param_11, v_136, v_136BufferSize); + uint raw5 = read_mem(param_10, param_11, v_143); Alloc param_12 = a; uint param_13 = ix + 6u; - uint raw6 = read_mem(param_12, param_13, v_136, v_136BufferSize); + uint raw6 = read_mem(param_12, param_13, v_143); Alloc param_14 = a; uint param_15 = ix + 7u; - uint raw7 = read_mem(param_14, param_15, v_136, v_136BufferSize); + uint raw7 = read_mem(param_14, param_15, v_143); Alloc param_16 = a; uint param_17 = ix + 8u; - uint raw8 = read_mem(param_16, param_17, v_136, v_136BufferSize); + uint raw8 = read_mem(param_16, param_17, v_143); Alloc param_18 = a; uint param_19 = ix + 9u; - uint raw9 = read_mem(param_18, param_19, v_136, v_136BufferSize); + uint raw9 = read_mem(param_18, param_19, v_143); Alloc param_20 = a; uint param_21 = ix + 10u; - uint raw10 = read_mem(param_20, param_21, v_136, v_136BufferSize); + uint raw10 = read_mem(param_20, param_21, v_143); Alloc param_22 = a; uint param_23 = ix + 11u; - uint raw11 = read_mem(param_22, param_23, v_136, v_136BufferSize); + uint raw11 = read_mem(param_22, param_23, v_143); PathCubic s; s.p0 = float2(as_type(raw0), as_type(raw1)); s.p1 = float2(as_type(raw2), as_type(raw3)); @@ -250,11 +253,11 @@ PathCubic PathCubic_read(thread const Alloc& a, thread const PathCubicRef& ref, } static inline __attribute__((always_inline)) -PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +PathCubic PathSeg_Cubic_read(thread const Alloc& a, thread const PathSegRef& ref, device Memory& v_143) { Alloc param = a; PathCubicRef param_1 = PathCubicRef{ ref.offset + 4u }; - return PathCubic_read(param, param_1, v_136, v_136BufferSize); + return PathCubic_read(param, param_1, v_143); } static inline __attribute__((always_inline)) @@ -310,18 +313,18 @@ uint fill_mode_from_flags(thread const uint& flags) } static inline __attribute__((always_inline)) -Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_136, constant uint& v_136BufferSize) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_143) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_136, v_136BufferSize); + uint raw0 = read_mem(param, param_1, v_143); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_136, v_136BufferSize); + uint raw1 = read_mem(param_2, param_3, v_143); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_136, v_136BufferSize); + uint raw2 = read_mem(param_4, param_5, v_143); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); s.tiles = TileRef{ raw2 }; @@ -350,22 +353,16 @@ float2 eval_quad(thread const float2& p0, thread const float2& p1, thread const } static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_136, constant uint& v_136BufferSize) +uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_143) { - uint _142 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.mem_offset, size, memory_order_relaxed); - uint offset = _142; - MallocResult r; - r.failed = (offset + size) > uint(int((v_136BufferSize - 8) / 4) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _158 = atomic_fetch_add_explicit((device atomic_uint*)&v_143.mem_offset, size, memory_order_relaxed); + uint offset = _158; + if ((offset + size) > mem_size) { - uint _171 = atomic_fetch_max_explicit((device atomic_uint*)&v_136.mem_error, 1u, memory_order_relaxed); - return r; + uint _168 = atomic_fetch_or_explicit((device atomic_uint*)&v_143.mem_error, stage, memory_order_relaxed); + offset = 0u; } - return r; + return offset; } static inline __attribute__((always_inline)) @@ -375,7 +372,7 @@ TileRef Tile_index(thread const TileRef& ref, thread const uint& index) } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_136, constant uint& v_136BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_143) { Alloc param = alloc; uint param_1 = offset; @@ -383,61 +380,66 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_136.memory[offset] = val; + v_143.memory[offset] = val; } static inline __attribute__((always_inline)) -void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_136, constant uint& v_136BufferSize) +void TileSeg_write(thread const Alloc& a, thread const TileSegRef& ref, thread const TileSeg& s, device Memory& v_143) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = as_type(s.origin.x); - write_mem(param, param_1, param_2, v_136, v_136BufferSize); + write_mem(param, param_1, param_2, v_143); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.origin.y); - write_mem(param_3, param_4, param_5, v_136, v_136BufferSize); + write_mem(param_3, param_4, param_5, v_143); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.vector.x); - write_mem(param_6, param_7, param_8, v_136, v_136BufferSize); + write_mem(param_6, param_7, param_8, v_143); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.vector.y); - write_mem(param_9, param_10, param_11, v_136, v_136BufferSize); + write_mem(param_9, param_10, param_11, v_143); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.y_edge); - write_mem(param_12, param_13, param_14, v_136, v_136BufferSize); + write_mem(param_12, param_13, param_14, v_143); Alloc param_15 = a; uint param_16 = ix + 5u; uint param_17 = s.next.offset; - write_mem(param_15, param_16, param_17, v_136, v_136BufferSize); + write_mem(param_15, param_16, param_17, v_143); } -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_136 [[buffer(0)]], const device ConfigBuf& _710 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +kernel void main0(device Memory& v_143 [[buffer(0)]], const device ConfigBuf& _711 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) { - constant uint& v_136BufferSize = spvBufferSizeConstants[0]; - uint element_ix = gl_GlobalInvocationID.x; - PathSegRef ref = PathSegRef{ _710.conf.pathseg_alloc.offset + (element_ix * 52u) }; - PathSegTag tag = PathSegTag{ 0u, 0u }; - if (element_ix < _710.conf.n_pathseg) + bool mem_ok = true; + uint param = 7u; + bool _694 = check_deps(param, v_143); + if (!_694) { - Alloc param; - param.offset = _710.conf.pathseg_alloc.offset; - PathSegRef param_1 = ref; - tag = PathSeg_tag(param, param_1, v_136, v_136BufferSize); + return; + } + uint element_ix = gl_GlobalInvocationID.x; + PathSegRef ref = PathSegRef{ _711.conf.pathseg_alloc.offset + (element_ix * 52u) }; + PathSegTag tag = PathSegTag{ 0u, 0u }; + if (element_ix < _711.conf.n_pathseg) + { + Alloc param_1; + param_1.offset = _711.conf.pathseg_alloc.offset; + PathSegRef param_2 = ref; + tag = PathSeg_tag(param_1, param_2, v_143); } - bool mem_ok = v_136.mem_error == 0u; switch (tag.tag) { case 1u: { - Alloc param_2; - param_2.offset = _710.conf.pathseg_alloc.offset; - PathSegRef param_3 = ref; - PathCubic cubic = PathSeg_Cubic_read(param_2, param_3, v_136, v_136BufferSize); + Alloc param_3; + param_3.offset = _711.conf.pathseg_alloc.offset; + PathSegRef param_4 = ref; + PathCubic cubic = PathSeg_Cubic_read(param_3, param_4, v_143); float2 err_v = (((cubic.p2 - cubic.p1) * 3.0) + cubic.p0) - cubic.p3; float err = (err_v.x * err_v.x) + (err_v.y * err_v.y); uint n_quads = max(uint(ceil(pow(err * 3.7037036418914794921875, 0.16666667163372039794921875))), 1u); @@ -449,40 +451,40 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M for (uint i = 0u; i < n_quads; i++) { float t = float(i + 1u) * _step; - float2 param_4 = cubic.p0; - float2 param_5 = cubic.p1; - float2 param_6 = cubic.p2; - float2 param_7 = cubic.p3; - float param_8 = t; - float2 qp2 = eval_cubic(param_4, param_5, param_6, param_7, param_8); - float2 param_9 = cubic.p0; - float2 param_10 = cubic.p1; - float2 param_11 = cubic.p2; - float2 param_12 = cubic.p3; - float param_13 = t - (0.5 * _step); - float2 qp1 = eval_cubic(param_9, param_10, param_11, param_12, param_13); + float2 param_5 = cubic.p0; + float2 param_6 = cubic.p1; + float2 param_7 = cubic.p2; + float2 param_8 = cubic.p3; + float param_9 = t; + float2 qp2 = eval_cubic(param_5, param_6, param_7, param_8, param_9); + float2 param_10 = cubic.p0; + float2 param_11 = cubic.p1; + float2 param_12 = cubic.p2; + float2 param_13 = cubic.p3; + float param_14 = t - (0.5 * _step); + float2 qp1 = eval_cubic(param_10, param_11, param_12, param_13, param_14); qp1 = (qp1 * 2.0) - ((qp0 + qp2) * 0.5); - float2 param_14 = qp0; - float2 param_15 = qp1; - float2 param_16 = qp2; - float param_17 = 0.4743416607379913330078125; - SubdivResult params = estimate_subdiv(param_14, param_15, param_16, param_17); + float2 param_15 = qp0; + float2 param_16 = qp1; + float2 param_17 = qp2; + float param_18 = 0.4743416607379913330078125; + SubdivResult params = estimate_subdiv(param_15, param_16, param_17, param_18); keep_params[i] = params; val += params.val; qp0 = qp2; } uint n = max(uint(ceil((val * 0.5) / 0.4743416607379913330078125)), 1u); - uint param_18 = tag.flags; - bool is_stroke = fill_mode_from_flags(param_18) == 1u; + uint param_19 = tag.flags; + bool is_stroke = fill_mode_from_flags(param_19) == 1u; uint path_ix = cubic.path_ix; - Alloc param_19; - param_19.offset = _710.conf.tile_alloc.offset; - PathRef param_20 = PathRef{ _710.conf.tile_alloc.offset + (path_ix * 12u) }; - Path path = Path_read(param_19, param_20, v_136, v_136BufferSize); - uint param_21 = path.tiles.offset; - uint param_22 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_23 = mem_ok; - Alloc path_alloc = new_alloc(param_21, param_22, param_23); + Alloc param_20; + param_20.offset = _711.conf.tile_alloc.offset; + PathRef param_21 = PathRef{ _711.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_20, param_21, v_143); + uint param_22 = path.tiles.offset; + uint param_23 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_24 = true; + Alloc path_alloc = new_alloc(param_22, param_23, param_24); int4 bbox = int4(path.bbox); float2 p0 = cubic.p0; qp0 = cubic.p0; @@ -490,44 +492,44 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M int n_out = 1; float val_sum = 0.0; float2 p1; - float _1147; + float _1143; TileSeg tile_seg; for (uint i_1 = 0u; i_1 < n_quads; i_1++) { float t_1 = float(i_1 + 1u) * _step; - float2 param_24 = cubic.p0; - float2 param_25 = cubic.p1; - float2 param_26 = cubic.p2; - float2 param_27 = cubic.p3; - float param_28 = t_1; - float2 qp2_1 = eval_cubic(param_24, param_25, param_26, param_27, param_28); - float2 param_29 = cubic.p0; - float2 param_30 = cubic.p1; - float2 param_31 = cubic.p2; - float2 param_32 = cubic.p3; - float param_33 = t_1 - (0.5 * _step); - float2 qp1_1 = eval_cubic(param_29, param_30, param_31, param_32, param_33); + float2 param_25 = cubic.p0; + float2 param_26 = cubic.p1; + float2 param_27 = cubic.p2; + float2 param_28 = cubic.p3; + float param_29 = t_1; + float2 qp2_1 = eval_cubic(param_25, param_26, param_27, param_28, param_29); + float2 param_30 = cubic.p0; + float2 param_31 = cubic.p1; + float2 param_32 = cubic.p2; + float2 param_33 = cubic.p3; + float param_34 = t_1 - (0.5 * _step); + float2 qp1_1 = eval_cubic(param_30, param_31, param_32, param_33, param_34); qp1_1 = (qp1_1 * 2.0) - ((qp0 + qp2_1) * 0.5); SubdivResult params_1 = keep_params[i_1]; - float param_34 = params_1.a0; - float u0 = approx_parabola_inv_integral(param_34); - float param_35 = params_1.a2; - float u2 = approx_parabola_inv_integral(param_35); + float param_35 = params_1.a0; + float u0 = approx_parabola_inv_integral(param_35); + float param_36 = params_1.a2; + float u2 = approx_parabola_inv_integral(param_36); float uscale = 1.0 / (u2 - u0); float target = float(n_out) * v_step; for (;;) { - bool _1040 = uint(n_out) == n; - bool _1050; - if (!_1040) + bool _1036 = uint(n_out) == n; + bool _1046; + if (!_1036) { - _1050 = target < (val_sum + params_1.val); + _1046 = target < (val_sum + params_1.val); } else { - _1050 = _1040; + _1046 = _1036; } - if (_1050) + if (_1046) { if (uint(n_out) == n) { @@ -537,14 +539,14 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { float u = (target - val_sum) / params_1.val; float a = mix(params_1.a0, params_1.a2, u); - float param_36 = a; - float au = approx_parabola_inv_integral(param_36); + float param_37 = a; + float au = approx_parabola_inv_integral(param_37); float t_2 = (au - u0) * uscale; - float2 param_37 = qp0; - float2 param_38 = qp1_1; - float2 param_39 = qp2_1; - float param_40 = t_2; - p1 = eval_quad(param_37, param_38, param_39, param_40); + float2 param_38 = qp0; + float2 param_39 = qp1_1; + float2 param_40 = qp2_1; + float param_41 = t_2; + p1 = eval_quad(param_38, param_39, param_40, param_41); } float xmin = fast::min(p0.x, p1.x) - cubic.stroke.x; float xmax = fast::max(p0.x, p1.x) + cubic.stroke.x; @@ -554,13 +556,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M float dy = p1.y - p0.y; if (abs(dy) < 9.999999717180685365747194737196e-10) { - _1147 = 1000000000.0; + _1143 = 1000000000.0; } else { - _1147 = dx / dy; + _1143 = dx / dy; } - float invslope = _1147; + float invslope = _1143; float c = (cubic.stroke.x + (abs(invslope) * (8.0 + cubic.stroke.y))) * 0.0625; float b = invslope; float a_1 = (p0.x - ((p0.y - 8.0) * b)) * 0.0625; @@ -576,14 +578,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M int stride = bbox.z - bbox.x; int base = ((y0 - bbox.y) * stride) - bbox.x; uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); - uint param_41 = n_tile_alloc * 24u; - MallocResult _1263 = malloc(param_41, v_136, v_136BufferSize); - MallocResult tile_alloc = _1263; - if (tile_alloc.failed || (!mem_ok)) + uint malloc_size = n_tile_alloc * 24u; + uint param_42 = malloc_size; + uint param_43 = _711.conf.mem_size; + uint param_44 = 4u; + uint _1265 = malloc_stage(param_42, param_43, param_44, v_143); + uint tile_offset = _1265; + if (tile_offset == 0u) { - return; + mem_ok = false; } - uint tile_offset = tile_alloc.alloc.offset; + uint param_45 = tile_offset; + uint param_46 = malloc_size; + bool param_47 = true; + Alloc tile_alloc = new_alloc(param_45, param_46, param_47); int xray = int(floor(p0.x * 0.0625)); int last_xray = int(floor(p1.x * 0.0625)); if (p0.y > p1.y) @@ -596,38 +604,33 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { float tile_y0 = float(y * 16); int xbackdrop = max((xray + 1), bbox.x); - bool _1319 = !is_stroke; - bool _1329; - if (_1319) + bool _1322 = !is_stroke; + bool _1332; + if (_1322) { - _1329 = fast::min(p0.y, p1.y) < tile_y0; + _1332 = fast::min(p0.y, p1.y) < tile_y0; } else { - _1329 = _1319; + _1332 = _1322; } - bool _1336; - if (_1329) + bool _1339; + if (_1332) { - _1336 = xbackdrop < bbox.z; + _1339 = xbackdrop < bbox.z; } else { - _1336 = _1329; + _1339 = _1332; } - if (_1336) + if (_1339) { int backdrop = (p1.y < p0.y) ? 1 : (-1); - TileRef param_42 = path.tiles; - uint param_43 = uint(base + xbackdrop); - TileRef tile_ref = Tile_index(param_42, param_43); + TileRef param_48 = path.tiles; + uint param_49 = uint(base + xbackdrop); + TileRef tile_ref = Tile_index(param_48, param_49); uint tile_el = tile_ref.offset >> uint(2); - Alloc param_44 = path_alloc; - uint param_45 = tile_el + 1u; - if (touch_mem(param_44, param_45)) - { - uint _1374 = atomic_fetch_add_explicit((device atomic_uint*)&v_136.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed); - } + uint _1369 = atomic_fetch_add_explicit((device atomic_uint*)&v_143.memory[tile_el + 1u], uint(backdrop), memory_order_relaxed); } int next_xray = last_xray; if (y < (y1 - 1)) @@ -645,18 +648,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M for (int x = xx0; x < xx1; x++) { float tile_x0 = float(x * 16); - TileRef param_46 = TileRef{ path.tiles.offset }; - uint param_47 = uint(base + x); - TileRef tile_ref_1 = Tile_index(param_46, param_47); + TileRef param_50 = TileRef{ path.tiles.offset }; + uint param_51 = uint(base + x); + TileRef tile_ref_1 = Tile_index(param_50, param_51); uint tile_el_1 = tile_ref_1.offset >> uint(2); uint old = 0u; - Alloc param_48 = path_alloc; - uint param_49 = tile_el_1; - if (touch_mem(param_48, param_49)) - { - uint _1477 = atomic_exchange_explicit((device atomic_uint*)&v_136.memory[tile_el_1], tile_offset, memory_order_relaxed); - old = _1477; - } + uint _1465 = atomic_exchange_explicit((device atomic_uint*)&v_143.memory[tile_el_1], tile_offset, memory_order_relaxed); + old = _1465; tile_seg.origin = p0; tile_seg.vector = p1 - p0; float y_edge = 0.0; @@ -687,10 +685,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } tile_seg.y_edge = y_edge; tile_seg.next.offset = old; - Alloc param_50 = tile_alloc.alloc; - TileSegRef param_51 = TileSegRef{ tile_offset }; - TileSeg param_52 = tile_seg; - TileSeg_write(param_50, param_51, param_52, v_136, v_136BufferSize); + if (mem_ok) + { + Alloc param_52 = tile_alloc; + TileSegRef param_53 = TileSegRef{ tile_offset }; + TileSeg param_54 = tile_seg; + TileSeg_write(param_52, param_53, param_54, v_143); + } tile_offset += 24u; } xc += b; diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv index 5e6beda..bd32fc2 100644 Binary files a/piet-gpu/shader/gen/path_coarse.spv and b/piet-gpu/shader/gen/path_coarse.spv differ diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil index 6130712..c498755 100644 Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl index 578417f..4e9a0ae 100644 --- a/piet-gpu/shader/gen/pathseg.hlsl +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -52,6 +52,7 @@ struct Monoid struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -164,7 +165,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _111.Load(offset * 4 + 8); + uint v = _111.Load(offset * 4 + 12); return v; } @@ -203,7 +204,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _111.Store(offset * 4 + 8, val); + _111.Store(offset * 4 + 12, val); } void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) @@ -365,7 +366,7 @@ uint round_up(float x) void comp_main() { uint ix = gl_GlobalInvocationID.x * 4u; - uint tag_word = _574.Load(((_639.Load(92) >> uint(2)) + (ix >> uint(2))) * 4 + 0); + uint tag_word = _574.Load(((_639.Load(96) >> uint(2)) + (ix >> uint(2))) * 4 + 0); uint param = tag_word; TagMonoid local_tm = reduce_tag(param); sh_tag[gl_LocalInvocationID.x] = local_tm; @@ -404,14 +405,14 @@ void comp_main() TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; tm = combine_tag_monoid(param_3, param_4); } - uint ps_ix = (_639.Load(96) >> uint(2)) + tm.pathseg_offset; - uint lw_ix = (_639.Load(88) >> uint(2)) + tm.linewidth_ix; + uint ps_ix = (_639.Load(100) >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.Load(92) >> uint(2)) + tm.linewidth_ix; uint save_path_ix = tm.path_ix; uint trans_ix = tm.trans_ix; - TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) }; + TransformSegRef _771 = { _639.Load(40) + (trans_ix * 24u) }; TransformSegRef trans_ref = _771; - PathSegRef _781 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; - PathSegRef ps_ref = _781; + PathSegRef _780 = { _639.Load(32) + (tm.pathseg_ix * 52u) }; + PathSegRef ps_ref = _780; float linewidth[4]; uint save_trans_ix[4]; float2 p0; @@ -464,9 +465,9 @@ void comp_main() } } } - Alloc _877; - _877.offset = _639.Load(36); - param_13.offset = _877.offset; + Alloc _876; + _876.offset = _639.Load(40); + param_13.offset = _876.offset; TransformSegRef param_14 = trans_ref; TransformSeg transform = TransformSeg_read(param_13, param_14); p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; @@ -475,25 +476,25 @@ void comp_main() if (seg_type >= 2u) { p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; - float4 _947 = bbox; - float2 _950 = min(_947.xy, p2); - bbox.x = _950.x; - bbox.y = _950.y; - float4 _955 = bbox; - float2 _958 = max(_955.zw, p2); - bbox.z = _958.x; - bbox.w = _958.y; + float4 _946 = bbox; + float2 _949 = min(_946.xy, p2); + bbox.x = _949.x; + bbox.y = _949.y; + float4 _954 = bbox; + float2 _957 = max(_954.zw, p2); + bbox.z = _957.x; + bbox.w = _957.y; if (seg_type == 3u) { p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; - float4 _983 = bbox; - float2 _986 = min(_983.xy, p3); - bbox.x = _986.x; - bbox.y = _986.y; - float4 _991 = bbox; - float2 _994 = max(_991.zw, p3); - bbox.z = _994.x; - bbox.w = _994.y; + float4 _982 = bbox; + float2 _985 = min(_982.xy, p3); + bbox.x = _985.x; + bbox.y = _985.y; + float4 _990 = bbox; + float2 _993 = max(_990.zw, p3); + bbox.z = _993.x; + bbox.w = _993.y; } else { @@ -524,9 +525,9 @@ void comp_main() cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; cubic.stroke = stroke; uint fill_mode = uint(linewidth[i_1] >= 0.0f); - Alloc _1089; - _1089.offset = _639.Load(28); - param_15.offset = _1089.offset; + Alloc _1088; + _1088.offset = _639.Load(32); + param_15.offset = _1088.offset; PathSegRef param_16 = ps_ref; uint param_17 = fill_mode; PathCubic param_18 = cubic; @@ -571,7 +572,7 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); uint path_ix = save_path_ix; - uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 6u); + uint bbox_out_ix = (_639.Load(44) >> uint(2)) + (path_ix * 6u); Monoid row = monoid_identity(); if (gl_LocalInvocationID.x > 0u) { @@ -583,24 +584,24 @@ void comp_main() Monoid param_24 = local[i_4]; Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; - bool _1264 = i_4 == 3u; - bool _1270; - if (_1264) + bool _1263 = i_4 == 3u; + bool _1269; + if (_1263) { - _1270 = gl_LocalInvocationID.x == 255u; + _1269 = gl_LocalInvocationID.x == 255u; } else { - _1270 = _1264; + _1269 = _1263; } - if (_1270) + if (_1269) { do_atomic = true; } if ((m.flags & 1u) != 0u) { - _111.Store((bbox_out_ix + 4u) * 4 + 8, asuint(linewidth[i_4])); - _111.Store((bbox_out_ix + 5u) * 4 + 8, save_trans_ix[i_4]); + _111.Store((bbox_out_ix + 4u) * 4 + 12, asuint(linewidth[i_4])); + _111.Store((bbox_out_ix + 5u) * 4 + 12, save_trans_ix[i_4]); if ((m.flags & 2u) == 0u) { do_atomic = true; @@ -608,43 +609,43 @@ void comp_main() else { float param_25 = m.bbox.x; - _111.Store(bbox_out_ix * 4 + 8, round_down(param_25)); + _111.Store(bbox_out_ix * 4 + 12, round_down(param_25)); float param_26 = m.bbox.y; - _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26)); + _111.Store((bbox_out_ix + 1u) * 4 + 12, round_down(param_26)); float param_27 = m.bbox.z; - _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27)); + _111.Store((bbox_out_ix + 2u) * 4 + 12, round_up(param_27)); float param_28 = m.bbox.w; - _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28)); + _111.Store((bbox_out_ix + 3u) * 4 + 12, round_up(param_28)); bbox_out_ix += 6u; do_atomic = false; } } if (do_atomic) { - bool _1335 = m.bbox.z > m.bbox.x; - bool _1344; - if (!_1335) + bool _1334 = m.bbox.z > m.bbox.x; + bool _1343; + if (!_1334) { - _1344 = m.bbox.w > m.bbox.y; + _1343 = m.bbox.w > m.bbox.y; } else { - _1344 = _1335; + _1343 = _1334; } - if (_1344) + if (_1343) { float param_29 = m.bbox.x; - uint _1353; - _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353); + uint _1352; + _111.InterlockedMin(bbox_out_ix * 4 + 12, round_down(param_29), _1352); float param_30 = m.bbox.y; - uint _1361; - _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361); + uint _1360; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 12, round_down(param_30), _1360); float param_31 = m.bbox.z; - uint _1369; - _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369); + uint _1368; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 12, round_up(param_31), _1368); float param_32 = m.bbox.w; - uint _1377; - _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377); + uint _1376; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 12, round_up(param_32), _1376); } bbox_out_ix += 6u; } diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl index 9f6328e..5aea66d 100644 --- a/piet-gpu/shader/gen/pathseg.msl +++ b/piet-gpu/shader/gen/pathseg.msl @@ -102,6 +102,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -117,6 +118,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -545,25 +547,25 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 if (seg_type >= 2u) { p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; - float4 _947 = bbox; - float2 _950 = fast::min(_947.xy, p2); - bbox.x = _950.x; - bbox.y = _950.y; - float4 _955 = bbox; - float2 _958 = fast::max(_955.zw, p2); - bbox.z = _958.x; - bbox.w = _958.y; + float4 _946 = bbox; + float2 _949 = fast::min(_946.xy, p2); + bbox.x = _949.x; + bbox.y = _949.y; + float4 _954 = bbox; + float2 _957 = fast::max(_954.zw, p2); + bbox.z = _957.x; + bbox.w = _957.y; if (seg_type == 3u) { p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; - float4 _983 = bbox; - float2 _986 = fast::min(_983.xy, p3); - bbox.x = _986.x; - bbox.y = _986.y; - float4 _991 = bbox; - float2 _994 = fast::max(_991.zw, p3); - bbox.z = _994.x; - bbox.w = _994.y; + float4 _982 = bbox; + float2 _985 = fast::min(_982.xy, p3); + bbox.x = _985.x; + bbox.y = _985.y; + float4 _990 = bbox; + float2 _993 = fast::max(_990.zw, p3); + bbox.z = _993.x; + bbox.w = _993.y; } else { @@ -651,17 +653,17 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 Monoid param_24 = local[i_4]; Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; - bool _1264 = i_4 == 3u; - bool _1270; - if (_1264) + bool _1263 = i_4 == 3u; + bool _1269; + if (_1263) { - _1270 = gl_LocalInvocationID.x == 255u; + _1269 = gl_LocalInvocationID.x == 255u; } else { - _1270 = _1264; + _1269 = _1263; } - if (_1270) + if (_1269) { do_atomic = true; } @@ -689,26 +691,26 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 } if (do_atomic) { - bool _1335 = m.bbox.z > m.bbox.x; - bool _1344; - if (!_1335) + bool _1334 = m.bbox.z > m.bbox.x; + bool _1343; + if (!_1334) { - _1344 = m.bbox.w > m.bbox.y; + _1343 = m.bbox.w > m.bbox.y; } else { - _1344 = _1335; + _1343 = _1334; } - if (_1344) + if (_1343) { float param_29 = m.bbox.x; - uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + uint _1352 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); float param_30 = m.bbox.y; - uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + uint _1360 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); float param_31 = m.bbox.z; - uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + uint _1368 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); float param_32 = m.bbox.w; - uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + uint _1376 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); } bbox_out_ix += 6u; } diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv index 4e2e9d5..2fb04e5 100644 Binary files a/piet-gpu/shader/gen/pathseg.spv and b/piet-gpu/shader/gen/pathseg.spv differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil index 4c2bd23..692ac5f 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl index 5f7d125..6e9dee1 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.hlsl +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -14,6 +14,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -92,7 +93,7 @@ TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) void comp_main() { uint ix = gl_GlobalInvocationID.x * 2u; - uint scene_ix = (_139.Load(92) >> uint(2)) + ix; + uint scene_ix = (_139.Load(96) >> uint(2)) + ix; uint tag_word = _151.Load(scene_ix * 4 + 0); uint param = tag_word; TagMonoid agg = reduce_tag(param); diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl index 91e0cca..c6266ad 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.msl +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -21,6 +21,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -78,6 +79,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv index f1d8679..829addc 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.spv and b/piet-gpu/shader/gen/pathtag_reduce.spv differ diff --git a/piet-gpu/shader/gen/tile_alloc.dxil b/piet-gpu/shader/gen/tile_alloc.dxil index 7759910..35a1c2b 100644 Binary files a/piet-gpu/shader/gen/tile_alloc.dxil and b/piet-gpu/shader/gen/tile_alloc.dxil differ diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl index 73e0a8e..aed9001 100644 --- a/piet-gpu/shader/gen/tile_alloc.hlsl +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -3,12 +3,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - struct PathRef { uint offset; @@ -27,6 +21,7 @@ struct Path struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -58,9 +53,9 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _70 : register(u0, space0); -ByteAddressBuffer _181 : register(t1, space0); -ByteAddressBuffer _257 : register(t2, space0); +RWByteAddressBuffer _53 : register(u0, space0); +ByteAddressBuffer _148 : register(t1, space0); +ByteAddressBuffer _232 : register(t2, space0); static uint3 gl_LocalInvocationID; static uint3 gl_GlobalInvocationID; @@ -71,53 +66,38 @@ struct SPIRV_Cross_Input }; groupshared uint sh_tile_count[256]; -groupshared MallocResult sh_tile_alloc; +groupshared uint sh_tile_offset; + +bool check_deps(uint dep_stage) +{ + uint _60; + _53.InterlockedOr(4, 0u, _60); + return (_60 & dep_stage) == 0u; +} float4 load_draw_bbox(uint draw_ix) { - uint base = (_181.Load(64) >> uint(2)) + (4u * draw_ix); - float x0 = asfloat(_70.Load(base * 4 + 8)); - float y0 = asfloat(_70.Load((base + 1u) * 4 + 8)); - float x1 = asfloat(_70.Load((base + 2u) * 4 + 8)); - float y1 = asfloat(_70.Load((base + 3u) * 4 + 8)); + uint base = (_148.Load(68) >> uint(2)) + (4u * draw_ix); + float x0 = asfloat(_53.Load(base * 4 + 12)); + float y0 = asfloat(_53.Load((base + 1u) * 4 + 12)); + float x1 = asfloat(_53.Load((base + 2u) * 4 + 12)); + float y1 = asfloat(_53.Load((base + 3u) * 4 + 12)); float4 bbox = float4(x0, y0, x1, y1); return bbox; } -Alloc new_alloc(uint offset, uint size, bool mem_ok) +uint malloc_stage(uint size, uint mem_size, uint stage) { - Alloc a; - a.offset = offset; - return a; -} - -MallocResult malloc(uint size) -{ - uint _76; - _70.InterlockedAdd(0, size, _76); - uint offset = _76; - uint _83; - _70.GetDimensions(_83); - _83 = (_83 - 8) / 4; - MallocResult r; - r.failed = (offset + size) > uint(int(_83) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _70; + _53.InterlockedAdd(0, size, _70); + uint offset = _70; + if ((offset + size) > mem_size) { - uint _105; - _70.InterlockedMax(4, 1u, _105); - return r; + uint _80; + _53.InterlockedOr(4, stage, _80); + offset = 0u; } - return r; -} - -Alloc slice_mem(Alloc a, uint offset, uint size) -{ - Alloc _131 = { a.offset + offset }; - return _131; + return offset; } bool touch_mem(Alloc alloc, uint offset) @@ -133,7 +113,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _70.Store(offset * 4 + 8, val); + _53.Store(offset * 4 + 12, val); } void Path_write(Alloc a, PathRef ref, Path s) @@ -155,15 +135,21 @@ void Path_write(Alloc a, PathRef ref, Path s) void comp_main() { + uint param = 1u; + bool _192 = check_deps(param); + if (!_192) + { + return; + } uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; - PathRef _241 = { _181.Load(16) + (element_ix * 12u) }; - PathRef path_ref = _241; - uint drawtag_base = _181.Load(100) >> uint(2); + PathRef _216 = { _148.Load(20) + (element_ix * 12u) }; + PathRef path_ref = _216; + uint drawtag_base = _148.Load(104) >> uint(2); uint drawtag = 0u; - if (element_ix < _181.Load(0)) + if (element_ix < _148.Load(4)) { - drawtag = _257.Load((drawtag_base + element_ix) * 4 + 0); + drawtag = _232.Load((drawtag_base + element_ix) * 4 + 0); } int x0 = 0; int y0 = 0; @@ -171,17 +157,17 @@ void comp_main() int y1 = 0; if ((drawtag != 0u) && (drawtag != 37u)) { - uint param = element_ix; - float4 bbox = load_draw_bbox(param); + uint param_1 = element_ix; + float4 bbox = load_draw_bbox(param_1); x0 = int(floor(bbox.x * 0.0625f)); y0 = int(floor(bbox.y * 0.0625f)); x1 = int(ceil(bbox.z * 0.0625f)); y1 = int(ceil(bbox.w * 0.0625f)); } - x0 = clamp(x0, 0, int(_181.Load(8))); - y0 = clamp(y0, 0, int(_181.Load(12))); - x1 = clamp(x1, 0, int(_181.Load(8))); - y1 = clamp(y1, 0, int(_181.Load(12))); + x0 = clamp(x0, 0, int(_148.Load(12))); + y0 = clamp(y0, 0, int(_148.Load(16))); + x1 = clamp(x1, 0, int(_148.Load(12))); + y1 = clamp(y1, 0, int(_148.Load(16))); Path path; path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); uint tile_count = uint((x1 - x0) * (y1 - y0)); @@ -199,59 +185,45 @@ void comp_main() } if (th_ix == 255u) { - uint param_1 = total_tile_count * 8u; - MallocResult _392 = malloc(param_1); - sh_tile_alloc = _392; + uint param_2 = total_tile_count * 8u; + uint param_3 = _148.Load(0); + uint param_4 = 2u; + uint _370 = malloc_stage(param_2, param_3, param_4); + sh_tile_offset = _370; } GroupMemoryBarrierWithGroupSync(); - MallocResult alloc_start = sh_tile_alloc; - bool _403; - if (!alloc_start.failed) - { - _403 = _70.Load(4) != 0u; - } - else - { - _403 = alloc_start.failed; - } - if (_403) + uint offset_start = sh_tile_offset; + if (offset_start == 0u) { return; } - if (element_ix < _181.Load(0)) + if (element_ix < _148.Load(4)) { - uint _416; + uint _387; if (th_ix > 0u) { - _416 = sh_tile_count[th_ix - 1u]; + _387 = sh_tile_count[th_ix - 1u]; } else { - _416 = 0u; + _387 = 0u; } - uint tile_subix = _416; - Alloc param_2 = alloc_start.alloc; - uint param_3 = 8u * tile_subix; - uint param_4 = 8u * tile_count; - Alloc tiles_alloc = slice_mem(param_2, param_3, param_4); - TileRef _438 = { tiles_alloc.offset }; - path.tiles = _438; - Alloc _444; - _444.offset = _181.Load(16); + uint tile_subix = _387; + TileRef _400 = { offset_start + (8u * tile_subix) }; + path.tiles = _400; + Alloc _406; + _406.offset = _148.Load(20); Alloc param_5; - param_5.offset = _444.offset; + param_5.offset = _406.offset; PathRef param_6 = path_ref; Path param_7 = path; Path_write(param_5, param_6, param_7); } uint total_count = sh_tile_count[255] * 2u; - uint start_ix = alloc_start.alloc.offset >> uint(2); + uint start_ix = offset_start >> uint(2); for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) { - Alloc param_8 = alloc_start.alloc; - uint param_9 = start_ix + i_1; - uint param_10 = 0u; - write_mem(param_8, param_9, param_10); + _53.Store((start_ix + i_1) * 4 + 12, 0u); } } diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl index 961be50..e02138a 100644 --- a/piet-gpu/shader/gen/tile_alloc.msl +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -12,12 +12,6 @@ struct Alloc uint offset; }; -struct MallocResult -{ - Alloc alloc; - bool failed; -}; - struct PathRef { uint offset; @@ -38,6 +32,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -48,6 +43,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -90,48 +86,35 @@ struct SceneBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) -float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_70, constant uint& v_70BufferSize, const device ConfigBuf& v_181) +bool check_deps(thread const uint& dep_stage, device Memory& v_53) { - uint base = (v_181.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); - float x0 = as_type(v_70.memory[base]); - float y0 = as_type(v_70.memory[base + 1u]); - float x1 = as_type(v_70.memory[base + 2u]); - float y1 = as_type(v_70.memory[base + 3u]); + uint _60 = atomic_fetch_or_explicit((device atomic_uint*)&v_53.mem_error, 0u, memory_order_relaxed); + return (_60 & dep_stage) == 0u; +} + +static inline __attribute__((always_inline)) +float4 load_draw_bbox(thread const uint& draw_ix, device Memory& v_53, const device ConfigBuf& v_148) +{ + uint base = (v_148.conf.draw_bbox_alloc.offset >> uint(2)) + (4u * draw_ix); + float x0 = as_type(v_53.memory[base]); + float y0 = as_type(v_53.memory[base + 1u]); + float x1 = as_type(v_53.memory[base + 2u]); + float y1 = as_type(v_53.memory[base + 3u]); float4 bbox = float4(x0, y0, x1, y1); return bbox; } static inline __attribute__((always_inline)) -Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const bool& mem_ok) +uint malloc_stage(thread const uint& size, thread const uint& mem_size, thread const uint& stage, device Memory& v_53) { - Alloc a; - a.offset = offset; - return a; -} - -static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_70, constant uint& v_70BufferSize) -{ - uint _76 = atomic_fetch_add_explicit((device atomic_uint*)&v_70.mem_offset, size, memory_order_relaxed); - uint offset = _76; - MallocResult r; - r.failed = (offset + size) > uint(int((v_70BufferSize - 8) / 4) * 4); - uint param = offset; - uint param_1 = size; - bool param_2 = !r.failed; - r.alloc = new_alloc(param, param_1, param_2); - if (r.failed) + uint _70 = atomic_fetch_add_explicit((device atomic_uint*)&v_53.mem_offset, size, memory_order_relaxed); + uint offset = _70; + if ((offset + size) > mem_size) { - uint _105 = atomic_fetch_max_explicit((device atomic_uint*)&v_70.mem_error, 1u, memory_order_relaxed); - return r; + uint _80 = atomic_fetch_or_explicit((device atomic_uint*)&v_53.mem_error, stage, memory_order_relaxed); + offset = 0u; } - return r; -} - -static inline __attribute__((always_inline)) -Alloc slice_mem(thread const Alloc& a, thread const uint& offset, thread const uint& size) -{ - return Alloc{ a.offset + offset }; + return offset; } static inline __attribute__((always_inline)) @@ -141,7 +124,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_70, constant uint& v_70BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_53) { Alloc param = alloc; uint param_1 = offset; @@ -149,40 +132,45 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_70.memory[offset] = val; + v_53.memory[offset] = val; } static inline __attribute__((always_inline)) -void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_70, constant uint& v_70BufferSize) +void Path_write(thread const Alloc& a, thread const PathRef& ref, thread const Path& s, device Memory& v_53) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.bbox.x | (s.bbox.y << uint(16)); - write_mem(param, param_1, param_2, v_70, v_70BufferSize); + write_mem(param, param_1, param_2, v_53); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = s.bbox.z | (s.bbox.w << uint(16)); - write_mem(param_3, param_4, param_5, v_70, v_70BufferSize); + write_mem(param_3, param_4, param_5, v_53); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = s.tiles.offset; - write_mem(param_6, param_7, param_8, v_70, v_70BufferSize); + write_mem(param_6, param_7, param_8, v_53); } -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_70 [[buffer(0)]], const device ConfigBuf& v_181 [[buffer(1)]], const device SceneBuf& _257 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +kernel void main0(device Memory& v_53 [[buffer(0)]], const device ConfigBuf& v_148 [[buffer(1)]], const device SceneBuf& _232 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) { threadgroup uint sh_tile_count[256]; - threadgroup MallocResult sh_tile_alloc; - constant uint& v_70BufferSize = spvBufferSizeConstants[0]; + threadgroup uint sh_tile_offset; + uint param = 1u; + bool _192 = check_deps(param, v_53); + if (!_192) + { + return; + } uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; - PathRef path_ref = PathRef{ v_181.conf.tile_alloc.offset + (element_ix * 12u) }; - uint drawtag_base = v_181.conf.drawtag_offset >> uint(2); + PathRef path_ref = PathRef{ v_148.conf.tile_alloc.offset + (element_ix * 12u) }; + uint drawtag_base = v_148.conf.drawtag_offset >> uint(2); uint drawtag = 0u; - if (element_ix < v_181.conf.n_elements) + if (element_ix < v_148.conf.n_elements) { - drawtag = _257.scene[drawtag_base + element_ix]; + drawtag = _232.scene[drawtag_base + element_ix]; } int x0 = 0; int y0 = 0; @@ -190,17 +178,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M int y1 = 0; if ((drawtag != 0u) && (drawtag != 37u)) { - uint param = element_ix; - float4 bbox = load_draw_bbox(param, v_70, v_70BufferSize, v_181); + uint param_1 = element_ix; + float4 bbox = load_draw_bbox(param_1, v_53, v_148); x0 = int(floor(bbox.x * 0.0625)); y0 = int(floor(bbox.y * 0.0625)); x1 = int(ceil(bbox.z * 0.0625)); y1 = int(ceil(bbox.w * 0.0625)); } - x0 = clamp(x0, 0, int(v_181.conf.width_in_tiles)); - y0 = clamp(y0, 0, int(v_181.conf.height_in_tiles)); - x1 = clamp(x1, 0, int(v_181.conf.width_in_tiles)); - y1 = clamp(y1, 0, int(v_181.conf.height_in_tiles)); + x0 = clamp(x0, 0, int(v_148.conf.width_in_tiles)); + y0 = clamp(y0, 0, int(v_148.conf.height_in_tiles)); + x1 = clamp(x1, 0, int(v_148.conf.width_in_tiles)); + y1 = clamp(y1, 0, int(v_148.conf.height_in_tiles)); Path path; path.bbox = uint4(uint(x0), uint(y0), uint(x1), uint(y1)); uint tile_count = uint((x1 - x0) * (y1 - y0)); @@ -218,56 +206,42 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } if (th_ix == 255u) { - uint param_1 = total_tile_count * 8u; - MallocResult _392 = malloc(param_1, v_70, v_70BufferSize); - sh_tile_alloc = _392; + uint param_2 = total_tile_count * 8u; + uint param_3 = v_148.conf.mem_size; + uint param_4 = 2u; + uint _370 = malloc_stage(param_2, param_3, param_4, v_53); + sh_tile_offset = _370; } threadgroup_barrier(mem_flags::mem_threadgroup); - MallocResult alloc_start = sh_tile_alloc; - bool _403; - if (!alloc_start.failed) - { - _403 = v_70.mem_error != 0u; - } - else - { - _403 = alloc_start.failed; - } - if (_403) + uint offset_start = sh_tile_offset; + if (offset_start == 0u) { return; } - if (element_ix < v_181.conf.n_elements) + if (element_ix < v_148.conf.n_elements) { - uint _416; + uint _387; if (th_ix > 0u) { - _416 = sh_tile_count[th_ix - 1u]; + _387 = sh_tile_count[th_ix - 1u]; } else { - _416 = 0u; + _387 = 0u; } - uint tile_subix = _416; - Alloc param_2 = alloc_start.alloc; - uint param_3 = 8u * tile_subix; - uint param_4 = 8u * tile_count; - Alloc tiles_alloc = slice_mem(param_2, param_3, param_4); - path.tiles = TileRef{ tiles_alloc.offset }; + uint tile_subix = _387; + path.tiles = TileRef{ offset_start + (8u * tile_subix) }; Alloc param_5; - param_5.offset = v_181.conf.tile_alloc.offset; + param_5.offset = v_148.conf.tile_alloc.offset; PathRef param_6 = path_ref; Path param_7 = path; - Path_write(param_5, param_6, param_7, v_70, v_70BufferSize); + Path_write(param_5, param_6, param_7, v_53); } uint total_count = sh_tile_count[255] * 2u; - uint start_ix = alloc_start.alloc.offset >> uint(2); + uint start_ix = offset_start >> uint(2); for (uint i_1 = th_ix; i_1 < total_count; i_1 += 256u) { - Alloc param_8 = alloc_start.alloc; - uint param_9 = start_ix + i_1; - uint param_10 = 0u; - write_mem(param_8, param_9, param_10, v_70, v_70BufferSize); + v_53.memory[start_ix + i_1] = 0u; } } diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv index dbc02a8..25a362c 100644 Binary files a/piet-gpu/shader/gen/tile_alloc.spv and b/piet-gpu/shader/gen/tile_alloc.spv differ diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil index f9f31e6..9427186 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl index 8a3b3d5..d3347a6 100644 --- a/piet-gpu/shader/gen/transform_leaf.hlsl +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -27,6 +27,7 @@ struct TransformSeg struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -58,12 +59,12 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; +static const Transform _225 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; RWByteAddressBuffer _71 : register(u0, space0); -ByteAddressBuffer _96 : register(t2, space0); -ByteAddressBuffer _278 : register(t1, space0); -ByteAddressBuffer _376 : register(t3, space0); +ByteAddressBuffer _97 : register(t2, space0); +ByteAddressBuffer _279 : register(t1, space0); +ByteAddressBuffer _377 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -80,12 +81,12 @@ groupshared Transform sh_scratch[256]; Transform Transform_read(TransformRef ref) { uint ix = ref.offset >> uint(2); - uint raw0 = _96.Load((ix + 0u) * 4 + 0); - uint raw1 = _96.Load((ix + 1u) * 4 + 0); - uint raw2 = _96.Load((ix + 2u) * 4 + 0); - uint raw3 = _96.Load((ix + 3u) * 4 + 0); - uint raw4 = _96.Load((ix + 4u) * 4 + 0); - uint raw5 = _96.Load((ix + 5u) * 4 + 0); + uint raw0 = _97.Load((ix + 0u) * 4 + 0); + uint raw1 = _97.Load((ix + 1u) * 4 + 0); + uint raw2 = _97.Load((ix + 2u) * 4 + 0); + uint raw3 = _97.Load((ix + 3u) * 4 + 0); + uint raw4 = _97.Load((ix + 4u) * 4 + 0); + uint raw5 = _97.Load((ix + 5u) * 4 + 0); Transform s; s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); s.translate = float2(asfloat(raw4), asfloat(raw5)); @@ -108,7 +109,7 @@ Transform combine_monoid(Transform a, Transform b) Transform monoid_identity() { - return _224; + return _225; } bool touch_mem(Alloc alloc, uint offset) @@ -124,7 +125,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _71.Store(offset * 4 + 8, val); + _71.Store(offset * 4 + 12, val); } void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) @@ -159,8 +160,8 @@ void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - TransformRef _285 = { _278.Load(84) + (ix * 24u) }; - TransformRef ref = _285; + TransformRef _286 = { _279.Load(88) + (ix * 24u) }; + TransformRef ref = _286; TransformRef param = ref; Transform agg = Transform_read(param); Transform local[8]; @@ -193,11 +194,11 @@ void comp_main() Transform row = monoid_identity(); if (gl_WorkGroupID.x > 0u) { - Transform _382; - _382.mat = asfloat(_376.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); - _382.translate = asfloat(_376.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); - row.mat = _382.mat; - row.translate = _382.translate; + Transform _383; + _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _383.mat; + row.translate = _383.translate; } if (gl_LocalInvocationID.x > 0u) { @@ -211,13 +212,13 @@ void comp_main() Transform param_10 = row; Transform param_11 = local[i_2]; Transform m = combine_monoid(param_10, param_11); - TransformSeg _422 = { m.mat, m.translate }; - TransformSeg transform = _422; - TransformSegRef _432 = { _278.Load(36) + ((ix + i_2) * 24u) }; - TransformSegRef trans_ref = _432; - Alloc _436; - _436.offset = _278.Load(36); - param_12.offset = _436.offset; + TransformSeg _423 = { m.mat, m.translate }; + TransformSeg transform = _423; + TransformSegRef _433 = { _279.Load(40) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _433; + Alloc _437; + _437.offset = _279.Load(40); + param_12.offset = _437.offset; TransformSegRef param_13 = trans_ref; TransformSeg param_14 = transform; TransformSeg_write(param_12, param_13, param_14); diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl index fe45438..01fefd1 100644 --- a/piet-gpu/shader/gen/transform_leaf.msl +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -75,6 +75,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; @@ -90,6 +91,7 @@ struct Alloc_1 struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -139,15 +141,15 @@ struct ParentBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) -Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_97) { uint ix = ref.offset >> uint(2); - uint raw0 = v_96.scene[ix + 0u]; - uint raw1 = v_96.scene[ix + 1u]; - uint raw2 = v_96.scene[ix + 2u]; - uint raw3 = v_96.scene[ix + 3u]; - uint raw4 = v_96.scene[ix + 4u]; - uint raw5 = v_96.scene[ix + 5u]; + uint raw0 = v_97.scene[ix + 0u]; + uint raw1 = v_97.scene[ix + 1u]; + uint raw2 = v_97.scene[ix + 2u]; + uint raw3 = v_97.scene[ix + 3u]; + uint raw4 = v_97.scene[ix + 4u]; + uint raw5 = v_97.scene[ix + 5u]; Transform s; s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); s.translate = float2(as_type(raw4), as_type(raw5)); @@ -223,13 +225,13 @@ void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref write_mem(param_15, param_16, param_17, v_71); } -kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _376 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _279 [[buffer(1)]], const device SceneBuf& v_97 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { threadgroup Transform sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; - TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; + TransformRef ref = TransformRef{ _279.conf.trans_offset + (ix * 24u) }; TransformRef param = ref; - Transform agg = Transform_read(param, v_96); + Transform agg = Transform_read(param, v_97); spvUnsafeArray local; local[0] = agg; for (uint i = 1u; i < 8u; i++) @@ -238,7 +240,7 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 uint param_2 = i; TransformRef param_3 = Transform_index(param_1, param_2); Transform param_4 = agg; - Transform param_5 = Transform_read(param_3, v_96); + Transform param_5 = Transform_read(param_3, v_97); agg = combine_monoid(param_4, param_5); local[i] = agg; } @@ -260,9 +262,9 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 Transform row = monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _379 = gl_WorkGroupID.x - 1u; - row.mat = _376.parent[_379].mat; - row.translate = _376.parent[_379].translate; + uint _380 = gl_WorkGroupID.x - 1u; + row.mat = _377.parent[_380].mat; + row.translate = _377.parent[_380].translate; } if (gl_LocalInvocationID.x > 0u) { @@ -277,8 +279,8 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 Transform param_11 = local[i_2]; Transform m = combine_monoid(param_10, param_11); TransformSeg transform = TransformSeg{ m.mat, m.translate }; - TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) }; - param_12.offset = _278.conf.trans_alloc.offset; + TransformSegRef trans_ref = TransformSegRef{ _279.conf.trans_alloc.offset + ((ix + i_2) * 24u) }; + param_12.offset = _279.conf.trans_alloc.offset; TransformSegRef param_13 = trans_ref; TransformSeg param_14 = transform; TransformSeg_write(param_12, param_13, param_14, v_71); diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv index b739099..a0081bf 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.spv and b/piet-gpu/shader/gen/transform_leaf.spv differ diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil index 978dd98..6986f8f 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl index bd14f79..90ea55f 100644 --- a/piet-gpu/shader/gen/transform_reduce.hlsl +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -16,6 +16,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -96,7 +97,7 @@ Transform combine_monoid(Transform a, Transform b) void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - TransformRef _168 = { _161.Load(84) + (ix * 24u) }; + TransformRef _168 = { _161.Load(88) + (ix * 24u) }; TransformRef ref = _168; TransformRef param = ref; Transform agg = Transform_read(param); diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl index 62da531..6ae57e7 100644 --- a/piet-gpu/shader/gen/transform_reduce.msl +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -28,6 +28,7 @@ struct Alloc struct Config { + uint mem_size; uint n_elements; uint n_pathseg; uint width_in_tiles; @@ -78,6 +79,7 @@ struct Memory { uint mem_offset; uint mem_error; + uint blend_offset; uint memory[1]; }; diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv index 6aa6b94..fc8e58a 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.spv and b/piet-gpu/shader/gen/transform_reduce.spv differ