diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index e4bbfec..873fc41 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -248,9 +248,10 @@ void main() { // The flag load is done only in the last thread. However, because the // translation of memoryBarrierBuffer to Metal requires uniform control // flow, we broadcast it to all threads. - barrier(); memoryBarrierBuffer(); + barrier(); uint flag = sh_flag; + barrier(); if (flag == FLAG_PREFIX_READY) { if (gl_LocalInvocationID.x == WG_SIZE - 1) { @@ -293,6 +294,7 @@ void main() { } barrier(); flag = sh_flag; + barrier(); if (flag == FLAG_PREFIX_READY) { break; } diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index 60517b0..37cc051 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 2e8f752..322a047 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/tests/shader/gen/prefix.dxil b/tests/shader/gen/prefix.dxil index 34f3d6a..73f1ba1 100644 Binary files a/tests/shader/gen/prefix.dxil and b/tests/shader/gen/prefix.dxil differ diff --git a/tests/shader/gen/prefix.hlsl b/tests/shader/gen/prefix.hlsl index 3af5a96..72cfa90 100644 --- a/tests/shader/gen/prefix.hlsl +++ b/tests/shader/gen/prefix.hlsl @@ -109,6 +109,7 @@ void comp_main() GroupMemoryBarrierWithGroupSync(); DeviceMemoryBarrier(); uint flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); if (flag_1 == 2u) { if (gl_LocalInvocationID.x == 511u) @@ -174,6 +175,7 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); if (flag_1 == 2u) { break; diff --git a/tests/shader/gen/prefix.msl b/tests/shader/gen/prefix.msl index 8e402a9..24bee60 100644 --- a/tests/shader/gen/prefix.msl +++ b/tests/shader/gen/prefix.msl @@ -160,6 +160,7 @@ kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[b threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_device); uint flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); if (flag_1 == 2u) { if (gl_LocalInvocationID.x == 511u) @@ -219,6 +220,7 @@ kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[b } threadgroup_barrier(mem_flags::mem_threadgroup); flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); if (flag_1 == 2u) { break; diff --git a/tests/shader/gen/prefix.spv b/tests/shader/gen/prefix.spv index d2c1aad..8e7db4a 100644 Binary files a/tests/shader/gen/prefix.spv and b/tests/shader/gen/prefix.spv differ diff --git a/tests/shader/gen/prefix_atomic.dxil b/tests/shader/gen/prefix_atomic.dxil index 68f47e5..45a7dd8 100644 Binary files a/tests/shader/gen/prefix_atomic.dxil and b/tests/shader/gen/prefix_atomic.dxil differ diff --git a/tests/shader/gen/prefix_atomic.hlsl b/tests/shader/gen/prefix_atomic.hlsl index 10f7081..a75448f 100644 --- a/tests/shader/gen/prefix_atomic.hlsl +++ b/tests/shader/gen/prefix_atomic.hlsl @@ -112,6 +112,7 @@ void comp_main() GroupMemoryBarrierWithGroupSync(); DeviceMemoryBarrier(); uint flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); if (flag_1 == 2u) { if (gl_LocalInvocationID.x == 511u) @@ -177,6 +178,7 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); flag_1 = sh_flag; + GroupMemoryBarrierWithGroupSync(); if (flag_1 == 2u) { break; diff --git a/tests/shader/gen/prefix_atomic.msl b/tests/shader/gen/prefix_atomic.msl index 6d7d155..910e842 100644 --- a/tests/shader/gen/prefix_atomic.msl +++ b/tests/shader/gen/prefix_atomic.msl @@ -161,6 +161,7 @@ kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[b threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_device); uint flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); if (flag_1 == 2u) { if (gl_LocalInvocationID.x == 511u) @@ -220,6 +221,7 @@ kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[b } threadgroup_barrier(mem_flags::mem_threadgroup); flag_1 = sh_flag; + threadgroup_barrier(mem_flags::mem_threadgroup); if (flag_1 == 2u) { break; diff --git a/tests/shader/gen/prefix_atomic.spv b/tests/shader/gen/prefix_atomic.spv index acca545..d7dac5b 100644 Binary files a/tests/shader/gen/prefix_atomic.spv and b/tests/shader/gen/prefix_atomic.spv differ diff --git a/tests/shader/gen/prefix_vkmm.spv b/tests/shader/gen/prefix_vkmm.spv index 0b8d475..cef3965 100644 Binary files a/tests/shader/gen/prefix_vkmm.spv and b/tests/shader/gen/prefix_vkmm.spv differ diff --git a/tests/shader/prefix.comp b/tests/shader/prefix.comp index a6a0d57..b41d1b1 100644 --- a/tests/shader/prefix.comp +++ b/tests/shader/prefix.comp @@ -144,6 +144,7 @@ void main() { memoryBarrierBuffer(); #endif uint flag = sh_flag; + barrier(); if (flag == FLAG_PREFIX_READY) { if (gl_LocalInvocationID.x == WG_SIZE - 1) { @@ -185,6 +186,7 @@ void main() { } barrier(); flag = sh_flag; + barrier(); if (flag == FLAG_PREFIX_READY) { break; }