#pragma clang diagnostic ignored "-Wmissing-prototypes" #pragma clang diagnostic ignored "-Wunused-variable" #include #include #include using namespace metal; struct Element { uint data; uint flag; }; struct DataBuf { Element data[1]; }; struct ControlBuf { uint failures; }; constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) uint permute_flag_ix(thread const uint& data_ix) { return (data_ix * 419u) & 65535u; } kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) { atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed); threadgroup_barrier(mem_flags::mem_device); uint param = gl_GlobalInvocationID.x; uint write_flag_ix = permute_flag_ix(param); atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed); uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u; uint param_1 = read_ix; uint read_flag_ix = permute_flag_ix(param_1); uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed); uint flag = _58; threadgroup_barrier(mem_flags::mem_device); uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed); uint data = _62; if (flag > data) { uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed); } }