Merge pull request #127 from linebender/atomic

Increase testing of atomics
2025-01-10 12:41:30 +11:00 · 2021-11-14 07:10:39 -08:00 · 2021-11-14 07:10:39 -08:00 · 76a6f1fec8
parent 4c42da2d46 f32f2d7f95
commit 76a6f1fec8
26 changed files with 1299 additions and 15 deletions
--- a/tests/README.md
+++ b/tests/README.md
@ -0,0 +1,47 @@
+# piet-gpu-tests
+
+This subdirectory contains a curated set of tests for GPU issues likely to affect piet-gpu compatibility or performance. To run, cd to the tests directory and do `cargo run --release`. There are a number of additional options, including:
+
+* `--dx12` Prefer DX12 backend on windows.
+* `--size {s,m,l}` Size of test to run.
+* `--n_iter n` Number of iterations.
+* `--verbose` Verbose output.
+
+As usual, run `cargo run -- -h` for the current list.
+
+Below is a description of individual tests.
+
+## clear buffers
+
+This is as simple as it says, it uses a compute shader to clear buffers. It's run first as a warmup, and is a simple test of raw memory bandwidth (reported as 4 byte elements/s).
+
+## Prefix sum tests
+
+There are several variations of the prefix sum test, first the [decoupled look-back] variant, then a more conservative tree reduction version. The decoupled look-back implemenation exercises advanced atomic features and depends on their correctness, including atomic coherence and correct scope of memory barriers.
+
+None of the decoupled look-back tests are expected to pass on Metal, as that back-end lacks the appropriate barrier; the spirv-cross translation silently translates the GLSL version to a weaker one. All tests are expected to pass on both Vulkan and DX12.
+
+The compatibility variant does all manipulation of the state buffer using non-atomic operations, with the buffer marked "volatile" and barriers to insure acquire/release ordering.
+
+The atomic variant is similar, but uses atomicLoad and atomicStore (from the [memory scope semantics] extension to GLSL).
+
+Finally, the vkmm (Vulkan memory model) variant uses explicit acquire and release semantics on the atomics instead of barriers, and only runs when the device reports that the memory model extension is available.
+
+The tree reduction version of this test does not rely on advanced atomics and can be considered a baseline for both correctness and performance. The current implementation lacks configuration settings to handle odd-size buffers. On well-tuned hardware, the decoupled look-back implementation is expected to be 1.5x faster.
+
+Note that the workgroup sizes and sequential iteration count parameters are hard-coded (and tuned for a desktop card I had handy). A useful future extension of this test suite would be iteration over several combinations of those parameters. (The main reason this is not done yet is that it would put a lot of strain on the shader build pipeline, and at the moment hand-editing the ninja file is adequate).
+
+## Atomic tests
+
+Decoupled look-back relies on the atomic message passing idiom; these tests exercise that in isolation.
+
+The message passing tests basically do bunch of the basic message passing operation in parallel, and the "special sauce" is that the memory locations for both flags and data are permuted. That seems to do a lot better job finding violations than existing versions of the test.
+
+The linked list test is mostly a bandwidth test of atomicExchange, and is a simplified version of what the coarse path rasterizer does in piet-gpu to build per-tile lists of path segments. The verification of the resulting lists is also a pretty good test of device scoped modification order (not that this is likely to fail).
+
+## More tests
+
+I'll be adding more tests specific to piet-gpu. I'm also open to tests being added here, feel free to file an issue.
+
+[decoupled look-back]: https://raphlinus.github.io/gpu/2020/04/30/prefix-sum.html
+[memory scope semantics]: https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_memory_scope_semantics.txt
--- a/tests/shader/build.ninja
+++ b/tests/shader/build.ninja
@ -32,6 +32,16 @@ build gen/prefix.hlsl: hlsl gen/prefix.spv
 build gen/prefix.dxil: dxil gen/prefix.hlsl
 build gen/prefix.msl: msl gen/prefix.spv

+build gen/prefix_atomic.spv: glsl prefix.comp
+  flags = -DATOMIC
+build gen/prefix_atomic.hlsl: hlsl gen/prefix_atomic.spv
+build gen/prefix_atomic.dxil: dxil gen/prefix_atomic.hlsl
+build gen/prefix_atomic.msl: msl gen/prefix_atomic.spv
+
+build gen/prefix_vkmm.spv: glsl prefix.comp
+  flags = -DATOMIC -DVKMM
+# Vulkan memory model doesn't translate
+
 build gen/prefix_reduce.spv: glsl prefix_reduce.comp
 build gen/prefix_reduce.hlsl: hlsl gen/prefix_reduce.spv
 build gen/prefix_reduce.dxil: dxil gen/prefix_reduce.hlsl
@ -47,3 +57,16 @@ build gen/prefix_scan.spv: glsl prefix_scan.comp
 build gen/prefix_scan.hlsl: hlsl gen/prefix_scan.spv
 build gen/prefix_scan.dxil: dxil gen/prefix_scan.hlsl
 build gen/prefix_scan.msl: msl gen/prefix_scan.spv
+
+build gen/message_passing.spv: glsl message_passing.comp
+build gen/message_passing.hlsl: hlsl gen/message_passing.spv
+build gen/message_passing.dxil: dxil gen/message_passing.hlsl
+build gen/message_passing.msl: msl gen/message_passing.spv
+
+build gen/message_passing_vkmm.spv: glsl message_passing.comp
+  flags = -DVKMM
+
+build gen/linkedlist.spv: glsl linkedlist.comp
+build gen/linkedlist.hlsl: hlsl gen/linkedlist.spv
+build gen/linkedlist.dxil: dxil gen/linkedlist.hlsl
+build gen/linkedlist.msl: msl gen/linkedlist.spv
--- a/tests/shader/gen/linkedlist.dxil
+++ b/tests/shader/gen/linkedlist.dxil
--- a/tests/shader/gen/linkedlist.hlsl
+++ b/tests/shader/gen/linkedlist.hlsl
@ -0,0 +1,39 @@
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer _56 : register(u0);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+    uint rng = gl_GlobalInvocationID.x + 1u;
+    for (uint i = 0u; i < 100u; i++)
+    {
+        rng ^= (rng << uint(13));
+        rng ^= (rng >> uint(17));
+        rng ^= (rng << uint(5));
+        uint bucket = rng % 65536u;
+        if (bucket != 0u)
+        {
+            uint _61;
+            _56.InterlockedAdd(0, 2u, _61);
+            uint alloc = _61 + 65536u;
+            uint _67;
+            _56.InterlockedExchange(bucket * 4 + 0, alloc, _67);
+            uint old = _67;
+            _56.Store(alloc * 4 + 0, old);
+            _56.Store((alloc + 1u) * 4 + 0, gl_GlobalInvocationID.x);
+        }
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/tests/shader/gen/linkedlist.msl
+++ b/tests/shader/gen/linkedlist.msl
@ -0,0 +1,36 @@
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct MemBuf
+{
+    uint mem[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+kernel void main0(device MemBuf& _56 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint rng = gl_GlobalInvocationID.x + 1u;
+    for (uint i = 0u; i < 100u; i++)
+    {
+        rng ^= (rng << uint(13));
+        rng ^= (rng >> uint(17));
+        rng ^= (rng << uint(5));
+        uint bucket = rng % 65536u;
+        if (bucket != 0u)
+        {
+            uint _61 = atomic_fetch_add_explicit((device atomic_uint*)&_56.mem[0], 2u, memory_order_relaxed);
+            uint alloc = _61 + 65536u;
+            uint _67 = atomic_exchange_explicit((device atomic_uint*)&_56.mem[bucket], alloc, memory_order_relaxed);
+            uint old = _67;
+            _56.mem[alloc] = old;
+            _56.mem[alloc + 1u] = gl_GlobalInvocationID.x;
+        }
+    }
+}
+
--- a/tests/shader/gen/linkedlist.spv
+++ b/tests/shader/gen/linkedlist.spv
--- a/tests/shader/gen/message_passing.dxil
+++ b/tests/shader/gen/message_passing.dxil
--- a/tests/shader/gen/message_passing.hlsl
+++ b/tests/shader/gen/message_passing.hlsl
@ -0,0 +1,54 @@
+struct Element
+{
+    uint data;
+    uint flag;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+RWByteAddressBuffer data_buf : register(u0);
+RWByteAddressBuffer control_buf : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+uint permute_flag_ix(uint data_ix)
+{
+    return (data_ix * 419u) & 65535u;
+}
+
+void comp_main()
+{
+    uint _76;
+    data_buf.InterlockedExchange(gl_GlobalInvocationID.x * 8 + 0, 1u, _76);
+    DeviceMemoryBarrier();
+    uint param = gl_GlobalInvocationID.x;
+    uint write_flag_ix = permute_flag_ix(param);
+    uint _77;
+    data_buf.InterlockedExchange(write_flag_ix * 8 + 4, 1u, _77);
+    uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
+    uint param_1 = read_ix;
+    uint read_flag_ix = permute_flag_ix(param_1);
+    uint _58;
+    data_buf.InterlockedAdd(read_flag_ix * 8 + 4, 0, _58);
+    uint flag = _58;
+    DeviceMemoryBarrier();
+    uint _62;
+    data_buf.InterlockedAdd(read_ix * 8 + 0, 0, _62);
+    uint data = _62;
+    if (flag > data)
+    {
+        uint _73;
+        control_buf.InterlockedAdd(0, 1u, _73);
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/tests/shader/gen/message_passing.msl
+++ b/tests/shader/gen/message_passing.msl
@ -0,0 +1,54 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct Element
+{
+    uint data;
+    uint flag;
+};
+
+struct DataBuf
+{
+    Element data[1];
+};
+
+struct ControlBuf
+{
+    uint failures;
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+uint permute_flag_ix(thread const uint& data_ix)
+{
+    return (data_ix * 419u) & 65535u;
+}
+
+kernel void main0(device DataBuf& data_buf [[buffer(0)]], device ControlBuf& control_buf [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    atomic_store_explicit((device atomic_uint*)&data_buf.data[gl_GlobalInvocationID.x].data, 1u, memory_order_relaxed);
+    threadgroup_barrier(mem_flags::mem_device);
+    uint param = gl_GlobalInvocationID.x;
+    uint write_flag_ix = permute_flag_ix(param);
+    atomic_store_explicit((device atomic_uint*)&data_buf.data[write_flag_ix].flag, 1u, memory_order_relaxed);
+    uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
+    uint param_1 = read_ix;
+    uint read_flag_ix = permute_flag_ix(param_1);
+    uint _58 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_flag_ix].flag, memory_order_relaxed);
+    uint flag = _58;
+    threadgroup_barrier(mem_flags::mem_device);
+    uint _62 = atomic_load_explicit((device atomic_uint*)&data_buf.data[read_ix].data, memory_order_relaxed);
+    uint data = _62;
+    if (flag > data)
+    {
+        uint _73 = atomic_fetch_add_explicit((device atomic_uint*)&control_buf.failures, 1u, memory_order_relaxed);
+    }
+}
+
--- a/tests/shader/gen/message_passing.spv
+++ b/tests/shader/gen/message_passing.spv
--- a/tests/shader/gen/message_passing_vkmm.spv
+++ b/tests/shader/gen/message_passing_vkmm.spv
--- a/tests/shader/gen/prefix.spv
+++ b/tests/shader/gen/prefix.spv
--- a/tests/shader/gen/prefix_atomic.dxil
+++ b/tests/shader/gen/prefix_atomic.dxil
--- a/tests/shader/gen/prefix_atomic.hlsl
+++ b/tests/shader/gen/prefix_atomic.hlsl
@ -0,0 +1,227 @@
+struct Monoid
+{
+    uint element;
+};
+
+struct State
+{
+    uint flag;
+    Monoid aggregate;
+    Monoid prefix;
+};
+
+static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
+
+static const Monoid _185 = { 0u };
+
+globallycoherent RWByteAddressBuffer _43 : register(u2);
+ByteAddressBuffer _67 : register(t0);
+RWByteAddressBuffer _372 : register(u1);
+
+static uint3 gl_LocalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_LocalInvocationID : SV_GroupThreadID;
+};
+
+groupshared uint sh_part_ix;
+groupshared Monoid sh_scratch[512];
+groupshared uint sh_flag;
+groupshared Monoid sh_prefix;
+
+Monoid combine_monoid(Monoid a, Monoid b)
+{
+    Monoid _22 = { a.element + b.element };
+    return _22;
+}
+
+void comp_main()
+{
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        uint _47;
+        _43.InterlockedAdd(0, 1u, _47);
+        sh_part_ix = _47;
+    }
+    GroupMemoryBarrierWithGroupSync();
+    uint part_ix = sh_part_ix;
+    uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
+    Monoid _71;
+    _71.element = _67.Load(ix * 4 + 0);
+    Monoid local[16];
+    local[0].element = _71.element;
+    Monoid param_1;
+    for (uint i = 1u; i < 16u; i++)
+    {
+        Monoid param = local[i - 1u];
+        Monoid _94;
+        _94.element = _67.Load((ix + i) * 4 + 0);
+        param_1.element = _94.element;
+        local[i] = combine_monoid(param, param_1);
+    }
+    Monoid agg = local[15];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        GroupMemoryBarrierWithGroupSync();
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Monoid param_2 = other;
+            Monoid param_3 = agg;
+            agg = combine_monoid(param_2, param_3);
+        }
+        GroupMemoryBarrierWithGroupSync();
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 511u)
+    {
+        _43.Store(part_ix * 12 + 8, agg.element);
+        if (part_ix == 0u)
+        {
+            _43.Store(12, agg.element);
+        }
+    }
+    DeviceMemoryBarrier();
+    if (gl_LocalInvocationID.x == 511u)
+    {
+        uint flag = 1u;
+        if (part_ix == 0u)
+        {
+            flag = 2u;
+        }
+        uint _383;
+        _43.InterlockedExchange(part_ix * 12 + 4, flag, _383);
+    }
+    Monoid exclusive = _185;
+    if (part_ix != 0u)
+    {
+        uint look_back_ix = part_ix - 1u;
+        uint their_ix = 0u;
+        Monoid their_prefix;
+        Monoid their_agg;
+        Monoid m;
+        while (true)
+        {
+            if (gl_LocalInvocationID.x == 511u)
+            {
+                uint _208;
+                _43.InterlockedAdd(look_back_ix * 12 + 4, 0, _208);
+                sh_flag = _208;
+            }
+            GroupMemoryBarrierWithGroupSync();
+            DeviceMemoryBarrier();
+            uint flag_1 = sh_flag;
+            if (flag_1 == 2u)
+            {
+                if (gl_LocalInvocationID.x == 511u)
+                {
+                    Monoid _223;
+                    _223.element = _43.Load(look_back_ix * 12 + 12);
+                    their_prefix.element = _223.element;
+                    Monoid param_4 = their_prefix;
+                    Monoid param_5 = exclusive;
+                    exclusive = combine_monoid(param_4, param_5);
+                }
+                break;
+            }
+            else
+            {
+                if (flag_1 == 1u)
+                {
+                    if (gl_LocalInvocationID.x == 511u)
+                    {
+                        Monoid _245;
+                        _245.element = _43.Load(look_back_ix * 12 + 8);
+                        their_agg.element = _245.element;
+                        Monoid param_6 = their_agg;
+                        Monoid param_7 = exclusive;
+                        exclusive = combine_monoid(param_6, param_7);
+                    }
+                    look_back_ix--;
+                    their_ix = 0u;
+                    continue;
+                }
+            }
+            if (gl_LocalInvocationID.x == 511u)
+            {
+                Monoid _267;
+                _267.element = _67.Load(((look_back_ix * 8192u) + their_ix) * 4 + 0);
+                m.element = _267.element;
+                if (their_ix == 0u)
+                {
+                    their_agg = m;
+                }
+                else
+                {
+                    Monoid param_8 = their_agg;
+                    Monoid param_9 = m;
+                    their_agg = combine_monoid(param_8, param_9);
+                }
+                their_ix++;
+                if (their_ix == 8192u)
+                {
+                    Monoid param_10 = their_agg;
+                    Monoid param_11 = exclusive;
+                    exclusive = combine_monoid(param_10, param_11);
+                    if (look_back_ix == 0u)
+                    {
+                        sh_flag = 2u;
+                    }
+                    else
+                    {
+                        look_back_ix--;
+                        their_ix = 0u;
+                    }
+                }
+            }
+            GroupMemoryBarrierWithGroupSync();
+            flag_1 = sh_flag;
+            if (flag_1 == 2u)
+            {
+                break;
+            }
+        }
+        if (gl_LocalInvocationID.x == 511u)
+        {
+            Monoid param_12 = exclusive;
+            Monoid param_13 = agg;
+            Monoid inclusive_prefix = combine_monoid(param_12, param_13);
+            sh_prefix = exclusive;
+            _43.Store(part_ix * 12 + 12, inclusive_prefix.element);
+        }
+        DeviceMemoryBarrier();
+        if (gl_LocalInvocationID.x == 511u)
+        {
+            uint _384;
+            _43.InterlockedExchange(part_ix * 12 + 4, 2u, _384);
+        }
+    }
+    GroupMemoryBarrierWithGroupSync();
+    if (part_ix != 0u)
+    {
+        exclusive = sh_prefix;
+    }
+    Monoid row = exclusive;
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        Monoid param_14 = row;
+        Monoid param_15 = other_1;
+        row = combine_monoid(param_14, param_15);
+    }
+    for (uint i_2 = 0u; i_2 < 16u; i_2++)
+    {
+        Monoid param_16 = row;
+        Monoid param_17 = local[i_2];
+        Monoid m_1 = combine_monoid(param_16, param_17);
+        _372.Store((ix + i_2) * 4 + 0, m_1.element);
+    }
+}
+
+[numthreads(512, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
+    comp_main();
+}
--- a/tests/shader/gen/prefix_atomic.msl
+++ b/tests/shader/gen/prefix_atomic.msl
@ -0,0 +1,263 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#pragma clang diagnostic ignored "-Wunused-variable"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+#include <metal_atomic>
+
+using namespace metal;
+
+template<typename T, size_t Num>
+struct spvUnsafeArray
+{
+    T elements[Num ? Num : 1];
+    
+    thread T& operator [] (size_t pos) thread
+    {
+        return elements[pos];
+    }
+    constexpr const thread T& operator [] (size_t pos) const thread
+    {
+        return elements[pos];
+    }
+    
+    device T& operator [] (size_t pos) device
+    {
+        return elements[pos];
+    }
+    constexpr const device T& operator [] (size_t pos) const device
+    {
+        return elements[pos];
+    }
+    
+    constexpr const constant T& operator [] (size_t pos) const constant
+    {
+        return elements[pos];
+    }
+    
+    threadgroup T& operator [] (size_t pos) threadgroup
+    {
+        return elements[pos];
+    }
+    constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
+    {
+        return elements[pos];
+    }
+};
+
+struct Monoid
+{
+    uint element;
+};
+
+struct Monoid_1
+{
+    uint element;
+};
+
+struct State
+{
+    uint flag;
+    Monoid_1 aggregate;
+    Monoid_1 prefix;
+};
+
+struct StateBuf
+{
+    uint part_counter;
+    State state[1];
+};
+
+struct InBuf
+{
+    Monoid_1 inbuf[1];
+};
+
+struct OutBuf
+{
+    Monoid_1 outbuf[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
+
+static inline __attribute__((always_inline))
+Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b)
+{
+    return Monoid{ a.element + b.element };
+}
+
+kernel void main0(const device InBuf& _67 [[buffer(0)]], device OutBuf& _372 [[buffer(1)]], volatile device StateBuf& _43 [[buffer(2)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
+{
+    threadgroup uint sh_part_ix;
+    threadgroup Monoid sh_scratch[512];
+    threadgroup uint sh_flag;
+    threadgroup Monoid sh_prefix;
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        uint _47 = atomic_fetch_add_explicit((volatile device atomic_uint*)&_43.part_counter, 1u, memory_order_relaxed);
+        sh_part_ix = _47;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    uint part_ix = sh_part_ix;
+    uint ix = (part_ix * 8192u) + (gl_LocalInvocationID.x * 16u);
+    spvUnsafeArray<Monoid, 16> local;
+    local[0].element = _67.inbuf[ix].element;
+    Monoid param_1;
+    for (uint i = 1u; i < 16u; i++)
+    {
+        Monoid param = local[i - 1u];
+        param_1.element = _67.inbuf[ix + i].element;
+        local[i] = combine_monoid(param, param_1);
+    }
+    Monoid agg = local[15];
+    sh_scratch[gl_LocalInvocationID.x] = agg;
+    for (uint i_1 = 0u; i_1 < 9u; i_1++)
+    {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        if (gl_LocalInvocationID.x >= (1u << i_1))
+        {
+            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
+            Monoid param_2 = other;
+            Monoid param_3 = agg;
+            agg = combine_monoid(param_2, param_3);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        sh_scratch[gl_LocalInvocationID.x] = agg;
+    }
+    if (gl_LocalInvocationID.x == 511u)
+    {
+        _43.state[part_ix].aggregate.element = agg.element;
+        if (part_ix == 0u)
+        {
+            _43.state[0].prefix.element = agg.element;
+        }
+    }
+    threadgroup_barrier(mem_flags::mem_device);
+    if (gl_LocalInvocationID.x == 511u)
+    {
+        uint flag = 1u;
+        if (part_ix == 0u)
+        {
+            flag = 2u;
+        }
+        atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, flag, memory_order_relaxed);
+    }
+    Monoid exclusive = Monoid{ 0u };
+    if (part_ix != 0u)
+    {
+        uint look_back_ix = part_ix - 1u;
+        uint their_ix = 0u;
+        Monoid their_prefix;
+        Monoid their_agg;
+        Monoid m;
+        while (true)
+        {
+            if (gl_LocalInvocationID.x == 511u)
+            {
+                uint _208 = atomic_load_explicit((volatile device atomic_uint*)&_43.state[look_back_ix].flag, memory_order_relaxed);
+                sh_flag = _208;
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+            threadgroup_barrier(mem_flags::mem_device);
+            uint flag_1 = sh_flag;
+            if (flag_1 == 2u)
+            {
+                if (gl_LocalInvocationID.x == 511u)
+                {
+                    their_prefix.element = _43.state[look_back_ix].prefix.element;
+                    Monoid param_4 = their_prefix;
+                    Monoid param_5 = exclusive;
+                    exclusive = combine_monoid(param_4, param_5);
+                }
+                break;
+            }
+            else
+            {
+                if (flag_1 == 1u)
+                {
+                    if (gl_LocalInvocationID.x == 511u)
+                    {
+                        their_agg.element = _43.state[look_back_ix].aggregate.element;
+                        Monoid param_6 = their_agg;
+                        Monoid param_7 = exclusive;
+                        exclusive = combine_monoid(param_6, param_7);
+                    }
+                    look_back_ix--;
+                    their_ix = 0u;
+                    continue;
+                }
+            }
+            if (gl_LocalInvocationID.x == 511u)
+            {
+                m.element = _67.inbuf[(look_back_ix * 8192u) + their_ix].element;
+                if (their_ix == 0u)
+                {
+                    their_agg = m;
+                }
+                else
+                {
+                    Monoid param_8 = their_agg;
+                    Monoid param_9 = m;
+                    their_agg = combine_monoid(param_8, param_9);
+                }
+                their_ix++;
+                if (their_ix == 8192u)
+                {
+                    Monoid param_10 = their_agg;
+                    Monoid param_11 = exclusive;
+                    exclusive = combine_monoid(param_10, param_11);
+                    if (look_back_ix == 0u)
+                    {
+                        sh_flag = 2u;
+                    }
+                    else
+                    {
+                        look_back_ix--;
+                        their_ix = 0u;
+                    }
+                }
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+            flag_1 = sh_flag;
+            if (flag_1 == 2u)
+            {
+                break;
+            }
+        }
+        if (gl_LocalInvocationID.x == 511u)
+        {
+            Monoid param_12 = exclusive;
+            Monoid param_13 = agg;
+            Monoid inclusive_prefix = combine_monoid(param_12, param_13);
+            sh_prefix = exclusive;
+            _43.state[part_ix].prefix.element = inclusive_prefix.element;
+        }
+        threadgroup_barrier(mem_flags::mem_device);
+        if (gl_LocalInvocationID.x == 511u)
+        {
+            atomic_store_explicit((volatile device atomic_uint*)&_43.state[part_ix].flag, 2u, memory_order_relaxed);
+        }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (part_ix != 0u)
+    {
+        exclusive = sh_prefix;
+    }
+    Monoid row = exclusive;
+    if (gl_LocalInvocationID.x > 0u)
+    {
+        Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - 1u];
+        Monoid param_14 = row;
+        Monoid param_15 = other_1;
+        row = combine_monoid(param_14, param_15);
+    }
+    for (uint i_2 = 0u; i_2 < 16u; i_2++)
+    {
+        Monoid param_16 = row;
+        Monoid param_17 = local[i_2];
+        Monoid m_1 = combine_monoid(param_16, param_17);
+        _372.outbuf[ix + i_2].element = m_1.element;
+    }
+}
+
--- a/tests/shader/gen/prefix_atomic.spv
+++ b/tests/shader/gen/prefix_atomic.spv
--- a/tests/shader/gen/prefix_vkmm.spv
+++ b/tests/shader/gen/prefix_vkmm.spv
--- a/tests/shader/linkedlist.comp
+++ b/tests/shader/linkedlist.comp
@ -0,0 +1,31 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Linked list building.
+
+#version 450
+
+#define N_BUCKETS 65536
+#define N_ITER 100
+
+layout(local_size_x = 256, local_size_y = 1) in;
+
+layout(set = 0, binding = 0) buffer MemBuf {
+    uint[] mem;
+};
+
+void main() {
+    uint rng = gl_GlobalInvocationID.x + 1;
+    for (uint i = 0; i < N_ITER; i++) {
+        // xorshift32
+        rng ^= rng << 13;
+        rng ^= rng >> 17;
+        rng ^= rng << 5;
+        uint bucket = rng % N_BUCKETS;
+        if (bucket != 0) {
+            uint alloc = atomicAdd(mem[0], 2) + N_BUCKETS;
+            uint old = atomicExchange(mem[bucket], alloc);
+            mem[alloc] = old;
+            mem[alloc + 1] = gl_GlobalInvocationID.x;
+        }
+    }
+}
--- a/tests/shader/message_passing.comp
+++ b/tests/shader/message_passing.comp
@ -0,0 +1,60 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Our version of the message passing atomic litmus test.
+
+#version 450
+
+#extension GL_KHR_memory_scope_semantics : enable
+
+#ifdef VKMM
+#pragma use_vulkan_memory_model
+#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
+#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
+#else
+#define ACQUIRE 0, 0
+#define RELEASE 0, 0
+#endif
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+struct Element
+{
+    uint data;
+    uint flag;
+};
+
+layout(binding = 0) buffer DataBuf
+{
+    Element data[];
+} data_buf;
+
+layout(binding = 1) buffer ControlBuf
+{
+    uint failures;
+} control_buf;
+
+uint permute_flag_ix(uint data_ix)
+{
+    return (data_ix * 419u) & 65535u;
+}
+
+void main()
+{
+    atomicStore(data_buf.data[gl_GlobalInvocationID.x].data, 1u, gl_ScopeDevice, 0, 0);
+#ifndef VKMM
+    memoryBarrierBuffer();
+#endif
+    uint write_flag_ix = permute_flag_ix(gl_GlobalInvocationID.x);
+    atomicStore(data_buf.data[write_flag_ix].flag, 1u, gl_ScopeDevice, RELEASE);
+    uint read_ix = (gl_GlobalInvocationID.x * 4099u) & 65535u;
+    uint read_flag_ix = permute_flag_ix(read_ix);
+    uint flag = atomicLoad(data_buf.data[read_flag_ix].flag, gl_ScopeDevice, ACQUIRE);
+#ifndef VKMM
+    memoryBarrierBuffer();
+#endif
+    uint data = atomicLoad(data_buf.data[read_ix].data, gl_ScopeDevice, 0, 0);
+    if (flag > data)
+    {
+        atomicAdd(control_buf.failures, 1u);
+    }
+}
--- a/tests/shader/prefix.comp
+++ b/tests/shader/prefix.comp
@ -1,9 +1,26 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

 // A prefix sum.
+//
+// This test builds in three configurations. The default is a
+// compatibility mode, essentially plain GLSL. With ATOMIC set, the
+// flag loads and stores are atomic operations, but uses barriers.
+// With both ATOMIC and VKMM set, it uses acquire/release semantics
+// instead of barriers.

 #version 450

+#extension GL_KHR_memory_scope_semantics : enable
+
+#ifdef VKMM
+#pragma use_vulkan_memory_model
+#define ACQUIRE gl_StorageSemanticsBuffer, gl_SemanticsAcquire
+#define RELEASE gl_StorageSemanticsBuffer, gl_SemanticsRelease
+#else
+#define ACQUIRE 0, 0
+#define RELEASE 0, 0
+#endif
+
 #define N_ROWS 16
 #define LG_WG_SIZE 9
 #define WG_SIZE (1 << LG_WG_SIZE)
@ -24,9 +41,9 @@ layout(set = 0, binding = 1) buffer OutBuf {
 };

 // These correspond to X, A, P respectively in the prefix sum paper.
-#define FLAG_NOT_READY 0
-#define FLAG_AGGREGATE_READY 1
-#define FLAG_PREFIX_READY 2
+#define FLAG_NOT_READY 0u
+#define FLAG_AGGREGATE_READY 1u
+#define FLAG_PREFIX_READY 2u

 struct State {
    uint flag;
@ -34,6 +51,7 @@ struct State {
    Monoid prefix;
 };

+// Perhaps this should be "nonprivate" with VKMM
 layout(set = 0, binding = 2) volatile buffer StateBuf {
    uint part_counter;
    State[] state;
@ -87,13 +105,19 @@ void main() {
        }
    }
    // Write flag with release semantics; this is done portably with a barrier.
+#ifndef VKMM
    memoryBarrierBuffer();
+#endif
    if (gl_LocalInvocationID.x == WG_SIZE - 1) {
        uint flag = FLAG_AGGREGATE_READY;
        if (part_ix == 0) {
            flag = FLAG_PREFIX_READY;
        }
+#ifdef ATOMIC
+        atomicStore(state[part_ix].flag, flag, gl_ScopeDevice, RELEASE);
+#else
        state[part_ix].flag = flag;
+#endif
    }

    Monoid exclusive = Monoid(0);
@ -106,13 +130,19 @@ void main() {
        while (true) {
            // Read flag with acquire semantics.
            if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+#ifdef ATOMIC
+                sh_flag = atomicLoad(state[look_back_ix].flag, gl_ScopeDevice, ACQUIRE);
+#else
                sh_flag = state[look_back_ix].flag;
+#endif
            }
            // The flag load is done only in the last thread. However, because the
            // translation of memoryBarrierBuffer to Metal requires uniform control
            // flow, we broadcast it to all threads.
            barrier();
+#ifndef VKMM
            memoryBarrierBuffer();
+#endif
            uint flag = sh_flag;

            if (flag == FLAG_PREFIX_READY) {
@ -165,9 +195,15 @@ void main() {
            sh_prefix = exclusive;
            state[part_ix].prefix = inclusive_prefix;
        }
+#ifndef VKMM
        memoryBarrierBuffer();
+#endif
        if (gl_LocalInvocationID.x == WG_SIZE - 1) {
+#ifdef ATOMIC
+            atomicStore(state[part_ix].flag, FLAG_PREFIX_READY, gl_ScopeDevice, RELEASE);
+#else
            state[part_ix].flag = FLAG_PREFIX_READY;
+#endif
        }
    }
    barrier();
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@ -74,7 +74,7 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult

 impl ClearCode {
    pub unsafe fn new(runner: &mut Runner) -> ClearCode {
-        let code = include_shader!(&runner.session, "../shader/gen/Clear");
+        let code = include_shader!(&runner.session, "../shader/gen/clear");
        let pipeline = runner
            .session
            .create_compute_pipeline(code, &[BindType::BufReadOnly, BindType::Buffer])
--- a/tests/src/linkedlist.rs
+++ b/tests/src/linkedlist.rs
@ -0,0 +1,185 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use piet_gpu_hal::{include_shader, BackendType, BindType, DescriptorSet};
+use piet_gpu_hal::{Buffer, Pipeline};
+
+use crate::clear::{ClearBinding, ClearCode, ClearStage};
+use crate::runner::{Commands, Runner};
+use crate::test_result::TestResult;
+use crate::Config;
+
+const WG_SIZE: u64 = 256;
+const N_BUCKETS: u64 = 65536;
+
+struct LinkedListCode {
+    pipeline: Pipeline,
+    clear_code: Option<ClearCode>,
+}
+
+struct LinkedListStage {
+    clear_stage: Option<ClearStage>,
+}
+
+struct LinkedListBinding {
+    descriptor_set: DescriptorSet,
+    clear_binding: Option<ClearBinding>,
+}
+
+pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("linked list");
+    let mem_buf = runner.buf_down(1024 * N_BUCKETS);
+    let code = LinkedListCode::new(runner);
+    let stage = LinkedListStage::new(runner, &code, N_BUCKETS);
+    let binding = stage.bind(runner, &code, &mem_buf.dev_buf);
+    let n_iter = config.n_iter;
+    let mut total_elapsed = 0.0;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        // Might clear only buckets to save time.
+        commands.write_timestamp(0);
+        stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
+        commands.write_timestamp(1);
+        if i == 0 {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&mem_buf);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 {
+            let mut dst: Vec<u32> = Default::default();
+            mem_buf.read(&mut dst);
+            if !verify(&dst) {
+                result.fail("incorrect data");
+            }
+        }
+    }
+    result.timing(total_elapsed, N_BUCKETS * 100 * n_iter);
+    result
+}
+
+impl LinkedListCode {
+    unsafe fn new(runner: &mut Runner) -> LinkedListCode {
+        let code = include_shader!(&runner.session, "../shader/gen/linkedlist");
+        let pipeline = runner
+            .session
+            .create_compute_pipeline(code, &[BindType::Buffer])
+            .unwrap();
+        let clear_code = if runner.backend_type() != BackendType::Vulkan {
+            Some(ClearCode::new(runner))
+        } else {
+            None
+        };
+        LinkedListCode {
+            pipeline,
+            clear_code,
+        }
+    }
+}
+
+impl LinkedListStage {
+    unsafe fn new(runner: &mut Runner, code: &LinkedListCode, n_buckets: u64) -> LinkedListStage {
+        let clear_stage = if code.clear_code.is_some() {
+            Some(ClearStage::new(runner, n_buckets))
+        } else {
+            None
+        };
+        LinkedListStage { clear_stage }
+    }
+
+    unsafe fn bind(
+        &self,
+        runner: &mut Runner,
+        code: &LinkedListCode,
+        mem_buf: &Buffer,
+    ) -> LinkedListBinding {
+        let descriptor_set = runner
+            .session
+            .create_simple_descriptor_set(&code.pipeline, &[mem_buf])
+            .unwrap();
+        let clear_binding = if let Some(stage) = &self.clear_stage {
+            Some(stage.bind(runner, &code.clear_code.as_ref().unwrap(), mem_buf))
+        } else {
+            None
+        };
+        LinkedListBinding {
+            descriptor_set,
+            clear_binding,
+        }
+    }
+
+    unsafe fn record(
+        &self,
+        commands: &mut Commands,
+        code: &LinkedListCode,
+        bindings: &LinkedListBinding,
+        out_buf: &Buffer,
+    ) {
+        if let Some(stage) = &self.clear_stage {
+            stage.record(
+                commands,
+                code.clear_code.as_ref().unwrap(),
+                bindings.clear_binding.as_ref().unwrap(),
+            );
+        } else {
+            commands.cmd_buf.clear_buffer(out_buf, None);
+        }
+        commands.cmd_buf.memory_barrier();
+        let n_workgroups = N_BUCKETS / WG_SIZE;
+        commands.cmd_buf.dispatch(
+            &code.pipeline,
+            &bindings.descriptor_set,
+            (n_workgroups as u32, 1, 1),
+            (WG_SIZE as u32, 1, 1),
+        );
+    }
+}
+
+fn verify(data: &[u32]) -> bool {
+    let mut expected = (0..N_BUCKETS).map(|_| Vec::new()).collect::<Vec<_>>();
+    for ix in 0..N_BUCKETS {
+        let mut rng = ix as u32 + 1;
+        for _ in 0..100 {
+            // xorshift32
+            rng ^= rng.wrapping_shl(13);
+            rng ^= rng.wrapping_shr(17);
+            rng ^= rng.wrapping_shl(5);
+            let bucket = rng % N_BUCKETS as u32;
+            if bucket != 0 {
+                expected[bucket as usize].push(ix as u32);
+            }
+        }
+    }
+    let mut actual = Vec::new();
+    for (i, expected) in expected.iter_mut().enumerate().skip(1) {
+        actual.clear();
+        let mut ptr = i;
+        loop {
+            let next = data[ptr] as usize;
+            if next == 0 {
+                break;
+            }
+            let val = data[next + 1];
+            actual.push(val);
+            ptr = next;
+        }
+        actual.sort();
+        expected.sort();
+        if actual != *expected {
+            return false;
+        }
+    }
+    true
+}
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@ -18,6 +18,8 @@

 mod clear;
 mod config;
+mod linkedlist;
+mod message_passing;
 mod prefix;
 mod prefix_tree;
 mod runner;
@ -27,8 +29,9 @@ use clap::{App, Arg};
 use piet_gpu_hal::InstanceFlags;

 use crate::config::Config;
-use crate::runner::Runner;
-use crate::test_result::{ReportStyle, TestResult};
+pub use crate::runner::Runner;
+use crate::test_result::ReportStyle;
+pub use crate::test_result::TestResult;

 fn main() {
    let matches = App::new("piet-gpu-tests")
@ -86,8 +89,39 @@ fn main() {
        }
        report(&clear::run_clear_test(&mut runner, &config));
        if config.groups.matches("prefix") {
-            report(&prefix::run_prefix_test(&mut runner, &config));
+            report(&prefix::run_prefix_test(
+                &mut runner,
+                &config,
+                prefix::Variant::Compatibility,
+            ));
+            report(&prefix::run_prefix_test(
+                &mut runner,
+                &config,
+                prefix::Variant::Atomic,
+            ));
+            if runner.session.gpu_info().has_memory_model {
+                report(&prefix::run_prefix_test(
+                    &mut runner,
+                    &config,
+                    prefix::Variant::Vkmm,
+                ));
+            }
            report(&prefix_tree::run_prefix_test(&mut runner, &config));
        }
+        if config.groups.matches("atomic") {
+            report(&message_passing::run_message_passing_test(
+                &mut runner,
+                &config,
+                message_passing::Variant::Atomic,
+            ));
+            if runner.session.gpu_info().has_memory_model {
+                report(&message_passing::run_message_passing_test(
+                    &mut runner,
+                    &config,
+                    message_passing::Variant::Vkmm,
+                ));
+            }
+            report(&linkedlist::run_linkedlist_test(&mut runner, &config));
+        }
    }
 }
--- a/tests/src/message_passing.rs
+++ b/tests/src/message_passing.rs
@ -0,0 +1,180 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet, ShaderCode};
+use piet_gpu_hal::{Buffer, Pipeline};
+
+use crate::clear::{ClearBinding, ClearCode, ClearStage};
+use crate::config::Config;
+use crate::runner::{Commands, Runner};
+use crate::test_result::TestResult;
+
+const N_ELEMENTS: u64 = 65536;
+
+/// The shader code forMessagePassing sum example.
+struct MessagePassingCode {
+    pipeline: Pipeline,
+    clear_code: Option<ClearCode>,
+}
+
+/// The stage resources for the prefix sum example.
+struct MessagePassingStage {
+    data_buf: Buffer,
+    clear_stages: Option<(ClearStage, ClearBinding, ClearStage)>,
+}
+
+/// The binding for the prefix sum example.
+struct MessagePassingBinding {
+    descriptor_set: DescriptorSet,
+    clear_binding: Option<ClearBinding>,
+}
+
+#[derive(Debug)]
+pub enum Variant {
+    Atomic,
+    Vkmm,
+}
+
+pub unsafe fn run_message_passing_test(
+    runner: &mut Runner,
+    config: &Config,
+    variant: Variant,
+) -> TestResult {
+    let mut result = TestResult::new(format!("message passing litmus, {:?}", variant));
+    let out_buf = runner.buf_down(4);
+    let code = MessagePassingCode::new(runner, variant);
+    let stage = MessagePassingStage::new(runner, &code);
+    let binding = stage.bind(runner, &code, &out_buf.dev_buf);
+    let n_iter = config.n_iter;
+    let mut total_elapsed = 0.0;
+    let mut failures = 0;
+    for _ in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands, &code, &binding, &out_buf.dev_buf);
+        commands.write_timestamp(1);
+        commands.cmd_buf.memory_barrier();
+        commands.download(&out_buf);
+        total_elapsed += runner.submit(commands);
+        let mut dst: Vec<u32> = Default::default();
+        out_buf.read(&mut dst);
+        failures += dst[0];
+    }
+    if failures > 0 {
+        result.fail(format!("{} failures", failures));
+    }
+    result.timing(total_elapsed, N_ELEMENTS * n_iter);
+    result
+}
+
+impl MessagePassingCode {
+    unsafe fn new(runner: &mut Runner, variant: Variant) -> MessagePassingCode {
+        let code = match variant {
+            Variant::Atomic => include_shader!(&runner.session, "../shader/gen/message_passing"),
+            Variant::Vkmm => {
+                ShaderCode::Spv(include_bytes!("../shader/gen/message_passing_vkmm.spv"))
+            }
+        };
+        let pipeline = runner
+            .session
+            .create_compute_pipeline(code, &[BindType::Buffer, BindType::Buffer])
+            .unwrap();
+        // Currently, DX12 and Metal backends don't support buffer clearing, so use a
+        // compute shader as a workaround.
+        let clear_code = if runner.backend_type() != BackendType::Vulkan {
+            Some(ClearCode::new(runner))
+        } else {
+            None
+        };
+        MessagePassingCode {
+            pipeline,
+            clear_code,
+        }
+    }
+}
+
+impl MessagePassingStage {
+    unsafe fn new(runner: &mut Runner, code: &MessagePassingCode) -> MessagePassingStage {
+        let data_buf_size = 8 * N_ELEMENTS;
+        let data_buf = runner
+            .session
+            .create_buffer(data_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST)
+            .unwrap();
+        let clear_stages = if let Some(clear_code) = &code.clear_code {
+            let stage0 = ClearStage::new(runner, N_ELEMENTS * 2);
+            let binding0 = stage0.bind(runner, clear_code, &data_buf);
+            let stage1 = ClearStage::new(runner, 1);
+            Some((stage0, binding0, stage1))
+        } else {
+            None
+        };
+        MessagePassingStage {
+            data_buf,
+            clear_stages,
+        }
+    }
+
+    unsafe fn bind(
+        &self,
+        runner: &mut Runner,
+        code: &MessagePassingCode,
+        out_buf: &Buffer,
+    ) -> MessagePassingBinding {
+        let descriptor_set = runner
+            .session
+            .create_simple_descriptor_set(&code.pipeline, &[&self.data_buf, out_buf])
+            .unwrap();
+        let clear_binding = if let Some(clear_code) = &code.clear_code {
+            Some(
+                self.clear_stages
+                    .as_ref()
+                    .unwrap()
+                    .2
+                    .bind(runner, clear_code, out_buf),
+            )
+        } else {
+            None
+        };
+        MessagePassingBinding {
+            descriptor_set,
+            clear_binding,
+        }
+    }
+
+    unsafe fn record(
+        &self,
+        commands: &mut Commands,
+        code: &MessagePassingCode,
+        bindings: &MessagePassingBinding,
+        out_buf: &Buffer,
+    ) {
+        if let Some((stage0, binding0, stage1)) = &self.clear_stages {
+            let code = code.clear_code.as_ref().unwrap();
+            stage0.record(commands, code, binding0);
+            stage1.record(commands, code, bindings.clear_binding.as_ref().unwrap());
+        } else {
+            commands.cmd_buf.clear_buffer(&self.data_buf, None);
+            commands.cmd_buf.clear_buffer(out_buf, None);
+        }
+        commands.cmd_buf.memory_barrier();
+        commands.cmd_buf.dispatch(
+            &code.pipeline,
+            &bindings.descriptor_set,
+            (256, 1, 1),
+            (256, 1, 1),
+        );
+    }
+}
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@ -14,7 +14,7 @@
 //
 // Also licensed under MIT license, at your choice.

-use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet, ShaderCode};
 use piet_gpu_hal::{Buffer, Pipeline};

 use crate::clear::{ClearBinding, ClearCode, ClearStage};
@ -51,8 +51,19 @@ struct PrefixBinding {
    descriptor_set: DescriptorSet,
 }

-pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResult {
-    let mut result = TestResult::new("prefix sum, decoupled look-back");
+#[derive(Debug)]
+pub enum Variant {
+    Compatibility,
+    Atomic,
+    Vkmm,
+}
+
+pub unsafe fn run_prefix_test(
+    runner: &mut Runner,
+    config: &Config,
+    variant: Variant,
+) -> TestResult {
+    let mut result = TestResult::new(format!("prefix sum, decoupled look-back, {:?}", variant));
    /*
    // We're good if we're using DXC.
    if runner.backend_type() == BackendType::Dx12 {
@ -67,7 +78,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
        .create_buffer_init(&data, BufferUsage::STORAGE)
        .unwrap();
    let out_buf = runner.buf_down(data_buf.size());
-    let code = PrefixCode::new(runner);
+    let code = PrefixCode::new(runner, variant);
    let stage = PrefixStage::new(runner, &code, n_elements);
    let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
    let n_iter = config.n_iter;
@ -95,8 +106,12 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
 }

 impl PrefixCode {
-    unsafe fn new(runner: &mut Runner) -> PrefixCode {
-        let code = include_shader!(&runner.session, "../shader/gen/prefix");
+    unsafe fn new(runner: &mut Runner, variant: Variant) -> PrefixCode {
+        let code = match variant {
+            Variant::Compatibility => include_shader!(&runner.session, "../shader/gen/prefix"),
+            Variant::Atomic => include_shader!(&runner.session, "../shader/gen/prefix_atomic"),
+            Variant::Vkmm => ShaderCode::Spv(include_bytes!("../shader/gen/prefix_vkmm.spv")),
+        };
        let pipeline = runner
            .session
            .create_compute_pipeline(
--- a/tests/src/test_result.rs
+++ b/tests/src/test_result.rs
@ -38,9 +38,9 @@ pub enum ReportStyle {
 }

 impl TestResult {
-    pub fn new(name: &str) -> TestResult {
+    pub fn new(name: impl Into<String>) -> TestResult {
        TestResult {
-            name: name.to_string(),
+            name: name.into(),
            total_time: 0.0,
            n_elements: 0,
            status: Status::Pass,