Add workaround for buffer clearing

Add a clear stage and associated tests, and also use it on non-Vulkan backends to clear the state buffer. While that's a workaround and will go away when we implement the actual clear command, it's also a nice demo of how the new "stage" structure composes.
2025-01-10 12:41:30 +11:00 · 2021-11-10 14:56:00 -08:00 · 2021-11-10 14:56:00 -08:00 · fbfd4ee81b
parent 94949a6906
commit fbfd4ee81b
9 changed files with 261 additions and 5 deletions
--- a/tests/shader/build.ninja
+++ b/tests/shader/build.ninja
@ -14,6 +14,10 @@ rule hlsl
 rule msl
  command = $spirv_cross --msl $in --output $out

+build gen/clear.spv: glsl clear.comp
+build gen/clear.hlsl: hlsl gen/clear.spv
+build gen/clear.msl: msl gen/clear.spv
+
 build gen/prefix.spv: glsl prefix.comp
 build gen/prefix.hlsl: hlsl gen/prefix.spv
 build gen/prefix.msl: msl gen/prefix.spv
--- a/tests/shader/clear.comp
+++ b/tests/shader/clear.comp
@ -0,0 +1,26 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+// Clear a buffer.
+
+#version 450
+
+layout(local_size_x = 256) in;
+
+// This should probably be uniform rather than readonly,
+// but we haven't done the binding work yet.
+layout(binding = 0) readonly buffer ConfigBuf {
+    // size is in uint (4 byte) units
+    uint size;
+    uint value;
+};
+
+layout(binding = 1) buffer TargetBuf {
+    uint[] data;
+};
+
+void main() {
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < size) {
+        data[ix] = value;
+    }
+}
--- a/tests/shader/gen/clear.hlsl
+++ b/tests/shader/gen/clear.hlsl
@ -0,0 +1,26 @@
+static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
+
+ByteAddressBuffer _19 : register(t0);
+RWByteAddressBuffer _32 : register(u1);
+
+static uint3 gl_GlobalInvocationID;
+struct SPIRV_Cross_Input
+{
+    uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
+};
+
+void comp_main()
+{
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < _19.Load(0))
+    {
+        _32.Store(ix * 4 + 0, _19.Load(4));
+    }
+}
+
+[numthreads(256, 1, 1)]
+void main(SPIRV_Cross_Input stage_input)
+{
+    gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
+    comp_main();
+}
--- a/tests/shader/gen/clear.msl
+++ b/tests/shader/gen/clear.msl
@ -0,0 +1,27 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+struct ConfigBuf
+{
+    uint size;
+    uint value;
+};
+
+struct TargetBuf
+{
+    uint data[1];
+};
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
+
+kernel void main0(const device ConfigBuf& _19 [[buffer(0)]], device TargetBuf& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
+{
+    uint ix = gl_GlobalInvocationID.x;
+    if (ix < _19.size)
+    {
+        _32.data[ix] = _19.value;
+    }
+}
+
--- a/tests/shader/gen/clear.spv
+++ b/tests/shader/gen/clear.spv
--- a/tests/src/clear.rs
+++ b/tests/src/clear.rs
@ -0,0 +1,146 @@
+// Copyright 2021 The piet-gpu authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Also licensed under MIT license, at your choice.
+
+//! Utilities (and a benchmark) for clearing buffers with compute shaders.
+
+use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
+use piet_gpu_hal::{Buffer, Pipeline};
+
+use crate::config::Config;
+use crate::runner::{Commands, Runner};
+use crate::test_result::TestResult;
+
+const WG_SIZE: u64 = 256;
+
+/// The shader code for clearing buffers.
+pub struct ClearCode {
+    pipeline: Pipeline,
+}
+
+/// The stage resources for clearing buffers.
+pub struct ClearStage {
+    n_elements: u64,
+    config_buf: Buffer,
+}
+
+/// The binding for clearing buffers.
+pub struct ClearBinding {
+    descriptor_set: DescriptorSet,
+}
+
+pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult {
+    let mut result = TestResult::new("clear buffers");
+    // This will be configurable.
+    let n_elements: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24);
+    let out_buf = runner.buf_down(n_elements * 4);
+    let code = ClearCode::new(runner);
+    let stage = ClearStage::new_with_value(runner, n_elements, 0x42);
+    let binding = stage.bind(runner, &code, &out_buf.dev_buf);
+    // Also will be configurable of course.
+    let n_iter = 1000;
+    let mut total_elapsed = 0.0;
+    for i in 0..n_iter {
+        let mut commands = runner.commands();
+        commands.write_timestamp(0);
+        stage.record(&mut commands, &code, &binding);
+        commands.write_timestamp(1);
+        if i == 0 {
+            commands.cmd_buf.memory_barrier();
+            commands.download(&out_buf);
+        }
+        total_elapsed += runner.submit(commands);
+        if i == 0 {
+            let mut dst: Vec<u32> = Default::default();
+            out_buf.read(&mut dst);
+            if let Some(failure) = verify(&dst) {
+                result.fail(format!("failure at {}", failure));
+            }
+        }
+    }
+    result.timing(total_elapsed, n_elements * n_iter);
+    result
+}
+
+impl ClearCode {
+    pub unsafe fn new(runner: &mut Runner) -> ClearCode {
+        let code = include_shader!(&runner.session, "../shader/gen/Clear");
+        let pipeline = runner
+            .session
+            .create_compute_pipeline(
+                code,
+                &[BindType::BufReadOnly, BindType::Buffer],
+            )
+            .unwrap();
+        ClearCode { pipeline }
+    }
+}
+
+impl ClearStage {
+    pub unsafe fn new(runner: &mut Runner, n_elements: u64) -> ClearStage {
+        Self::new_with_value(runner, n_elements, 0)
+    }
+
+    pub unsafe fn new_with_value(runner: &mut Runner, n_elements: u64, value: u32) -> ClearStage {
+        let config = [n_elements as u32, value];
+        let config_buf = runner
+            .session
+            .create_buffer_init(&config, BufferUsage::STORAGE)
+            .unwrap();
+        ClearStage {
+            n_elements,
+            config_buf,
+        }
+    }
+
+    pub unsafe fn bind(
+        &self,
+        runner: &mut Runner,
+        code: &ClearCode,
+        out_buf: &Buffer,
+    ) -> ClearBinding {
+        let descriptor_set = runner
+            .session
+            .create_simple_descriptor_set(&code.pipeline, &[&self.config_buf, out_buf])
+            .unwrap();
+        ClearBinding { descriptor_set }
+    }
+
+    pub unsafe fn record(
+        &self,
+        commands: &mut Commands,
+        code: &ClearCode,
+        bindings: &ClearBinding,
+    ) {
+        let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
+        // An issue: for clearing large buffers (>16M), we need to check the
+        // number of workgroups against the (dynamically detected) limit, and
+        // potentially issue multiple dispatches.
+        commands.cmd_buf.dispatch(
+            &code.pipeline,
+            &bindings.descriptor_set,
+            (n_workgroups as u32, 1, 1),
+            (WG_SIZE as u32, 1, 1),
+        );
+        // One thing that's missing here is registering the buffers so
+        // they can be safely dropped by Rust code before the execution
+        // of the command buffer completes.
+    }
+}
+
+// Verify that the data is cleared.
+fn verify(data: &[u32]) -> Option<usize> {
+    data.iter().position(|val| *val != 0x42)
+}
--- a/tests/src/main.rs
+++ b/tests/src/main.rs
@ -16,6 +16,7 @@

 //! Tests for piet-gpu shaders and GPU capabilities.

+mod clear;
 mod config;
 mod prefix;
 mod prefix_tree;
@ -79,6 +80,7 @@ fn main() {
            flags |= InstanceFlags::DX12;
        }
        let mut runner = Runner::new(flags);
+        report(&clear::run_clear_test(&mut runner, &config));
        if config.groups.matches("prefix") {
            report(&prefix::run_prefix_test(&mut runner, &config));
            report(&prefix_tree::run_prefix_test(&mut runner, &config));
--- a/tests/src/prefix.rs
+++ b/tests/src/prefix.rs
@ -17,6 +17,7 @@
 use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet};
 use piet_gpu_hal::{Buffer, Pipeline};

+use crate::clear::{ClearBinding, ClearCode, ClearStage};
 use crate::config::Config;
 use crate::runner::{Commands, Runner};
 use crate::test_result::TestResult;
@ -30,6 +31,7 @@ const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
 /// A code struct can be created once and reused any number of times.
 struct PrefixCode {
    pipeline: Pipeline,
+    clear_code: Option<ClearCode>,
 }

 /// The stage resources for the prefix sum example.
@ -41,6 +43,7 @@ struct PrefixStage {
    // treat it as a capacity.
    n_elements: u64,
    state_buf: Buffer,
+    clear_stage: Option<(ClearStage, ClearBinding)>,
 }

 /// The binding for the prefix sum example.
@ -63,7 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
        .unwrap();
    let out_buf = runner.buf_down(data_buf.size());
    let code = PrefixCode::new(runner);
-    let stage = PrefixStage::new(runner, n_elements);
+    let stage = PrefixStage::new(runner, &code, n_elements);
    let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
    // Also will be configurable of course.
    let n_iter = 1000;
@ -100,21 +103,39 @@ impl PrefixCode {
                &[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer],
            )
            .unwrap();
-        PrefixCode { pipeline }
+        // Currently, DX12 and Metal backends don't support buffer clearing, so use a
+        // compute shader as a workaround.
+        let clear_code = if runner.backend_type() != BackendType::Vulkan {
+            Some(ClearCode::new(runner))
+        } else {
+            None
+        };
+        PrefixCode {
+            pipeline,
+            clear_code,
+        }
    }
 }

 impl PrefixStage {
-    unsafe fn new(runner: &mut Runner, n_elements: u64) -> PrefixStage {
+    unsafe fn new(runner: &mut Runner, code: &PrefixCode, n_elements: u64) -> PrefixStage {
        let n_workgroups = (n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
        let state_buf_size = 4 + 12 * n_workgroups;
        let state_buf = runner
            .session
            .create_buffer(state_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST)
            .unwrap();
+        let clear_stage = if let Some(clear_code) = &code.clear_code {
+            let stage = ClearStage::new(runner, state_buf_size / 4);
+            let binding = stage.bind(runner, clear_code, &state_buf);
+            Some((stage, binding))
+        } else {
+            None
+        };
        PrefixStage {
            n_elements,
            state_buf,
+            clear_stage,
        }
    }

@ -134,7 +155,11 @@ impl PrefixStage {

    unsafe fn record(&self, commands: &mut Commands, code: &PrefixCode, bindings: &PrefixBinding) {
        let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
-        commands.cmd_buf.clear_buffer(&self.state_buf, None);
+        if let Some((stage, binding)) = &self.clear_stage {
+            stage.record(commands, code.clear_code.as_ref().unwrap(), binding);
+        } else {
+            commands.cmd_buf.clear_buffer(&self.state_buf, None);
+        }
        commands.cmd_buf.memory_barrier();
        commands.cmd_buf.dispatch(
            &code.pipeline,
--- a/tests/src/runner.rs
+++ b/tests/src/runner.rs
@ -85,7 +85,7 @@ impl Runner {
        let submitted = self.session.run_cmd_buf(cmd_buf, &[], &[]).unwrap();
        self.cmd_buf_pool.extend(submitted.wait().unwrap());
        let timestamps = self.session.fetch_query_pool(&query_pool).unwrap();
-        timestamps[0]
+        timestamps.get(0).copied().unwrap_or_default()
    }

    #[allow(unused)]