Add workaround for buffer clearing

Add a clear stage and associated tests, and also use it on non-Vulkan
backends to clear the state buffer.

While that's a workaround and will go away when we implement the actual
clear command, it's also a nice demo of how the new "stage" structure
composes.
This commit is contained in:
Raph Levien 2021-11-10 14:56:00 -08:00
parent 94949a6906
commit fbfd4ee81b
9 changed files with 261 additions and 5 deletions

View file

@ -14,6 +14,10 @@ rule hlsl
rule msl rule msl
command = $spirv_cross --msl $in --output $out command = $spirv_cross --msl $in --output $out
build gen/clear.spv: glsl clear.comp
build gen/clear.hlsl: hlsl gen/clear.spv
build gen/clear.msl: msl gen/clear.spv
build gen/prefix.spv: glsl prefix.comp build gen/prefix.spv: glsl prefix.comp
build gen/prefix.hlsl: hlsl gen/prefix.spv build gen/prefix.hlsl: hlsl gen/prefix.spv
build gen/prefix.msl: msl gen/prefix.spv build gen/prefix.msl: msl gen/prefix.spv

26
tests/shader/clear.comp Normal file
View file

@ -0,0 +1,26 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Clear a buffer.
#version 450
layout(local_size_x = 256) in;
// This should probably be uniform rather than readonly,
// but we haven't done the binding work yet.
layout(binding = 0) readonly buffer ConfigBuf {
// size is in uint (4 byte) units
uint size;
uint value;
};
layout(binding = 1) buffer TargetBuf {
uint[] data;
};
void main() {
uint ix = gl_GlobalInvocationID.x;
if (ix < size) {
data[ix] = value;
}
}

View file

@ -0,0 +1,26 @@
static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u);
ByteAddressBuffer _19 : register(t0);
RWByteAddressBuffer _32 : register(u1);
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
void comp_main()
{
uint ix = gl_GlobalInvocationID.x;
if (ix < _19.Load(0))
{
_32.Store(ix * 4 + 0, _19.Load(4));
}
}
[numthreads(256, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}

View file

@ -0,0 +1,27 @@
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct ConfigBuf
{
uint size;
uint value;
};
struct TargetBuf
{
uint data[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u);
kernel void main0(const device ConfigBuf& _19 [[buffer(0)]], device TargetBuf& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]])
{
uint ix = gl_GlobalInvocationID.x;
if (ix < _19.size)
{
_32.data[ix] = _19.value;
}
}

BIN
tests/shader/gen/clear.spv Normal file

Binary file not shown.

146
tests/src/clear.rs Normal file
View file

@ -0,0 +1,146 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Utilities (and a benchmark) for clearing buffers with compute shaders.
use piet_gpu_hal::{include_shader, BindType, BufferUsage, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline};
use crate::config::Config;
use crate::runner::{Commands, Runner};
use crate::test_result::TestResult;
const WG_SIZE: u64 = 256;
/// The shader code for clearing buffers.
pub struct ClearCode {
pipeline: Pipeline,
}
/// The stage resources for clearing buffers.
pub struct ClearStage {
n_elements: u64,
config_buf: Buffer,
}
/// The binding for clearing buffers.
pub struct ClearBinding {
descriptor_set: DescriptorSet,
}
pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut result = TestResult::new("clear buffers");
// This will be configurable.
let n_elements: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24);
let out_buf = runner.buf_down(n_elements * 4);
let code = ClearCode::new(runner);
let stage = ClearStage::new_with_value(runner, n_elements, 0x42);
let binding = stage.bind(runner, &code, &out_buf.dev_buf);
// Also will be configurable of course.
let n_iter = 1000;
let mut total_elapsed = 0.0;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1);
if i == 0 {
commands.cmd_buf.memory_barrier();
commands.download(&out_buf);
}
total_elapsed += runner.submit(commands);
if i == 0 {
let mut dst: Vec<u32> = Default::default();
out_buf.read(&mut dst);
if let Some(failure) = verify(&dst) {
result.fail(format!("failure at {}", failure));
}
}
}
result.timing(total_elapsed, n_elements * n_iter);
result
}
impl ClearCode {
pub unsafe fn new(runner: &mut Runner) -> ClearCode {
let code = include_shader!(&runner.session, "../shader/gen/Clear");
let pipeline = runner
.session
.create_compute_pipeline(
code,
&[BindType::BufReadOnly, BindType::Buffer],
)
.unwrap();
ClearCode { pipeline }
}
}
impl ClearStage {
pub unsafe fn new(runner: &mut Runner, n_elements: u64) -> ClearStage {
Self::new_with_value(runner, n_elements, 0)
}
pub unsafe fn new_with_value(runner: &mut Runner, n_elements: u64, value: u32) -> ClearStage {
let config = [n_elements as u32, value];
let config_buf = runner
.session
.create_buffer_init(&config, BufferUsage::STORAGE)
.unwrap();
ClearStage {
n_elements,
config_buf,
}
}
pub unsafe fn bind(
&self,
runner: &mut Runner,
code: &ClearCode,
out_buf: &Buffer,
) -> ClearBinding {
let descriptor_set = runner
.session
.create_simple_descriptor_set(&code.pipeline, &[&self.config_buf, out_buf])
.unwrap();
ClearBinding { descriptor_set }
}
pub unsafe fn record(
&self,
commands: &mut Commands,
code: &ClearCode,
bindings: &ClearBinding,
) {
let n_workgroups = (self.n_elements + WG_SIZE - 1) / WG_SIZE;
// An issue: for clearing large buffers (>16M), we need to check the
// number of workgroups against the (dynamically detected) limit, and
// potentially issue multiple dispatches.
commands.cmd_buf.dispatch(
&code.pipeline,
&bindings.descriptor_set,
(n_workgroups as u32, 1, 1),
(WG_SIZE as u32, 1, 1),
);
// One thing that's missing here is registering the buffers so
// they can be safely dropped by Rust code before the execution
// of the command buffer completes.
}
}
// Verify that the data is cleared.
fn verify(data: &[u32]) -> Option<usize> {
data.iter().position(|val| *val != 0x42)
}

View file

@ -16,6 +16,7 @@
//! Tests for piet-gpu shaders and GPU capabilities. //! Tests for piet-gpu shaders and GPU capabilities.
mod clear;
mod config; mod config;
mod prefix; mod prefix;
mod prefix_tree; mod prefix_tree;
@ -79,6 +80,7 @@ fn main() {
flags |= InstanceFlags::DX12; flags |= InstanceFlags::DX12;
} }
let mut runner = Runner::new(flags); let mut runner = Runner::new(flags);
report(&clear::run_clear_test(&mut runner, &config));
if config.groups.matches("prefix") { if config.groups.matches("prefix") {
report(&prefix::run_prefix_test(&mut runner, &config)); report(&prefix::run_prefix_test(&mut runner, &config));
report(&prefix_tree::run_prefix_test(&mut runner, &config)); report(&prefix_tree::run_prefix_test(&mut runner, &config));

View file

@ -17,6 +17,7 @@
use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet}; use piet_gpu_hal::{include_shader, BackendType, BindType, BufferUsage, DescriptorSet};
use piet_gpu_hal::{Buffer, Pipeline}; use piet_gpu_hal::{Buffer, Pipeline};
use crate::clear::{ClearBinding, ClearCode, ClearStage};
use crate::config::Config; use crate::config::Config;
use crate::runner::{Commands, Runner}; use crate::runner::{Commands, Runner};
use crate::test_result::TestResult; use crate::test_result::TestResult;
@ -30,6 +31,7 @@ const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
/// A code struct can be created once and reused any number of times. /// A code struct can be created once and reused any number of times.
struct PrefixCode { struct PrefixCode {
pipeline: Pipeline, pipeline: Pipeline,
clear_code: Option<ClearCode>,
} }
/// The stage resources for the prefix sum example. /// The stage resources for the prefix sum example.
@ -41,6 +43,7 @@ struct PrefixStage {
// treat it as a capacity. // treat it as a capacity.
n_elements: u64, n_elements: u64,
state_buf: Buffer, state_buf: Buffer,
clear_stage: Option<(ClearStage, ClearBinding)>,
} }
/// The binding for the prefix sum example. /// The binding for the prefix sum example.
@ -63,7 +66,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
.unwrap(); .unwrap();
let out_buf = runner.buf_down(data_buf.size()); let out_buf = runner.buf_down(data_buf.size());
let code = PrefixCode::new(runner); let code = PrefixCode::new(runner);
let stage = PrefixStage::new(runner, n_elements); let stage = PrefixStage::new(runner, &code, n_elements);
let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf); let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
// Also will be configurable of course. // Also will be configurable of course.
let n_iter = 1000; let n_iter = 1000;
@ -100,21 +103,39 @@ impl PrefixCode {
&[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer], &[BindType::BufReadOnly, BindType::Buffer, BindType::Buffer],
) )
.unwrap(); .unwrap();
PrefixCode { pipeline } // Currently, DX12 and Metal backends don't support buffer clearing, so use a
// compute shader as a workaround.
let clear_code = if runner.backend_type() != BackendType::Vulkan {
Some(ClearCode::new(runner))
} else {
None
};
PrefixCode {
pipeline,
clear_code,
}
} }
} }
impl PrefixStage { impl PrefixStage {
unsafe fn new(runner: &mut Runner, n_elements: u64) -> PrefixStage { unsafe fn new(runner: &mut Runner, code: &PrefixCode, n_elements: u64) -> PrefixStage {
let n_workgroups = (n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; let n_workgroups = (n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
let state_buf_size = 4 + 12 * n_workgroups; let state_buf_size = 4 + 12 * n_workgroups;
let state_buf = runner let state_buf = runner
.session .session
.create_buffer(state_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST) .create_buffer(state_buf_size, BufferUsage::STORAGE | BufferUsage::COPY_DST)
.unwrap(); .unwrap();
let clear_stage = if let Some(clear_code) = &code.clear_code {
let stage = ClearStage::new(runner, state_buf_size / 4);
let binding = stage.bind(runner, clear_code, &state_buf);
Some((stage, binding))
} else {
None
};
PrefixStage { PrefixStage {
n_elements, n_elements,
state_buf, state_buf,
clear_stage,
} }
} }
@ -134,7 +155,11 @@ impl PrefixStage {
unsafe fn record(&self, commands: &mut Commands, code: &PrefixCode, bindings: &PrefixBinding) { unsafe fn record(&self, commands: &mut Commands, code: &PrefixCode, bindings: &PrefixBinding) {
let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; let n_workgroups = (self.n_elements + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
if let Some((stage, binding)) = &self.clear_stage {
stage.record(commands, code.clear_code.as_ref().unwrap(), binding);
} else {
commands.cmd_buf.clear_buffer(&self.state_buf, None); commands.cmd_buf.clear_buffer(&self.state_buf, None);
}
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.cmd_buf.dispatch( commands.cmd_buf.dispatch(
&code.pipeline, &code.pipeline,

View file

@ -85,7 +85,7 @@ impl Runner {
let submitted = self.session.run_cmd_buf(cmd_buf, &[], &[]).unwrap(); let submitted = self.session.run_cmd_buf(cmd_buf, &[], &[]).unwrap();
self.cmd_buf_pool.extend(submitted.wait().unwrap()); self.cmd_buf_pool.extend(submitted.wait().unwrap());
let timestamps = self.session.fetch_query_pool(&query_pool).unwrap(); let timestamps = self.session.fetch_query_pool(&query_pool).unwrap();
timestamps[0] timestamps.get(0).copied().unwrap_or_default()
} }
#[allow(unused)] #[allow(unused)]