diff --git a/tests/shader/build.ninja b/tests/shader/build.ninja index c5ecb06..93a0b66 100644 --- a/tests/shader/build.ninja +++ b/tests/shader/build.ninja @@ -6,7 +6,7 @@ glslang_validator = glslangValidator spirv_cross = spirv-cross rule glsl - command = $glslang_validator -V -o $out $in + command = $glslang_validator $flags -V -o $out $in rule hlsl command = $spirv_cross --hlsl $in --output $out @@ -17,3 +17,16 @@ rule msl build gen/prefix.spv: glsl prefix.comp build gen/prefix.hlsl: hlsl gen/prefix.spv build gen/prefix.msl: msl gen/prefix.spv + +build gen/prefix_reduce.spv: glsl prefix_reduce.comp +build gen/prefix_reduce.hlsl: hlsl gen/prefix_reduce.spv +build gen/prefix_reduce.msl: msl gen/prefix_reduce.spv + +build gen/prefix_root.spv: glsl prefix_scan.comp + flags = -DROOT +build gen/prefix_root.hlsl: hlsl gen/prefix_root.spv +build gen/prefix_root.msl: msl gen/prefix_root.spv + +build gen/prefix_scan.spv: glsl prefix_scan.comp +build gen/prefix_scan.hlsl: hlsl gen/prefix_scan.spv +build gen/prefix_scan.msl: msl gen/prefix_scan.spv diff --git a/tests/shader/prefix_reduce.comp b/tests/shader/prefix_reduce.comp new file mode 100644 index 0000000..378da88 --- /dev/null +++ b/tests/shader/prefix_reduce.comp @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The reduction phase for prefix sum implemented as a tree reduction. + +#version 450 + +#define N_ROWS 8 +#define LG_WG_SIZE 9 +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +struct Monoid { + uint element; +}; + +layout(set = 0, binding = 0) readonly buffer InBuf { + Monoid[] inbuf; +}; + +layout(set = 0, binding = 1) buffer OutBuf { + Monoid[] outbuf; +}; + +shared Monoid sh_scratch[WG_SIZE]; + +Monoid combine_monoid(Monoid a, Monoid b) { + return Monoid(a.element + b.element); +} + +void main() { + uint ix = gl_GlobalInvocationID.x * N_ROWS; + // TODO: gate buffer read + Monoid agg = inbuf[ix]; + for (uint i = 1; i < N_ROWS; i++) { + agg = combine_monoid(agg, inbuf[ix + i]); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + // We could make this predicate tighter, but would it help? + if (gl_LocalInvocationID.x + (1 << i) < WG_SIZE) { + Monoid other = sh_scratch[gl_LocalInvocationID.x + (1 << i)]; + agg = combine_monoid(agg, other); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0) { + outbuf[gl_WorkGroupID.x] = agg; + } +} diff --git a/tests/src/main.rs b/tests/src/main.rs index 2fe2a3d..85d8a66 100644 --- a/tests/src/main.rs +++ b/tests/src/main.rs @@ -17,6 +17,7 @@ //! Tests for piet-gpu shaders and GPU capabilities. mod prefix; +mod prefix_tree; mod runner; use runner::Runner; @@ -25,5 +26,6 @@ fn main() { unsafe { let mut runner = Runner::new(); prefix::run_prefix_test(&mut runner); + prefix_tree::run_prefix_test(&mut runner); } } diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs index 2a52f75..f95470a 100644 --- a/tests/src/prefix.rs +++ b/tests/src/prefix.rs @@ -59,7 +59,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner) { let stage = PrefixStage::new(runner, n_elements); let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf); // Also will be configurable of course. - let n_iter = 5000; + let n_iter = 1000; let mut total_elapsed = 0.0; for i in 0..n_iter { let mut commands = runner.commands(); @@ -110,7 +110,13 @@ impl PrefixStage { } } - unsafe fn bind(&self, runner: &mut Runner, code: &PrefixCode, in_buf: &Buffer, out_buf: &Buffer) -> PrefixBinding { + unsafe fn bind( + &self, + runner: &mut Runner, + code: &PrefixCode, + in_buf: &Buffer, + out_buf: &Buffer, + ) -> PrefixBinding { let descriptor_set = runner .session .create_simple_descriptor_set(&code.pipeline, &[in_buf, out_buf, &self.state_buf]) diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs new file mode 100644 index 0000000..7b9743a --- /dev/null +++ b/tests/src/prefix_tree.rs @@ -0,0 +1,210 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +use piet_gpu_hal::{include_shader, BufferUsage, DescriptorSet}; +use piet_gpu_hal::{Buffer, Pipeline}; + +use crate::runner::{Commands, Runner}; + +const WG_SIZE: u64 = 512; +const N_ROWS: u64 = 8; +const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS; + +struct PrefixTreeCode { + reduce_pipeline: Pipeline, + scan_pipeline: Pipeline, + root_pipeline: Pipeline, +} + +struct PrefixTreeStage { + sizes: Vec, + tmp_bufs: Vec, +} + +struct PrefixTreeBinding { + // All but the first and last can be moved to stage. + descriptor_sets: Vec, +} + +pub unsafe fn run_prefix_test(runner: &mut Runner) { + // This will be configurable. Note though that the current code is + // prone to reading and writing past the end of buffers if this is + // not a power of the number of elements processed in a workgroup. + let n_elements: u64 = 1 << 24; + let data: Vec = (0..n_elements as u32).collect(); + let data_buf = runner + .session + .create_buffer_init(&data, BufferUsage::STORAGE) + .unwrap(); + let out_buf = runner.buf_down(data_buf.size()); + let code = PrefixTreeCode::new(runner); + let stage = PrefixTreeStage::new(runner, n_elements); + let binding = stage.bind(runner, &code, &out_buf.dev_buf); + // Also will be configurable of course. + let n_iter = 1000; + let mut total_elapsed = 0.0; + for i in 0..n_iter { + let mut commands = runner.commands(); + commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf); + commands.cmd_buf.memory_barrier(); + commands.write_timestamp(0); + stage.record(&mut commands, &code, &binding); + commands.write_timestamp(1); + if i == 0 { + commands.cmd_buf.memory_barrier(); + commands.download(&out_buf); + } + total_elapsed += runner.submit(commands); + if i == 0 { + let mut dst: Vec = Default::default(); + out_buf.read(&mut dst); + println!("failures: {:?}", verify(&dst)); + } + } + let throughput = (n_elements * n_iter) as f64 / total_elapsed; + println!( + "total {:?}ms, throughput = {}G el/s", + total_elapsed * 1e3, + throughput * 1e-9 + ); +} + +impl PrefixTreeCode { + unsafe fn new(runner: &mut Runner) -> PrefixTreeCode { + let reduce_code = include_shader!(&runner.session, "../shader/gen/prefix_reduce"); + let reduce_pipeline = runner + .session + .create_simple_compute_pipeline(reduce_code, 2) + .unwrap(); + let scan_code = include_shader!(&runner.session, "../shader/gen/prefix_scan"); + let scan_pipeline = runner + .session + .create_simple_compute_pipeline(scan_code, 2) + .unwrap(); + let root_code = include_shader!(&runner.session, "../shader/gen/prefix_root"); + let root_pipeline = runner + .session + .create_simple_compute_pipeline(root_code, 1) + .unwrap(); + PrefixTreeCode { + reduce_pipeline, + scan_pipeline, + root_pipeline, + } + } +} + +impl PrefixTreeStage { + unsafe fn new(runner: &mut Runner, n_elements: u64) -> PrefixTreeStage { + let mut size = n_elements; + let mut sizes = vec![size]; + let mut tmp_bufs = Vec::new(); + while size > ELEMENTS_PER_WG { + size = (size + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG; + sizes.push(size); + let buf = runner + .session + .create_buffer(4 * size, BufferUsage::STORAGE) + .unwrap(); + tmp_bufs.push(buf); + } + PrefixTreeStage { sizes, tmp_bufs } + } + + unsafe fn bind( + &self, + runner: &mut Runner, + code: &PrefixTreeCode, + data_buf: &Buffer, + ) -> PrefixTreeBinding { + let mut descriptor_sets = Vec::with_capacity(2 * self.tmp_bufs.len() + 1); + for i in 0..self.tmp_bufs.len() { + let buf0 = if i == 0 { + data_buf + } else { + &self.tmp_bufs[i - 1] + }; + let buf1 = &self.tmp_bufs[i]; + let descriptor_set = runner + .session + .create_simple_descriptor_set(&code.reduce_pipeline, &[buf0, buf1]) + .unwrap(); + descriptor_sets.push(descriptor_set); + } + let buf0 = self.tmp_bufs.last().unwrap_or(data_buf); + let descriptor_set = runner + .session + .create_simple_descriptor_set(&code.root_pipeline, &[buf0]) + .unwrap(); + descriptor_sets.push(descriptor_set); + for i in (0..self.tmp_bufs.len()).rev() { + let buf0 = if i == 0 { + data_buf + } else { + &self.tmp_bufs[i - 1] + }; + let buf1 = &self.tmp_bufs[i]; + let descriptor_set = runner + .session + .create_simple_descriptor_set(&code.scan_pipeline, &[buf0, buf1]) + .unwrap(); + descriptor_sets.push(descriptor_set); + } + PrefixTreeBinding { descriptor_sets } + } + + unsafe fn record( + &self, + commands: &mut Commands, + code: &PrefixTreeCode, + bindings: &PrefixTreeBinding, + ) { + let n = self.tmp_bufs.len(); + for i in 0..n { + let n_workgroups = self.sizes[i + 1]; + commands.cmd_buf.dispatch( + &code.reduce_pipeline, + &bindings.descriptor_sets[i], + (n_workgroups as u32, 1, 1), + (WG_SIZE as u32, 1, 1), + ); + commands.cmd_buf.memory_barrier(); + } + commands.cmd_buf.dispatch( + &code.root_pipeline, + &bindings.descriptor_sets[n], + (1, 1, 1), + (WG_SIZE as u32, 1, 1), + ); + for i in (0..n).rev() { + commands.cmd_buf.memory_barrier(); + let n_workgroups = self.sizes[i + 1]; + commands.cmd_buf.dispatch( + &code.scan_pipeline, + &bindings.descriptor_sets[2 * n - i], + (n_workgroups as u32, 1, 1), + (WG_SIZE as u32, 1, 1), + ); + } + } +} + +// Verify that the data is OEIS A000217 +fn verify(data: &[u32]) -> Option { + data.iter() + .enumerate() + .position(|(i, val)| ((i * (i + 1)) / 2) as u32 != *val) +} diff --git a/tests/src/runner.rs b/tests/src/runner.rs index be42b30..ce89961 100644 --- a/tests/src/runner.rs +++ b/tests/src/runner.rs @@ -102,9 +102,14 @@ impl Runner { .session .create_buffer(size, BufferUsage::MAP_READ | BufferUsage::COPY_DST) .unwrap(); + // Note: the COPY_DST isn't needed in all use cases, but I don't think + // making this tighter would help. let dev_buf = self .session - .create_buffer(size, BufferUsage::COPY_SRC | BufferUsage::STORAGE) + .create_buffer( + size, + BufferUsage::COPY_SRC | BufferUsage::COPY_DST | BufferUsage::STORAGE, + ) .unwrap(); BufDown { stage_buf, dev_buf } }