mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-25 18:56:35 +11:00
Add tree reduction prefix sum test
Do a tree reduction in addition to the existing decoupled look-back, to explore the tradeoff between performance and compatibility.
This commit is contained in:
parent
33d7b25a92
commit
4ed339d434
6 changed files with 293 additions and 4 deletions
|
@ -6,7 +6,7 @@ glslang_validator = glslangValidator
|
||||||
spirv_cross = spirv-cross
|
spirv_cross = spirv-cross
|
||||||
|
|
||||||
rule glsl
|
rule glsl
|
||||||
command = $glslang_validator -V -o $out $in
|
command = $glslang_validator $flags -V -o $out $in
|
||||||
|
|
||||||
rule hlsl
|
rule hlsl
|
||||||
command = $spirv_cross --hlsl $in --output $out
|
command = $spirv_cross --hlsl $in --output $out
|
||||||
|
@ -17,3 +17,16 @@ rule msl
|
||||||
build gen/prefix.spv: glsl prefix.comp
|
build gen/prefix.spv: glsl prefix.comp
|
||||||
build gen/prefix.hlsl: hlsl gen/prefix.spv
|
build gen/prefix.hlsl: hlsl gen/prefix.spv
|
||||||
build gen/prefix.msl: msl gen/prefix.spv
|
build gen/prefix.msl: msl gen/prefix.spv
|
||||||
|
|
||||||
|
build gen/prefix_reduce.spv: glsl prefix_reduce.comp
|
||||||
|
build gen/prefix_reduce.hlsl: hlsl gen/prefix_reduce.spv
|
||||||
|
build gen/prefix_reduce.msl: msl gen/prefix_reduce.spv
|
||||||
|
|
||||||
|
build gen/prefix_root.spv: glsl prefix_scan.comp
|
||||||
|
flags = -DROOT
|
||||||
|
build gen/prefix_root.hlsl: hlsl gen/prefix_root.spv
|
||||||
|
build gen/prefix_root.msl: msl gen/prefix_root.spv
|
||||||
|
|
||||||
|
build gen/prefix_scan.spv: glsl prefix_scan.comp
|
||||||
|
build gen/prefix_scan.hlsl: hlsl gen/prefix_scan.spv
|
||||||
|
build gen/prefix_scan.msl: msl gen/prefix_scan.spv
|
||||||
|
|
53
tests/shader/prefix_reduce.comp
Normal file
53
tests/shader/prefix_reduce.comp
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
||||||
|
|
||||||
|
// The reduction phase for prefix sum implemented as a tree reduction.
|
||||||
|
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#define N_ROWS 8
|
||||||
|
#define LG_WG_SIZE 9
|
||||||
|
#define WG_SIZE (1 << LG_WG_SIZE)
|
||||||
|
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
|
||||||
|
|
||||||
|
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
|
||||||
|
|
||||||
|
struct Monoid {
|
||||||
|
uint element;
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(set = 0, binding = 0) readonly buffer InBuf {
|
||||||
|
Monoid[] inbuf;
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(set = 0, binding = 1) buffer OutBuf {
|
||||||
|
Monoid[] outbuf;
|
||||||
|
};
|
||||||
|
|
||||||
|
shared Monoid sh_scratch[WG_SIZE];
|
||||||
|
|
||||||
|
Monoid combine_monoid(Monoid a, Monoid b) {
|
||||||
|
return Monoid(a.element + b.element);
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
uint ix = gl_GlobalInvocationID.x * N_ROWS;
|
||||||
|
// TODO: gate buffer read
|
||||||
|
Monoid agg = inbuf[ix];
|
||||||
|
for (uint i = 1; i < N_ROWS; i++) {
|
||||||
|
agg = combine_monoid(agg, inbuf[ix + i]);
|
||||||
|
}
|
||||||
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
||||||
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
||||||
|
barrier();
|
||||||
|
// We could make this predicate tighter, but would it help?
|
||||||
|
if (gl_LocalInvocationID.x + (1 << i) < WG_SIZE) {
|
||||||
|
Monoid other = sh_scratch[gl_LocalInvocationID.x + (1 << i)];
|
||||||
|
agg = combine_monoid(agg, other);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
sh_scratch[gl_LocalInvocationID.x] = agg;
|
||||||
|
}
|
||||||
|
if (gl_LocalInvocationID.x == 0) {
|
||||||
|
outbuf[gl_WorkGroupID.x] = agg;
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,7 @@
|
||||||
//! Tests for piet-gpu shaders and GPU capabilities.
|
//! Tests for piet-gpu shaders and GPU capabilities.
|
||||||
|
|
||||||
mod prefix;
|
mod prefix;
|
||||||
|
mod prefix_tree;
|
||||||
mod runner;
|
mod runner;
|
||||||
|
|
||||||
use runner::Runner;
|
use runner::Runner;
|
||||||
|
@ -25,5 +26,6 @@ fn main() {
|
||||||
unsafe {
|
unsafe {
|
||||||
let mut runner = Runner::new();
|
let mut runner = Runner::new();
|
||||||
prefix::run_prefix_test(&mut runner);
|
prefix::run_prefix_test(&mut runner);
|
||||||
|
prefix_tree::run_prefix_test(&mut runner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,7 +59,7 @@ pub unsafe fn run_prefix_test(runner: &mut Runner) {
|
||||||
let stage = PrefixStage::new(runner, n_elements);
|
let stage = PrefixStage::new(runner, n_elements);
|
||||||
let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
|
let binding = stage.bind(runner, &code, &data_buf, &out_buf.dev_buf);
|
||||||
// Also will be configurable of course.
|
// Also will be configurable of course.
|
||||||
let n_iter = 5000;
|
let n_iter = 1000;
|
||||||
let mut total_elapsed = 0.0;
|
let mut total_elapsed = 0.0;
|
||||||
for i in 0..n_iter {
|
for i in 0..n_iter {
|
||||||
let mut commands = runner.commands();
|
let mut commands = runner.commands();
|
||||||
|
@ -110,7 +110,13 @@ impl PrefixStage {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe fn bind(&self, runner: &mut Runner, code: &PrefixCode, in_buf: &Buffer, out_buf: &Buffer) -> PrefixBinding {
|
unsafe fn bind(
|
||||||
|
&self,
|
||||||
|
runner: &mut Runner,
|
||||||
|
code: &PrefixCode,
|
||||||
|
in_buf: &Buffer,
|
||||||
|
out_buf: &Buffer,
|
||||||
|
) -> PrefixBinding {
|
||||||
let descriptor_set = runner
|
let descriptor_set = runner
|
||||||
.session
|
.session
|
||||||
.create_simple_descriptor_set(&code.pipeline, &[in_buf, out_buf, &self.state_buf])
|
.create_simple_descriptor_set(&code.pipeline, &[in_buf, out_buf, &self.state_buf])
|
||||||
|
|
210
tests/src/prefix_tree.rs
Normal file
210
tests/src/prefix_tree.rs
Normal file
|
@ -0,0 +1,210 @@
|
||||||
|
// Copyright 2021 The piet-gpu authors.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// https://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
//
|
||||||
|
// Also licensed under MIT license, at your choice.
|
||||||
|
|
||||||
|
use piet_gpu_hal::{include_shader, BufferUsage, DescriptorSet};
|
||||||
|
use piet_gpu_hal::{Buffer, Pipeline};
|
||||||
|
|
||||||
|
use crate::runner::{Commands, Runner};
|
||||||
|
|
||||||
|
const WG_SIZE: u64 = 512;
|
||||||
|
const N_ROWS: u64 = 8;
|
||||||
|
const ELEMENTS_PER_WG: u64 = WG_SIZE * N_ROWS;
|
||||||
|
|
||||||
|
struct PrefixTreeCode {
|
||||||
|
reduce_pipeline: Pipeline,
|
||||||
|
scan_pipeline: Pipeline,
|
||||||
|
root_pipeline: Pipeline,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PrefixTreeStage {
|
||||||
|
sizes: Vec<u64>,
|
||||||
|
tmp_bufs: Vec<Buffer>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PrefixTreeBinding {
|
||||||
|
// All but the first and last can be moved to stage.
|
||||||
|
descriptor_sets: Vec<DescriptorSet>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn run_prefix_test(runner: &mut Runner) {
|
||||||
|
// This will be configurable. Note though that the current code is
|
||||||
|
// prone to reading and writing past the end of buffers if this is
|
||||||
|
// not a power of the number of elements processed in a workgroup.
|
||||||
|
let n_elements: u64 = 1 << 24;
|
||||||
|
let data: Vec<u32> = (0..n_elements as u32).collect();
|
||||||
|
let data_buf = runner
|
||||||
|
.session
|
||||||
|
.create_buffer_init(&data, BufferUsage::STORAGE)
|
||||||
|
.unwrap();
|
||||||
|
let out_buf = runner.buf_down(data_buf.size());
|
||||||
|
let code = PrefixTreeCode::new(runner);
|
||||||
|
let stage = PrefixTreeStage::new(runner, n_elements);
|
||||||
|
let binding = stage.bind(runner, &code, &out_buf.dev_buf);
|
||||||
|
// Also will be configurable of course.
|
||||||
|
let n_iter = 1000;
|
||||||
|
let mut total_elapsed = 0.0;
|
||||||
|
for i in 0..n_iter {
|
||||||
|
let mut commands = runner.commands();
|
||||||
|
commands.cmd_buf.copy_buffer(&data_buf, &out_buf.dev_buf);
|
||||||
|
commands.cmd_buf.memory_barrier();
|
||||||
|
commands.write_timestamp(0);
|
||||||
|
stage.record(&mut commands, &code, &binding);
|
||||||
|
commands.write_timestamp(1);
|
||||||
|
if i == 0 {
|
||||||
|
commands.cmd_buf.memory_barrier();
|
||||||
|
commands.download(&out_buf);
|
||||||
|
}
|
||||||
|
total_elapsed += runner.submit(commands);
|
||||||
|
if i == 0 {
|
||||||
|
let mut dst: Vec<u32> = Default::default();
|
||||||
|
out_buf.read(&mut dst);
|
||||||
|
println!("failures: {:?}", verify(&dst));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let throughput = (n_elements * n_iter) as f64 / total_elapsed;
|
||||||
|
println!(
|
||||||
|
"total {:?}ms, throughput = {}G el/s",
|
||||||
|
total_elapsed * 1e3,
|
||||||
|
throughput * 1e-9
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrefixTreeCode {
|
||||||
|
unsafe fn new(runner: &mut Runner) -> PrefixTreeCode {
|
||||||
|
let reduce_code = include_shader!(&runner.session, "../shader/gen/prefix_reduce");
|
||||||
|
let reduce_pipeline = runner
|
||||||
|
.session
|
||||||
|
.create_simple_compute_pipeline(reduce_code, 2)
|
||||||
|
.unwrap();
|
||||||
|
let scan_code = include_shader!(&runner.session, "../shader/gen/prefix_scan");
|
||||||
|
let scan_pipeline = runner
|
||||||
|
.session
|
||||||
|
.create_simple_compute_pipeline(scan_code, 2)
|
||||||
|
.unwrap();
|
||||||
|
let root_code = include_shader!(&runner.session, "../shader/gen/prefix_root");
|
||||||
|
let root_pipeline = runner
|
||||||
|
.session
|
||||||
|
.create_simple_compute_pipeline(root_code, 1)
|
||||||
|
.unwrap();
|
||||||
|
PrefixTreeCode {
|
||||||
|
reduce_pipeline,
|
||||||
|
scan_pipeline,
|
||||||
|
root_pipeline,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrefixTreeStage {
|
||||||
|
unsafe fn new(runner: &mut Runner, n_elements: u64) -> PrefixTreeStage {
|
||||||
|
let mut size = n_elements;
|
||||||
|
let mut sizes = vec![size];
|
||||||
|
let mut tmp_bufs = Vec::new();
|
||||||
|
while size > ELEMENTS_PER_WG {
|
||||||
|
size = (size + ELEMENTS_PER_WG - 1) / ELEMENTS_PER_WG;
|
||||||
|
sizes.push(size);
|
||||||
|
let buf = runner
|
||||||
|
.session
|
||||||
|
.create_buffer(4 * size, BufferUsage::STORAGE)
|
||||||
|
.unwrap();
|
||||||
|
tmp_bufs.push(buf);
|
||||||
|
}
|
||||||
|
PrefixTreeStage { sizes, tmp_bufs }
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe fn bind(
|
||||||
|
&self,
|
||||||
|
runner: &mut Runner,
|
||||||
|
code: &PrefixTreeCode,
|
||||||
|
data_buf: &Buffer,
|
||||||
|
) -> PrefixTreeBinding {
|
||||||
|
let mut descriptor_sets = Vec::with_capacity(2 * self.tmp_bufs.len() + 1);
|
||||||
|
for i in 0..self.tmp_bufs.len() {
|
||||||
|
let buf0 = if i == 0 {
|
||||||
|
data_buf
|
||||||
|
} else {
|
||||||
|
&self.tmp_bufs[i - 1]
|
||||||
|
};
|
||||||
|
let buf1 = &self.tmp_bufs[i];
|
||||||
|
let descriptor_set = runner
|
||||||
|
.session
|
||||||
|
.create_simple_descriptor_set(&code.reduce_pipeline, &[buf0, buf1])
|
||||||
|
.unwrap();
|
||||||
|
descriptor_sets.push(descriptor_set);
|
||||||
|
}
|
||||||
|
let buf0 = self.tmp_bufs.last().unwrap_or(data_buf);
|
||||||
|
let descriptor_set = runner
|
||||||
|
.session
|
||||||
|
.create_simple_descriptor_set(&code.root_pipeline, &[buf0])
|
||||||
|
.unwrap();
|
||||||
|
descriptor_sets.push(descriptor_set);
|
||||||
|
for i in (0..self.tmp_bufs.len()).rev() {
|
||||||
|
let buf0 = if i == 0 {
|
||||||
|
data_buf
|
||||||
|
} else {
|
||||||
|
&self.tmp_bufs[i - 1]
|
||||||
|
};
|
||||||
|
let buf1 = &self.tmp_bufs[i];
|
||||||
|
let descriptor_set = runner
|
||||||
|
.session
|
||||||
|
.create_simple_descriptor_set(&code.scan_pipeline, &[buf0, buf1])
|
||||||
|
.unwrap();
|
||||||
|
descriptor_sets.push(descriptor_set);
|
||||||
|
}
|
||||||
|
PrefixTreeBinding { descriptor_sets }
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe fn record(
|
||||||
|
&self,
|
||||||
|
commands: &mut Commands,
|
||||||
|
code: &PrefixTreeCode,
|
||||||
|
bindings: &PrefixTreeBinding,
|
||||||
|
) {
|
||||||
|
let n = self.tmp_bufs.len();
|
||||||
|
for i in 0..n {
|
||||||
|
let n_workgroups = self.sizes[i + 1];
|
||||||
|
commands.cmd_buf.dispatch(
|
||||||
|
&code.reduce_pipeline,
|
||||||
|
&bindings.descriptor_sets[i],
|
||||||
|
(n_workgroups as u32, 1, 1),
|
||||||
|
(WG_SIZE as u32, 1, 1),
|
||||||
|
);
|
||||||
|
commands.cmd_buf.memory_barrier();
|
||||||
|
}
|
||||||
|
commands.cmd_buf.dispatch(
|
||||||
|
&code.root_pipeline,
|
||||||
|
&bindings.descriptor_sets[n],
|
||||||
|
(1, 1, 1),
|
||||||
|
(WG_SIZE as u32, 1, 1),
|
||||||
|
);
|
||||||
|
for i in (0..n).rev() {
|
||||||
|
commands.cmd_buf.memory_barrier();
|
||||||
|
let n_workgroups = self.sizes[i + 1];
|
||||||
|
commands.cmd_buf.dispatch(
|
||||||
|
&code.scan_pipeline,
|
||||||
|
&bindings.descriptor_sets[2 * n - i],
|
||||||
|
(n_workgroups as u32, 1, 1),
|
||||||
|
(WG_SIZE as u32, 1, 1),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that the data is OEIS A000217
|
||||||
|
fn verify(data: &[u32]) -> Option<usize> {
|
||||||
|
data.iter()
|
||||||
|
.enumerate()
|
||||||
|
.position(|(i, val)| ((i * (i + 1)) / 2) as u32 != *val)
|
||||||
|
}
|
|
@ -102,9 +102,14 @@ impl Runner {
|
||||||
.session
|
.session
|
||||||
.create_buffer(size, BufferUsage::MAP_READ | BufferUsage::COPY_DST)
|
.create_buffer(size, BufferUsage::MAP_READ | BufferUsage::COPY_DST)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
// Note: the COPY_DST isn't needed in all use cases, but I don't think
|
||||||
|
// making this tighter would help.
|
||||||
let dev_buf = self
|
let dev_buf = self
|
||||||
.session
|
.session
|
||||||
.create_buffer(size, BufferUsage::COPY_SRC | BufferUsage::STORAGE)
|
.create_buffer(
|
||||||
|
size,
|
||||||
|
BufferUsage::COPY_SRC | BufferUsage::COPY_DST | BufferUsage::STORAGE,
|
||||||
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
BufDown { stage_buf, dev_buf }
|
BufDown { stage_buf, dev_buf }
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue