From 7d7c86c44b9562856d2c6bf646800cee2feec774 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Fri, 28 May 2021 15:17:36 -0700 Subject: [PATCH] API changes and cleanup Add workgroup size to dispatch call (needed by metal). Change all fence references to mutable for consistency. Move backend traits to a separate file (move them out of the toplevel namespace in preparation for the hub types going there, to make the public API nicer). Add a method and macro for automatically choosing shader code, and change collatz example to generate all 3 kinds on build. --- piet-gpu-hal/examples/collatz.rs | 7 +- piet-gpu-hal/examples/dx12_toy.rs | 5 +- piet-gpu-hal/examples/metal_toy.rs | 25 -- piet-gpu-hal/examples/shader/build.ninja | 13 +- piet-gpu-hal/examples/shader/gen/collatz.hlsl | 62 ++++ piet-gpu-hal/examples/shader/gen/collatz.msl | 48 ++++ .../examples/shader/{ => gen}/collatz.spv | Bin 1616 -> 1616 bytes piet-gpu-hal/src/backend.rs | 265 ++++++++++++++++++ piet-gpu-hal/src/dx12.rs | 16 +- piet-gpu-hal/src/hub.rs | 7 +- piet-gpu-hal/src/lib.rs | 252 +---------------- piet-gpu-hal/src/macros.rs | 18 +- piet-gpu-hal/src/metal.rs | 70 ++--- piet-gpu-hal/src/mux.rs | 42 ++- piet-gpu-hal/src/vulkan.rs | 33 ++- piet-gpu/bin/android.rs | 2 +- piet-gpu/src/lib.rs | 7 + 17 files changed, 524 insertions(+), 348 deletions(-) delete mode 100644 piet-gpu-hal/examples/metal_toy.rs create mode 100644 piet-gpu-hal/examples/shader/gen/collatz.hlsl create mode 100644 piet-gpu-hal/examples/shader/gen/collatz.msl rename piet-gpu-hal/examples/shader/{ => gen}/collatz.spv (91%) create mode 100644 piet-gpu-hal/src/backend.rs diff --git a/piet-gpu-hal/examples/collatz.rs b/piet-gpu-hal/examples/collatz.rs index 4220148..d31bb4c 100644 --- a/piet-gpu-hal/examples/collatz.rs +++ b/piet-gpu-hal/examples/collatz.rs @@ -1,6 +1,7 @@ use piet_gpu_hal::hub; -use piet_gpu_hal::mux::{Instance, ShaderCode}; +use piet_gpu_hal::mux::Instance; use piet_gpu_hal::BufferUsage; +use piet_gpu_hal::include_shader; fn main() { let (instance, _) = Instance::new(None).unwrap(); @@ -10,7 +11,7 @@ fn main() { let usage = BufferUsage::MAP_READ | BufferUsage::STORAGE; let src = (0..256).map(|x| x + 1).collect::>(); let buffer = session.create_buffer_init(&src, usage).unwrap(); - let code = ShaderCode::Msl(include_str!("./shader/collatz.msl")); + let code = include_shader!(&session, "./shader/gen/collatz"); let pipeline = session.create_simple_compute_pipeline(code, 1).unwrap(); let descriptor_set = session .create_simple_descriptor_set(&pipeline, &[&buffer]) @@ -20,7 +21,7 @@ fn main() { cmd_buf.begin(); cmd_buf.reset_query_pool(&query_pool); cmd_buf.write_timestamp(&query_pool, 0); - cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1)); + cmd_buf.dispatch(&pipeline, &descriptor_set, (256, 1, 1), (1, 1, 1)); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.host_barrier(); cmd_buf.finish(); diff --git a/piet-gpu-hal/examples/dx12_toy.rs b/piet-gpu-hal/examples/dx12_toy.rs index 2ad9dfe..d8df466 100644 --- a/piet-gpu-hal/examples/dx12_toy.rs +++ b/piet-gpu-hal/examples/dx12_toy.rs @@ -2,7 +2,8 @@ //! This will probably go away when it's fully implemented and we can //! just use the hub. -use piet_gpu_hal::{dx12, BufferUsage, CmdBuf, Device, Error}; +use piet_gpu_hal::{dx12, BufferUsage, Error}; +use piet_gpu_hal::backend::{CmdBuf, Device}; const SHADER_CODE: &str = r#"RWByteAddressBuffer _53 : register(u0, space0); @@ -78,7 +79,7 @@ fn toy() -> Result<(), Error> { cmd_buf.copy_buffer(&buf, &dev_buf); cmd_buf.memory_barrier(); cmd_buf.write_timestamp(&query_pool, 0); - cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1)); + cmd_buf.dispatch(&pipeline, &ds, (1, 1, 1), (256, 1, 1)); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); cmd_buf.copy_buffer(&dev_buf, &buf); diff --git a/piet-gpu-hal/examples/metal_toy.rs b/piet-gpu-hal/examples/metal_toy.rs deleted file mode 100644 index a20a5af..0000000 --- a/piet-gpu-hal/examples/metal_toy.rs +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2021 The piet-gpu authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// Also licensed under MIT license, at your choice. - -//! An example to exercise the Metal backend. Once that becomes -//! functional, this file will go away. - -use piet_gpu_hal::metal; - -fn main() { - let instance = metal::MetalInstance; - println!("hello metal"); -} diff --git a/piet-gpu-hal/examples/shader/build.ninja b/piet-gpu-hal/examples/shader/build.ninja index 848637a..f1c6328 100644 --- a/piet-gpu-hal/examples/shader/build.ninja +++ b/piet-gpu-hal/examples/shader/build.ninja @@ -1,10 +1,19 @@ # Build file for shaders. -# You must have glslangValidator in your path, or patch here. +# You must have Vulkan tools in your path, or patch here. glslang_validator = glslangValidator +spirv_cross = spirv-cross rule glsl command = $glslang_validator -V -o $out $in -build collatz.spv: glsl collatz.comp +rule hlsl + command = $spirv_cross --hlsl $in --output $out + +rule msl + command = $spirv_cross --msl $in --output $out + +build gen/collatz.spv: glsl collatz.comp +build gen/collatz.hlsl: hlsl gen/collatz.spv +build gen/collatz.msl: msl gen/collatz.spv diff --git a/piet-gpu-hal/examples/shader/gen/collatz.hlsl b/piet-gpu-hal/examples/shader/gen/collatz.hlsl new file mode 100644 index 0000000..d5fcd56 --- /dev/null +++ b/piet-gpu-hal/examples/shader/gen/collatz.hlsl @@ -0,0 +1,62 @@ +static const uint3 gl_WorkGroupSize = uint3(1u, 1u, 1u); + +RWByteAddressBuffer _57 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +float mod(float x, float y) +{ + return x - y * floor(x / y); +} + +float2 mod(float2 x, float2 y) +{ + return x - y * floor(x / y); +} + +float3 mod(float3 x, float3 y) +{ + return x - y * floor(x / y); +} + +float4 mod(float4 x, float4 y) +{ + return x - y * floor(x / y); +} + +uint collatz_iterations(inout uint n) +{ + uint i = 0u; + while (n != 1u) + { + if (mod(float(n), 2.0f) == 0.0f) + { + n /= 2u; + } + else + { + n = (3u * n) + 1u; + } + i++; + } + return i; +} + +void comp_main() +{ + uint index = gl_GlobalInvocationID.x; + uint param = _57.Load(index * 4 + 0); + uint _65 = collatz_iterations(param); + _57.Store(index * 4 + 0, _65); +} + +[numthreads(1, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu-hal/examples/shader/gen/collatz.msl b/piet-gpu-hal/examples/shader/gen/collatz.msl new file mode 100644 index 0000000..c2592c0 --- /dev/null +++ b/piet-gpu-hal/examples/shader/gen/collatz.msl @@ -0,0 +1,48 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct PrimeIndices +{ + uint indices[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(1u); + +// Implementation of the GLSL mod() function, which is slightly different than Metal fmod() +template +inline Tx mod(Tx x, Ty y) +{ + return x - y * floor(x / y); +} + +static inline __attribute__((always_inline)) +uint collatz_iterations(thread uint& n) +{ + uint i = 0u; + while (n != 1u) + { + if (mod(float(n), 2.0) == 0.0) + { + n /= 2u; + } + else + { + n = (3u * n) + 1u; + } + i++; + } + return i; +} + +kernel void main0(device PrimeIndices& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint index = gl_GlobalInvocationID.x; + uint param = _57.indices[index]; + uint _65 = collatz_iterations(param); + _57.indices[index] = _65; +} + diff --git a/piet-gpu-hal/examples/shader/collatz.spv b/piet-gpu-hal/examples/shader/gen/collatz.spv similarity index 91% rename from piet-gpu-hal/examples/shader/collatz.spv rename to piet-gpu-hal/examples/shader/gen/collatz.spv index 21e4e92c3929f253532ddcf6594980eaeee9c755..886797e6937b1918712237eaba6a9ff12daa1c67 100644 GIT binary patch delta 18 Zcmcb>bAgAGnMs+Qfq{{MYa^#S8vr9>0|Nj6 delta 18 Zcmcb>bAgAGnMs+Qfq{{MV; + type Fence; + type Semaphore; + type PipelineBuilder: PipelineBuilder; + type DescriptorSetBuilder: DescriptorSetBuilder; + type Sampler; + type ShaderSource: ?Sized; + + /// Query the GPU info. + /// + /// This method may be expensive, so the hub should call it once and retain + /// the info. + fn query_gpu_info(&self) -> GpuInfo; + + fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result; + + /// Destroy a buffer. + /// + /// The same safety requirements hold as in Vulkan: the buffer cannot be used + /// after this call, and all commands referencing this buffer must have completed. + /// + /// Maybe doesn't need result return? + unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>; + + unsafe fn create_image2d(&self, width: u32, height: u32) -> Result; + + /// Destroy an image. + /// + /// The same safety requirements hold as in Vulkan: the image cannot be used + /// after this call, and all commands referencing this image must have completed. + /// + /// Use this only with images we created, not for swapchain images. + /// + /// Maybe doesn't need result return? + unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>; + + /// Start building a pipeline. + /// + /// A pipeline is a bit of shader IR plus a signature for what kinds of resources + /// it expects. + unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder; + + /// Start building a descriptor set. + /// + /// A descriptor set is a binding of resources for a given pipeline. + unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder; + + /// Create a simple compute pipeline that operates on buffers and storage images. + /// + /// This is provided as a convenience but will probably go away, as the functionality + /// is subsumed by the builder. + unsafe fn create_simple_compute_pipeline( + &self, + code: &Self::ShaderSource, + n_buffers: u32, + n_images: u32, + ) -> Result { + let mut builder = self.pipeline_builder(); + builder.add_buffers(n_buffers); + builder.add_images(n_images); + builder.create_compute_pipeline(self, code) + } + + /// Create a descriptor set for a given pipeline, binding buffers and images. + /// + /// This is provided as a convenience but will probably go away, as the functionality + /// is subsumed by the builder. + unsafe fn create_descriptor_set( + &self, + pipeline: &Self::Pipeline, + bufs: &[&Self::Buffer], + images: &[&Self::Image], + ) -> Result { + let mut builder = self.descriptor_set_builder(); + builder.add_buffers(bufs); + builder.add_images(images); + builder.build(self, pipeline) + } + + fn create_cmd_buf(&self) -> Result; + + fn create_query_pool(&self, n_queries: u32) -> Result; + + /// Get results from query pool, destroying it in the process. + /// + /// The returned vector is one less than the number of queries; the first is used as + /// a baseline. + /// + /// # Safety + /// All submitted commands that refer to this query pool must have completed. + unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error>; + + unsafe fn run_cmd_bufs( + &self, + cmd_buf: &[&Self::CmdBuf], + wait_semaphores: &[&Self::Semaphore], + signal_semaphores: &[&Self::Semaphore], + fence: Option<&mut Self::Fence>, + ) -> Result<(), Error>; + + /// Copy data from the buffer to memory. + /// + /// Discussion question: add offset? + /// + /// # Safety + /// + /// The buffer must be valid to access. The destination memory must be valid to + /// write to. The ranges must not overlap. The offset + size must be within + /// the buffer's allocation, and size within the destination. + unsafe fn read_buffer( + &self, + buffer: &Self::Buffer, + dst: *mut u8, + offset: u64, + size: u64, + ) -> Result<(), Error>; + + /// Copy data from memory to the buffer. + /// + /// # Safety + /// + /// The buffer must be valid to access. The source memory must be valid to + /// read from. The ranges must not overlap. The offset + size must be within + /// the buffer's allocation, and size within the source. + unsafe fn write_buffer( + &self, + buffer: &Self::Buffer, + contents: *const u8, + offset: u64, + size: u64, + ) -> Result<(), Error>; + + unsafe fn create_semaphore(&self) -> Result; + unsafe fn create_fence(&self, signaled: bool) -> Result; + unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>; + unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result; + + unsafe fn create_sampler(&self, params: SamplerParams) -> Result; +} + +pub trait CmdBuf { + unsafe fn begin(&mut self); + + unsafe fn finish(&mut self); + + unsafe fn dispatch( + &mut self, + pipeline: &D::Pipeline, + descriptor_set: &D::DescriptorSet, + workgroup_count: (u32, u32, u32), + workgroup_size: (u32, u32, u32), + ); + + /// Insert an execution and memory barrier. + /// + /// Compute kernels (and other actions) after this barrier may read from buffers + /// that were written before this barrier. + unsafe fn memory_barrier(&mut self); + + /// Insert a barrier for host access to buffers. + /// + /// The host may read buffers written before this barrier, after the fence for + /// the command buffer is signaled. + /// + /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/ + /// ("Host memory reads") for an explanation of this barrier. + unsafe fn host_barrier(&mut self); + + unsafe fn image_barrier( + &mut self, + image: &D::Image, + src_layout: ImageLayout, + dst_layout: ImageLayout, + ); + + /// Clear the buffer. + /// + /// This is readily supported in Vulkan, but for portability it is remarkably + /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute + /// kernel, or organize the code not to need it. + unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option); + + unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); + + unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer); + + unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image); + + // low portability, dx12 doesn't support it natively + unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image); + + /// Reset the query pool. + /// + /// The query pool must be reset before each use, to avoid validation errors. + /// This is annoying, and we could tweak the API to make it implicit, doing + /// the reset before the first timestamp write. + unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool); + + unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32); + + /// Prepare the timestamps for reading. This isn't required on Vulkan but + /// is required on (at least) DX12. + unsafe fn finish_timestamps(&mut self, _pool: &D::QueryPool) {} +} + +/// A builder for pipelines with more complex layouts. +pub trait PipelineBuilder { + /// Add buffers to the pipeline. Each has its own binding. + fn add_buffers(&mut self, n_buffers: u32); + /// Add storage images to the pipeline. Each has its own binding. + fn add_images(&mut self, n_images: u32); + /// Add a binding with a variable-size array of textures. + fn add_textures(&mut self, max_textures: u32); + unsafe fn create_compute_pipeline( + self, + device: &D, + code: &D::ShaderSource, + ) -> Result; +} + +/// A builder for descriptor sets with more complex layouts. +/// +/// Note: the order needs to match the pipeline building, and it also needs to +/// be buffers, then images, then textures. +pub trait DescriptorSetBuilder { + fn add_buffers(&mut self, buffers: &[&D::Buffer]); + /// Add an array of storage images. + /// + /// The images need to be in `ImageLayout::General` layout. + fn add_images(&mut self, images: &[&D::Image]); + /// Add an array of textures. + /// + /// The images need to be in `ImageLayout::ShaderRead` layout. + /// + /// The same sampler is used for all textures, which is not very sophisticated; + /// we should have a way to vary the sampler. + fn add_textures(&mut self, images: &[&D::Image]); + unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result; +} diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index cf9a0e3..67b751d 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -224,7 +224,7 @@ impl Dx12Instance { } } -impl crate::Device for Dx12Device { +impl crate::backend::Device for Dx12Device { type Buffer = Buffer; type Image = Image; @@ -413,7 +413,7 @@ impl crate::Device for Dx12Device { Ok(()) } - unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result { + unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result { let fence_val = fence.fence.get_value(); Ok(fence_val == fence.val.get()) } @@ -451,7 +451,7 @@ impl Dx12Device { } } -impl crate::CmdBuf for CmdBuf { +impl crate::backend::CmdBuf for CmdBuf { unsafe fn begin(&mut self) {} unsafe fn finish(&mut self) { @@ -468,7 +468,8 @@ impl crate::CmdBuf for CmdBuf { &mut self, pipeline: &Pipeline, descriptor_set: &DescriptorSet, - size: (u32, u32, u32), + workgroup_count: (u32, u32, u32), + _workgroup_size: (u32, u32, u32), ) { self.c.set_pipeline_state(&pipeline.pipeline_state); self.c @@ -478,7 +479,8 @@ impl crate::CmdBuf for CmdBuf { 0, descriptor_set.0.get_gpu_descriptor_handle_at_offset(0), ); - self.c.dispatch(size.0, size.1, size.2); + self.c + .dispatch(workgroup_count.0, workgroup_count.1, workgroup_count.2); } unsafe fn memory_barrier(&mut self) { @@ -554,7 +556,7 @@ impl crate::CmdBuf for CmdBuf { } } -impl crate::PipelineBuilder for PipelineBuilder { +impl crate::backend::PipelineBuilder for PipelineBuilder { fn add_buffers(&mut self, n_buffers: u32) { if n_buffers != 0 { self.ranges.push(d3d12::D3D12_DESCRIPTOR_RANGE { @@ -630,7 +632,7 @@ impl crate::PipelineBuilder for PipelineBuilder { } } -impl crate::DescriptorSetBuilder for DescriptorSetBuilder { +impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { // Note: we could get rid of the clone here (which is an AddRef) // and store a raw pointer, as it's a safety precondition that diff --git a/piet-gpu-hal/src/hub.rs b/piet-gpu-hal/src/hub.rs index a53ef2f..8822b8a 100644 --- a/piet-gpu-hal/src/hub.rs +++ b/piet-gpu-hal/src/hub.rs @@ -123,7 +123,7 @@ impl Session { unsafe { let mut i = 0; while i < pending.len() { - if let Ok(true) = self.0.device.get_fence_status(&pending[i].fence) { + if let Ok(true) = self.0.device.get_fence_status(&mut pending[i].fence) { let mut item = pending.swap_remove(i); // TODO: wait is superfluous, can just reset let _ = self.0.device.wait_and_reset(vec![&mut item.fence]); @@ -295,6 +295,11 @@ impl Session { pub fn gpu_info(&self) -> &GpuInfo { &self.0.gpu_info } + + /// Choose shader code from the available choices. + pub fn choose_shader<'a>(&self, spv: &'a [u8], hlsl: &'a str, msl: &'a str) -> ShaderCode<'a> { + self.0.device.choose_shader(spv, hlsl, msl) + } } impl CmdBuf { diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 6a66db6..b5bc8b5 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -4,6 +4,7 @@ /// In time, it may go away and be replaced by either gfx-hal or wgpu. use bitflags::bitflags; +pub mod backend; pub mod hub; #[macro_use] @@ -26,9 +27,13 @@ mux_cfg! { #[cfg(target_os = "macos")] pub mod metal; -/// This isn't great but is expedient. +/// The common error type for the crate. +/// +/// This keeps things imple and can be expanded later. pub type Error = Box; +pub use crate::backend::CmdBuf; + #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum ImageLayout { Undefined, @@ -92,248 +97,3 @@ pub struct SubgroupSize { min: u32, max: u32, } - -pub trait Device: Sized { - type Buffer: 'static; - type Image; - type Pipeline; - type DescriptorSet; - type QueryPool; - type CmdBuf: CmdBuf; - type Fence; - type Semaphore; - type PipelineBuilder: PipelineBuilder; - type DescriptorSetBuilder: DescriptorSetBuilder; - type Sampler; - type ShaderSource: ?Sized; - - /// Query the GPU info. - /// - /// This method may be expensive, so the hub should call it once and retain - /// the info. - fn query_gpu_info(&self) -> GpuInfo; - - fn create_buffer(&self, size: u64, usage: BufferUsage) -> Result; - - /// Destroy a buffer. - /// - /// The same safety requirements hold as in Vulkan: the buffer cannot be used - /// after this call, and all commands referencing this buffer must have completed. - /// - /// Maybe doesn't need result return? - unsafe fn destroy_buffer(&self, buffer: &Self::Buffer) -> Result<(), Error>; - - unsafe fn create_image2d(&self, width: u32, height: u32) -> Result; - - /// Destroy an image. - /// - /// The same safety requirements hold as in Vulkan: the image cannot be used - /// after this call, and all commands referencing this image must have completed. - /// - /// Use this only with images we created, not for swapchain images. - /// - /// Maybe doesn't need result return? - unsafe fn destroy_image(&self, image: &Self::Image) -> Result<(), Error>; - - /// Start building a pipeline. - /// - /// A pipeline is a bit of shader IR plus a signature for what kinds of resources - /// it expects. - unsafe fn pipeline_builder(&self) -> Self::PipelineBuilder; - - /// Start building a descriptor set. - /// - /// A descriptor set is a binding of resources for a given pipeline. - unsafe fn descriptor_set_builder(&self) -> Self::DescriptorSetBuilder; - - /// Create a simple compute pipeline that operates on buffers and storage images. - /// - /// This is provided as a convenience but will probably go away, as the functionality - /// is subsumed by the builder. - unsafe fn create_simple_compute_pipeline( - &self, - code: &Self::ShaderSource, - n_buffers: u32, - n_images: u32, - ) -> Result { - let mut builder = self.pipeline_builder(); - builder.add_buffers(n_buffers); - builder.add_images(n_images); - builder.create_compute_pipeline(self, code) - } - - /// Create a descriptor set for a given pipeline, binding buffers and images. - /// - /// This is provided as a convenience but will probably go away, as the functionality - /// is subsumed by the builder. - unsafe fn create_descriptor_set( - &self, - pipeline: &Self::Pipeline, - bufs: &[&Self::Buffer], - images: &[&Self::Image], - ) -> Result { - let mut builder = self.descriptor_set_builder(); - builder.add_buffers(bufs); - builder.add_images(images); - builder.build(self, pipeline) - } - - fn create_cmd_buf(&self) -> Result; - - fn create_query_pool(&self, n_queries: u32) -> Result; - - /// Get results from query pool, destroying it in the process. - /// - /// The returned vector is one less than the number of queries; the first is used as - /// a baseline. - /// - /// # Safety - /// All submitted commands that refer to this query pool must have completed. - unsafe fn fetch_query_pool(&self, pool: &Self::QueryPool) -> Result, Error>; - - unsafe fn run_cmd_bufs( - &self, - cmd_buf: &[&Self::CmdBuf], - wait_semaphores: &[&Self::Semaphore], - signal_semaphores: &[&Self::Semaphore], - fence: Option<&mut Self::Fence>, - ) -> Result<(), Error>; - - /// Copy data from the buffer to memory. - /// - /// Discussion question: add offset? - /// - /// # Safety - /// - /// The buffer must be valid to access. The destination memory must be valid to - /// write to. The ranges must not overlap. The offset + size must be within - /// the buffer's allocation, and size within the destination. - unsafe fn read_buffer( - &self, - buffer: &Self::Buffer, - dst: *mut u8, - offset: u64, - size: u64, - ) -> Result<(), Error>; - - /// Copy data from memory to the buffer. - /// - /// # Safety - /// - /// The buffer must be valid to access. The source memory must be valid to - /// read from. The ranges must not overlap. The offset + size must be within - /// the buffer's allocation, and size within the source. - unsafe fn write_buffer( - &self, - buffer: &Self::Buffer, - contents: *const u8, - offset: u64, - size: u64, - ) -> Result<(), Error>; - - unsafe fn create_semaphore(&self) -> Result; - unsafe fn create_fence(&self, signaled: bool) -> Result; - unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error>; - unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result; - - unsafe fn create_sampler(&self, params: SamplerParams) -> Result; -} - -pub trait CmdBuf { - unsafe fn begin(&mut self); - - unsafe fn finish(&mut self); - - unsafe fn dispatch( - &mut self, - pipeline: &D::Pipeline, - descriptor_set: &D::DescriptorSet, - size: (u32, u32, u32), - ); - - /// Insert an execution and memory barrier. - /// - /// Compute kernels (and other actions) after this barrier may read from buffers - /// that were written before this barrier. - unsafe fn memory_barrier(&mut self); - - /// Insert a barrier for host access to buffers. - /// - /// The host may read buffers written before this barrier, after the fence for - /// the command buffer is signaled. - /// - /// See http://themaister.net/blog/2019/08/14/yet-another-blog-explaining-vulkan-synchronization/ - /// ("Host memory reads") for an explanation of this barrier. - unsafe fn host_barrier(&mut self); - - unsafe fn image_barrier( - &mut self, - image: &D::Image, - src_layout: ImageLayout, - dst_layout: ImageLayout, - ); - - /// Clear the buffer. - /// - /// This is readily supported in Vulkan, but for portability it is remarkably - /// tricky (unimplemented in gfx-hal right now). Possibly best to write a compute - /// kernel, or organize the code not to need it. - unsafe fn clear_buffer(&self, buffer: &D::Buffer, size: Option); - - unsafe fn copy_buffer(&self, src: &D::Buffer, dst: &D::Buffer); - - unsafe fn copy_image_to_buffer(&self, src: &D::Image, dst: &D::Buffer); - - unsafe fn copy_buffer_to_image(&self, src: &D::Buffer, dst: &D::Image); - - // low portability, dx12 doesn't support it natively - unsafe fn blit_image(&self, src: &D::Image, dst: &D::Image); - - /// Reset the query pool. - /// - /// The query pool must be reset before each use, to avoid validation errors. - /// This is annoying, and we could tweak the API to make it implicit, doing - /// the reset before the first timestamp write. - unsafe fn reset_query_pool(&mut self, pool: &D::QueryPool); - - unsafe fn write_timestamp(&mut self, pool: &D::QueryPool, query: u32); - - /// Prepare the timestamps for reading. This isn't required on Vulkan but - /// is required on (at least) DX12. - unsafe fn finish_timestamps(&mut self, pool: &D::QueryPool) {} -} - -/// A builder for pipelines with more complex layouts. -pub trait PipelineBuilder { - /// Add buffers to the pipeline. Each has its own binding. - fn add_buffers(&mut self, n_buffers: u32); - /// Add storage images to the pipeline. Each has its own binding. - fn add_images(&mut self, n_images: u32); - /// Add a binding with a variable-size array of textures. - fn add_textures(&mut self, max_textures: u32); - unsafe fn create_compute_pipeline( - self, - device: &D, - code: &D::ShaderSource, - ) -> Result; -} - -/// A builder for descriptor sets with more complex layouts. -/// -/// Note: the order needs to match the pipeline building, and it also needs to -/// be buffers, then images, then textures. -pub trait DescriptorSetBuilder { - fn add_buffers(&mut self, buffers: &[&D::Buffer]); - /// Add an array of storage images. - /// - /// The images need to be in `ImageLayout::General` layout. - fn add_images(&mut self, images: &[&D::Image]); - /// Add an array of textures. - /// - /// The images need to be in `ImageLayout::ShaderRead` layout. - /// - /// The same sampler is used for all textures, which is not very sophisticated; - /// we should have a way to vary the sampler. - fn add_textures(&mut self, images: &[&D::Image]); - unsafe fn build(self, device: &D, pipeline: &D::Pipeline) -> Result; -} diff --git a/piet-gpu-hal/src/macros.rs b/piet-gpu-hal/src/macros.rs index 1810a3c..3456342 100644 --- a/piet-gpu-hal/src/macros.rs +++ b/piet-gpu-hal/src/macros.rs @@ -117,9 +117,9 @@ macro_rules! mux_device_enum { $crate::mux_enum! { $(#[$outer])* pub enum $assoc_type { - Vk(<$crate::vulkan::VkDevice as $crate::Device>::$assoc_type), - Dx12(<$crate::dx12::Dx12Device as $crate::Device>::$assoc_type), - Mtl(<$crate::metal::MtlDevice as $crate::Device>::$assoc_type), + Vk(<$crate::vulkan::VkDevice as $crate::backend::Device>::$assoc_type), + Dx12(<$crate::dx12::Dx12Device as $crate::backend::Device>::$assoc_type), + Mtl(<$crate::metal::MtlDevice as $crate::backend::Device>::$assoc_type), } } } @@ -154,3 +154,15 @@ macro_rules! mux_match { } }; } + +/// A convenience macro for selecting a shader from included files. +#[macro_export] +macro_rules! include_shader { + ( $device:expr, $path_base:expr) => { + $device.choose_shader( + include_bytes!(concat!($path_base, ".spv")), + include_str!(concat!($path_base, ".hlsl")), + include_str!(concat!($path_base, ".msl")), + ) + }; +} diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs index 0b3f588..fcf4637 100644 --- a/piet-gpu-hal/src/metal.rs +++ b/piet-gpu-hal/src/metal.rs @@ -102,7 +102,11 @@ impl MtlInstance { has_memory_model: false, use_staging_buffers: use_staging_buffers, }; - Ok(MtlDevice { device, cmd_queue, gpu_info }) + Ok(MtlDevice { + device, + cmd_queue, + gpu_info, + }) } else { Err("can't create system default Metal device".into()) } @@ -119,7 +123,7 @@ impl MtlInstance { } } -impl crate::Device for MtlDevice { +impl crate::backend::Device for MtlDevice { type Buffer = Buffer; type Image = Image; @@ -282,9 +286,13 @@ impl crate::Device for MtlDevice { Ok(()) } - unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result { - // fence need to be mutable here :/ - todo!() + unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result { + match fence { + Fence::Idle => Ok(true), + Fence::CmdBufPending(cmd_buf) => { + Ok(cmd_buf.status() == metal::MTLCommandBufferStatus::Completed) + } + } } unsafe fn create_sampler(&self, params: crate::SamplerParams) -> Result { @@ -292,18 +300,17 @@ impl crate::Device for MtlDevice { } } -impl crate::CmdBuf for CmdBuf { - unsafe fn begin(&mut self) { - } +impl crate::backend::CmdBuf for CmdBuf { + unsafe fn begin(&mut self) {} - unsafe fn finish(&mut self) { - } + unsafe fn finish(&mut self) {} unsafe fn dispatch( &mut self, pipeline: &Pipeline, descriptor_set: &DescriptorSet, - size: (u32, u32, u32), + workgroup_count: (u32, u32, u32), + workgroup_size: (u32, u32, u32), ) { let encoder = self.cmd_buf.new_compute_command_encoder(); encoder.set_compute_pipeline_state(&pipeline.0); @@ -313,19 +320,17 @@ impl crate::CmdBuf for CmdBuf { ix += 1; } // TODO: set images - let work_group_count = metal::MTLSize { - width: size.0 as u64, - height: size.1 as u64, - depth: size.2 as u64, + let workgroup_count = metal::MTLSize { + width: workgroup_count.0 as u64, + height: workgroup_count.1 as u64, + depth: workgroup_count.2 as u64, }; - // TODO: we need to pass this in explicitly. In gfx-hal, this is parsed from - // the spv before translation. - let work_group_size = metal::MTLSize { - width: 1, - height: 1, - depth: 1, + let workgroup_size = metal::MTLSize { + width: workgroup_size.0 as u64, + height: workgroup_size.1 as u64, + depth: workgroup_size.2 as u64, }; - encoder.dispatch_thread_groups(work_group_count, work_group_size); + encoder.dispatch_thread_groups(workgroup_count, workgroup_size); encoder.end_encoding(); } @@ -334,8 +339,7 @@ impl crate::CmdBuf for CmdBuf { // Metal's own tracking. } - unsafe fn host_barrier(&mut self) { - } + unsafe fn host_barrier(&mut self) {} unsafe fn image_barrier( &mut self, @@ -366,9 +370,7 @@ impl crate::CmdBuf for CmdBuf { todo!() } - unsafe fn reset_query_pool(&mut self, pool: &QueryPool) { - - } + unsafe fn reset_query_pool(&mut self, pool: &QueryPool) {} unsafe fn write_timestamp(&mut self, pool: &QueryPool, query: u32) { // TODO @@ -377,17 +379,15 @@ impl crate::CmdBuf for CmdBuf { } } -impl crate::PipelineBuilder for PipelineBuilder { +impl crate::backend::PipelineBuilder for PipelineBuilder { fn add_buffers(&mut self, _n_buffers: u32) { // My understanding is that Metal infers the pipeline layout from // the source. } - fn add_images(&mut self, _n_images: u32) { - } + fn add_images(&mut self, _n_images: u32) {} - fn add_textures(&mut self, _max_textures: u32) { - } + fn add_textures(&mut self, _max_textures: u32) {} unsafe fn create_compute_pipeline( self, @@ -399,12 +399,14 @@ impl crate::PipelineBuilder for PipelineBuilder { let library = device.device.new_library_with_source(code, &options)?; // This seems to be the default name from spirv-cross, but we may need to tweak. let function = library.get_function("main0", None)?; - let pipeline = device.device.new_compute_pipeline_state_with_function(&function)?; + let pipeline = device + .device + .new_compute_pipeline_state_with_function(&function)?; Ok(Pipeline(pipeline)) } } -impl crate::DescriptorSetBuilder for DescriptorSetBuilder { +impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { self.0.buffers.extend(buffers.iter().copied().cloned()); } diff --git a/piet-gpu-hal/src/mux.rs b/piet-gpu-hal/src/mux.rs index e2c0988..7a83ae9 100644 --- a/piet-gpu-hal/src/mux.rs +++ b/piet-gpu-hal/src/mux.rs @@ -30,10 +30,10 @@ mux_cfg! { #[cfg(mtl)] use crate::metal; } -use crate::CmdBuf as CmdBufTrait; -use crate::DescriptorSetBuilder as DescriptorSetBuilderTrait; -use crate::Device as DeviceTrait; -use crate::PipelineBuilder as PipelineBuilderTrait; +use crate::backend::CmdBuf as CmdBufTrait; +use crate::backend::DescriptorSetBuilder as DescriptorSetBuilderTrait; +use crate::backend::Device as DeviceTrait; +use crate::backend::PipelineBuilder as PipelineBuilderTrait; use crate::{BufferUsage, Error, GpuInfo, ImageLayout}; mux_enum! { @@ -255,11 +255,11 @@ impl Device { } } - pub unsafe fn get_fence_status(&self, fence: &Fence) -> Result { + pub unsafe fn get_fence_status(&self, fence: &mut Fence) -> Result { mux_match! { self; - Device::Vk(d) => d.get_fence_status(fence.vk()), - Device::Dx12(d) => d.get_fence_status(fence.dx12()), - Device::Mtl(d) => d.get_fence_status(fence.mtl()), + Device::Vk(d) => d.get_fence_status(fence.vk_mut()), + Device::Dx12(d) => d.get_fence_status(fence.dx12_mut()), + Device::Mtl(d) => d.get_fence_status(fence.mtl_mut()), } } @@ -400,6 +400,15 @@ impl Device { Device::Mtl(d) => d.write_buffer(buffer.mtl(), contents, offset, size), } } + + /// Choose shader code from the available choices. + pub fn choose_shader<'a>(&self, _spv: &'a [u8], _hlsl: &'a str, _msl: &'a str) -> ShaderCode<'a> { + mux_match! { self; + Device::Vk(_d) => ShaderCode::Spv(_spv), + Device::Dx12(_d) => ShaderCode::Hlsl(_hlsl), + Device::Mtl(_d) => ShaderCode::Msl(_msl), + } + } } impl PipelineBuilder { @@ -578,16 +587,25 @@ impl CmdBuf { } } + /// Dispatch a compute shader. + /// + /// Note that both the number of workgroups (`workgroup_count`) and the number of + /// threads in a workgroup (`workgroup_size`) are given. The latter is needed on + /// Metal, while it's baked into the shader on Vulkan and DX12. + /// + /// Perhaps we'll have a mechanism to plumb the latter value to configure the size + /// of a workgroup using specialization constants in the future. pub unsafe fn dispatch( &mut self, pipeline: &Pipeline, descriptor_set: &DescriptorSet, - size: (u32, u32, u32), + workgroup_count: (u32, u32, u32), + workgroup_size: (u32, u32, u32), ) { mux_match! { self; - CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), size), - CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), size), - CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), size), + CmdBuf::Vk(c) => c.dispatch(pipeline.vk(), descriptor_set.vk(), workgroup_count, workgroup_size), + CmdBuf::Dx12(c) => c.dispatch(pipeline.dx12(), descriptor_set.dx12(), workgroup_count, workgroup_size), + CmdBuf::Mtl(c) => c.dispatch(pipeline.mtl(), descriptor_set.mtl(), workgroup_count, workgroup_size), } } diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 8cdddcd..1084505 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -13,8 +13,10 @@ use ash::{vk, Device, Entry, Instance}; use smallvec::SmallVec; use crate::{ - BufferUsage, Device as DeviceTrait, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, + BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, }; +use crate::backend::Device as DeviceTrait; + pub struct VkInstance { /// Retain the dynamic lib. @@ -455,7 +457,7 @@ impl VkInstance { } } -impl crate::Device for VkDevice { +impl crate::backend::Device for VkDevice { type Buffer = Buffer; type Image = Image; type CmdBuf = CmdBuf; @@ -621,16 +623,13 @@ impl crate::Device for VkDevice { unsafe fn wait_and_reset(&self, fences: Vec<&mut Self::Fence>) -> Result<(), Error> { let device = &self.device.device; - let fences = fences - .iter() - .map(|f| **f) - .collect::>(); + let fences = fences.iter().map(|f| **f).collect::>(); device.wait_for_fences(&fences, true, !0)?; device.reset_fences(&fences)?; Ok(()) } - unsafe fn get_fence_status(&self, fence: &Self::Fence) -> Result { + unsafe fn get_fence_status(&self, fence: &mut Self::Fence) -> Result { let device = &self.device.device; Ok(device.get_fence_status(*fence)?) } @@ -843,7 +842,8 @@ impl crate::CmdBuf for CmdBuf { &mut self, pipeline: &Pipeline, descriptor_set: &DescriptorSet, - size: (u32, u32, u32), + workgroup_count: (u32, u32, u32), + _workgroup_size: (u32, u32, u32), ) { let device = &self.device.device; device.cmd_bind_pipeline( @@ -859,7 +859,12 @@ impl crate::CmdBuf for CmdBuf { &[descriptor_set.descriptor_set], &[], ); - device.cmd_dispatch(self.cmd_buf, size.0, size.1, size.2); + device.cmd_dispatch( + self.cmd_buf, + workgroup_count.0, + workgroup_count.1, + workgroup_count.2, + ); } /// Insert a pipeline barrier for all memory accesses. @@ -1047,7 +1052,7 @@ impl crate::CmdBuf for CmdBuf { } } -impl crate::PipelineBuilder for PipelineBuilder { +impl crate::backend::PipelineBuilder for PipelineBuilder { fn add_buffers(&mut self, n_buffers: u32) { let start = self.bindings.len() as u32; for i in 0..n_buffers { @@ -1153,7 +1158,7 @@ impl crate::PipelineBuilder for PipelineBuilder { } } -impl crate::DescriptorSetBuilder for DescriptorSetBuilder { +impl crate::backend::DescriptorSetBuilder for DescriptorSetBuilder { fn add_buffers(&mut self, buffers: &[&Buffer]) { self.buffers.extend(buffers.iter().map(|b| b.buffer)); } @@ -1307,7 +1312,11 @@ impl VkSwapchain { image_idx: usize, semaphores: &[&vk::Semaphore], ) -> Result { - let semaphores = semaphores.iter().copied().copied().collect::>(); + let semaphores = semaphores + .iter() + .copied() + .copied() + .collect::>(); Ok(self.swapchain_fn.queue_present( self.present_queue, &vk::PresentInfoKHR::builder() diff --git a/piet-gpu/bin/android.rs b/piet-gpu/bin/android.rs index 63ee91a..ce37503 100644 --- a/piet-gpu/bin/android.rs +++ b/piet-gpu/bin/android.rs @@ -12,7 +12,7 @@ use ndk::native_window::NativeWindow; use ndk_glue::Event; use piet_gpu_hal::hub; -use piet_gpu_hal::mux::{QueryPool, Instance, Surface, Swapchain}; +use piet_gpu_hal::mux::{Instance, QueryPool, Surface, Swapchain}; use piet_gpu_hal::{CmdBuf, Error, ImageLayout}; use piet_gpu::{render_scene, PietGpuRenderContext, Renderer}; diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 2b8ed0f..b815655 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -403,6 +403,7 @@ impl Renderer { &self.el_pipeline, &self.el_ds, (((self.n_elements + 127) / 128) as u32, 1, 1), + (128, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); @@ -410,6 +411,7 @@ impl Renderer { &self.tile_pipeline, &self.tile_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), + (256, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 2); cmd_buf.memory_barrier(); @@ -417,6 +419,7 @@ impl Renderer { &self.path_pipeline, &self.path_ds, (((self.n_pathseg + 31) / 32) as u32, 1, 1), + (32, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 3); cmd_buf.memory_barrier(); @@ -424,6 +427,7 @@ impl Renderer { &self.backdrop_pipeline, &self.backdrop_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), + (256, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 4); // Note: this barrier is not needed as an actual dependency between @@ -434,6 +438,7 @@ impl Renderer { &self.bin_pipeline, &self.bin_ds, (((self.n_paths + 255) / 256) as u32, 1, 1), + (256, 1, 1), ); cmd_buf.write_timestamp(&query_pool, 5); cmd_buf.memory_barrier(); @@ -441,6 +446,7 @@ impl Renderer { &self.coarse_pipeline, &self.coarse_ds, ((WIDTH as u32 + 255) / 256, (HEIGHT as u32 + 255) / 256, 1), + (256, 256, 1), ); cmd_buf.write_timestamp(&query_pool, 6); cmd_buf.memory_barrier(); @@ -448,6 +454,7 @@ impl Renderer { &self.k4_pipeline, &self.k4_ds, ((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1), + (8, 4, 1), ); cmd_buf.write_timestamp(&query_pool, 7); cmd_buf.memory_barrier();