diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs index 857fe3c..bcef409 100644 --- a/piet-gpu-hal/src/dx12.rs +++ b/piet-gpu-hal/src/dx12.rs @@ -14,7 +14,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use smallvec::SmallVec; -use crate::{BufferUsage, Error, GpuInfo, ImageLayout}; +use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits}; use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode}; @@ -177,6 +177,10 @@ impl Dx12Instance { has_descriptor_indexing: false, has_subgroups: false, subgroup_size: None, + workgroup_limits: WorkgroupLimits { + max_size: [1024, 1024, 64], + max_invocations: 1024, + }, has_memory_model: false, use_staging_buffers, }; diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs index 0739b13..2dd0eff 100644 --- a/piet-gpu-hal/src/lib.rs +++ b/piet-gpu-hal/src/lib.rs @@ -99,6 +99,8 @@ pub struct GpuInfo { /// required in Vulkan 1.1), and we should have finer grained /// queries for shuffles, etc. pub has_subgroups: bool, + /// Limits on workgroup size for compute shaders. + pub workgroup_limits: WorkgroupLimits, /// Info about subgroup size control, if available. pub subgroup_size: Option, /// The GPU supports a real, grown-ass memory model. @@ -114,6 +116,16 @@ pub struct GpuInfo { /// available. #[derive(Clone, Debug)] pub struct SubgroupSize { - min: u32, - max: u32, + pub min: u32, + pub max: u32, +} + +/// The range of workgroup sizes supported by a back-end. +#[derive(Clone, Debug)] +pub struct WorkgroupLimits { + /// The maximum size on each workgroup dimension can be. + pub max_size: [u32; 3], + /// The maximum overall invocations a workgroup can have. That is, the product of sizes in each + /// dimension. + pub max_invocations: u32, } diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs index 9b0c2b2..69141c2 100644 --- a/piet-gpu-hal/src/metal.rs +++ b/piet-gpu-hal/src/metal.rs @@ -29,7 +29,7 @@ use metal::{CGFloat, MTLFeatureSet}; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; -use crate::{BufferUsage, Error, GpuInfo}; +use crate::{BufferUsage, Error, GpuInfo, WorkgroupLimits}; use util::*; @@ -164,6 +164,10 @@ impl MtlInstance { has_descriptor_indexing: false, has_subgroups: false, subgroup_size: None, + workgroup_limits: WorkgroupLimits { + max_size: [512, 512, 512], + max_invocations: 512, + }, has_memory_model: false, use_staging_buffers, }; diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs index 3eee69a..9111900 100644 --- a/piet-gpu-hal/src/vulkan.rs +++ b/piet-gpu-hal/src/vulkan.rs @@ -12,9 +12,7 @@ use ash::{vk, Device, Entry, Instance}; use smallvec::SmallVec; -use crate::{ - BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, -}; +use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits}; use crate::backend::Device as DeviceTrait; @@ -357,10 +355,17 @@ impl VkInstance { // TODO: finer grained query of specific subgroup info. let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0); + + let workgroup_limits = WorkgroupLimits { + max_invocations: props.limits.max_compute_work_group_invocations, + max_size: props.limits.max_compute_work_group_size, + }; + let gpu_info = GpuInfo { has_descriptor_indexing, has_subgroups, subgroup_size, + workgroup_limits, has_memory_model, use_staging_buffers, }; diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index 99cbf4f..247bbdf 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -21,11 +21,16 @@ #define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define BACKDROP_WG (1 << LG_BACKDROP_WG) +#ifndef BACKDROP_DIST_FACTOR // Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much // additional threads should we spawn for parallel row processing. The additional threads does not participate in the // earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more // parallelism. -#define BACKDROP_DIST_FACTOR 4 + +// This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with +// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance. +#define BACKDROP_DIST_FACTOR 1 +#endif layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in; diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv index 7941654..6458f4b 100644 Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv new file mode 100644 index 0000000..7941654 Binary files /dev/null and b/piet-gpu/shader/backdrop_lg.spv differ diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 22c9c78..b73da2e 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -18,6 +18,9 @@ build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setu build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h +build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h + flags = -DBACKDROP_DIST_FACTOR=4 + build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index ef70c9c..971b517 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -311,8 +311,13 @@ impl Renderer { let path_ds = session .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; - let backdrop_alloc_code = ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")); - let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?; + let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 { + ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv")) + } else { + println!("using small workgroup backdrop kernel"); + ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")) + }; + let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?; let backdrop_ds = session .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;