diff --git a/piet-gpu-hal/src/dx12.rs b/piet-gpu-hal/src/dx12.rs
index 857fe3c..bcef409 100644
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@@ -14,7 +14,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
 use smallvec::SmallVec;
 
-use crate::{BufferUsage, Error, GpuInfo, ImageLayout};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits};
 
 use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode};
 
@@ -177,6 +177,10 @@ impl Dx12Instance {
                 has_descriptor_indexing: false,
                 has_subgroups: false,
                 subgroup_size: None,
+                workgroup_limits: WorkgroupLimits {
+                    max_size: [1024, 1024, 64],
+                    max_invocations: 1024,
+                },
                 has_memory_model: false,
                 use_staging_buffers,
             };
diff --git a/piet-gpu-hal/src/lib.rs b/piet-gpu-hal/src/lib.rs
index 0739b13..2dd0eff 100644
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@@ -99,6 +99,8 @@ pub struct GpuInfo {
     /// required in Vulkan 1.1), and we should have finer grained
     /// queries for shuffles, etc.
     pub has_subgroups: bool,
+    /// Limits on workgroup size for compute shaders.
+    pub workgroup_limits: WorkgroupLimits,
     /// Info about subgroup size control, if available.
     pub subgroup_size: Option<SubgroupSize>,
     /// The GPU supports a real, grown-ass memory model.
@@ -114,6 +116,16 @@ pub struct GpuInfo {
 /// available.
 #[derive(Clone, Debug)]
 pub struct SubgroupSize {
-    min: u32,
-    max: u32,
+    pub min: u32,
+    pub max: u32,
+}
+
+/// The range of workgroup sizes supported by a back-end.
+#[derive(Clone, Debug)]
+pub struct WorkgroupLimits {
+    /// The maximum size on each workgroup dimension can be.
+    pub max_size: [u32; 3],
+    /// The maximum overall invocations a workgroup can have. That is, the product of sizes in each
+    /// dimension.
+    pub max_invocations: u32,
 }
diff --git a/piet-gpu-hal/src/metal.rs b/piet-gpu-hal/src/metal.rs
index 9b0c2b2..69141c2 100644
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@@ -29,7 +29,7 @@ use metal::{CGFloat, MTLFeatureSet};
 
 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
 
-use crate::{BufferUsage, Error, GpuInfo};
+use crate::{BufferUsage, Error, GpuInfo, WorkgroupLimits};
 
 use util::*;
 
@@ -164,6 +164,10 @@ impl MtlInstance {
                 has_descriptor_indexing: false,
                 has_subgroups: false,
                 subgroup_size: None,
+                workgroup_limits: WorkgroupLimits {
+                    max_size: [512, 512, 512],
+                    max_invocations: 512,
+                },
                 has_memory_model: false,
                 use_staging_buffers,
             };
diff --git a/piet-gpu-hal/src/vulkan.rs b/piet-gpu-hal/src/vulkan.rs
index 3eee69a..9111900 100644
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@@ -12,9 +12,7 @@ use ash::{vk, Device, Entry, Instance};
 
 use smallvec::SmallVec;
 
-use crate::{
-    BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
-};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits};
 use crate::backend::Device as DeviceTrait;
 
 
@@ -357,10 +355,17 @@ impl VkInstance {
 
         // TODO: finer grained query of specific subgroup info.
         let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
+
+        let workgroup_limits = WorkgroupLimits {
+            max_invocations: props.limits.max_compute_work_group_invocations,
+            max_size: props.limits.max_compute_work_group_size,
+        };
+
         let gpu_info = GpuInfo {
             has_descriptor_indexing,
             has_subgroups,
             subgroup_size,
+            workgroup_limits,
             has_memory_model,
             use_staging_buffers,
         };
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 99cbf4f..247bbdf 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -21,11 +21,16 @@
 
 #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)
+#ifndef BACKDROP_DIST_FACTOR
 // Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much
 // additional threads should we spawn for parallel row processing. The additional threads does not participate in the
 // earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more
 // parallelism.
-#define BACKDROP_DIST_FACTOR 4
+
+// This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with
+// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance.
+#define BACKDROP_DIST_FACTOR 1
+#endif
 
 layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in;
 
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 7941654..6458f4b 100644
Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ
diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv
new file mode 100644
index 0000000..7941654
Binary files /dev/null and b/piet-gpu/shader/backdrop_lg.spv differ
diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja
index 22c9c78..b73da2e 100644
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@@ -18,6 +18,9 @@ build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setu
 
 build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
 
+build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+  flags = -DBACKDROP_DIST_FACTOR=4
+
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
 
 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index ef70c9c..971b517 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -311,8 +311,13 @@ impl Renderer {
         let path_ds = session
             .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
 
-        let backdrop_alloc_code = ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"));
-        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
+        let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
+            ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv"))
+        } else {
+            println!("using small workgroup backdrop kernel");
+            ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
+        };
+        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?;
         let backdrop_ds = session
             .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;