Merge pull request #99 from ishitatsuyuki/bd

2025-01-10 12:41:30 +11:00 · 2021-06-12 15:29:34 +09:00 · 2021-06-12 15:29:34 +09:00 · 090c99e277
parent 8b65942f65 e610c71f13
commit 090c99e277
9 changed files with 91 additions and 40 deletions
--- a/piet-gpu-hal/src/dx12.rs
+++ b/piet-gpu-hal/src/dx12.rs
@ -14,7 +14,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};

 use smallvec::SmallVec;

-use crate::{BufferUsage, Error, GpuInfo, ImageLayout};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits};

 use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode};

@ -177,6 +177,10 @@ impl Dx12Instance {
                has_descriptor_indexing: false,
                has_subgroups: false,
                subgroup_size: None,
+                workgroup_limits: WorkgroupLimits {
+                    max_size: [1024, 1024, 64],
+                    max_invocations: 1024,
+                },
                has_memory_model: false,
                use_staging_buffers,
            };
--- a/piet-gpu-hal/src/lib.rs
+++ b/piet-gpu-hal/src/lib.rs
@ -99,6 +99,8 @@ pub struct GpuInfo {
    /// required in Vulkan 1.1), and we should have finer grained
    /// queries for shuffles, etc.
    pub has_subgroups: bool,
+    /// Limits on workgroup size for compute shaders.
+    pub workgroup_limits: WorkgroupLimits,
    /// Info about subgroup size control, if available.
    pub subgroup_size: Option<SubgroupSize>,
    /// The GPU supports a real, grown-ass memory model.
@ -114,6 +116,16 @@ pub struct GpuInfo {
 /// available.
 #[derive(Clone, Debug)]
 pub struct SubgroupSize {
-    min: u32,
-    max: u32,
+    pub min: u32,
+    pub max: u32,
+}
+
+/// The range of workgroup sizes supported by a back-end.
+#[derive(Clone, Debug)]
+pub struct WorkgroupLimits {
+    /// The maximum size on each workgroup dimension can be.
+    pub max_size: [u32; 3],
+    /// The maximum overall invocations a workgroup can have. That is, the product of sizes in each
+    /// dimension.
+    pub max_invocations: u32,
 }
--- a/piet-gpu-hal/src/metal.rs
+++ b/piet-gpu-hal/src/metal.rs
@ -29,7 +29,7 @@ use metal::{CGFloat, MTLFeatureSet};

 use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};

-use crate::{BufferUsage, Error, GpuInfo};
+use crate::{BufferUsage, Error, GpuInfo, WorkgroupLimits};

 use util::*;

@ -164,6 +164,13 @@ impl MtlInstance {
                has_descriptor_indexing: false,
                has_subgroups: false,
                subgroup_size: None,
+                // The workgroup limits are taken from the minimum of a desktop installation;
+                // we don't support iOS right now, but in case of testing on those devices it might
+                // need to change these (or just queried properly).
+                workgroup_limits: WorkgroupLimits {
+                    max_size: [1024, 1024, 64],
+                    max_invocations: 1024,
+                },
                has_memory_model: false,
                use_staging_buffers,
            };
--- a/piet-gpu-hal/src/vulkan.rs
+++ b/piet-gpu-hal/src/vulkan.rs
@ -12,9 +12,7 @@ use ash::{vk, Device, Entry, Instance};

 use smallvec::SmallVec;

-use crate::{
-    BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
-};
+use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits};
 use crate::backend::Device as DeviceTrait;


@ -357,10 +355,17 @@ impl VkInstance {

        // TODO: finer grained query of specific subgroup info.
        let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
+
+        let workgroup_limits = WorkgroupLimits {
+            max_invocations: props.limits.max_compute_work_group_invocations,
+            max_size: props.limits.max_compute_work_group_size,
+        };
+
        let gpu_info = GpuInfo {
            has_descriptor_indexing,
            has_subgroups,
            subgroup_size,
+            workgroup_limits,
            has_memory_model,
            use_staging_buffers,
        };
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@ -2,9 +2,10 @@

 // Propagation of tile backdrop for filling.
 //
-// Each thread reads one path element and calculates the number of spanned tiles
+// Each thread reads one path element and calculates the row and column counts of spanned tiles
 // based on the bounding box.
-// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
+// The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup.
+// In the following step, the workgroup loops over the corresponding tile rows per element in parallel.
 // For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
 // and propagated from the left to the right (prefix summed).
 //
@ -20,8 +21,18 @@

 #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)
+#ifndef BACKDROP_DIST_FACTOR
+// Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much
+// additional threads should we spawn for parallel row processing. The additional threads does not participate in the
+// earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more
+// parallelism.

-layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
+// This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with
+// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance.
+#define BACKDROP_DIST_FACTOR 1
+#endif
+
+layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in;

 layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    Config conf;
@ -35,54 +46,58 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];

 void main() {
-    uint th_ix = gl_LocalInvocationID.x;
+    uint th_ix = gl_LocalInvocationIndex;
    uint element_ix = gl_GlobalInvocationID.x;
    AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);

    // Work assignment: 1 thread : 1 path element
    uint row_count = 0;
    bool mem_ok = mem_error == NO_ERROR;
-    if (element_ix < conf.n_elements) {
-        AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
-        switch (tag.tag) {
-        case Annotated_Image:
-        case Annotated_BeginClip:
-        case Annotated_Color:
-            if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
-                break;
+    if (gl_LocalInvocationID.y == 0) {
+        if (element_ix < conf.n_elements) {
+            AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
+            switch (tag.tag) {
+                case Annotated_Image:
+                case Annotated_BeginClip:
+                case Annotated_Color:
+                if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
+                    break;
+                }
+                // Fall through.
+                PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
+                Path path = Path_read(conf.tile_alloc, path_ref);
+                sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
+                row_count = path.bbox.w - path.bbox.y;
+                // Paths that don't cross tile top edges don't have backdrops.
+                // Don't apply the optimization to paths that may cross the y = 0
+                // top edge, but clipped to 1 row.
+                if (row_count == 1 && path.bbox.y > 0) {
+                    // Note: this can probably be expanded to width = 2 as
+                    // long as it doesn't cross the left edge.
+                    row_count = 0;
+                }
+                Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
+                sh_row_alloc[th_ix] = path_alloc;
            }
-            // Fall through.
-            PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
-            Path path = Path_read(conf.tile_alloc, path_ref);
-            sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
-            row_count = path.bbox.w - path.bbox.y;
-            // Paths that don't cross tile top edges don't have backdrops.
-            // Don't apply the optimization to paths that may cross the y = 0
-            // top edge, but clipped to 1 row.
-            if (row_count == 1 && path.bbox.y > 0) {
-                // Note: this can probably be expanded to width = 2 as
-                // long as it doesn't cross the left edge.
-                row_count = 0;
-            }
-            Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
-            sh_row_alloc[th_ix] = path_alloc;
        }
+        sh_row_count[th_ix] = row_count;
    }

-    sh_row_count[th_ix] = row_count;
    // Prefix sum of sh_row_count
    for (uint i = 0; i < LG_BACKDROP_WG; i++) {
        barrier();
-        if (th_ix >= (1 << i)) {
+        if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) {
            row_count += sh_row_count[th_ix - (1 << i)];
        }
        barrier();
-        sh_row_count[th_ix] = row_count;
+        if (gl_LocalInvocationID.y == 0) {
+            sh_row_count[th_ix] = row_count;
+        }
    }
    barrier();
    // Work assignment: 1 thread : 1 path element row
    uint total_rows = sh_row_count[BACKDROP_WG - 1];
-    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
+    for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) {
        // Binary search to find element
        uint el_ix = 0;
        for (uint i = 0; i < LG_BACKDROP_WG; i++) {
--- a/piet-gpu/shader/backdrop.spv
+++ b/piet-gpu/shader/backdrop.spv
--- a/piet-gpu/shader/backdrop_lg.spv
+++ b/piet-gpu/shader/backdrop_lg.spv
--- a/piet-gpu/shader/build.ninja
+++ b/piet-gpu/shader/build.ninja
@ -18,6 +18,9 @@ build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setu

 build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h

+build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
+  flags = -DBACKDROP_DIST_FACTOR=4
+
 build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h

 build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@ -311,8 +311,13 @@ impl Renderer {
        let path_ds = session
            .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;

-        let backdrop_alloc_code = ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"));
-        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
+        let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
+            ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv"))
+        } else {
+            println!("using small workgroup backdrop kernel");
+            ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
+        };
+        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?;
        let backdrop_ds = session
            .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;