Merge pull request #99 from ishitatsuyuki/bd

This commit is contained in:
Tatsuyuki Ishi 2021-06-12 15:29:34 +09:00 committed by GitHub
commit 090c99e277
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 91 additions and 40 deletions

View file

@ -14,7 +14,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{BufferUsage, Error, GpuInfo, ImageLayout}; use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits};
use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode}; use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode};
@ -177,6 +177,10 @@ impl Dx12Instance {
has_descriptor_indexing: false, has_descriptor_indexing: false,
has_subgroups: false, has_subgroups: false,
subgroup_size: None, subgroup_size: None,
workgroup_limits: WorkgroupLimits {
max_size: [1024, 1024, 64],
max_invocations: 1024,
},
has_memory_model: false, has_memory_model: false,
use_staging_buffers, use_staging_buffers,
}; };

View file

@ -99,6 +99,8 @@ pub struct GpuInfo {
/// required in Vulkan 1.1), and we should have finer grained /// required in Vulkan 1.1), and we should have finer grained
/// queries for shuffles, etc. /// queries for shuffles, etc.
pub has_subgroups: bool, pub has_subgroups: bool,
/// Limits on workgroup size for compute shaders.
pub workgroup_limits: WorkgroupLimits,
/// Info about subgroup size control, if available. /// Info about subgroup size control, if available.
pub subgroup_size: Option<SubgroupSize>, pub subgroup_size: Option<SubgroupSize>,
/// The GPU supports a real, grown-ass memory model. /// The GPU supports a real, grown-ass memory model.
@ -114,6 +116,16 @@ pub struct GpuInfo {
/// available. /// available.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct SubgroupSize { pub struct SubgroupSize {
min: u32, pub min: u32,
max: u32, pub max: u32,
}
/// The range of workgroup sizes supported by a back-end.
#[derive(Clone, Debug)]
pub struct WorkgroupLimits {
/// The maximum size on each workgroup dimension can be.
pub max_size: [u32; 3],
/// The maximum overall invocations a workgroup can have. That is, the product of sizes in each
/// dimension.
pub max_invocations: u32,
} }

View file

@ -29,7 +29,7 @@ use metal::{CGFloat, MTLFeatureSet};
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle}; use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
use crate::{BufferUsage, Error, GpuInfo}; use crate::{BufferUsage, Error, GpuInfo, WorkgroupLimits};
use util::*; use util::*;
@ -164,6 +164,13 @@ impl MtlInstance {
has_descriptor_indexing: false, has_descriptor_indexing: false,
has_subgroups: false, has_subgroups: false,
subgroup_size: None, subgroup_size: None,
// The workgroup limits are taken from the minimum of a desktop installation;
// we don't support iOS right now, but in case of testing on those devices it might
// need to change these (or just queried properly).
workgroup_limits: WorkgroupLimits {
max_size: [1024, 1024, 64],
max_invocations: 1024,
},
has_memory_model: false, has_memory_model: false,
use_staging_buffers, use_staging_buffers,
}; };

View file

@ -12,9 +12,7 @@ use ash::{vk, Device, Entry, Instance};
use smallvec::SmallVec; use smallvec::SmallVec;
use crate::{ use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits};
BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
};
use crate::backend::Device as DeviceTrait; use crate::backend::Device as DeviceTrait;
@ -357,10 +355,17 @@ impl VkInstance {
// TODO: finer grained query of specific subgroup info. // TODO: finer grained query of specific subgroup info.
let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0); let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
let workgroup_limits = WorkgroupLimits {
max_invocations: props.limits.max_compute_work_group_invocations,
max_size: props.limits.max_compute_work_group_size,
};
let gpu_info = GpuInfo { let gpu_info = GpuInfo {
has_descriptor_indexing, has_descriptor_indexing,
has_subgroups, has_subgroups,
subgroup_size, subgroup_size,
workgroup_limits,
has_memory_model, has_memory_model,
use_staging_buffers, use_staging_buffers,
}; };

View file

@ -2,9 +2,10 @@
// Propagation of tile backdrop for filling. // Propagation of tile backdrop for filling.
// //
// Each thread reads one path element and calculates the number of spanned tiles // Each thread reads one path element and calculates the row and column counts of spanned tiles
// based on the bounding box. // based on the bounding box.
// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel. // The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup.
// In the following step, the workgroup loops over the corresponding tile rows per element in parallel.
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel, // For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
// and propagated from the left to the right (prefix summed). // and propagated from the left to the right (prefix summed).
// //
@ -20,8 +21,18 @@
#define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
#define BACKDROP_WG (1 << LG_BACKDROP_WG) #define BACKDROP_WG (1 << LG_BACKDROP_WG)
#ifndef BACKDROP_DIST_FACTOR
// Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much
// additional threads should we spawn for parallel row processing. The additional threads does not participate in the
// earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more
// parallelism.
layout(local_size_x = BACKDROP_WG, local_size_y = 1) in; // This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with
// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance.
#define BACKDROP_DIST_FACTOR 1
#endif
layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf { layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf; Config conf;
@ -35,54 +46,58 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG];
void main() { void main() {
uint th_ix = gl_LocalInvocationID.x; uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
// Work assignment: 1 thread : 1 path element // Work assignment: 1 thread : 1 path element
uint row_count = 0; uint row_count = 0;
bool mem_ok = mem_error == NO_ERROR; bool mem_ok = mem_error == NO_ERROR;
if (element_ix < conf.n_elements) { if (gl_LocalInvocationID.y == 0) {
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref); if (element_ix < conf.n_elements) {
switch (tag.tag) { AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
case Annotated_Image: switch (tag.tag) {
case Annotated_BeginClip: case Annotated_Image:
case Annotated_Color: case Annotated_BeginClip:
if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) { case Annotated_Color:
break; if (fill_mode_from_flags(tag.flags) != MODE_NONZERO) {
break;
}
// Fall through.
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
Path path = Path_read(conf.tile_alloc, path_ref);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
// Paths that don't cross tile top edges don't have backdrops.
// Don't apply the optimization to paths that may cross the y = 0
// top edge, but clipped to 1 row.
if (row_count == 1 && path.bbox.y > 0) {
// Note: this can probably be expanded to width = 2 as
// long as it doesn't cross the left edge.
row_count = 0;
}
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
sh_row_alloc[th_ix] = path_alloc;
} }
// Fall through.
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
Path path = Path_read(conf.tile_alloc, path_ref);
sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
row_count = path.bbox.w - path.bbox.y;
// Paths that don't cross tile top edges don't have backdrops.
// Don't apply the optimization to paths that may cross the y = 0
// top edge, but clipped to 1 row.
if (row_count == 1 && path.bbox.y > 0) {
// Note: this can probably be expanded to width = 2 as
// long as it doesn't cross the left edge.
row_count = 0;
}
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
sh_row_alloc[th_ix] = path_alloc;
} }
sh_row_count[th_ix] = row_count;
} }
sh_row_count[th_ix] = row_count;
// Prefix sum of sh_row_count // Prefix sum of sh_row_count
for (uint i = 0; i < LG_BACKDROP_WG; i++) { for (uint i = 0; i < LG_BACKDROP_WG; i++) {
barrier(); barrier();
if (th_ix >= (1 << i)) { if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) {
row_count += sh_row_count[th_ix - (1 << i)]; row_count += sh_row_count[th_ix - (1 << i)];
} }
barrier(); barrier();
sh_row_count[th_ix] = row_count; if (gl_LocalInvocationID.y == 0) {
sh_row_count[th_ix] = row_count;
}
} }
barrier(); barrier();
// Work assignment: 1 thread : 1 path element row // Work assignment: 1 thread : 1 path element row
uint total_rows = sh_row_count[BACKDROP_WG - 1]; uint total_rows = sh_row_count[BACKDROP_WG - 1];
for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) { for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) {
// Binary search to find element // Binary search to find element
uint el_ix = 0; uint el_ix = 0;
for (uint i = 0; i < LG_BACKDROP_WG; i++) { for (uint i = 0; i < LG_BACKDROP_WG; i++) {

Binary file not shown.

Binary file not shown.

View file

@ -18,6 +18,9 @@ build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setu
build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
flags = -DBACKDROP_DIST_FACTOR=4
build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h

View file

@ -311,8 +311,13 @@ impl Renderer {
let path_ds = session let path_ds = session
.create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?; .create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
let backdrop_alloc_code = ShaderCode::Spv(include_bytes!("../shader/backdrop.spv")); let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?; ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv"))
} else {
println!("using small workgroup backdrop kernel");
ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
};
let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?;
let backdrop_ds = session let backdrop_ds = session
.create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?; .create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;