mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 20:51:29 +11:00
Merge pull request #99 from ishitatsuyuki/bd
This commit is contained in:
commit
090c99e277
|
@ -14,7 +14,7 @@ use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
|
||||||
|
|
||||||
use smallvec::SmallVec;
|
use smallvec::SmallVec;
|
||||||
|
|
||||||
use crate::{BufferUsage, Error, GpuInfo, ImageLayout};
|
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, WorkgroupLimits};
|
||||||
|
|
||||||
use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode};
|
use self::wrappers::{CommandAllocator, CommandQueue, Device, Factory4, Resource, ShaderByteCode};
|
||||||
|
|
||||||
|
@ -177,6 +177,10 @@ impl Dx12Instance {
|
||||||
has_descriptor_indexing: false,
|
has_descriptor_indexing: false,
|
||||||
has_subgroups: false,
|
has_subgroups: false,
|
||||||
subgroup_size: None,
|
subgroup_size: None,
|
||||||
|
workgroup_limits: WorkgroupLimits {
|
||||||
|
max_size: [1024, 1024, 64],
|
||||||
|
max_invocations: 1024,
|
||||||
|
},
|
||||||
has_memory_model: false,
|
has_memory_model: false,
|
||||||
use_staging_buffers,
|
use_staging_buffers,
|
||||||
};
|
};
|
||||||
|
|
|
@ -99,6 +99,8 @@ pub struct GpuInfo {
|
||||||
/// required in Vulkan 1.1), and we should have finer grained
|
/// required in Vulkan 1.1), and we should have finer grained
|
||||||
/// queries for shuffles, etc.
|
/// queries for shuffles, etc.
|
||||||
pub has_subgroups: bool,
|
pub has_subgroups: bool,
|
||||||
|
/// Limits on workgroup size for compute shaders.
|
||||||
|
pub workgroup_limits: WorkgroupLimits,
|
||||||
/// Info about subgroup size control, if available.
|
/// Info about subgroup size control, if available.
|
||||||
pub subgroup_size: Option<SubgroupSize>,
|
pub subgroup_size: Option<SubgroupSize>,
|
||||||
/// The GPU supports a real, grown-ass memory model.
|
/// The GPU supports a real, grown-ass memory model.
|
||||||
|
@ -114,6 +116,16 @@ pub struct GpuInfo {
|
||||||
/// available.
|
/// available.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct SubgroupSize {
|
pub struct SubgroupSize {
|
||||||
min: u32,
|
pub min: u32,
|
||||||
max: u32,
|
pub max: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The range of workgroup sizes supported by a back-end.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct WorkgroupLimits {
|
||||||
|
/// The maximum size on each workgroup dimension can be.
|
||||||
|
pub max_size: [u32; 3],
|
||||||
|
/// The maximum overall invocations a workgroup can have. That is, the product of sizes in each
|
||||||
|
/// dimension.
|
||||||
|
pub max_invocations: u32,
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,7 @@ use metal::{CGFloat, MTLFeatureSet};
|
||||||
|
|
||||||
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
|
use raw_window_handle::{HasRawWindowHandle, RawWindowHandle};
|
||||||
|
|
||||||
use crate::{BufferUsage, Error, GpuInfo};
|
use crate::{BufferUsage, Error, GpuInfo, WorkgroupLimits};
|
||||||
|
|
||||||
use util::*;
|
use util::*;
|
||||||
|
|
||||||
|
@ -164,6 +164,13 @@ impl MtlInstance {
|
||||||
has_descriptor_indexing: false,
|
has_descriptor_indexing: false,
|
||||||
has_subgroups: false,
|
has_subgroups: false,
|
||||||
subgroup_size: None,
|
subgroup_size: None,
|
||||||
|
// The workgroup limits are taken from the minimum of a desktop installation;
|
||||||
|
// we don't support iOS right now, but in case of testing on those devices it might
|
||||||
|
// need to change these (or just queried properly).
|
||||||
|
workgroup_limits: WorkgroupLimits {
|
||||||
|
max_size: [1024, 1024, 64],
|
||||||
|
max_invocations: 1024,
|
||||||
|
},
|
||||||
has_memory_model: false,
|
has_memory_model: false,
|
||||||
use_staging_buffers,
|
use_staging_buffers,
|
||||||
};
|
};
|
||||||
|
|
|
@ -12,9 +12,7 @@ use ash::{vk, Device, Entry, Instance};
|
||||||
|
|
||||||
use smallvec::SmallVec;
|
use smallvec::SmallVec;
|
||||||
|
|
||||||
use crate::{
|
use crate::{BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize, WorkgroupLimits};
|
||||||
BufferUsage, Error, GpuInfo, ImageLayout, SamplerParams, SubgroupSize,
|
|
||||||
};
|
|
||||||
use crate::backend::Device as DeviceTrait;
|
use crate::backend::Device as DeviceTrait;
|
||||||
|
|
||||||
|
|
||||||
|
@ -357,10 +355,17 @@ impl VkInstance {
|
||||||
|
|
||||||
// TODO: finer grained query of specific subgroup info.
|
// TODO: finer grained query of specific subgroup info.
|
||||||
let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
|
let has_subgroups = self.vk_version >= vk::make_version(1, 1, 0);
|
||||||
|
|
||||||
|
let workgroup_limits = WorkgroupLimits {
|
||||||
|
max_invocations: props.limits.max_compute_work_group_invocations,
|
||||||
|
max_size: props.limits.max_compute_work_group_size,
|
||||||
|
};
|
||||||
|
|
||||||
let gpu_info = GpuInfo {
|
let gpu_info = GpuInfo {
|
||||||
has_descriptor_indexing,
|
has_descriptor_indexing,
|
||||||
has_subgroups,
|
has_subgroups,
|
||||||
subgroup_size,
|
subgroup_size,
|
||||||
|
workgroup_limits,
|
||||||
has_memory_model,
|
has_memory_model,
|
||||||
use_staging_buffers,
|
use_staging_buffers,
|
||||||
};
|
};
|
||||||
|
|
|
@ -2,9 +2,10 @@
|
||||||
|
|
||||||
// Propagation of tile backdrop for filling.
|
// Propagation of tile backdrop for filling.
|
||||||
//
|
//
|
||||||
// Each thread reads one path element and calculates the number of spanned tiles
|
// Each thread reads one path element and calculates the row and column counts of spanned tiles
|
||||||
// based on the bounding box.
|
// based on the bounding box.
|
||||||
// In a further compaction step, the workgroup loops over the corresponding tile rows per element in parallel.
|
// The row count then goes through a prefix sum to redistribute and load-balance the work across the workgroup.
|
||||||
|
// In the following step, the workgroup loops over the corresponding tile rows per element in parallel.
|
||||||
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
|
// For each row the per tile backdrop will be read, as calculated in the previous coarse path segment kernel,
|
||||||
// and propagated from the left to the right (prefix summed).
|
// and propagated from the left to the right (prefix summed).
|
||||||
//
|
//
|
||||||
|
@ -20,8 +21,18 @@
|
||||||
|
|
||||||
#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
|
#define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
|
||||||
#define BACKDROP_WG (1 << LG_BACKDROP_WG)
|
#define BACKDROP_WG (1 << LG_BACKDROP_WG)
|
||||||
|
#ifndef BACKDROP_DIST_FACTOR
|
||||||
|
// Some paths (those covering a large area) can generate a lot of backdrop tiles; BACKDROP_DIST_FACTOR defines how much
|
||||||
|
// additional threads should we spawn for parallel row processing. The additional threads does not participate in the
|
||||||
|
// earlier stages (calculating the tile counts) but does work in the final prefix sum stage which has a lot more
|
||||||
|
// parallelism.
|
||||||
|
|
||||||
layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
|
// This feature is opt-in: one variant is compiled with the following default, while the other variant is compiled with
|
||||||
|
// a larger BACKDROP_DIST_FACTOR, which is used on GPUs supporting a larger workgroup size to improve performance.
|
||||||
|
#define BACKDROP_DIST_FACTOR 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
layout(local_size_x = BACKDROP_WG, local_size_y = BACKDROP_DIST_FACTOR) in;
|
||||||
|
|
||||||
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
|
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
|
||||||
Config conf;
|
Config conf;
|
||||||
|
@ -35,13 +46,14 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
|
||||||
shared uint sh_row_width[BACKDROP_WG];
|
shared uint sh_row_width[BACKDROP_WG];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
uint th_ix = gl_LocalInvocationID.x;
|
uint th_ix = gl_LocalInvocationIndex;
|
||||||
uint element_ix = gl_GlobalInvocationID.x;
|
uint element_ix = gl_GlobalInvocationID.x;
|
||||||
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
||||||
|
|
||||||
// Work assignment: 1 thread : 1 path element
|
// Work assignment: 1 thread : 1 path element
|
||||||
uint row_count = 0;
|
uint row_count = 0;
|
||||||
bool mem_ok = mem_error == NO_ERROR;
|
bool mem_ok = mem_error == NO_ERROR;
|
||||||
|
if (gl_LocalInvocationID.y == 0) {
|
||||||
if (element_ix < conf.n_elements) {
|
if (element_ix < conf.n_elements) {
|
||||||
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
|
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
|
||||||
switch (tag.tag) {
|
switch (tag.tag) {
|
||||||
|
@ -68,21 +80,24 @@ void main() {
|
||||||
sh_row_alloc[th_ix] = path_alloc;
|
sh_row_alloc[th_ix] = path_alloc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sh_row_count[th_ix] = row_count;
|
sh_row_count[th_ix] = row_count;
|
||||||
|
}
|
||||||
|
|
||||||
// Prefix sum of sh_row_count
|
// Prefix sum of sh_row_count
|
||||||
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
|
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
|
||||||
barrier();
|
barrier();
|
||||||
if (th_ix >= (1 << i)) {
|
if (gl_LocalInvocationID.y == 0 && th_ix >= (1 << i)) {
|
||||||
row_count += sh_row_count[th_ix - (1 << i)];
|
row_count += sh_row_count[th_ix - (1 << i)];
|
||||||
}
|
}
|
||||||
barrier();
|
barrier();
|
||||||
|
if (gl_LocalInvocationID.y == 0) {
|
||||||
sh_row_count[th_ix] = row_count;
|
sh_row_count[th_ix] = row_count;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
barrier();
|
barrier();
|
||||||
// Work assignment: 1 thread : 1 path element row
|
// Work assignment: 1 thread : 1 path element row
|
||||||
uint total_rows = sh_row_count[BACKDROP_WG - 1];
|
uint total_rows = sh_row_count[BACKDROP_WG - 1];
|
||||||
for (uint row = th_ix; row < total_rows; row += BACKDROP_WG) {
|
for (uint row = th_ix; row < total_rows; row += BACKDROP_WG * BACKDROP_DIST_FACTOR) {
|
||||||
// Binary search to find element
|
// Binary search to find element
|
||||||
uint el_ix = 0;
|
uint el_ix = 0;
|
||||||
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
|
for (uint i = 0; i < LG_BACKDROP_WG; i++) {
|
||||||
|
|
Binary file not shown.
BIN
piet-gpu/shader/backdrop_lg.spv
Normal file
BIN
piet-gpu/shader/backdrop_lg.spv
Normal file
Binary file not shown.
|
@ -18,6 +18,9 @@ build path_coarse.spv: glsl path_coarse.comp | annotated.h pathseg.h tile.h setu
|
||||||
|
|
||||||
build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
|
build backdrop.spv: glsl backdrop.comp | annotated.h tile.h setup.h
|
||||||
|
|
||||||
|
build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
|
||||||
|
flags = -DBACKDROP_DIST_FACTOR=4
|
||||||
|
|
||||||
build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
|
build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
|
||||||
|
|
||||||
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
|
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
|
||||||
|
|
|
@ -311,8 +311,13 @@ impl Renderer {
|
||||||
let path_ds = session
|
let path_ds = session
|
||||||
.create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
|
.create_simple_descriptor_set(&path_pipeline, &[&memory_buf_dev, &config_buf])?;
|
||||||
|
|
||||||
let backdrop_alloc_code = ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"));
|
let backdrop_code = if session.gpu_info().workgroup_limits.max_invocations >= 1024 {
|
||||||
let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
|
ShaderCode::Spv(include_bytes!("../shader/backdrop_lg.spv"))
|
||||||
|
} else {
|
||||||
|
println!("using small workgroup backdrop kernel");
|
||||||
|
ShaderCode::Spv(include_bytes!("../shader/backdrop.spv"))
|
||||||
|
};
|
||||||
|
let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_code, 2)?;
|
||||||
let backdrop_ds = session
|
let backdrop_ds = session
|
||||||
.create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;
|
.create_simple_descriptor_set(&backdrop_pipeline, &[&memory_buf_dev, &config_buf])?;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue