Merge pull request #219 from linebender/squeeze

Reduce resource requirements of shaders
This commit is contained in:
Raph Levien 2022-11-29 19:02:15 -08:00 committed by GitHub
commit 571822248c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
21 changed files with 99 additions and 79 deletions

View file

@ -9,7 +9,7 @@ struct Tile {
#import config
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage, read_write> tiles: array<Tile>;

View file

@ -6,7 +6,7 @@
#import tile
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> paths: array<Path>;

View file

@ -3,7 +3,7 @@
#import config
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
struct PathBbox {
x0: i32,

View file

@ -8,7 +8,7 @@
#import bump
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> draw_monoids: array<DrawMonoid>;
@ -46,9 +46,11 @@ let SY = 0.00390625;
let WG_SIZE = 256u;
let N_SLICE = 8u;
//let N_SLICE = WG_SIZE / 32u;
let N_SUBSLICE = 4u;
var<workgroup> sh_bitmaps: array<array<atomic<u32>, N_TILE>, N_SLICE>;
var<workgroup> sh_count: array<array<u32, N_TILE>, N_SLICE>;
// store count values packed two u16's to a u32
var<workgroup> sh_count: array<array<u32, N_TILE>, N_SUBSLICE>;
var<workgroup> sh_chunk_offset: array<u32, N_TILE>;
@compute @workgroup_size(256)
@ -115,9 +117,13 @@ fn main(
workgroupBarrier();
// Allocate output segments
var element_count = 0u;
for (var i = 0u; i < N_SLICE; i += 1u) {
element_count += countOneBits(atomicLoad(&sh_bitmaps[i][local_id.x]));
sh_count[i][local_id.x] = element_count;
for (var i = 0u; i < N_SUBSLICE; i += 1u) {
element_count += countOneBits(atomicLoad(&sh_bitmaps[i * 2u][local_id.x]));
let element_count_lo = element_count;
element_count += countOneBits(atomicLoad(&sh_bitmaps[i * 2u + 1u][local_id.x]));
let element_count_hi = element_count;
let element_count_packed = element_count_lo | (element_count_hi << 16u);
sh_count[i][local_id.x] = element_count_packed;
}
// element_count is the number of draw objects covering this thread's bin
let chunk_offset = atomicAdd(&bump.binning, element_count);
@ -136,9 +142,11 @@ fn main(
if (out_mask & my_mask) != 0u {
var idx = countOneBits(out_mask & (my_mask - 1u));
if my_slice > 0u {
idx += sh_count[my_slice - 1u][bin_ix];
let count_ix = my_slice - 1u;
let count_packed = sh_count[count_ix / 2u][bin_ix];
idx += (count_packed >> (16u * (count_ix & 1u))) & 0xffffu;
}
let offset = sh_chunk_offset[bin_ix];
let offset = config.bin_data_start + sh_chunk_offset[bin_ix];
bin_data[offset + idx] = element_ix;
}
x += 1;

View file

@ -6,7 +6,7 @@
#import drawtag
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> clip_inp: array<ClipInp>;

View file

@ -5,7 +5,7 @@
#import clip
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> clip_inp: array<ClipInp>;

View file

@ -9,7 +9,7 @@
#import tile
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;
@ -27,7 +27,7 @@ struct BinHeader {
var<storage> bin_headers: array<BinHeader>;
@group(0) @binding(4)
var<storage> bin_data: array<u32>;
var<storage> info_bin_data: array<u32>;
@group(0) @binding(5)
var<storage> paths: array<Path>;
@ -36,12 +36,9 @@ var<storage> paths: array<Path>;
var<storage> tiles: array<Tile>;
@group(0) @binding(7)
var<storage> info: array<u32>;
@group(0) @binding(8)
var<storage, read_write> bump: BumpAllocators;
@group(0) @binding(9)
@group(0) @binding(8)
var<storage, read_write> ptcl: array<u32>;
@ -208,8 +205,8 @@ fn main(
}
}
ix -= select(part_start_ix, sh_part_count[part_ix - 1u], part_ix > 0u);
let offset = sh_part_offsets[part_ix];
sh_drawobj_ix[local_id.x] = bin_data[offset + ix];
let offset = config.bin_data_start + sh_part_offsets[part_ix];
sh_drawobj_ix[local_id.x] = info_bin_data[offset + ix];
}
wr_ix = min(rd_ix + N_TILE, ready_ix);
if wr_ix - rd_ix >= N_TILE || (wr_ix >= ready_ix && partition_ix >= n_partitions) {
@ -326,14 +323,14 @@ fn main(
switch drawtag {
// DRAWTAG_FILL_COLOR
case 0x44u: {
let linewidth = bitcast<f32>(info[di]);
let linewidth = bitcast<f32>(info_bin_data[di]);
write_path(tile, linewidth);
let rgba_color = scene[dd];
write_color(CmdColor(rgba_color));
}
// DRAWTAG_FILL_LIN_GRADIENT
case 0x114u: {
let linewidth = bitcast<f32>(info[di]);
let linewidth = bitcast<f32>(info_bin_data[di]);
write_path(tile, linewidth);
let index = scene[dd];
let info_offset = di + 1u;
@ -341,7 +338,7 @@ fn main(
}
// DRAWTAG_FILL_RAD_GRADIENT
case 0x2dcu: {
let linewidth = bitcast<f32>(info[di]);
let linewidth = bitcast<f32>(info_bin_data[di]);
write_path(tile, linewidth);
let index = scene[dd];
let info_offset = di + 1u;

View file

@ -8,7 +8,7 @@
#import bbox
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -4,7 +4,7 @@
#import drawtag
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -13,7 +13,7 @@ struct Tile {
#import config
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> tiles: array<Tile>;

View file

@ -4,7 +4,7 @@
#import pathtag
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -10,7 +10,7 @@
#import bump
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -16,7 +16,7 @@
#import cubic
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -4,7 +4,7 @@
#import pathtag
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -4,7 +4,7 @@
#import pathtag
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -11,6 +11,10 @@ struct Config {
n_path: u32,
n_clip: u32,
// To reduce the number of bindings, info and bin data are combined
// into one buffer.
bin_data_start: u32,
// offsets within scene buffer (in u32 units)
// Note: this is a difference from piet-gpu, which is in bytes
pathtag_base: u32,

View file

@ -8,7 +8,7 @@
#import tile
@group(0) @binding(0)
var<storage> config: Config;
var<uniform> config: Config;
@group(0) @binding(1)
var<storage> scene: array<u32>;

View file

@ -86,6 +86,7 @@ pub enum ExternalResource<'a> {
pub enum Command {
Upload(BufProxy, Vec<u8>),
UploadUniform(BufProxy, Vec<u8>),
UploadImage(ImageProxy, Vec<u8>),
// Discussion question: third argument is vec of resources?
// Maybe use tricks to make more ergonomic?
@ -107,6 +108,8 @@ pub enum BindType {
Buffer,
/// A storage buffer with read only access.
BufReadOnly,
/// A small storage buffer to be used as uniforms.
Uniform,
/// A storage image.
Image(ImageFormat),
/// A storage image with read only access.
@ -158,6 +161,16 @@ impl Engine {
},
count: None,
},
BindType::Uniform => wgpu::BindGroupLayoutEntry {
binding: i as u32,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
BindType::Image(format) | BindType::ImageRead(format) => {
wgpu::BindGroupLayoutEntry {
binding: i as u32,
@ -229,6 +242,14 @@ impl Engine {
});
bind_map.insert_buf(buf_proxy.id, buf);
}
Command::UploadUniform(buf_proxy, bytes) => {
let buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: None,
contents: &bytes,
usage: wgpu::BufferUsages::UNIFORM,
});
bind_map.insert_buf(buf_proxy.id, buf);
}
Command::UploadImage(image_proxy, bytes) => {
let buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: None,
@ -308,7 +329,7 @@ impl Engine {
}
Command::Clear(proxy, offset, size) => {
let buffer = bind_map.get_or_create(*proxy, device)?;
encoder.clear_buffer(buffer, *offset, *size)
encoder.clear_buffer(buffer, *offset, *size);
}
}
}
@ -329,6 +350,13 @@ impl Recording {
buf_proxy
}
pub fn upload_uniform(&mut self, data: impl Into<Vec<u8>>) -> BufProxy {
let data = data.into();
let buf_proxy = BufProxy::new(data.len() as u64);
self.push(Command::UploadUniform(buf_proxy, data));
buf_proxy
}
pub fn upload_image(
&mut self,
width: u32,

View file

@ -33,6 +33,7 @@ struct Config {
n_drawobj: u32,
n_path: u32,
n_clip: u32,
bin_data_start: u32,
pathtag_base: u32,
pathdata_base: u32,
drawtag_base: u32,
@ -85,7 +86,7 @@ fn render(scene: &Scene, shaders: &Shaders) -> (Recording, BufProxy) {
..Default::default()
};
let scene_buf = recording.upload(scene);
let config_buf = recording.upload(bytemuck::bytes_of(&config).to_owned());
let config_buf = recording.upload_uniform(bytemuck::bytes_of(&config));
let reduced_buf = BufProxy::new(pathtag_wgs as u64 * TAG_MONOID_SIZE);
// TODO: really only need pathtag_wgs - 1
@ -209,6 +210,7 @@ pub fn render_full(
// TODO: calculate for real when we do rectangles
let n_drawobj = n_path;
let n_clip = data.n_clip;
let bin_data_start = n_drawobj * MAX_DRAWINFO_SIZE as u32;
let new_width = next_multiple_of(width, 16);
let new_height = next_multiple_of(height, 16);
@ -222,6 +224,7 @@ pub fn render_full(
n_drawobj,
n_path,
n_clip,
bin_data_start,
pathtag_base,
pathdata_base,
drawtag_base,
@ -231,7 +234,7 @@ pub fn render_full(
};
// println!("{:?}", config);
let scene_buf = ResourceProxy::Buf(recording.upload(scene));
let config_buf = ResourceProxy::Buf(recording.upload(bytemuck::bytes_of(&config).to_owned()));
let config_buf = ResourceProxy::Buf(recording.upload_uniform(bytemuck::bytes_of(&config)));
let pathtag_wgs = pathtag_padded / (4 * shaders::PATHTAG_REDUCE_WG as usize);
let reduced_buf = ResourceProxy::new_buf(pathtag_wgs as u64 * TAG_MONOID_FULL_SIZE);
@ -278,7 +281,7 @@ pub fn render_full(
[config_buf, scene_buf, draw_reduced_buf],
);
let draw_monoid_buf = ResourceProxy::new_buf(n_drawobj as u64 * DRAWMONOID_SIZE);
let info_buf = ResourceProxy::new_buf(n_drawobj as u64 * MAX_DRAWINFO_SIZE);
let info_bin_data_buf = ResourceProxy::new_buf(1 << 20);
let clip_inp_buf = ResourceProxy::new_buf(data.n_clip as u64 * CLIP_INP_SIZE);
recording.dispatch(
shaders.draw_leaf,
@ -289,7 +292,7 @@ pub fn render_full(
draw_reduced_buf,
path_bbox_buf,
draw_monoid_buf,
info_buf,
info_bin_data_buf,
clip_inp_buf,
],
);
@ -329,7 +332,6 @@ pub fn render_full(
}
let draw_bbox_buf = ResourceProxy::new_buf(n_path as u64 * DRAW_BBOX_SIZE);
let bump_buf = BufProxy::new(BUMP_SIZE);
let bin_data_buf = ResourceProxy::new_buf(1 << 20);
let width_in_bins = (config.width_in_tiles + 15) / 16;
let height_in_bins = (config.height_in_tiles + 15) / 16;
let n_bins = width_in_bins * height_in_bins;
@ -346,7 +348,7 @@ pub fn render_full(
clip_bbox_buf,
draw_bbox_buf,
bump_buf,
bin_data_buf,
info_bin_data_buf,
bin_header_buf,
],
);
@ -395,10 +397,9 @@ pub fn render_full(
scene_buf,
draw_monoid_buf,
bin_header_buf,
bin_data_buf,
info_bin_data_buf,
path_buf,
tile_buf,
info_buf,
bump_buf,
ptcl_buf,
],
@ -414,7 +415,7 @@ pub fn render_full(
ResourceProxy::Image(out_image),
ptcl_buf,
gradient_image,
info_buf,
info_bin_data_buf,
],
);
(recording, ResourceProxy::Image(out_image))

View file

@ -71,17 +71,13 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
let pathtag_reduce = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_reduce"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
&[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
)?;
let pathtag_scan = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_scan"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -94,7 +90,7 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
device,
preprocess::preprocess(shader!("path_coarse"), &path_coarse_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -104,13 +100,13 @@ pub fn init_shaders(device: &Device, engine: &mut Engine) -> Result<Shaders, Err
let backdrop = engine.add_shader(
device,
preprocess::preprocess(shader!("backdrop"), &empty, &imports).into(),
&[BindType::BufReadOnly, BindType::Buffer],
&[BindType::Uniform, BindType::Buffer],
)?;
let fine = engine.add_shader(
device,
preprocess::preprocess(shader!("fine"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -136,17 +132,13 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
let pathtag_reduce = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_reduce"), &full_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
&[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
)?;
let pathtag_scan = engine.add_shader(
device,
preprocess::preprocess(shader!("pathtag_scan"), &full_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -155,13 +147,13 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
let bbox_clear = engine.add_shader(
device,
preprocess::preprocess(shader!("bbox_clear"), &empty, &imports).into(),
&[BindType::BufReadOnly, BindType::Buffer],
&[BindType::Uniform, BindType::Buffer],
)?;
let pathseg = engine.add_shader(
device,
preprocess::preprocess(shader!("pathseg"), &full_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -171,17 +163,13 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
let draw_reduce = engine.add_shader(
device,
preprocess::preprocess(shader!("draw_reduce"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
&[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
)?;
let draw_leaf = engine.add_shader(
device,
preprocess::preprocess(shader!("draw_leaf"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
@ -194,7 +182,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
device,
preprocess::preprocess(shader!("clip_reduce"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -205,7 +193,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
device,
preprocess::preprocess(shader!("clip_leaf"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
@ -218,7 +206,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
device,
preprocess::preprocess(shader!("binning"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
@ -232,7 +220,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
device,
preprocess::preprocess(shader!("tile_alloc"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
@ -245,7 +233,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
device,
preprocess::preprocess(shader!("path_coarse_full"), &full_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
@ -258,18 +246,13 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
let backdrop = engine.add_shader(
device,
preprocess::preprocess(shader!("backdrop_dyn"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
&[BindType::Uniform, BindType::BufReadOnly, BindType::Buffer],
)?;
let coarse = engine.add_shader(
device,
preprocess::preprocess(shader!("coarse"), &empty, &imports).into(),
&[
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
@ -284,7 +267,7 @@ pub fn full_shaders(device: &Device, engine: &mut Engine) -> Result<FullShaders,
device,
preprocess::preprocess(shader!("fine"), &full_config, &imports).into(),
&[
BindType::BufReadOnly,
BindType::Uniform,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Image(ImageFormat::Rgba8),

View file

@ -34,7 +34,6 @@ impl RenderContext {
let adapter = instance.request_adapter(&Default::default()).await.unwrap();
let features = adapter.features();
let mut limits = Limits::default();
limits.max_storage_buffers_per_shader_stage = 16;
let (device, queue) = adapter
.request_device(
&wgpu::DeviceDescriptor {