mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Start writing tiles
This is the first checkpoint where it actually runs a pipeline end to end, though it's far from accurate.
This commit is contained in:
parent
06cad48dca
commit
3a6428238b
|
@ -41,7 +41,7 @@ fn main() -> Result<(), Error> {
|
||||||
|
|
||||||
let fence = device.create_fence(false)?;
|
let fence = device.create_fence(false)?;
|
||||||
let mut cmd_buf = device.create_cmd_buf()?;
|
let mut cmd_buf = device.create_cmd_buf()?;
|
||||||
let query_pool = device.create_query_pool(4)?;
|
let query_pool = device.create_query_pool(5)?;
|
||||||
|
|
||||||
let mut ctx = PietGpuRenderContext::new();
|
let mut ctx = PietGpuRenderContext::new();
|
||||||
render_scene(&mut ctx);
|
render_scene(&mut ctx);
|
||||||
|
@ -62,10 +62,11 @@ fn main() -> Result<(), Error> {
|
||||||
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
|
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
|
||||||
println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
|
println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
|
||||||
println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
|
println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
|
||||||
|
println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
let mut data: Vec<u32> = Default::default();
|
let mut data: Vec<u32> = Default::default();
|
||||||
device.read_buffer(&renderer.bin_buf, &mut data).unwrap();
|
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
|
||||||
piet_gpu::dump_k1_data(&data);
|
piet_gpu::dump_k1_data(&data);
|
||||||
|
|
||||||
let mut data: Vec<u32> = Default::default();
|
let mut data: Vec<u32> = Default::default();
|
||||||
|
|
|
@ -46,6 +46,17 @@ shared uint sh_bitmaps[N_SLICE][N_TILE];
|
||||||
#define SX (1.0 / float(TILE_WIDTH_PX))
|
#define SX (1.0 / float(TILE_WIDTH_PX))
|
||||||
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
#define SY (1.0 / float(TILE_HEIGHT_PX))
|
||||||
|
|
||||||
|
// Perhaps cmd_limit should be a global? This is a style question.
|
||||||
|
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
|
||||||
|
if (cmd_ref.offset > cmd_limit) {
|
||||||
|
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
|
||||||
|
CmdJump jump = CmdJump(new_cmd);
|
||||||
|
Cmd_Jump_write(cmd_ref, jump);
|
||||||
|
cmd_ref = CmdRef(new_cmd);
|
||||||
|
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
// Could use either linear or 2d layouts for both dispatch and
|
// Could use either linear or 2d layouts for both dispatch and
|
||||||
// invocations within the workgroup. We'll use variables to abstract.
|
// invocations within the workgroup. We'll use variables to abstract.
|
||||||
|
@ -53,6 +64,13 @@ void main() {
|
||||||
// Top left coordinates of this bin.
|
// Top left coordinates of this bin.
|
||||||
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
|
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
|
||||||
uint th_ix = gl_LocalInvocationID.x;
|
uint th_ix = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
|
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
|
||||||
|
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
|
||||||
|
uint tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
|
||||||
|
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
|
||||||
|
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
|
||||||
|
|
||||||
uint wr_ix = 0;
|
uint wr_ix = 0;
|
||||||
uint rd_ix = 0;
|
uint rd_ix = 0;
|
||||||
uint first_el;
|
uint first_el;
|
||||||
|
@ -172,6 +190,7 @@ void main() {
|
||||||
y++;
|
y++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
barrier();
|
||||||
|
|
||||||
// Output elements for this tile, based on bitmaps.
|
// Output elements for this tile, based on bitmaps.
|
||||||
uint slice_ix = 0;
|
uint slice_ix = 0;
|
||||||
|
@ -193,13 +212,25 @@ void main() {
|
||||||
// At this point, we read the element again from global memory.
|
// At this point, we read the element again from global memory.
|
||||||
// If that turns out to be expensive, maybe we can pack it into
|
// If that turns out to be expensive, maybe we can pack it into
|
||||||
// shared memory (or perhaps just the tag).
|
// shared memory (or perhaps just the tag).
|
||||||
probe += 1;
|
ref = AnnotatedRef(element_ix * Annotated_size);
|
||||||
|
tag = Annotated_tag(ref);
|
||||||
|
|
||||||
|
switch (tag) {
|
||||||
|
case Annotated_Fill:
|
||||||
|
case Annotated_Stroke:
|
||||||
|
// Note: we take advantage of the fact that fills and strokes
|
||||||
|
// have compatible layout.
|
||||||
|
AnnoFill fill = Annotated_Fill_read(ref);
|
||||||
|
alloc_cmd(cmd_ref, cmd_limit);
|
||||||
|
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// clear LSB
|
// clear LSB
|
||||||
bitmap &= bitmap - 1;
|
bitmap &= bitmap - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
rd_ix += N_TILE;
|
rd_ix += N_TILE;
|
||||||
|
break;
|
||||||
} while (wr_ix > rd_ix);
|
} while (wr_ix > rd_ix);
|
||||||
ptcl[bin_ix * N_TILE + th_ix] = probe;
|
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
|
@ -39,7 +39,8 @@ void main() {
|
||||||
uvec2 xy_uint = gl_GlobalInvocationID.xy;
|
uvec2 xy_uint = gl_GlobalInvocationID.xy;
|
||||||
vec2 xy = vec2(xy_uint);
|
vec2 xy = vec2(xy_uint);
|
||||||
vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
|
vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
|
||||||
vec3 rgb = uv.xyy;
|
//vec3 rgb = uv.xyy;
|
||||||
|
vec3 rgb = vec3(0.75);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
uint tag = Cmd_tag(cmd_ref);
|
uint tag = Cmd_tag(cmd_ref);
|
||||||
|
|
Binary file not shown.
|
@ -30,7 +30,7 @@ const PTCL_INITIAL_ALLOC: usize = 1024;
|
||||||
|
|
||||||
const K2_PER_TILE_SIZE: usize = 8;
|
const K2_PER_TILE_SIZE: usize = 8;
|
||||||
|
|
||||||
const N_CIRCLES: usize = 1;
|
const N_CIRCLES: usize = 0;
|
||||||
|
|
||||||
const N_WG: u32 = 16;
|
const N_WG: u32 = 16;
|
||||||
|
|
||||||
|
@ -47,11 +47,13 @@ pub fn render_scene(rc: &mut impl RenderContext) {
|
||||||
rc.fill(circle, &color);
|
rc.fill(circle, &color);
|
||||||
}
|
}
|
||||||
let mut path = BezPath::new();
|
let mut path = BezPath::new();
|
||||||
|
/*
|
||||||
path.move_to((100.0, 1150.0));
|
path.move_to((100.0, 1150.0));
|
||||||
path.line_to((200.0, 1200.0));
|
path.line_to((200.0, 1200.0));
|
||||||
path.line_to((150.0, 1250.0));
|
path.line_to((150.0, 1250.0));
|
||||||
path.close_path();
|
path.close_path();
|
||||||
rc.fill(path, &Color::rgb8(128, 0, 128));
|
rc.fill(path, &Color::rgb8(128, 0, 128));
|
||||||
|
*/
|
||||||
rc.stroke(
|
rc.stroke(
|
||||||
Line::new((100.0, 100.0), (200.0, 150.0)),
|
Line::new((100.0, 100.0), (200.0, 150.0)),
|
||||||
&Color::WHITE,
|
&Color::WHITE,
|
||||||
|
@ -134,29 +136,9 @@ pub struct Renderer<D: Device> {
|
||||||
coarse_alloc_buf_host: D::Buffer,
|
coarse_alloc_buf_host: D::Buffer,
|
||||||
coarse_alloc_buf_dev: D::Buffer,
|
coarse_alloc_buf_dev: D::Buffer,
|
||||||
|
|
||||||
/*
|
|
||||||
k1_alloc_buf_host: D::Buffer,
|
|
||||||
k1_alloc_buf_dev: D::Buffer,
|
|
||||||
k2s_alloc_buf_host: D::Buffer,
|
|
||||||
k2s_alloc_buf_dev: D::Buffer,
|
|
||||||
k2f_alloc_buf_host: D::Buffer,
|
|
||||||
k2f_alloc_buf_dev: D::Buffer,
|
|
||||||
k3_alloc_buf_host: D::Buffer,
|
|
||||||
k3_alloc_buf_dev: D::Buffer,
|
|
||||||
tilegroup_buf: D::Buffer,
|
|
||||||
ptcl_buf: D::Buffer,
|
|
||||||
|
|
||||||
k1_pipeline: D::Pipeline,
|
|
||||||
k1_ds: D::DescriptorSet,
|
|
||||||
k2s_pipeline: D::Pipeline,
|
|
||||||
k2s_ds: D::DescriptorSet,
|
|
||||||
k2f_pipeline: D::Pipeline,
|
|
||||||
k2f_ds: D::DescriptorSet,
|
|
||||||
k3_pipeline: D::Pipeline,
|
|
||||||
k3_ds: D::DescriptorSet,
|
|
||||||
k4_pipeline: D::Pipeline,
|
k4_pipeline: D::Pipeline,
|
||||||
k4_ds: D::DescriptorSet,
|
k4_ds: D::DescriptorSet,
|
||||||
*/
|
|
||||||
n_elements: usize,
|
n_elements: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,10 +195,10 @@ impl<D: Device> Renderer<D> {
|
||||||
let coarse_alloc_buf_host = device.create_buffer(4, host)?;
|
let coarse_alloc_buf_host = device.create_buffer(4, host)?;
|
||||||
let coarse_alloc_buf_dev = device.create_buffer(4, dev)?;
|
let coarse_alloc_buf_dev = device.create_buffer(4, dev)?;
|
||||||
|
|
||||||
let coarse_alloc_start = 256 * 64 * N_WG;
|
let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
|
||||||
device
|
device
|
||||||
.write_buffer(&coarse_alloc_buf_host, &[
|
.write_buffer(&coarse_alloc_buf_host, &[
|
||||||
coarse_alloc_start,
|
coarse_alloc_start as u32,
|
||||||
])
|
])
|
||||||
?;
|
?;
|
||||||
let coarse_code = include_bytes!("../shader/coarse.spv");
|
let coarse_code = include_bytes!("../shader/coarse.spv");
|
||||||
|
@ -227,72 +209,11 @@ impl<D: Device> Renderer<D> {
|
||||||
&[],
|
&[],
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
/*
|
// These will probably be combined with the ptcl buf, as they're all written by the
|
||||||
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
|
// same kernel now.
|
||||||
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
|
|
||||||
let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
|
||||||
|
|
||||||
let k1_alloc_buf_host = device.create_buffer(4, host)?;
|
|
||||||
let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
|
|
||||||
let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
|
|
||||||
device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
|
|
||||||
let k1_code = include_bytes!("../shader/kernel1.spv");
|
|
||||||
let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?;
|
|
||||||
let k1_ds = device.create_descriptor_set(
|
|
||||||
&k1_pipeline,
|
|
||||||
&[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
|
|
||||||
&[],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let k2s_alloc_buf_host = device.create_buffer(4, host)?;
|
|
||||||
let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
|
|
||||||
let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
|
|
||||||
device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?;
|
|
||||||
let k2s_code = include_bytes!("../shader/kernel2s.spv");
|
|
||||||
let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?;
|
|
||||||
let k2s_ds = device.create_descriptor_set(
|
|
||||||
&k2s_pipeline,
|
|
||||||
&[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
|
|
||||||
&[],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let k2f_alloc_buf_host = device.create_buffer(4, host)?;
|
|
||||||
let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
|
|
||||||
let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
|
|
||||||
device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?;
|
|
||||||
let k2f_code = include_bytes!("../shader/kernel2f.spv");
|
|
||||||
let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
|
|
||||||
let k2f_ds = device.create_descriptor_set(
|
|
||||||
&k2f_pipeline,
|
|
||||||
&[
|
|
||||||
&scene_dev,
|
|
||||||
&tilegroup_buf,
|
|
||||||
&fill_seg_buf,
|
|
||||||
&k2f_alloc_buf_dev,
|
|
||||||
],
|
|
||||||
&[],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let k3_alloc_buf_host = device.create_buffer(4, host)?;
|
|
||||||
let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
|
|
||||||
let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
|
|
||||||
device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?;
|
|
||||||
let k3_code = include_bytes!("../shader/kernel3.spv");
|
|
||||||
let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
|
|
||||||
let k3_ds = device.create_descriptor_set(
|
|
||||||
&k3_pipeline,
|
|
||||||
&[
|
|
||||||
&scene_dev,
|
|
||||||
&tilegroup_buf,
|
|
||||||
&segment_buf,
|
|
||||||
&fill_seg_buf,
|
|
||||||
&ptcl_buf,
|
|
||||||
&k3_alloc_buf_dev,
|
|
||||||
],
|
|
||||||
&[],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let k4_code = include_bytes!("../shader/kernel4.spv");
|
let k4_code = include_bytes!("../shader/kernel4.spv");
|
||||||
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
|
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
|
||||||
let k4_ds = device.create_descriptor_set(
|
let k4_ds = device.create_descriptor_set(
|
||||||
|
@ -300,7 +221,6 @@ impl<D: Device> Renderer<D> {
|
||||||
&[&ptcl_buf, &segment_buf, &fill_seg_buf],
|
&[&ptcl_buf, &segment_buf, &fill_seg_buf],
|
||||||
&[&image_dev],
|
&[&image_dev],
|
||||||
)?;
|
)?;
|
||||||
*/
|
|
||||||
|
|
||||||
Ok(Renderer {
|
Ok(Renderer {
|
||||||
scene_buf,
|
scene_buf,
|
||||||
|
@ -312,6 +232,8 @@ impl<D: Device> Renderer<D> {
|
||||||
bin_ds,
|
bin_ds,
|
||||||
coarse_pipeline,
|
coarse_pipeline,
|
||||||
coarse_ds,
|
coarse_ds,
|
||||||
|
k4_pipeline,
|
||||||
|
k4_ds,
|
||||||
state_buf,
|
state_buf,
|
||||||
anno_buf,
|
anno_buf,
|
||||||
bin_buf,
|
bin_buf,
|
||||||
|
@ -339,7 +261,7 @@ impl<D: Device> Renderer<D> {
|
||||||
cmd_buf.dispatch(
|
cmd_buf.dispatch(
|
||||||
&self.el_pipeline,
|
&self.el_pipeline,
|
||||||
&self.el_ds,
|
&self.el_ds,
|
||||||
((self.n_elements / 128) as u32, 1, 1),
|
(((self.n_elements + 127) / 128) as u32, 1, 1),
|
||||||
);
|
);
|
||||||
cmd_buf.write_timestamp(&query_pool, 1);
|
cmd_buf.write_timestamp(&query_pool, 1);
|
||||||
cmd_buf.memory_barrier();
|
cmd_buf.memory_barrier();
|
||||||
|
@ -357,6 +279,13 @@ impl<D: Device> Renderer<D> {
|
||||||
);
|
);
|
||||||
cmd_buf.write_timestamp(&query_pool, 3);
|
cmd_buf.write_timestamp(&query_pool, 3);
|
||||||
cmd_buf.memory_barrier();
|
cmd_buf.memory_barrier();
|
||||||
|
cmd_buf.dispatch(
|
||||||
|
&self.k4_pipeline,
|
||||||
|
&self.k4_ds,
|
||||||
|
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
|
||||||
|
);
|
||||||
|
cmd_buf.write_timestamp(&query_pool, 4);
|
||||||
|
cmd_buf.memory_barrier();
|
||||||
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
|
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue