Experimenting with sort-middle

Starting a prototype that explores the sort-middle approach. This
commit has a prefix sum pass computing state per element.
This commit is contained in:
Raph Levien 2020-05-11 20:01:06 -07:00
parent 8d01aba237
commit 9a8854ffab
14 changed files with 762 additions and 268 deletions

View file

@ -3,5 +3,6 @@ pub mod fill_seg;
pub mod ptcl;
pub mod scene;
pub mod segment;
pub mod state;
pub mod test;
pub mod tilegroup;

View file

@ -5,6 +5,7 @@ fn main() {
.expect("provide a module name");
match mod_name.as_str() {
"scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
"state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
"tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
"segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
"fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),

View file

@ -4,6 +4,8 @@ pub use self::scene::{
Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup,
};
pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform};
piet_gpu! {
#[rust_encode]
mod scene {
@ -51,5 +53,46 @@ piet_gpu! {
Fill(PietFill),
Poly(PietStrokePolyLine),
}
// New approach follows (above to be deleted)
struct LineSeg {
p0: [f32; 2],
p1: [f32; 2],
}
struct QuadSeg {
p0: [f32; 2],
p1: [f32; 2],
p2: [f32; 2],
}
struct CubicSeg {
p0: [f32; 2],
p1: [f32; 2],
p2: [f32; 2],
p3: [f32; 2],
}
struct Fill {
rgba_color: u32,
}
struct Stroke {
rgba_color: u32,
}
struct SetLineWidth {
width: f32,
}
struct Transform {
mat: [f32; 4],
translate: [f32; 2],
}
enum Element {
Nop,
// The segments need a flag to indicate fill/stroke
Line(LineSeg),
Quad(QuadSeg),
Cubic(CubicSeg),
Stroke(Stroke),
Fill(Fill),
SetLineWidth(SetLineWidth),
Transform(Transform),
}
}
}

View file

@ -0,0 +1,14 @@
use piet_gpu_derive::piet_gpu;
piet_gpu! {
#[gpu_write]
mod state {
struct State {
mat: [f32; 4],
translate: [f32; 2],
bbox: [f32; 4],
linewidth: f32,
flags: u32,
}
}
}

View file

@ -5,7 +5,7 @@ use std::path::Path;
use piet_gpu_hal::vulkan::VkInstance;
use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags};
use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
#[allow(unused)]
fn dump_scene(buf: &[u8]) {
@ -16,6 +16,24 @@ fn dump_scene(buf: &[u8]) {
}
}
#[allow(unused)]
fn dump_state(buf: &[u8]) {
for i in 0..(buf.len() / 48) {
let j = i * 48;
let floats = (0..11).map(|k| {
let mut buf_f32 = [0u8; 4];
buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]);
f32::from_le_bytes(buf_f32)
}).collect::<Vec<_>>();
println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}",
i,
floats[0], floats[1], floats[2], floats[3], floats[4], floats[5],
floats[6], floats[7], floats[8], floats[9],
floats[10], buf[j + 44]);
}
}
fn main() -> Result<(), Error> {
let (instance, _) = VkInstance::new(None)?;
unsafe {
@ -23,7 +41,7 @@ fn main() -> Result<(), Error> {
let fence = device.create_fence(false)?;
let mut cmd_buf = device.create_cmd_buf()?;
let query_pool = device.create_query_pool(6)?;
let query_pool = device.create_query_pool(2)?;
let mut ctx = PietGpuRenderContext::new();
render_scene(&mut ctx);
@ -31,7 +49,8 @@ fn main() -> Result<(), Error> {
//dump_scene(&scene);
let renderer = Renderer::new(&device, scene)?;
let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
let image_buf =
device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
cmd_buf.begin();
renderer.record(&mut cmd_buf, &query_pool);
@ -40,28 +59,12 @@ fn main() -> Result<(), Error> {
device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
device.wait_and_reset(&[fence])?;
let timestamps = device.reap_query_pool(&query_pool).unwrap();
println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
println!(
"Kernel 2s time: {:.3}ms",
(timestamps[1] - timestamps[0]) * 1e3
);
println!(
"Kernel 2f time: {:.3}ms",
(timestamps[2] - timestamps[1]) * 1e3
);
println!(
"Kernel 3 time: {:.3}ms",
(timestamps[3] - timestamps[2]) * 1e3
);
println!(
"Render time: {:.3}ms",
(timestamps[4] - timestamps[3]) * 1e3
);
println!("Element kernel time: {:.3}ms", timestamps[0] * 1e3);
/*
let mut k1_data: Vec<u32> = Default::default();
device.read_buffer(&segment_buf, &mut k1_data).unwrap();
dump_k1_data(&k1_data);
let mut data: Vec<u8> = Default::default();
device.read_buffer(&renderer.state_buf, &mut data).unwrap();
dump_state(&data);
*/
let mut img_data: Vec<u8> = Default::default();

View file

@ -1,7 +1,7 @@
use piet_gpu_hal::vulkan::VkInstance;
use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout};
use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
use winit::{
event::{Event, WindowEvent},
@ -69,7 +69,8 @@ fn main() -> Result<(), Error> {
device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap();
let timestamps = device.reap_query_pool(query_pool).unwrap();
window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
window.set_title(&format!(
"k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
timestamps[0] * 1e3,
(timestamps[1] - timestamps[0]) * 1e3,
(timestamps[2] - timestamps[1]) * 1e3,
@ -93,11 +94,7 @@ fn main() -> Result<(), Error> {
ImageLayout::BlitDst,
);
cmd_buf.blit_image(&renderer.image_dev, &swap_image);
cmd_buf.image_barrier(
&swap_image,
ImageLayout::BlitDst,
ImageLayout::Present,
);
cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
cmd_buf.finish();
device

View file

@ -18,3 +18,6 @@ build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
build elements.spv: glsl elements.comp | scene.h state.h

View file

@ -0,0 +1,173 @@
#version 450
#extension GL_GOOGLE_include_directive : enable
#define N_ROWS 4
#define WG_SIZE 32
#define LG_WG_SIZE 5
#define TILE_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
// This will be used for inter-wprkgroup aggregates
layout(set = 0, binding = 1) buffer StateBuf {
uint[] state;
};
#include "scene.h"
#include "state.h"
#define FLAG_SET_LINEWIDTH 1
#define FLAG_RESET_BBOX 2
// This is almost like a monoid (the interaction between transformation and
// bounding boxes is approximate)
State combine_state(State a, State b) {
State c;
c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
c.bbox = a.bbox;
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) {
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
}
// It would be more concise to cast to matrix types; ah well.
c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
c.flags = a.flags | b.flags;
return c;
}
State map_element(ElementRef ref) {
// TODO: it would *probably* be more efficient to make the memory read patterns less
// divergent, though it would be more wasted memory.
uint tag = Element_tag(ref);
State c;
c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
c.mat = vec4(1.0, 0.0, 0.0, 1.0);
c.translate = vec2(0.0, 0.0);
c.linewidth = 0.0;
c.flags = 0;
switch (tag) {
case Element_Line:
LineSeg line = Element_Line_read(ref);
c.bbox.xy = min(line.p0, line.p1);
c.bbox.zw = max(line.p0, line.p1);
break;
case Element_Quad:
QuadSeg quad = Element_Quad_read(ref);
c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
break;
case Element_Cubic:
CubicSeg cubic = Element_Cubic_read(ref);
c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
break;
case Element_Fill:
case Element_Stroke:
c.flags = FLAG_RESET_BBOX;
break;
case Element_SetLineWidth:
SetLineWidth lw = Element_SetLineWidth_read(ref);
c.linewidth = lw.width;
c.flags = FLAG_SET_LINEWIDTH;
break;
case Element_Transform:
Transform t = Element_Transform_read(ref);
c.mat = t.mat;
c.translate = t.translate;
break;
}
return c;
}
// We should be able to use an array of structs but the NV shader compiler
// doesn't seem to like it :/
//shared State sh_state[WG_SIZE];
shared vec4 sh_mat[WG_SIZE];
shared vec2 sh_translate[WG_SIZE];
shared vec4 sh_bbox[WG_SIZE];
shared float sh_width[WG_SIZE];
shared uint sh_flags[WG_SIZE];
void main() {
State th_state[N_ROWS];
// this becomes an atomic counter
uint tile_ix = gl_WorkGroupID.x;
uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
ElementRef ref = ElementRef(ix * Element_size);
th_state[0] = map_element(ref);
for (uint i = 1; i < N_ROWS; i++) {
// discussion question: would it be faster to load using more coherent patterns
// into thread memory? This is kinda strided.
th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
}
State agg = th_state[N_ROWS - 1];
sh_mat[gl_LocalInvocationID.x] = agg.mat;
sh_translate[gl_LocalInvocationID.x] = agg.translate;
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
sh_flags[gl_LocalInvocationID.x] = agg.flags;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1 << i)) {
State other;
uint ix = gl_LocalInvocationID.x - (1 << i);
other.mat = sh_mat[ix];
other.translate = sh_translate[ix];
other.bbox = sh_bbox[ix];
other.linewidth = sh_width[ix];
other.flags = sh_flags[ix];
agg = combine_state(other, agg);
}
barrier();
sh_mat[gl_LocalInvocationID.x] = agg.mat;
sh_translate[gl_LocalInvocationID.x] = agg.translate;
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
sh_flags[gl_LocalInvocationID.x] = agg.flags;
}
// TODO: if last invocation in wg, publish agg.
barrier();
State exclusive;
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
exclusive.translate = vec2(0.0, 0.0);
exclusive.linewidth = 0.0;
exclusive.flags = 0;
// TODO: do decoupled look-back
State row = exclusive;
if (gl_LocalInvocationID.x > 0) {
uint ix = gl_LocalInvocationID.x - 1;
State other;
other.mat = sh_mat[ix];
other.translate = sh_translate[ix];
other.bbox = sh_bbox[ix];
other.linewidth = sh_width[ix];
other.flags = sh_flags[ix];
row = combine_state(row, other);
}
for (uint i = 0; i < N_ROWS; i++) {
State this_state = combine_state(row, th_state[i]);
// We write the state now for development purposes, but the
// actual goal is to write transformed and annotated elements.
State_write(StateRef((ix + i) * State_size), this_state);
}
}

Binary file not shown.

View file

@ -32,6 +32,38 @@ struct PietItemRef {
uint offset;
};
struct LineSegRef {
uint offset;
};
struct QuadSegRef {
uint offset;
};
struct CubicSegRef {
uint offset;
};
struct FillRef {
uint offset;
};
struct StrokeRef {
uint offset;
};
struct SetLineWidthRef {
uint offset;
};
struct TransformRef {
uint offset;
};
struct ElementRef {
uint offset;
};
struct Bbox {
ivec4 bbox;
};
@ -128,6 +160,97 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) {
return PietItemRef(ref.offset + index * PietItem_size);
}
struct LineSeg {
vec2 p0;
vec2 p1;
};
#define LineSeg_size 16
LineSegRef LineSeg_index(LineSegRef ref, uint index) {
return LineSegRef(ref.offset + index * LineSeg_size);
}
struct QuadSeg {
vec2 p0;
vec2 p1;
vec2 p2;
};
#define QuadSeg_size 24
QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
return QuadSegRef(ref.offset + index * QuadSeg_size);
}
struct CubicSeg {
vec2 p0;
vec2 p1;
vec2 p2;
vec2 p3;
};
#define CubicSeg_size 32
CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
return CubicSegRef(ref.offset + index * CubicSeg_size);
}
struct Fill {
uint rgba_color;
};
#define Fill_size 4
FillRef Fill_index(FillRef ref, uint index) {
return FillRef(ref.offset + index * Fill_size);
}
struct Stroke {
uint rgba_color;
};
#define Stroke_size 4
StrokeRef Stroke_index(StrokeRef ref, uint index) {
return StrokeRef(ref.offset + index * Stroke_size);
}
struct SetLineWidth {
float width;
};
#define SetLineWidth_size 4
SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
}
struct Transform {
vec4 mat;
vec2 translate;
};
#define Transform_size 24
TransformRef Transform_index(TransformRef ref, uint index) {
return TransformRef(ref.offset + index * Transform_size);
}
#define Element_Nop 0
#define Element_Line 1
#define Element_Quad 2
#define Element_Cubic 3
#define Element_Stroke 4
#define Element_Fill 5
#define Element_SetLineWidth 6
#define Element_Transform 7
#define Element_size 36
ElementRef Element_index(ElementRef ref, uint index) {
return ElementRef(ref.offset + index * Element_size);
}
Bbox Bbox_read(BboxRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
@ -236,3 +359,118 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) {
return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4));
}
LineSeg LineSeg_read(LineSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
LineSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
QuadSeg QuadSeg_read(QuadSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
uint raw4 = scene[ix + 4];
uint raw5 = scene[ix + 5];
QuadSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
return s;
}
CubicSeg CubicSeg_read(CubicSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
uint raw4 = scene[ix + 4];
uint raw5 = scene[ix + 5];
uint raw6 = scene[ix + 6];
uint raw7 = scene[ix + 7];
CubicSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
return s;
}
Fill Fill_read(FillRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
Fill s;
s.rgba_color = raw0;
return s;
}
Stroke Stroke_read(StrokeRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
Stroke s;
s.rgba_color = raw0;
return s;
}
SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
SetLineWidth s;
s.width = uintBitsToFloat(raw0);
return s;
}
Transform Transform_read(TransformRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
uint raw4 = scene[ix + 4];
uint raw5 = scene[ix + 5];
Transform s;
s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
return s;
}
uint Element_tag(ElementRef ref) {
return scene[ref.offset >> 2];
}
LineSeg Element_Line_read(ElementRef ref) {
return LineSeg_read(LineSegRef(ref.offset + 4));
}
QuadSeg Element_Quad_read(ElementRef ref) {
return QuadSeg_read(QuadSegRef(ref.offset + 4));
}
CubicSeg Element_Cubic_read(ElementRef ref) {
return CubicSeg_read(CubicSegRef(ref.offset + 4));
}
Stroke Element_Stroke_read(ElementRef ref) {
return Stroke_read(StrokeRef(ref.offset + 4));
}
Fill Element_Fill_read(ElementRef ref) {
return Fill_read(FillRef(ref.offset + 4));
}
SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
}
Transform Element_Transform_read(ElementRef ref) {
return Transform_read(TransformRef(ref.offset + 4));
}

59
piet-gpu/shader/state.h Normal file
View file

@ -0,0 +1,59 @@
// Code auto-generated by piet-gpu-derive
struct StateRef {
uint offset;
};
struct State {
vec4 mat;
vec2 translate;
vec4 bbox;
float linewidth;
uint flags;
};
#define State_size 48
StateRef State_index(StateRef ref, uint index) {
return StateRef(ref.offset + index * State_size);
}
State State_read(StateRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = state[ix + 0];
uint raw1 = state[ix + 1];
uint raw2 = state[ix + 2];
uint raw3 = state[ix + 3];
uint raw4 = state[ix + 4];
uint raw5 = state[ix + 5];
uint raw6 = state[ix + 6];
uint raw7 = state[ix + 7];
uint raw8 = state[ix + 8];
uint raw9 = state[ix + 9];
uint raw10 = state[ix + 10];
uint raw11 = state[ix + 11];
State s;
s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
s.linewidth = uintBitsToFloat(raw10);
s.flags = raw11;
return s;
}
void State_write(StateRef ref, State s) {
uint ix = ref.offset >> 2;
state[ix + 0] = floatBitsToUint(s.mat.x);
state[ix + 1] = floatBitsToUint(s.mat.y);
state[ix + 2] = floatBitsToUint(s.mat.z);
state[ix + 3] = floatBitsToUint(s.mat.w);
state[ix + 4] = floatBitsToUint(s.translate.x);
state[ix + 5] = floatBitsToUint(s.translate.y);
state[ix + 6] = floatBitsToUint(s.bbox.x);
state[ix + 7] = floatBitsToUint(s.bbox.y);
state[ix + 8] = floatBitsToUint(s.bbox.z);
state[ix + 9] = floatBitsToUint(s.bbox.w);
state[ix + 10] = floatBitsToUint(s.linewidth);
state[ix + 11] = s.flags;
}

View file

@ -1,5 +1,5 @@
mod render_ctx;
mod pico_svg;
mod render_ctx;
pub use render_ctx::PietGpuRenderContext;
@ -8,6 +8,8 @@ use rand::{Rng, RngCore};
use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
use piet::{Color, RenderContext};
use piet_gpu_types::encoder::Encode;
use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags};
use pico_svg::PicoSvg;
@ -110,6 +112,12 @@ pub struct Renderer<D: Device> {
scene_buf: D::Buffer,
scene_dev: D::Buffer,
pub state_buf: D::Buffer,
el_pipeline: D::Pipeline,
el_ds: D::DescriptorSet,
/*
k1_alloc_buf_host: D::Buffer,
k1_alloc_buf_dev: D::Buffer,
k2s_alloc_buf_host: D::Buffer,
@ -131,6 +139,8 @@ pub struct Renderer<D: Device> {
k3_ds: D::DescriptorSet,
k4_pipeline: D::Pipeline,
k4_ds: D::DescriptorSet,
*/
n_elements: usize,
}
impl<D: Device> Renderer<D> {
@ -146,175 +156,123 @@ impl<D: Device> Renderer<D> {
.unwrap();
device.write_buffer(&scene_buf, &scene)?;
let state_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
let el_code = include_bytes!("../shader/elements.spv");
let el_pipeline = device.create_simple_compute_pipeline(el_code, 2, 0)?;
let el_ds = device.create_descriptor_set(
&el_pipeline,
&[&scene_dev, &state_buf],
&[],
)?;
let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
println!("scene: {} elements", n_elements);
/*
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
let k1_alloc_buf_host = device.create_buffer(4, host)?;
let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
let k1_code = include_bytes!("../shader/kernel1.spv");
let k1_pipeline = device
.create_simple_compute_pipeline(k1_code, 3, 0)?;
let k1_ds = device
.create_descriptor_set(
&k1_pipeline,
&[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
&[],
)?;
let k1_pipeline = device.create_simple_compute_pipeline(k1_code, 3, 0)?;
let k1_ds = device.create_descriptor_set(
&k1_pipeline,
&[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
&[],
)?;
let k2s_alloc_buf_host = device.create_buffer(4, host)?;
let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device
.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
?;
device.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])?;
let k2s_code = include_bytes!("../shader/kernel2s.spv");
let k2s_pipeline = device
.create_simple_compute_pipeline(k2s_code, 4, 0)
?;
let k2s_ds = device
.create_descriptor_set(
&k2s_pipeline,
&[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
&[],
)
?;
let k2s_pipeline = device.create_simple_compute_pipeline(k2s_code, 4, 0)?;
let k2s_ds = device.create_descriptor_set(
&k2s_pipeline,
&[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
&[],
)?;
let k2f_alloc_buf_host = device.create_buffer(4, host)?;
let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device
.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])
?;
device.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])?;
let k2f_code = include_bytes!("../shader/kernel2f.spv");
let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
let k2f_ds = device
.create_descriptor_set(
&k2f_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&fill_seg_buf,
&k2f_alloc_buf_dev,
],
&[],
)
?;
let k2f_ds = device.create_descriptor_set(
&k2f_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&fill_seg_buf,
&k2f_alloc_buf_dev,
],
&[],
)?;
let k3_alloc_buf_host = device.create_buffer(4, host)?;
let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
device
.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
?;
device.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])?;
let k3_code = include_bytes!("../shader/kernel3.spv");
let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
let k3_ds = device
.create_descriptor_set(
&k3_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&segment_buf,
&fill_seg_buf,
&ptcl_buf,
&k3_alloc_buf_dev,
],
&[],
)
?;
let k3_ds = device.create_descriptor_set(
&k3_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&segment_buf,
&fill_seg_buf,
&ptcl_buf,
&k3_alloc_buf_dev,
],
&[],
)?;
let k4_code = include_bytes!("../shader/kernel4.spv");
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
let k4_ds = device
.create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev])
?;
let k4_ds = device.create_descriptor_set(
&k4_pipeline,
&[&ptcl_buf, &segment_buf, &fill_seg_buf],
&[&image_dev],
)?;
*/
Ok(Renderer {
scene_buf,
scene_dev,
image_dev,
k1_alloc_buf_host,
k1_alloc_buf_dev,
k2s_alloc_buf_host,
k2s_alloc_buf_dev,
k2f_alloc_buf_host,
k2f_alloc_buf_dev,
k3_alloc_buf_host,
k3_alloc_buf_dev,
tilegroup_buf,
ptcl_buf,
k1_pipeline,
k1_ds,
k2s_pipeline,
k2s_ds,
k2f_pipeline,
k2f_ds,
k3_pipeline,
k3_ds,
k4_pipeline,
k4_ds,
el_pipeline,
el_ds,
state_buf,
n_elements,
})
}
pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
// Note: we could use one alloc buf and reuse it. But we'll stick with
// multiple ones for clarity.
cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev);
cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev);
cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev);
cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev);
// Note: these clears aren't necessary, and are here to make inspection
// of the buffers cleaner. Can likely be removed.
cmd_buf.clear_buffer(&self.tilegroup_buf);
cmd_buf.clear_buffer(&self.ptcl_buf);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General);
cmd_buf.image_barrier(
&self.image_dev,
ImageLayout::Undefined,
ImageLayout::General,
);
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.dispatch(
&self.k1_pipeline,
&self.k1_ds,
((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1),
&self.el_pipeline,
&self.el_ds,
((self.n_elements / 128) as u32, 1, 1),
);
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k2s_pipeline,
&self.k2s_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
);
cmd_buf.write_timestamp(&query_pool, 2);
// Note: this barrier is not necessary (k2f does not depend on
// k2s output), but I'm keeping it here to increase transparency
// of performance.
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k2f_pipeline,
&self.k2f_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2),
);
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k3_pipeline,
&self.k3_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3),
);
cmd_buf.write_timestamp(&query_pool, 4);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k4_pipeline,
&self.k4_ds,
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
);
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
}
}

View file

@ -41,10 +41,14 @@ impl PicoSvg {
let path = Affine::scale(scale) * bp;
if let Some(fill_color) = el.attribute("fill") {
let color = parse_color(fill_color);
items.push(Item::Fill(FillItem { color, path: path.clone() }));
items.push(Item::Fill(FillItem {
color,
path: path.clone(),
}));
}
if let Some(stroke_color) = el.attribute("stroke") {
let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
let width =
f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
let color = parse_color(stroke_color);
items.push(Item::Stroke(StrokeItem { width, color, path }));
}

View file

@ -2,7 +2,11 @@ use std::borrow::Cow;
use piet_gpu_types::encoder::{Encode, Encoder, Ref};
use piet_gpu_types::scene;
use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup};
use piet_gpu_types::scene::{
Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup,
};
use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke};
use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
@ -27,10 +31,10 @@ pub struct PietGpuText;
pub struct PietGpuRenderContext {
encoder: Encoder,
bboxes: Vec<Bbox>,
items: Vec<PietItem>,
elements: Vec<Element>,
// Will probably need direct accesss to hal Device to create images etc.
inner_text: PietGpuText,
stroke_width: f32,
}
#[derive(Clone)]
@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25;
impl PietGpuRenderContext {
pub fn new() -> PietGpuRenderContext {
let mut encoder = Encoder::new();
let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
let bboxes = Vec::new();
let items = Vec::new();
let encoder = Encoder::new();
let elements = Vec::new();
let inner_text = PietGpuText;
let stroke_width = 0.0;
PietGpuRenderContext {
encoder,
bboxes,
items,
elements,
inner_text,
stroke_width,
}
}
pub fn get_scene_buf(&mut self) -> &[u8] {
let n_items = self.bboxes.len() as u32;
let bboxes = self.bboxes.encode(&mut self.encoder).transmute();
let items = self.items.encode(&mut self.encoder).transmute();
let offset = scene::Point { xy: [0.0, 0.0] };
let simple_group = SimpleGroup {
n_items,
bboxes,
items,
offset,
};
let root_item = PietItem::Group(simple_group);
root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]);
self.elements.encode(&mut self.encoder);
self.encoder.buf()
}
fn push_item(&mut self, item: PietItem, bbox: Rect) {
let scene_bbox = Bbox {
bbox: [
bbox.x0.floor() as i16,
bbox.y0.floor() as i16,
bbox.x1.ceil() as i16,
bbox.y1.ceil() as i16,
],
};
self.items.push(item);
self.bboxes.push(scene_bbox);
}
}
impl RenderContext for PietGpuRenderContext {
@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext {
fn clear(&mut self, _color: Color) {}
fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
let bbox = shape.bounding_box();
let brush = brush.make_brush(self, || bbox).into_owned();
let width = width as f32;
if self.stroke_width != width {
self.elements
.push(Element::SetLineWidth(SetLineWidth { width }));
self.stroke_width = width;
}
let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
let path = shape.to_bez_path(TOLERANCE);
let (n_points, points) = flatten_shape(&mut self.encoder, path);
self.encode_path(path);
match brush {
PietGpuBrush::Solid(rgba_color) => {
let poly_line = PietStrokePolyLine {
rgba_color,
width: width as f32,
n_points,
points,
};
let bbox = bbox.inset(-0.5 * width);
self.push_item(PietItem::Poly(poly_line), bbox);
let stroke = Stroke { rgba_color };
self.elements.push(Element::Stroke(stroke));
}
_ => (),
}
@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext {
}
fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
let bbox = shape.bounding_box();
let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
if let Some(circle) = shape.as_circle() {
match brush {
PietGpuBrush::Solid(rgba_color) => {
let piet_circle = PietCircle {
rgba_color,
center: to_scene_point(circle.center),
radius: circle.radius as f32,
};
let bbox = circle.bounding_box();
self.push_item(PietItem::Circle(piet_circle), bbox);
}
_ => {}
}
return;
}
let path = shape.to_bez_path(TOLERANCE);
let (n_points, points) = flatten_shape(&mut self.encoder, path);
self.encode_path(path);
match brush {
PietGpuBrush::Solid(rgba_color) => {
let fill = PietFill {
flags: 0,
rgba_color,
n_points,
points,
};
self.push_item(PietItem::Fill(fill), bbox);
let fill = Fill { rgba_color };
self.elements.push(Element::Fill(fill));
}
_ => (),
}
@ -241,45 +197,96 @@ impl RenderContext for PietGpuRenderContext {
}
}
fn flatten_shape(
encoder: &mut Encoder,
path: impl Iterator<Item = PathEl>,
) -> (u32, Ref<scene::Point>) {
let mut points = Vec::new();
let mut start_pt = None;
let mut last_pt = None;
piet::kurbo::flatten(path, TOLERANCE, |el| {
match el {
PathEl::MoveTo(p) => {
let scene_pt = to_scene_point(p);
start_pt = Some(clone_scene_pt(&scene_pt));
if !points.is_empty() {
points.push(scene::Point {
xy: [std::f32::NAN, std::f32::NAN],
});
impl PietGpuRenderContext {
fn encode_path(&mut self, path: impl Iterator<Item = PathEl>) {
let flatten = false;
if flatten {
let mut start_pt = None;
let mut last_pt = None;
piet::kurbo::flatten(path, TOLERANCE, |el| {
match el {
PathEl::MoveTo(p) => {
let scene_pt = to_f32_2(p);
last_pt = Some(scene_pt);
}
PathEl::LineTo(p) => {
let scene_pt = to_f32_2(p);
let seg = LineSeg {
p0: last_pt.unwrap(),
p1: scene_pt,
};
self.elements.push(Element::Line(seg));
last_pt = Some(scene_pt);
}
PathEl::ClosePath => {
if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
let seg = LineSeg {
p0: last,
p1: start,
};
self.elements.push(Element::Line(seg));
}
}
_ => (),
}
last_pt = Some(clone_scene_pt(&scene_pt));
points.push(scene_pt);
}
PathEl::LineTo(p) => {
let scene_pt = to_scene_point(p);
last_pt = Some(clone_scene_pt(&scene_pt));
points.push(scene_pt);
}
PathEl::ClosePath => {
if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
if start.xy != last.xy {
points.push(start);
//println!("{:?}", el);
});
} else {
let mut start_pt = None;
let mut last_pt = None;
for el in path {
match el {
PathEl::MoveTo(p) => {
let scene_pt = to_f32_2(p);
last_pt = Some(scene_pt);
}
PathEl::LineTo(p) => {
let scene_pt = to_f32_2(p);
let seg = LineSeg {
p0: last_pt.unwrap(),
p1: scene_pt,
};
self.elements.push(Element::Line(seg));
last_pt = Some(scene_pt);
}
PathEl::QuadTo(p1, p2) => {
let scene_p1 = to_f32_2(p1);
let scene_p2 = to_f32_2(p2);
let seg = QuadSeg {
p0: last_pt.unwrap(),
p1: scene_p1,
p2: scene_p2,
};
self.elements.push(Element::Quad(seg));
last_pt = Some(scene_p2);
}
PathEl::CurveTo(p1, p2, p3) => {
let scene_p1 = to_f32_2(p1);
let scene_p2 = to_f32_2(p2);
let scene_p3 = to_f32_2(p3);
let seg = CubicSeg {
p0: last_pt.unwrap(),
p1: scene_p1,
p2: scene_p2,
p3: scene_p3,
};
self.elements.push(Element::Cubic(seg));
last_pt = Some(scene_p3);
}
PathEl::ClosePath => {
if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
let seg = LineSeg {
p0: last,
p1: start,
};
self.elements.push(Element::Line(seg));
}
}
}
//println!("{:?}", el);
}
_ => (),
}
//println!("{:?}", el);
});
let n_points = points.len() as u32;
let points_ref = points.encode(encoder).transmute();
(n_points, points_ref)
}
}
impl Text for PietGpuText {
@ -360,13 +367,6 @@ impl IntoBrush<PietGpuRenderContext> for PietGpuBrush {
}
}
fn to_scene_point(point: Point) -> scene::Point {
scene::Point {
xy: [point.x as f32, point.y as f32],
}
}
// TODO: allow #[derive(Clone)] in piet-gpu-derive.
fn clone_scene_pt(p: &scene::Point) -> scene::Point {
scene::Point { xy: p.xy }
fn to_f32_2(point: Point) -> [f32; 2] {
[point.x as f32, point.y as f32]
}