Merge pull request #19 from linebender/sort_middle

Bring sort_middle branch to master
This commit is contained in:
Raph Levien 2020-06-11 16:16:10 -07:00 committed by GitHub
commit 73df5534a1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
41 changed files with 2673 additions and 1310 deletions

72
Cargo.lock generated
View file

@ -26,6 +26,15 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "000444226fcff248f2bc4c7625be32c63caccfecc2723a2b9f78a7487a49c407"
[[package]]
name = "ansi_term"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
dependencies = [
"winapi 0.3.8",
]
[[package]]
name = "approx"
version = "0.3.2"
@ -59,6 +68,17 @@ dependencies = [
"raw-window-handle",
]
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi 0.3.8",
]
[[package]]
name = "autocfg"
version = "1.0.0"
@ -106,6 +126,21 @@ version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
[[package]]
name = "clap"
version = "2.33.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
dependencies = [
"ansi_term",
"atty",
"bitflags",
"strsim",
"textwrap",
"unicode-width",
"vec_map",
]
[[package]]
name = "cloudabi"
version = "0.0.3"
@ -259,6 +294,15 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f36b5f248235f45773d4944f555f83ea61fe07b18b561ccf99d7483d7381e54d"
[[package]]
name = "hermit-abi"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71"
dependencies = [
"libc",
]
[[package]]
name = "inflate"
version = "0.4.5"
@ -525,6 +569,7 @@ dependencies = [
name = "piet-gpu"
version = "0.1.0"
dependencies = [
"clap",
"piet",
"piet-gpu-hal",
"piet-gpu-types",
@ -758,6 +803,12 @@ dependencies = [
"byteorder",
]
[[package]]
name = "strsim"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "syn"
version = "1.0.17"
@ -769,6 +820,21 @@ dependencies = [
"unicode-xid 0.2.0",
]
[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]]
name = "unicode-width"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
[[package]]
name = "unicode-xid"
version = "0.1.0"
@ -781,6 +847,12 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
[[package]]
name = "vec_map"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "void"
version = "1.0.2"

View file

@ -0,0 +1,53 @@
use piet_gpu_derive::piet_gpu;
piet_gpu! {
#[gpu_write]
mod annotated {
struct AnnoFillLineSeg {
p0: [f32; 2],
p1: [f32; 2],
// A note: the layout of this struct is shared with
// AnnoStrokeLineSeg. In that case, we actually write
// [0.0, 0.0] as the stroke field, to minimize divergence.
}
struct AnnoStrokeLineSeg {
p0: [f32; 2],
p1: [f32; 2],
// halfwidth in both x and y for binning
stroke: [f32; 2],
}
struct AnnoQuadSeg {
p0: [f32; 2],
p1: [f32; 2],
p2: [f32; 2],
stroke: [f32; 2],
}
struct AnnoCubicSeg {
p0: [f32; 2],
p1: [f32; 2],
p2: [f32; 2],
p3: [f32; 2],
stroke: [f32; 2],
}
struct AnnoFill {
rgba_color: u32,
bbox: [f32; 4],
}
struct AnnoStroke {
rgba_color: u32,
bbox: [f32; 4],
// For the nonuniform scale case, this needs to be a 2x2 matrix.
// That's expected to be uncommon, so we could special-case it.
linewidth: f32,
}
enum Annotated {
Nop,
FillLine(AnnoFillLineSeg),
StrokeLine(AnnoStrokeLineSeg),
Quad(AnnoQuadSeg),
Cubic(AnnoCubicSeg),
Stroke(AnnoStroke),
Fill(AnnoFill),
}
}
}

View file

@ -0,0 +1,22 @@
use piet_gpu_derive::piet_gpu;
// The output of the binning stage, organized as a linked list of chunks.
piet_gpu! {
#[gpu_write]
mod bins {
struct BinInstance {
element_ix: u32,
// Right edge of the bounding box of the associated fill
// element; used in backdrop computation.
right_edge: f32,
}
struct BinChunk {
// First chunk can have n = 0, subsequent ones not.
n: u32,
next: Ref<BinChunk>,
// Instances follow
}
}
}

View file

@ -1,37 +0,0 @@
use piet_gpu_derive::piet_gpu;
// Structures representing segments for fill items.
// There is some cut'n'paste here from stroke segments, which can be
// traced to the fact that buffers in GLSL are basically global.
// Maybe there's a way to address that, but in the meantime living
// with the duplication is easiest.
piet_gpu! {
#[gpu_write]
mod fill_seg {
struct FillTileHeader {
n: u32,
items: Ref<FillItemHeader>,
}
struct FillItemHeader {
backdrop: i32,
segments: Ref<FillSegChunk>,
}
// TODO: strongly consider using f16. If so, these would be
// relative to the tile. We're doing f32 for now to minimize
// divergence from piet-metal originals.
struct FillSegment {
start: [f32; 2],
end: [f32; 2],
}
struct FillSegChunk {
n: u32,
next: Ref<FillSegChunk>,
// Segments follow (could represent this as a variable sized array).
}
}
}

View file

@ -1,7 +1,10 @@
// Structures used only internally probably don't need to be pub.
pub mod annotated;
pub mod bins;
pub mod encoder;
pub mod fill_seg;
pub mod ptcl;
pub mod scene;
pub mod segment;
pub mod state;
pub mod test;
pub mod tilegroup;

View file

@ -5,9 +5,10 @@ fn main() {
.expect("provide a module name");
match mod_name.as_str() {
"scene" => print!("{}", piet_gpu_types::scene::gen_gpu_scene()),
"state" => print!("{}", piet_gpu_types::state::gen_gpu_state()),
"annotated" => print!("{}", piet_gpu_types::annotated::gen_gpu_annotated()),
"bins" => print!("{}", piet_gpu_types::bins::gen_gpu_bins()),
"tilegroup" => print!("{}", piet_gpu_types::tilegroup::gen_gpu_tilegroup()),
"segment" => print!("{}", piet_gpu_types::segment::gen_gpu_segment()),
"fill_seg" => print!("{}", piet_gpu_types::fill_seg::gen_gpu_fill_seg()),
"ptcl" => print!("{}", piet_gpu_types::ptcl::gen_gpu_ptcl()),
"test" => print!("{}", piet_gpu_types::test::gen_gpu_test()),
_ => println!("Oops, unknown module name"),

View file

@ -13,14 +13,13 @@ piet_gpu! {
end: [f32; 2],
}
struct CmdStroke {
// Should be Ref<SegChunk> if we had cross-module references.
seg_ref: u32,
// Consider a specialization to one segment.
seg_ref: Ref<SegChunk>,
half_width: f32,
rgba_color: u32,
}
struct CmdFill {
// Should be Ref<FillSegChunk> if we had cross-module references.
seg_ref: u32,
seg_ref: Ref<SegChunk>,
backdrop: i32,
rgba_color: u32,
}
@ -51,5 +50,24 @@ piet_gpu! {
Jump(CmdJump),
Bail,
}
// TODO: strongly consider using f16. If so, these would be
// relative to the tile. We're doing f32 for now to minimize
// divergence from piet-metal originals.
struct Segment {
start: [f32; 2],
end: [f32; 2],
// This is used for fills only, but we're including it in
// the general structure for simplicity.
y_edge: f32,
}
struct SegChunk {
n: u32,
next: Ref<SegChunk>,
// Actually a reference to a variable-sized slice.
segs: Ref<Segment>,
}
}
}

View file

@ -4,6 +4,8 @@ pub use self::scene::{
Bbox, PietCircle, PietFill, PietItem, PietStrokeLine, PietStrokePolyLine, Point, SimpleGroup,
};
pub use self::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke, Transform};
piet_gpu! {
#[rust_encode]
mod scene {
@ -51,5 +53,53 @@ piet_gpu! {
Fill(PietFill),
Poly(PietStrokePolyLine),
}
// New approach follows (above to be deleted)
struct LineSeg {
p0: [f32; 2],
p1: [f32; 2],
}
struct QuadSeg {
p0: [f32; 2],
p1: [f32; 2],
p2: [f32; 2],
}
struct CubicSeg {
p0: [f32; 2],
p1: [f32; 2],
p2: [f32; 2],
p3: [f32; 2],
}
struct Fill {
rgba_color: u32,
}
struct Stroke {
rgba_color: u32,
}
struct SetLineWidth {
width: f32,
}
struct Transform {
mat: [f32; 4],
translate: [f32; 2],
}
enum Element {
Nop,
// Another approach to encoding would be to use a single
// variant but have a bool for fill/stroke. This could be
// packed into the tag, so the on-the-wire representation
// would be very similar to what's here.
StrokeLine(LineSeg),
FillLine(LineSeg),
// Note: we'll need to handle the stroke/fill distinction
// for these as well, when we do flattening on the GPU.
Quad(QuadSeg),
Cubic(CubicSeg),
Stroke(Stroke),
Fill(Fill),
SetLineWidth(SetLineWidth),
Transform(Transform),
}
}
}

View file

@ -1,32 +0,0 @@
use piet_gpu_derive::piet_gpu;
// Structures representing segments for stroke/fill items.
piet_gpu! {
#[gpu_write]
mod segment {
struct TileHeader {
n: u32,
items: Ref<ItemHeader>,
}
// Note: this is only suitable for strokes, fills require backdrop.
struct ItemHeader {
segments: Ref<SegChunk>,
}
// TODO: strongly consider using f16. If so, these would be
// relative to the tile. We're doing f32 for now to minimize
// divergence from piet-metal originals.
struct Segment {
start: [f32; 2],
end: [f32; 2],
}
struct SegChunk {
n: u32,
next: Ref<SegChunk>,
// Segments follow (could represent this as a variable sized array).
}
}
}

View file

@ -0,0 +1,14 @@
use piet_gpu_derive::piet_gpu;
piet_gpu! {
#[gpu_write]
mod state {
struct State {
mat: [f32; 4],
translate: [f32; 2],
bbox: [f32; 4],
linewidth: f32,
flags: u32,
}
}
}

View file

@ -26,3 +26,4 @@ png = "0.16.2"
rand = "0.7.3"
roxmltree = "0.11"
winit = "0.22"
clap = "2.33"

View file

@ -2,10 +2,12 @@ use std::fs::File;
use std::io::BufWriter;
use std::path::Path;
use clap::{Arg, App};
use piet_gpu_hal::vulkan::VkInstance;
use piet_gpu_hal::{CmdBuf, Device, Error, MemFlags};
use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
use piet_gpu::{render_scene, render_svg, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
#[allow(unused)]
fn dump_scene(buf: &[u8]) {
@ -16,22 +18,179 @@ fn dump_scene(buf: &[u8]) {
}
}
#[allow(unused)]
fn dump_state(buf: &[u8]) {
for i in 0..(buf.len() / 48) {
let j = i * 48;
let floats = (0..11).map(|k| {
let mut buf_f32 = [0u8; 4];
buf_f32.copy_from_slice(&buf[j + k * 4..j + k * 4 + 4]);
f32::from_le_bytes(buf_f32)
}).collect::<Vec<_>>();
println!("{}: [{} {} {} {} {} {}] ({}, {})-({} {}) {} {}",
i,
floats[0], floats[1], floats[2], floats[3], floats[4], floats[5],
floats[6], floats[7], floats[8], floats[9],
floats[10], buf[j + 44]);
}
}
/// Interpret the output of the binning stage, for diagnostic purposes.
#[allow(unused)]
fn trace_merge(buf: &[u32]) {
for bin in 0..256 {
println!("bin {}:", bin);
let mut starts = (0..16).map(|i| Some((bin * 16 + i) * 64)).collect::<Vec<Option<usize>>>();
loop {
let min_start = starts.iter().map(|st|
st.map(|st|
if buf[st / 4] == 0 {
!0
} else {
buf[st / 4 + 2]
}).unwrap_or(!0)).min().unwrap();
if min_start == !0 {
break;
}
let mut selected = !0;
for i in 0..16 {
if let Some(st) = starts[i] {
if buf[st/4] != 0 && buf[st/4 + 2] == min_start {
selected = i;
break;
}
}
}
let st = starts[selected].unwrap();
println!("selected {}, start {:x}", selected, st);
for j in 0..buf[st/4] {
println!("{:x}", buf[st/4 + 2 + j as usize])
}
if buf[st/4 + 1] == 0 {
starts[selected] = None;
} else {
starts[selected] = Some(buf[st/4 + 1] as usize);
}
}
}
}
/// Interpret the output of the coarse raster stage, for diagnostic purposes.
#[allow(unused)]
fn trace_ptcl(buf: &[u32]) {
for y in 0..96 {
for x in 0..128 {
let tile_ix = y * 128 + x;
println!("tile {} @({}, {})", tile_ix, x, y);
let mut tile_offset = tile_ix * 1024;
loop {
let tag = buf[tile_offset / 4];
match tag {
0 => break,
3 => {
let backdrop = buf[tile_offset / 4 + 2];
let rgba_color = buf[tile_offset / 4 + 3];
println!(" {:x}: fill {:x} {}", tile_offset, rgba_color, backdrop);
let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
let n = buf[seg_chunk / 4] as usize;
let segs = buf[seg_chunk / 4 + 2] as usize;
println!(" chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
for i in 0..n {
let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
println!(" ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
}
loop {
seg_chunk = buf[seg_chunk / 4 + 1] as usize;
if seg_chunk == 0 {
break;
}
}
}
4 => {
let line_width = f32::from_bits(buf[tile_offset / 4 + 2]);
let rgba_color = buf[tile_offset / 4 + 3];
println!(" {:x}: stroke {:x} {}", tile_offset, rgba_color, line_width);
let mut seg_chunk = buf[tile_offset / 4 + 1] as usize;
let n = buf[seg_chunk / 4] as usize;
let segs = buf[seg_chunk / 4 + 2] as usize;
println!(" chunk @{:x}: n={}, segs @{:x}", seg_chunk, n, segs);
for i in 0..n {
let x0 = f32::from_bits(buf[segs / 4 + i * 5]);
let y0 = f32::from_bits(buf[segs / 4 + i * 5 + 1]);
let x1 = f32::from_bits(buf[segs / 4 + i * 5 + 2]);
let y1 = f32::from_bits(buf[segs / 4 + i * 5 + 3]);
let y_edge = f32::from_bits(buf[segs / 4 + i * 5 + 4]);
println!(" ({:.3}, {:.3}) - ({:.3}, {:.3}) | {:.3}", x0, y0, x1, y1, y_edge);
}
loop {
seg_chunk = buf[seg_chunk / 4 + 1] as usize;
if seg_chunk == 0 {
break;
}
}
}
_ => {
println!("{:x}: {}", tile_offset, tag);
}
}
if tag == 0 {
break;
}
if tag == 8 {
tile_offset = buf[tile_offset / 4 + 1] as usize;
} else {
tile_offset += 20;
}
}
}
}
}
fn main() -> Result<(), Error> {
let matches = App::new("piet-gpu test")
.arg(Arg::with_name("INPUT")
.index(1))
.arg(Arg::with_name("flip")
.short("f")
.long("flip"))
.arg(Arg::with_name("scale")
.short("s")
.long("scale")
.takes_value(true))
.get_matches();
let (instance, _) = VkInstance::new(None)?;
unsafe {
let device = instance.device(None)?;
let fence = device.create_fence(false)?;
let mut cmd_buf = device.create_cmd_buf()?;
let query_pool = device.create_query_pool(6)?;
let query_pool = device.create_query_pool(5)?;
let mut ctx = PietGpuRenderContext::new();
render_scene(&mut ctx);
if let Some(input) = matches.value_of("INPUT") {
let mut scale = matches.value_of("scale")
.map(|scale| scale.parse().unwrap())
.unwrap_or(8.0);
if matches.is_present("flip") {
scale = -scale;
}
render_svg(&mut ctx, input, scale);
} else {
render_scene(&mut ctx);
}
let scene = ctx.get_scene_buf();
//dump_scene(&scene);
let renderer = Renderer::new(&device, scene)?;
let image_buf = device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
let image_buf =
device.create_buffer((WIDTH * HEIGHT * 4) as u64, MemFlags::host_coherent())?;
cmd_buf.begin();
renderer.record(&mut cmd_buf, &query_pool);
@ -39,29 +198,17 @@ fn main() -> Result<(), Error> {
cmd_buf.finish();
device.run_cmd_buf(&cmd_buf, &[], &[], Some(&fence))?;
device.wait_and_reset(&[fence])?;
let timestamps = device.reap_query_pool(&query_pool).unwrap();
println!("Kernel 1 time: {:.3}ms", timestamps[0] * 1e3);
println!(
"Kernel 2s time: {:.3}ms",
(timestamps[1] - timestamps[0]) * 1e3
);
println!(
"Kernel 2f time: {:.3}ms",
(timestamps[2] - timestamps[1]) * 1e3
);
println!(
"Kernel 3 time: {:.3}ms",
(timestamps[3] - timestamps[2]) * 1e3
);
println!(
"Render time: {:.3}ms",
(timestamps[4] - timestamps[3]) * 1e3
);
let ts = device.reap_query_pool(&query_pool).unwrap();
println!("Element kernel time: {:.3}ms", ts[0] * 1e3);
println!("Binning kernel time: {:.3}ms", (ts[1] - ts[0]) * 1e3);
println!("Coarse kernel time: {:.3}ms", (ts[2] - ts[1]) * 1e3);
println!("Render kernel time: {:.3}ms", (ts[3] - ts[2]) * 1e3);
/*
let mut k1_data: Vec<u32> = Default::default();
device.read_buffer(&segment_buf, &mut k1_data).unwrap();
dump_k1_data(&k1_data);
let mut data: Vec<u32> = Default::default();
device.read_buffer(&renderer.ptcl_buf, &mut data).unwrap();
piet_gpu::dump_k1_data(&data);
//trace_ptcl(&data);
*/
let mut img_data: Vec<u8> = Default::default();

View file

@ -1,7 +1,7 @@
use piet_gpu_hal::vulkan::VkInstance;
use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout};
use piet_gpu::{PietGpuRenderContext, Renderer, render_scene, WIDTH, HEIGHT};
use piet_gpu::{render_scene, PietGpuRenderContext, Renderer, HEIGHT, WIDTH};
use winit::{
event::{Event, WindowEvent},
@ -37,7 +37,7 @@ fn main() -> Result<(), Error> {
.map(|_| device.create_cmd_buf())
.collect::<Result<Vec<_>, Error>>()?;
let query_pools = (0..NUM_FRAMES)
.map(|_| device.create_query_pool(6))
.map(|_| device.create_query_pool(5))
.collect::<Result<Vec<_>, Error>>()?;
let mut ctx = PietGpuRenderContext::new();
@ -69,12 +69,12 @@ fn main() -> Result<(), Error> {
device.wait_and_reset(&[frame_fences[frame_idx]]).unwrap();
let timestamps = device.reap_query_pool(query_pool).unwrap();
window.set_title(&format!("k1: {:.3}ms, k2s: {:.3}ms, k2f: {:.3}ms, k3: {:.3}ms, k4: {:.3}ms",
window.set_title(&format!(
"e: {:.3}ms, b: {:.3}ms, c: {:.3}ms, f: {:.3}ms",
timestamps[0] * 1e3,
(timestamps[1] - timestamps[0]) * 1e3,
(timestamps[2] - timestamps[1]) * 1e3,
(timestamps[3] - timestamps[2]) * 1e3,
(timestamps[4] - timestamps[3]) * 1e3,
));
}
@ -93,11 +93,7 @@ fn main() -> Result<(), Error> {
ImageLayout::BlitDst,
);
cmd_buf.blit_image(&renderer.image_dev, &swap_image);
cmd_buf.image_barrier(
&swap_image,
ImageLayout::BlitDst,
ImageLayout::Present,
);
cmd_buf.image_barrier(&swap_image, ImageLayout::BlitDst, ImageLayout::Present);
cmd_buf.finish();
device

335
piet-gpu/shader/annotated.h Normal file
View file

@ -0,0 +1,335 @@
// Code auto-generated by piet-gpu-derive
struct AnnoFillLineSegRef {
uint offset;
};
struct AnnoStrokeLineSegRef {
uint offset;
};
struct AnnoQuadSegRef {
uint offset;
};
struct AnnoCubicSegRef {
uint offset;
};
struct AnnoFillRef {
uint offset;
};
struct AnnoStrokeRef {
uint offset;
};
struct AnnotatedRef {
uint offset;
};
struct AnnoFillLineSeg {
vec2 p0;
vec2 p1;
};
#define AnnoFillLineSeg_size 16
AnnoFillLineSegRef AnnoFillLineSeg_index(AnnoFillLineSegRef ref, uint index) {
return AnnoFillLineSegRef(ref.offset + index * AnnoFillLineSeg_size);
}
struct AnnoStrokeLineSeg {
vec2 p0;
vec2 p1;
vec2 stroke;
};
#define AnnoStrokeLineSeg_size 24
AnnoStrokeLineSegRef AnnoStrokeLineSeg_index(AnnoStrokeLineSegRef ref, uint index) {
return AnnoStrokeLineSegRef(ref.offset + index * AnnoStrokeLineSeg_size);
}
struct AnnoQuadSeg {
vec2 p0;
vec2 p1;
vec2 p2;
vec2 stroke;
};
#define AnnoQuadSeg_size 32
AnnoQuadSegRef AnnoQuadSeg_index(AnnoQuadSegRef ref, uint index) {
return AnnoQuadSegRef(ref.offset + index * AnnoQuadSeg_size);
}
struct AnnoCubicSeg {
vec2 p0;
vec2 p1;
vec2 p2;
vec2 p3;
vec2 stroke;
};
#define AnnoCubicSeg_size 40
AnnoCubicSegRef AnnoCubicSeg_index(AnnoCubicSegRef ref, uint index) {
return AnnoCubicSegRef(ref.offset + index * AnnoCubicSeg_size);
}
struct AnnoFill {
uint rgba_color;
vec4 bbox;
};
#define AnnoFill_size 20
AnnoFillRef AnnoFill_index(AnnoFillRef ref, uint index) {
return AnnoFillRef(ref.offset + index * AnnoFill_size);
}
struct AnnoStroke {
uint rgba_color;
vec4 bbox;
float linewidth;
};
#define AnnoStroke_size 24
AnnoStrokeRef AnnoStroke_index(AnnoStrokeRef ref, uint index) {
return AnnoStrokeRef(ref.offset + index * AnnoStroke_size);
}
#define Annotated_Nop 0
#define Annotated_FillLine 1
#define Annotated_StrokeLine 2
#define Annotated_Quad 3
#define Annotated_Cubic 4
#define Annotated_Stroke 5
#define Annotated_Fill 6
#define Annotated_size 44
AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
return AnnotatedRef(ref.offset + index * Annotated_size);
}
AnnoFillLineSeg AnnoFillLineSeg_read(AnnoFillLineSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = annotated[ix + 0];
uint raw1 = annotated[ix + 1];
uint raw2 = annotated[ix + 2];
uint raw3 = annotated[ix + 3];
AnnoFillLineSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
void AnnoFillLineSeg_write(AnnoFillLineSegRef ref, AnnoFillLineSeg s) {
uint ix = ref.offset >> 2;
annotated[ix + 0] = floatBitsToUint(s.p0.x);
annotated[ix + 1] = floatBitsToUint(s.p0.y);
annotated[ix + 2] = floatBitsToUint(s.p1.x);
annotated[ix + 3] = floatBitsToUint(s.p1.y);
}
AnnoStrokeLineSeg AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = annotated[ix + 0];
uint raw1 = annotated[ix + 1];
uint raw2 = annotated[ix + 2];
uint raw3 = annotated[ix + 3];
uint raw4 = annotated[ix + 4];
uint raw5 = annotated[ix + 5];
AnnoStrokeLineSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.stroke = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
return s;
}
void AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef ref, AnnoStrokeLineSeg s) {
uint ix = ref.offset >> 2;
annotated[ix + 0] = floatBitsToUint(s.p0.x);
annotated[ix + 1] = floatBitsToUint(s.p0.y);
annotated[ix + 2] = floatBitsToUint(s.p1.x);
annotated[ix + 3] = floatBitsToUint(s.p1.y);
annotated[ix + 4] = floatBitsToUint(s.stroke.x);
annotated[ix + 5] = floatBitsToUint(s.stroke.y);
}
AnnoQuadSeg AnnoQuadSeg_read(AnnoQuadSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = annotated[ix + 0];
uint raw1 = annotated[ix + 1];
uint raw2 = annotated[ix + 2];
uint raw3 = annotated[ix + 3];
uint raw4 = annotated[ix + 4];
uint raw5 = annotated[ix + 5];
uint raw6 = annotated[ix + 6];
uint raw7 = annotated[ix + 7];
AnnoQuadSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
s.stroke = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
return s;
}
void AnnoQuadSeg_write(AnnoQuadSegRef ref, AnnoQuadSeg s) {
uint ix = ref.offset >> 2;
annotated[ix + 0] = floatBitsToUint(s.p0.x);
annotated[ix + 1] = floatBitsToUint(s.p0.y);
annotated[ix + 2] = floatBitsToUint(s.p1.x);
annotated[ix + 3] = floatBitsToUint(s.p1.y);
annotated[ix + 4] = floatBitsToUint(s.p2.x);
annotated[ix + 5] = floatBitsToUint(s.p2.y);
annotated[ix + 6] = floatBitsToUint(s.stroke.x);
annotated[ix + 7] = floatBitsToUint(s.stroke.y);
}
AnnoCubicSeg AnnoCubicSeg_read(AnnoCubicSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = annotated[ix + 0];
uint raw1 = annotated[ix + 1];
uint raw2 = annotated[ix + 2];
uint raw3 = annotated[ix + 3];
uint raw4 = annotated[ix + 4];
uint raw5 = annotated[ix + 5];
uint raw6 = annotated[ix + 6];
uint raw7 = annotated[ix + 7];
uint raw8 = annotated[ix + 8];
uint raw9 = annotated[ix + 9];
AnnoCubicSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
s.stroke = vec2(uintBitsToFloat(raw8), uintBitsToFloat(raw9));
return s;
}
void AnnoCubicSeg_write(AnnoCubicSegRef ref, AnnoCubicSeg s) {
uint ix = ref.offset >> 2;
annotated[ix + 0] = floatBitsToUint(s.p0.x);
annotated[ix + 1] = floatBitsToUint(s.p0.y);
annotated[ix + 2] = floatBitsToUint(s.p1.x);
annotated[ix + 3] = floatBitsToUint(s.p1.y);
annotated[ix + 4] = floatBitsToUint(s.p2.x);
annotated[ix + 5] = floatBitsToUint(s.p2.y);
annotated[ix + 6] = floatBitsToUint(s.p3.x);
annotated[ix + 7] = floatBitsToUint(s.p3.y);
annotated[ix + 8] = floatBitsToUint(s.stroke.x);
annotated[ix + 9] = floatBitsToUint(s.stroke.y);
}
AnnoFill AnnoFill_read(AnnoFillRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = annotated[ix + 0];
uint raw1 = annotated[ix + 1];
uint raw2 = annotated[ix + 2];
uint raw3 = annotated[ix + 3];
uint raw4 = annotated[ix + 4];
AnnoFill s;
s.rgba_color = raw0;
s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
return s;
}
void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
uint ix = ref.offset >> 2;
annotated[ix + 0] = s.rgba_color;
annotated[ix + 1] = floatBitsToUint(s.bbox.x);
annotated[ix + 2] = floatBitsToUint(s.bbox.y);
annotated[ix + 3] = floatBitsToUint(s.bbox.z);
annotated[ix + 4] = floatBitsToUint(s.bbox.w);
}
AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = annotated[ix + 0];
uint raw1 = annotated[ix + 1];
uint raw2 = annotated[ix + 2];
uint raw3 = annotated[ix + 3];
uint raw4 = annotated[ix + 4];
uint raw5 = annotated[ix + 5];
AnnoStroke s;
s.rgba_color = raw0;
s.bbox = vec4(uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3), uintBitsToFloat(raw4));
s.linewidth = uintBitsToFloat(raw5);
return s;
}
void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
uint ix = ref.offset >> 2;
annotated[ix + 0] = s.rgba_color;
annotated[ix + 1] = floatBitsToUint(s.bbox.x);
annotated[ix + 2] = floatBitsToUint(s.bbox.y);
annotated[ix + 3] = floatBitsToUint(s.bbox.z);
annotated[ix + 4] = floatBitsToUint(s.bbox.w);
annotated[ix + 5] = floatBitsToUint(s.linewidth);
}
uint Annotated_tag(AnnotatedRef ref) {
return annotated[ref.offset >> 2];
}
AnnoFillLineSeg Annotated_FillLine_read(AnnotatedRef ref) {
return AnnoFillLineSeg_read(AnnoFillLineSegRef(ref.offset + 4));
}
AnnoStrokeLineSeg Annotated_StrokeLine_read(AnnotatedRef ref) {
return AnnoStrokeLineSeg_read(AnnoStrokeLineSegRef(ref.offset + 4));
}
AnnoQuadSeg Annotated_Quad_read(AnnotatedRef ref) {
return AnnoQuadSeg_read(AnnoQuadSegRef(ref.offset + 4));
}
AnnoCubicSeg Annotated_Cubic_read(AnnotatedRef ref) {
return AnnoCubicSeg_read(AnnoCubicSegRef(ref.offset + 4));
}
AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
return AnnoStroke_read(AnnoStrokeRef(ref.offset + 4));
}
AnnoFill Annotated_Fill_read(AnnotatedRef ref) {
return AnnoFill_read(AnnoFillRef(ref.offset + 4));
}
void Annotated_Nop_write(AnnotatedRef ref) {
annotated[ref.offset >> 2] = Annotated_Nop;
}
void Annotated_FillLine_write(AnnotatedRef ref, AnnoFillLineSeg s) {
annotated[ref.offset >> 2] = Annotated_FillLine;
AnnoFillLineSeg_write(AnnoFillLineSegRef(ref.offset + 4), s);
}
void Annotated_StrokeLine_write(AnnotatedRef ref, AnnoStrokeLineSeg s) {
annotated[ref.offset >> 2] = Annotated_StrokeLine;
AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(ref.offset + 4), s);
}
void Annotated_Quad_write(AnnotatedRef ref, AnnoQuadSeg s) {
annotated[ref.offset >> 2] = Annotated_Quad;
AnnoQuadSeg_write(AnnoQuadSegRef(ref.offset + 4), s);
}
void Annotated_Cubic_write(AnnotatedRef ref, AnnoCubicSeg s) {
annotated[ref.offset >> 2] = Annotated_Cubic;
AnnoCubicSeg_write(AnnoCubicSegRef(ref.offset + 4), s);
}
void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
annotated[ref.offset >> 2] = Annotated_Stroke;
AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
}
void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
annotated[ref.offset >> 2] = Annotated_Fill;
AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
}

View file

@ -0,0 +1,193 @@
// The binning stage of the pipeline.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
// This is for scanning forward for right_edge data.
layout(set = 0, binding = 1) buffer StateBuf {
uint[] state;
};
layout(set = 0, binding = 2) buffer AllocBuf {
uint n_elements;
// Will be incremented atomically to claim tiles
uint tile_ix;
uint alloc;
};
layout(set = 0, binding = 3) buffer BinsBuf {
uint[] bins;
};
#include "annotated.h"
#include "state.h"
#include "bins.h"
// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
#define TSY (1.0 / float(TILE_HEIGHT_PX))
// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
#define INFINITY (1.0 / 0.0)
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared uint sh_chunk_start[N_TILE];
shared float sh_right_edge[N_TILE];
#define StateBuf_stride (8 + 2 * State_size)
uint state_right_edge_index(uint partition_ix) {
return 2 + partition_ix * (StateBuf_stride / 4);
}
void main() {
uint chunk_n = 0;
uint my_n_elements = n_elements;
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0;
}
barrier();
// Read inputs and determine coverage of bins
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
uint tag = Annotated_Nop;
if (element_ix < my_n_elements) {
tag = Annotated_tag(ref);
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
float my_right_edge = INFINITY;
bool crosses_edge = false;
switch (tag) {
case Annotated_FillLine:
case Annotated_StrokeLine:
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
x0 = int(floor((min(line.p0.x, line.p1.x) - line.stroke.x) * SX));
y0 = int(floor((min(line.p0.y, line.p1.y) - line.stroke.y) * SY));
x1 = int(ceil((max(line.p0.x, line.p1.x) + line.stroke.x) * SX));
y1 = int(ceil((max(line.p0.y, line.p1.y) + line.stroke.y) * SY));
crosses_edge = tag == Annotated_FillLine && ceil(line.p0.y * TSY) != ceil(line.p1.y * TSY);
break;
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
x0 = int(floor(fill.bbox.x * SX));
y0 = int(floor(fill.bbox.y * SY));
x1 = int(ceil(fill.bbox.z * SX));
y1 = int(ceil(fill.bbox.w * SY));
// It probably makes more sense to track x1, to avoid having to redo
// the rounding to tile coords.
my_right_edge = fill.bbox.z;
break;
}
// If the last element in this partition is a fill edge, then we need to do a
// look-forward to find the right edge of its corresponding fill. That data is
// recorded in aggregates computed in the element processing pass.
if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_FillLine) {
uint aggregate_ix = (my_partition + 1) * ELEMENT_BINNING_RATIO;
// This is sequential but the expectation is that the amount of
// look-forward is small (performance may degrade in the case
// of massively complex paths).
do {
my_right_edge = uintBitsToFloat(state[state_right_edge_index(aggregate_ix)]);
aggregate_ix++;
} while (isinf(my_right_edge));
}
// Now propagate right_edge backward, from fill to segment.
for (uint i = 0; i < LG_N_TILE; i++) {
// Note: we could try to cut down on write bandwidth here if the value hasn't
// changed, but not sure it's worth the complexity to track.
sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
barrier();
if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
}
barrier();
}
if (crosses_edge) {
x1 = int(ceil(my_right_edge * SX));
}
// At this point, we run an iterator over the coverage area,
// trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with
// segments.
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
if (x0 == x1) y1 = y0;
int x = x0, y = y0;
uint my_slice = gl_LocalInvocationID.x / 32;
uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
while (y < y1) {
atomicOr(bitmaps[my_slice][y * N_TILE_X + x], my_mask);
x++;
if (x == x1) {
x = x0;
y++;
}
}
barrier();
// Allocate output segments.
uint element_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
count[i][gl_LocalInvocationID.x] = element_count;
}
// element_count is number of elements covering bin for this invocation.
uint chunk_start = 0;
if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest)
chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
}
// Note: it might be more efficient for reading to do this in the
// other order (each bin is a contiguous sequence of partitions)
uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
bins[out_ix] = element_count;
bins[out_ix + 1] = chunk_start;
barrier();
// Use similar strategy as Laine & Karras paper; loop over bbox of bins
// touched by this element
x = x0;
y = y0;
while (y < y1) {
uint bin_ix = y * N_TILE_X + x;
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0) {
uint idx = bitCount(out_mask & (my_mask - 1));
if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix];
}
uint out_offset = sh_chunk_start[bin_ix] + idx * BinInstance_size;
BinInstance_write(BinInstanceRef(out_offset), BinInstance(element_ix, my_right_edge));
}
x++;
if (x == x1) {
x = x0;
y++;
}
}
}

BIN
piet-gpu/shader/binning.spv Normal file

Binary file not shown.

64
piet-gpu/shader/bins.h Normal file
View file

@ -0,0 +1,64 @@
// Code auto-generated by piet-gpu-derive
struct BinInstanceRef {
uint offset;
};
struct BinChunkRef {
uint offset;
};
struct BinInstance {
uint element_ix;
float right_edge;
};
#define BinInstance_size 8
BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
return BinInstanceRef(ref.offset + index * BinInstance_size);
}
struct BinChunk {
uint n;
BinChunkRef next;
};
#define BinChunk_size 8
BinChunkRef BinChunk_index(BinChunkRef ref, uint index) {
return BinChunkRef(ref.offset + index * BinChunk_size);
}
BinInstance BinInstance_read(BinInstanceRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = bins[ix + 0];
uint raw1 = bins[ix + 1];
BinInstance s;
s.element_ix = raw0;
s.right_edge = uintBitsToFloat(raw1);
return s;
}
void BinInstance_write(BinInstanceRef ref, BinInstance s) {
uint ix = ref.offset >> 2;
bins[ix + 0] = s.element_ix;
bins[ix + 1] = floatBitsToUint(s.right_edge);
}
BinChunk BinChunk_read(BinChunkRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = bins[ix + 0];
uint raw1 = bins[ix + 1];
BinChunk s;
s.n = raw0;
s.next = BinChunkRef(raw1);
return s;
}
void BinChunk_write(BinChunkRef ref, BinChunk s) {
uint ix = ref.offset >> 2;
bins[ix + 0] = s.n;
bins[ix + 1] = s.next.offset;
}

View file

@ -9,12 +9,11 @@ rule glsl
build image.spv: glsl image.comp | scene.h
build kernel1.spv: glsl kernel1.comp | scene.h tilegroup.h setup.h
build kernel2s.spv: glsl kernel2s.comp | scene.h tilegroup.h segment.h setup.h
build elements.spv: glsl elements.comp | scene.h state.h annotated.h
build kernel2f.spv: glsl kernel2f.comp | scene.h tilegroup.h fill_seg.h setup.h
build binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h
build kernel3.spv: glsl kernel3.comp | scene.h tilegroup.h segment.h fill_seg.h ptcl.h setup.h
build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
build kernel4.spv: glsl kernel4.comp | ptcl.h segment.h fill_seg.h setup.h
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h

526
piet-gpu/shader/coarse.comp Normal file
View file

@ -0,0 +1,526 @@
// The coarse rasterizer stage of the pipeline.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 0) buffer AnnotatedBuf {
uint[] annotated;
};
layout(set = 0, binding = 1) buffer BinsBuf {
uint[] bins;
};
layout(set = 0, binding = 2) buffer AllocBuf {
uint n_elements;
uint alloc;
};
layout(set = 0, binding = 3) buffer PtclBuf {
uint[] ptcl;
};
#include "annotated.h"
#include "bins.h"
#include "ptcl.h"
#define LG_N_PART_READ 8
#define N_PART_READ (1 << LG_N_PART_READ)
shared uint sh_elements[N_TILE];
shared float sh_right_edge[N_TILE];
// Number of elements in the partition; prefix sum.
shared uint sh_part_count[N_PART_READ];
shared uint sh_part_elements[N_PART_READ];
shared uint sh_bitmaps[N_SLICE][N_TILE];
shared uint sh_backdrop[N_SLICE][N_TILE];
shared uint sh_bd_sign[N_SLICE];
shared uint sh_is_segment[N_SLICE];
// Shared state for parallel segment output stage
// Count of total number of segments in each tile, then
// inclusive prefix sum of same.
shared uint sh_seg_count[N_TILE];
shared uint sh_seg_alloc;
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
// Perhaps cmd_limit should be a global? This is a style question.
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset > cmd_limit) {
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_ref, jump);
cmd_ref = CmdRef(new_cmd);
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
}
}
#define CHUNK_ALLOC_SLAB 16
uint alloc_chunk_remaining;
uint alloc_chunk_offset;
SegChunkRef alloc_seg_chunk() {
if (alloc_chunk_remaining == 0) {
alloc_chunk_offset = atomicAdd(alloc, CHUNK_ALLOC_SLAB * SegChunk_size);
alloc_chunk_remaining = CHUNK_ALLOC_SLAB;
}
uint offset = alloc_chunk_offset;
alloc_chunk_offset += SegChunk_size;
alloc_chunk_remaining--;
return SegChunkRef(offset);
}
// Accumulate delta to backdrop.
//
// Each bit for which bd_bitmap is 1 and bd_sign is 1 counts as +1, and each
// bit for which bd_bitmap is 1 and bd_sign is 0 counts as -1.
int count_backdrop(uint bd_bitmap, uint bd_sign) {
return bitCount(bd_bitmap & bd_sign) - bitCount(bd_bitmap & ~bd_sign);
}
void main() {
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
uint partition_ix = 0;
uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
// Top left coordinates of this bin.
vec2 xy0 = vec2(N_TILE_X * TILE_WIDTH_PX * gl_WorkGroupID.x, N_TILE_Y * TILE_HEIGHT_PX * gl_WorkGroupID.y);
uint th_ix = gl_LocalInvocationID.x;
uint tile_x = N_TILE_X * gl_WorkGroupID.x + gl_LocalInvocationID.x % N_TILE_X;
uint tile_y = N_TILE_Y * gl_WorkGroupID.y + gl_LocalInvocationID.x / N_TILE_X;
uint this_tile_ix = tile_y * WIDTH_IN_TILES + tile_x;
CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
// Allocation and management of segment output
SegChunkRef first_seg_chunk = SegChunkRef(0);
SegChunkRef last_chunk_ref = SegChunkRef(0);
uint last_chunk_n = 0;
SegmentRef last_chunk_segs = SegmentRef(0);
alloc_chunk_remaining = 0;
// I'm sure we can figure out how to do this with at least one fewer register...
// Items up to rd_ix have been read from sh_elements
uint rd_ix = 0;
// Items up to wr_ix have been written into sh_elements
uint wr_ix = 0;
// Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
uint part_start_ix = 0;
uint ready_ix = 0;
if (th_ix < N_SLICE) {
sh_bd_sign[th_ix] = 0;
}
int backdrop = 0;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
sh_backdrop[i][th_ix] = 0;
}
if (th_ix < N_SLICE) {
sh_is_segment[th_ix] = 0;
}
// parallel read of input partitions
do {
if (ready_ix == wr_ix && partition_ix < n_partitions) {
part_start_ix = ready_ix;
uint count = 0;
if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
count = bins[in_ix];
sh_part_elements[th_ix] = bins[in_ix + 1];
}
// prefix sum of counts
for (uint i = 0; i < LG_N_PART_READ; i++) {
if (th_ix < N_PART_READ) {
sh_part_count[th_ix] = count;
}
barrier();
if (th_ix < N_PART_READ) {
if (th_ix >= (1 << i)) {
count += sh_part_count[th_ix - (1 << i)];
}
}
barrier();
}
if (th_ix < N_PART_READ) {
sh_part_count[th_ix] = part_start_ix + count;
}
barrier();
ready_ix = sh_part_count[N_PART_READ - 1];
partition_ix += N_PART_READ;
}
// use binary search to find element to read
uint ix = rd_ix + th_ix;
if (ix >= wr_ix && ix < ready_ix) {
uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + ((N_PART_READ / 2) >> i);
if (ix >= sh_part_count[probe - 1]) {
part_ix = probe;
}
}
ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
BinInstanceRef inst_ref = BinInstanceRef(sh_part_elements[part_ix]);
BinInstance inst = BinInstance_read(BinInstance_index(inst_ref, ix));
sh_elements[th_ix] = inst.element_ix;
sh_right_edge[th_ix] = inst.right_edge;
}
barrier();
wr_ix = min(rd_ix + N_TILE, ready_ix);
} while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
// We've done the merge and filled the buffer.
// Read one element, compute coverage.
uint tag = Annotated_Nop;
AnnotatedRef ref;
float right_edge = 0.0;
if (th_ix + rd_ix < wr_ix) {
uint element_ix = sh_elements[th_ix];
right_edge = sh_right_edge[th_ix];
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
}
// Setup for coverage algorithm.
float a, b, c;
// Bounding box of element in pixel coordinates.
float xmin, xmax, ymin, ymax;
uint my_slice = th_ix / 32;
uint my_mask = 1 << (th_ix & 31);
switch (tag) {
case Annotated_FillLine:
case Annotated_StrokeLine:
AnnoStrokeLineSeg line = Annotated_StrokeLine_read(ref);
xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
float dx = line.p1.x - line.p0.x;
float dy = line.p1.y - line.p0.y;
if (tag == Annotated_FillLine) {
// Set bit for backdrop sign calculation, 1 is +1, 0 is -1.
if (dy < 0) {
atomicOr(sh_bd_sign[my_slice], my_mask);
} else {
atomicAnd(sh_bd_sign[my_slice], ~my_mask);
}
}
atomicOr(sh_is_segment[my_slice], my_mask);
// Set up for per-scanline coverage formula, below.
float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
b = invslope; // Note: assumes square tiles, otherwise scale.
a = (line.p0.x - xy0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX) - xy0.y) * b) * SX;
break;
case Annotated_Fill:
case Annotated_Stroke:
// Note: we take advantage of the fact that fills and strokes
// have compatible layout.
AnnoFill fill = Annotated_Fill_read(ref);
xmin = fill.bbox.x;
xmax = fill.bbox.z;
ymin = fill.bbox.y;
ymax = fill.bbox.w;
// Just let the clamping to xmin and xmax determine the bounds.
a = 0.0;
b = 0.0;
c = 1e9;
break;
default:
ymin = 0;
ymax = 0;
break;
}
// Draw the coverage area into the bitmasks. This uses an algorithm
// that computes the coverage of a span for given scanline.
// Compute bounding box in tiles and clip to this bin.
int x0 = int(floor((xmin - xy0.x) * SX));
int x1 = int(ceil((xmax - xy0.x) * SX));
int xr = int(ceil((right_edge - xy0.x) * SX));
int y0 = int(floor((ymin - xy0.y) * SY));
int y1 = int(ceil((ymax - xy0.y) * SY));
x0 = clamp(x0, 0, N_TILE_X);
x1 = clamp(x1, x0, N_TILE_X);
xr = clamp(xr, 0, N_TILE_X);
y0 = clamp(y0, 0, N_TILE_Y);
y1 = clamp(y1, y0, N_TILE_Y);
float t = a + b * float(y0);
for (uint y = y0; y < y1; y++) {
uint xx0 = clamp(int(floor(t - c)), x0, x1);
uint xx1 = clamp(int(ceil(t + c)), x0, x1);
for (uint x = xx0; x < xx1; x++) {
atomicOr(sh_bitmaps[my_slice][y * N_TILE_X + x], my_mask);
}
if (tag == Annotated_FillLine && ymin <= xy0.y + float(y * TILE_HEIGHT_PX)) {
// Assign backdrop to all tiles to the right of the ray crossing the
// top edge of this tile, up to the right edge of the fill bbox.
float xray = t - 0.5 * b;
xx0 = max(int(ceil(xray)), 0);
for (uint x = xx0; x < xr; x++) {
atomicOr(sh_backdrop[my_slice][y * N_TILE_X + x], my_mask);
}
}
t += b;
}
barrier();
// We've computed coverage and other info for each element in the input, now for
// the output stage. We'll do segments first using a more parallel algorithm.
uint seg_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
seg_count += bitCount(sh_bitmaps[i][th_ix] & sh_is_segment[i]);
}
sh_seg_count[th_ix] = seg_count;
// Prefix sum of sh_seg_count
for (uint i = 0; i < LG_N_TILE; i++) {
barrier();
if (th_ix >= (1 << i)) {
seg_count += sh_seg_count[th_ix - (1 << i)];
}
barrier();
sh_seg_count[th_ix] = seg_count;
}
if (th_ix == N_TILE - 1) {
sh_seg_alloc = atomicAdd(alloc, seg_count * Segment_size);
}
barrier();
uint total_seg_count = sh_seg_count[N_TILE - 1];
uint seg_alloc = sh_seg_alloc;
// Output buffer is allocated as segments for each tile laid end-to-end.
for (uint ix = th_ix; ix < total_seg_count; ix += N_TILE) {
// Find the work item; this thread is now not bound to an element or tile.
// First find the tile (by binary search)
uint tile_ix = 0;
for (uint i = 0; i < LG_N_TILE; i++) {
uint probe = tile_ix + ((N_TILE / 2) >> i);
if (ix >= sh_seg_count[probe - 1]) {
tile_ix = probe;
}
}
// Now, sh_seg_count[tile_ix - 1] <= ix < sh_seg_count[tile_ix].
// (considering sh_seg_count[-1] == 0)
// Index of segment within tile's segments
uint seq_ix = ix;
// Maybe consider a sentinel value to avoid the conditional?
if (tile_ix > 0) {
seq_ix -= sh_seg_count[tile_ix - 1];
}
// Find the segment. This is done by linear scan through the bitmaps of the
// tile, accelerated by bit counting. Binary search might help, maybe not.
uint slice_ix = 0;
uint seq_bits;
while (true) {
seq_bits = sh_bitmaps[slice_ix][tile_ix] & sh_is_segment[slice_ix];
uint this_count = bitCount(seq_bits);
if (this_count > seq_ix) {
break;
}
seq_ix -= this_count;
slice_ix++;
}
// Now find position of nth bit set (n = seq_ix) in seq_bits; binary search
uint bit_ix = 0;
for (int i = 0; i < 5; i++) {
uint probe = bit_ix + (16 >> i);
if (seq_ix >= bitCount(seq_bits & ((1 << probe) - 1))) {
bit_ix = probe;
}
}
uint out_offset = seg_alloc + Segment_size * ix + SegChunk_size;
uint rd_el_ix = slice_ix * 32 + bit_ix;
uint element_ix = sh_elements[rd_el_ix];
ref = AnnotatedRef(element_ix * Annotated_size);
AnnoFillLineSeg line = Annotated_FillLine_read(ref);
float y_edge = 0.0;
// This is basically the same logic as piet-metal, but should be made numerically robust.
if (Annotated_tag(ref) == Annotated_FillLine) {
vec2 tile_xy = xy0 + vec2((tile_ix % N_TILE_X) * TILE_WIDTH_PX, (tile_ix / N_TILE_X) * TILE_HEIGHT_PX);
y_edge = mix(line.p0.y, line.p1.y, (tile_xy.x - line.p0.x) / (line.p1.x - line.p0.x));
if (min(line.p0.x, line.p1.x) < tile_xy.x && y_edge >= tile_xy.y && y_edge < tile_xy.y + TILE_HEIGHT_PX) {
if (line.p0.x > line.p1.x) {
line.p1 = vec2(tile_xy.x, y_edge);
} else {
line.p0 = vec2(tile_xy.x, y_edge);
}
} else {
y_edge = 1e9;
}
}
Segment seg = Segment(line.p0, line.p1, y_edge);
Segment_write(SegmentRef(seg_alloc + Segment_size * ix), seg);
}
// Output non-segment elements for this tile. The thread does a sequential walk
// through the non-segment elements, and for segments, count and backdrop are
// aggregated using bit counting.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
uint bd_bitmap = sh_backdrop[0][th_ix];
uint bd_sign = sh_bd_sign[0];
uint is_segment = sh_is_segment[0];
uint seg_start = th_ix == 0 ? 0 : sh_seg_count[th_ix - 1];
seg_count = 0;
while (true) {
uint nonseg_bitmap = bitmap & ~is_segment;
if (nonseg_bitmap == 0) {
backdrop += count_backdrop(bd_bitmap, bd_sign);
seg_count += bitCount(bitmap & is_segment);
slice_ix++;
if (slice_ix == N_SLICE) {
break;
}
bitmap = sh_bitmaps[slice_ix][th_ix];
bd_bitmap = sh_backdrop[slice_ix][th_ix];
bd_sign = sh_bd_sign[slice_ix];
is_segment = sh_is_segment[slice_ix];
nonseg_bitmap = bitmap & ~is_segment;
if (nonseg_bitmap == 0) {
continue;
}
}
uint element_ref_ix = slice_ix * 32 + findLSB(nonseg_bitmap);
uint element_ix = sh_elements[element_ref_ix];
// Bits up to and including the lsb
uint bd_mask = (nonseg_bitmap - 1) ^ nonseg_bitmap;
backdrop += count_backdrop(bd_bitmap & bd_mask, bd_sign);
seg_count += bitCount(bitmap & bd_mask & is_segment);
// Clear bits that have been consumed.
bd_bitmap &= ~bd_mask;
bitmap &= ~bd_mask;
// At this point, we read the element again from global memory.
// If that turns out to be expensive, maybe we can pack it into
// shared memory (or perhaps just the tag).
ref = AnnotatedRef(element_ix * Annotated_size);
tag = Annotated_tag(ref);
switch (tag) {
case Annotated_Fill:
if (last_chunk_n > 0 || seg_count > 0) {
SegChunkRef chunk_ref = SegChunkRef(0);
if (seg_count > 0) {
chunk_ref = alloc_seg_chunk();
SegChunk chunk;
chunk.n = seg_count;
chunk.next = SegChunkRef(0);
uint seg_offset = seg_alloc + seg_start * Segment_size;
chunk.segs = SegmentRef(seg_offset);
SegChunk_write(chunk_ref, chunk);
}
if (last_chunk_n > 0) {
SegChunk chunk;
chunk.n = last_chunk_n;
chunk.next = chunk_ref;
chunk.segs = last_chunk_segs;
SegChunk_write(last_chunk_ref, chunk);
} else {
first_seg_chunk = chunk_ref;
}
AnnoFill fill = Annotated_Fill_read(ref);
CmdFill cmd_fill;
cmd_fill.seg_ref = first_seg_chunk;
cmd_fill.backdrop = backdrop;
cmd_fill.rgba_color = fill.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Fill_write(cmd_ref, cmd_fill);
cmd_ref.offset += Cmd_size;
last_chunk_n = 0;
} else if (backdrop != 0) {
AnnoFill fill = Annotated_Fill_read(ref);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
cmd_ref.offset += Cmd_size;
}
seg_start += seg_count;
seg_count = 0;
backdrop = 0;
break;
case Annotated_Stroke:
// TODO: reduce divergence & code duplication? Much of the
// fill and stroke processing is in common.
if (last_chunk_n > 0 || seg_count > 0) {
SegChunkRef chunk_ref = SegChunkRef(0);
if (seg_count > 0) {
chunk_ref = alloc_seg_chunk();
SegChunk chunk;
chunk.n = seg_count;
chunk.next = SegChunkRef(0);
uint seg_offset = seg_alloc + seg_start * Segment_size;
chunk.segs = SegmentRef(seg_offset);
SegChunk_write(chunk_ref, chunk);
}
if (last_chunk_n > 0) {
SegChunk chunk;
chunk.n = last_chunk_n;
chunk.next = chunk_ref;
chunk.segs = last_chunk_segs;
SegChunk_write(last_chunk_ref, chunk);
} else {
first_seg_chunk = chunk_ref;
}
AnnoStroke stroke = Annotated_Stroke_read(ref);
CmdStroke cmd_stroke;
cmd_stroke.seg_ref = first_seg_chunk;
cmd_stroke.half_width = 0.5 * stroke.linewidth;
cmd_stroke.rgba_color = stroke.rgba_color;
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd_stroke);
cmd_ref.offset += Cmd_size;
last_chunk_n = 0;
}
seg_start += seg_count;
seg_count = 0;
break;
default:
// This shouldn't happen, but just in case.
seg_start++;
break;
}
}
if (seg_count > 0) {
SegChunkRef chunk_ref = alloc_seg_chunk();
if (last_chunk_n > 0) {
SegChunk_write(last_chunk_ref, SegChunk(last_chunk_n, chunk_ref, last_chunk_segs));
} else {
first_seg_chunk = chunk_ref;
}
// TODO: free two registers by writing count and segments ref now,
// as opposed to deferring SegChunk write until all fields are known.
last_chunk_ref = chunk_ref;
last_chunk_n = seg_count;
uint seg_offset = seg_alloc + seg_start * Segment_size;
last_chunk_segs = SegmentRef(seg_offset);
}
barrier();
rd_ix += N_TILE;
if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
}
Cmd_End_write(cmd_ref);
}

BIN
piet-gpu/shader/coarse.spv Normal file

Binary file not shown.

View file

@ -0,0 +1,328 @@
// The element processing stage, first in the pipeline.
//
// This stage is primarily about applying transforms and computing bounding
// boxes. It is organized as a scan over the input elements, producing
// annotated output elements.
#version 450
#extension GL_GOOGLE_include_directive : enable
#define N_ROWS 4
#define WG_SIZE 32
#define LG_WG_SIZE 5
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
// It would be better to use the Vulkan memory model than
// "volatile" but shooting for compatibility here rather
// than doing things right.
layout(set = 0, binding = 1) volatile buffer StateBuf {
uint[] state;
};
// The annotated results are stored here.
layout(set = 0, binding = 2) buffer AnnotatedBuf {
uint[] annotated;
};
#include "scene.h"
#include "state.h"
#include "annotated.h"
#define StateBuf_stride (8 + 2 * State_size)
StateRef state_aggregate_ref(uint partition_ix) {
return StateRef(12 + partition_ix * StateBuf_stride);
}
StateRef state_prefix_ref(uint partition_ix) {
return StateRef(12 + partition_ix * StateBuf_stride + State_size);
}
uint state_flag_index(uint partition_ix) {
return 1 + partition_ix * (StateBuf_stride / 4);
}
// These correspond to X, A, P respectively in the prefix sum paper.
#define FLAG_NOT_READY 0
#define FLAG_AGGREGATE_READY 1
#define FLAG_PREFIX_READY 2
#define FLAG_SET_LINEWIDTH 1
#define FLAG_SET_BBOX 2
#define FLAG_RESET_BBOX 4
// This is almost like a monoid (the interaction between transformation and
// bounding boxes is approximate)
State combine_state(State a, State b) {
State c;
c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
c.bbox = a.bbox;
} else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
(a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
{
c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
}
// It would be more concise to cast to matrix types; ah well.
c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
return c;
}
State map_element(ElementRef ref, inout bool is_fill) {
// TODO: it would *probably* be more efficient to make the memory read patterns less
// divergent, though it would be more wasted memory.
uint tag = Element_tag(ref);
State c;
c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
c.mat = vec4(1.0, 0.0, 0.0, 1.0);
c.translate = vec2(0.0, 0.0);
c.linewidth = 1.0; // TODO should be 0.0
c.flags = 0;
is_fill = false;
switch (tag) {
case Element_FillLine:
case Element_StrokeLine:
LineSeg line = Element_FillLine_read(ref);
c.bbox.xy = min(line.p0, line.p1);
c.bbox.zw = max(line.p0, line.p1);
break;
case Element_Quad:
QuadSeg quad = Element_Quad_read(ref);
c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
break;
case Element_Cubic:
CubicSeg cubic = Element_Cubic_read(ref);
c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
break;
case Element_Fill:
is_fill = true;
// fall-through
case Element_Stroke:
c.flags = FLAG_RESET_BBOX;
break;
case Element_SetLineWidth:
SetLineWidth lw = Element_SetLineWidth_read(ref);
c.linewidth = lw.width;
c.flags = FLAG_SET_LINEWIDTH;
break;
case Element_Transform:
Transform t = Element_Transform_read(ref);
c.mat = t.mat;
c.translate = t.translate;
break;
}
return c;
}
// Get the bounding box of a circle transformed by the matrix into an ellipse.
vec2 get_linewidth(State st) {
// See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
}
// We should be able to use an array of structs but the NV shader compiler
// doesn't seem to like it :/
//shared State sh_state[WG_SIZE];
shared vec4 sh_mat[WG_SIZE];
shared vec2 sh_translate[WG_SIZE];
shared vec4 sh_bbox[WG_SIZE];
shared float sh_width[WG_SIZE];
shared uint sh_flags[WG_SIZE];
shared uint sh_min_fill;
shared uint sh_tile_ix;
shared State sh_prefix;
void main() {
State th_state[N_ROWS];
// Determine partition to process by atomic counter (described in Section
// 4.4 of prefix sum paper).
if (gl_LocalInvocationID.x == 0) {
sh_tile_ix = atomicAdd(state[0], 1);
sh_min_fill = ~0;
}
barrier();
uint tile_ix = sh_tile_ix;
uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
ElementRef ref = ElementRef(ix * Element_size);
bool is_fill;
uint my_min_fill = ~0;
th_state[0] = map_element(ref, is_fill);
if (is_fill) my_min_fill = ix;
for (uint i = 1; i < N_ROWS; i++) {
// discussion question: would it be faster to load using more coherent patterns
// into thread memory? This is kinda strided.
th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i), is_fill));
if (is_fill && my_min_fill == ~0) {
my_min_fill = ix + i;
}
}
atomicMin(sh_min_fill, my_min_fill);
State agg = th_state[N_ROWS - 1];
sh_mat[gl_LocalInvocationID.x] = agg.mat;
sh_translate[gl_LocalInvocationID.x] = agg.translate;
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
sh_flags[gl_LocalInvocationID.x] = agg.flags;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1 << i)) {
State other;
uint ix = gl_LocalInvocationID.x - (1 << i);
other.mat = sh_mat[ix];
other.translate = sh_translate[ix];
other.bbox = sh_bbox[ix];
other.linewidth = sh_width[ix];
other.flags = sh_flags[ix];
agg = combine_state(other, agg);
}
barrier();
sh_mat[gl_LocalInvocationID.x] = agg.mat;
sh_translate[gl_LocalInvocationID.x] = agg.translate;
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
sh_flags[gl_LocalInvocationID.x] = agg.flags;
}
State exclusive;
exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
exclusive.translate = vec2(0.0, 0.0);
exclusive.linewidth = 1.0; //TODO should be 0.0
exclusive.flags = 0;
// Publish aggregate for this partition
if (gl_LocalInvocationID.x == WG_SIZE - 1) {
// Note: with memory model, we'd want to generate the atomic store version of this.
State_write(state_aggregate_ref(tile_ix), agg);
uint flag = FLAG_AGGREGATE_READY;
memoryBarrierBuffer();
if (tile_ix == 0) {
State_write(state_prefix_ref(tile_ix), agg);
flag = FLAG_PREFIX_READY;
}
state[state_flag_index(tile_ix)] = flag;
if (tile_ix != 0) {
// step 4 of paper: decoupled lookback
uint look_back_ix = tile_ix - 1;
while (true) {
flag = state[state_flag_index(look_back_ix)];
if (flag == FLAG_PREFIX_READY) {
State their_prefix = State_read(state_prefix_ref(look_back_ix));
exclusive = combine_state(their_prefix, exclusive);
break;
} else if (flag == FLAG_AGGREGATE_READY) {
State their_agg = State_read(state_aggregate_ref(look_back_ix));
exclusive = combine_state(their_agg, exclusive);
look_back_ix--;
}
// else spin
}
// step 5 of paper: compute inclusive prefix
State inclusive_prefix = combine_state(exclusive, agg);
sh_prefix = exclusive;
State_write(state_prefix_ref(tile_ix), inclusive_prefix);
memoryBarrierBuffer();
flag = FLAG_PREFIX_READY;
state[state_flag_index(tile_ix)] = flag;
}
}
barrier();
my_min_fill = sh_min_fill;
if (tile_ix != 0) {
exclusive = sh_prefix;
}
State row = exclusive;
if (gl_LocalInvocationID.x > 0) {
uint ix = gl_LocalInvocationID.x - 1;
State other;
other.mat = sh_mat[ix];
other.translate = sh_translate[ix];
other.bbox = sh_bbox[ix];
other.linewidth = sh_width[ix];
other.flags = sh_flags[ix];
row = combine_state(row, other);
}
if (my_min_fill == ~0 && gl_LocalInvocationID.x == 0) {
state[state_flag_index(tile_ix) + 1] = 0x7f800000; // infinity
}
for (uint i = 0; i < N_ROWS; i++) {
State st = combine_state(row, th_state[i]);
if (my_min_fill == ix + i) {
state[state_flag_index(tile_ix) + 1] = floatBitsToUint(st.bbox.z);
}
// We write the state now for development purposes, but the
// actual goal is to write transformed and annotated elements.
//State_write(StateRef((ix + i) * State_size), st);
// Here we read again from the original scene. There may be
// gains to be had from stashing in shared memory or possibly
// registers (though register pressure is an issue).
ElementRef this_ref = Element_index(ref, i);
AnnotatedRef out_ref = AnnotatedRef((ix + i) * Annotated_size);
uint tag = Element_tag(this_ref);
switch (tag) {
case Element_FillLine:
case Element_StrokeLine:
LineSeg line = Element_StrokeLine_read(this_ref);
AnnoStrokeLineSeg anno_line;
anno_line.p0 = st.mat.xy * line.p0.x + st.mat.zw * line.p0.y + st.translate;
anno_line.p1 = st.mat.xy * line.p1.x + st.mat.zw * line.p1.y + st.translate;
if (tag == Element_StrokeLine) {
anno_line.stroke = get_linewidth(st);
} else {
anno_line.stroke = vec2(0.0);
}
// We do encoding a bit by hand to minimize divergence. Another approach
// would be to have a fill/stroke bool.
uint out_tag = tag == Element_FillLine ? Annotated_FillLine : Annotated_StrokeLine;
annotated[out_ref.offset >> 2] = out_tag;
AnnoStrokeLineSeg_write(AnnoStrokeLineSegRef(out_ref.offset + 4), anno_line);
break;
case Element_Stroke:
Stroke stroke = Element_Stroke_read(this_ref);
AnnoStroke anno_stroke;
anno_stroke.rgba_color = stroke.rgba_color;
vec2 lw = get_linewidth(st);
anno_stroke.bbox = st.bbox + vec4(-lw, lw);
anno_stroke.linewidth = st.linewidth * sqrt(st.mat.x * st.mat.w - st.mat.y * st.mat.z);
Annotated_Stroke_write(out_ref, anno_stroke);
break;
case Element_Fill:
Fill fill = Element_Fill_read(this_ref);
AnnoFill anno_fill;
anno_fill.rgba_color = fill.rgba_color;
anno_fill.bbox = st.bbox;
Annotated_Fill_write(out_ref, anno_fill);
break;
default:
Annotated_Nop_write(out_ref);
break;
}
}
}

Binary file not shown.

View file

@ -1,130 +0,0 @@
// Code auto-generated by piet-gpu-derive
struct FillTileHeaderRef {
uint offset;
};
struct FillItemHeaderRef {
uint offset;
};
struct FillSegmentRef {
uint offset;
};
struct FillSegChunkRef {
uint offset;
};
struct FillTileHeader {
uint n;
FillItemHeaderRef items;
};
#define FillTileHeader_size 8
FillTileHeaderRef FillTileHeader_index(FillTileHeaderRef ref, uint index) {
return FillTileHeaderRef(ref.offset + index * FillTileHeader_size);
}
struct FillItemHeader {
int backdrop;
FillSegChunkRef segments;
};
#define FillItemHeader_size 8
FillItemHeaderRef FillItemHeader_index(FillItemHeaderRef ref, uint index) {
return FillItemHeaderRef(ref.offset + index * FillItemHeader_size);
}
struct FillSegment {
vec2 start;
vec2 end;
};
#define FillSegment_size 16
FillSegmentRef FillSegment_index(FillSegmentRef ref, uint index) {
return FillSegmentRef(ref.offset + index * FillSegment_size);
}
struct FillSegChunk {
uint n;
FillSegChunkRef next;
};
#define FillSegChunk_size 8
FillSegChunkRef FillSegChunk_index(FillSegChunkRef ref, uint index) {
return FillSegChunkRef(ref.offset + index * FillSegChunk_size);
}
FillTileHeader FillTileHeader_read(FillTileHeaderRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = fill_seg[ix + 0];
uint raw1 = fill_seg[ix + 1];
FillTileHeader s;
s.n = raw0;
s.items = FillItemHeaderRef(raw1);
return s;
}
void FillTileHeader_write(FillTileHeaderRef ref, FillTileHeader s) {
uint ix = ref.offset >> 2;
fill_seg[ix + 0] = s.n;
fill_seg[ix + 1] = s.items.offset;
}
FillItemHeader FillItemHeader_read(FillItemHeaderRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = fill_seg[ix + 0];
uint raw1 = fill_seg[ix + 1];
FillItemHeader s;
s.backdrop = int(raw0);
s.segments = FillSegChunkRef(raw1);
return s;
}
void FillItemHeader_write(FillItemHeaderRef ref, FillItemHeader s) {
uint ix = ref.offset >> 2;
fill_seg[ix + 0] = uint(s.backdrop);
fill_seg[ix + 1] = s.segments.offset;
}
FillSegment FillSegment_read(FillSegmentRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = fill_seg[ix + 0];
uint raw1 = fill_seg[ix + 1];
uint raw2 = fill_seg[ix + 2];
uint raw3 = fill_seg[ix + 3];
FillSegment s;
s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
void FillSegment_write(FillSegmentRef ref, FillSegment s) {
uint ix = ref.offset >> 2;
fill_seg[ix + 0] = floatBitsToUint(s.start.x);
fill_seg[ix + 1] = floatBitsToUint(s.start.y);
fill_seg[ix + 2] = floatBitsToUint(s.end.x);
fill_seg[ix + 3] = floatBitsToUint(s.end.y);
}
FillSegChunk FillSegChunk_read(FillSegChunkRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = fill_seg[ix + 0];
uint raw1 = fill_seg[ix + 1];
FillSegChunk s;
s.n = raw0;
s.next = FillSegChunkRef(raw1);
return s;
}
void FillSegChunk_write(FillSegChunkRef ref, FillSegChunk s) {
uint ix = ref.offset >> 2;
fill_seg[ix + 0] = s.n;
fill_seg[ix + 1] = s.next.offset;
}

View file

@ -1,161 +0,0 @@
// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
// and outputs "instances" (references to item + translation) for each item
// that intersects the tilegroup.
//
// This implementation is simplistic and leaves a lot of performance on the
// table. A fancier implementation would use threadgroup shared memory or
// subgroups (or possibly both) to parallelize the reading of the input and
// the computation of tilegroup intersection.
//
// In addition, there are some features currently missing, such as support
// for clipping.
#version 450
#extension GL_GOOGLE_include_directive : enable
// It's possible we should lay this out with x and do our own math.
layout(local_size_x = 1, local_size_y = 32) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
layout(set = 0, binding = 2) buffer AllocBuf {
uint alloc;
};
#include "scene.h"
#include "tilegroup.h"
#include "setup.h"
#define MAX_STACK 8
struct StackElement {
PietItemRef group;
uint index;
vec2 offset;
};
void main() {
StackElement stack[MAX_STACK];
uint stack_ix = 0;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
// State for stroke references.
TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
uint stroke_chunk_n = 0;
uint stroke_n = 0;
// State for fill references. All this is a bit cut'n'paste, but making a
// proper abstraction isn't easy.
TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
uint fill_chunk_n = 0;
uint fill_n = 0;
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
PietItemRef root = PietItemRef(0);
SimpleGroup group = PietItem_Group_read(root);
StackElement tos = StackElement(root, 0, group.offset.xy);
while (true) {
if (tos.index < group.n_items) {
Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
&& max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
bool is_group = false;
uint tag;
if (hit) {
PietItemRef item_ref = PietItem_index(group.items, tos.index);
tag = PietItem_tag(item_ref);
is_group = tag == PietItem_Group;
}
if (hit && !is_group) {
PietItemRef item_ref = PietItem_index(group.items, tos.index);
Instance ins = Instance(item_ref.offset, tos.offset);
if (tg_ref.offset > tg_limit) {
// Allocation exceeded; do atomic bump alloc.
uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
Jump jump = Jump(TileGroupRef(new_tg));
TileGroup_Jump_write(tg_ref, jump);
tg_ref = TileGroupRef(new_tg);
tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
}
TileGroup_Instance_write(tg_ref, ins);
tg_ref.offset += TileGroup_size;
if (tag == PietItem_Poly) {
if (stroke_ref.offset > stroke_limit) {
uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
stroke_chunk_start = ChunkRef(new_stroke);
stroke_ref = InstanceRef(new_stroke + Chunk_size);
stroke_n += stroke_chunk_n;
stroke_chunk_n = 0;
stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
}
Instance_write(stroke_ref, ins);
stroke_chunk_n++;
stroke_ref.offset += Instance_size;
} else if (tag == PietItem_Fill) {
if (fill_ref.offset > fill_limit) {
uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
fill_chunk_start = ChunkRef(new_fill);
fill_ref = InstanceRef(new_fill + Chunk_size);
fill_n += fill_chunk_n;
fill_chunk_n = 0;
fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
}
Instance_write(fill_ref, ins);
fill_chunk_n++;
fill_ref.offset += Instance_size;
}
}
if (is_group) {
PietItemRef item_ref = PietItem_index(group.items, tos.index);
tos.index++;
if (tos.index < group.n_items) {
stack[stack_ix++] = tos;
}
group = PietItem_Group_read(item_ref);
tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);
} else {
tos.index++;
}
} else {
// processed all items in this group; pop the stack
if (stack_ix == 0) {
break;
}
tos = stack[--stack_ix];
group = PietItem_Group_read(tos.group);
}
}
TileGroup_End_write(tg_ref);
stroke_n += stroke_chunk_n;
if (stroke_n > 0) {
Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
}
tilegroup[stroke_start.offset >> 2] = stroke_n;
fill_n += fill_chunk_n;
if (fill_n > 0) {
Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
}
tilegroup[fill_start.offset >> 2] = fill_n;
}

Binary file not shown.

View file

@ -1,167 +0,0 @@
// This is "kernel 2" (fill) in a 4-kernel pipeline. It processes the fill
// (polyline) items in the scene and generates a list of segments for each, for
// each tile.
#version 450
#extension GL_GOOGLE_include_directive : enable
layout(local_size_x = 32) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
layout(set = 0, binding = 2) buffer FillSegBuf {
uint[] fill_seg;
};
layout(set = 0, binding = 3) buffer AllocBuf {
uint alloc;
};
#include "scene.h"
#include "tilegroup.h"
#include "fill_seg.h"
#include "setup.h"
// Ensure that there is space to encode a segment.
void alloc_chunk(inout uint chunk_n_segs, inout FillSegChunkRef seg_chunk_ref,
inout FillSegChunkRef first_seg_chunk, inout uint seg_limit)
{
if (chunk_n_segs == 0) {
if (seg_chunk_ref.offset + 40 > seg_limit) {
seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - FillSegment_size;
}
first_seg_chunk = seg_chunk_ref;
} else if (seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs > seg_limit) {
uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - FillSegment_size;
FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(new_chunk_ref)));
seg_chunk_ref.offset = new_chunk_ref;
chunk_n_segs = 0;
}
}
void main() {
uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef fill_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_FILL_START);
uint fill_n = tilegroup[fill_start.offset >> 2];
FillTileHeaderRef tile_header_ref = FillTileHeaderRef(tile_ix * FillTileHeader_size);
if (fill_n > 0) {
ChunkRef chunk_ref = ChunkRef(fill_start.offset + 4);
Chunk chunk = Chunk_read(chunk_ref);
InstanceRef fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
FillItemHeaderRef item_header = FillItemHeaderRef(atomicAdd(alloc, fill_n * FillItemHeader_size));
FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, item_header));
FillSegChunkRef seg_chunk_ref = FillSegChunkRef(0);
uint seg_limit = 0;
// Iterate through items; fill_n holds count remaining.
while (true) {
if (chunk.chunk_n == 0) {
chunk_ref = chunk.next;
if (chunk_ref.offset == 0) {
break;
}
chunk = Chunk_read(chunk_ref);
fill_ref = InstanceRef(chunk_ref.offset + Chunk_size);
}
Instance ins = Instance_read(fill_ref);
PietFill fill = PietItem_Fill_read(PietItemRef(ins.item_ref));
// Process the fill polyline item.
uint max_n_segs = fill.n_points - 1;
uint chunk_n_segs = 0;
int backdrop = 0;
FillSegChunkRef seg_chunk_ref;
FillSegChunkRef first_seg_chunk = FillSegChunkRef(0);
vec2 start = Point_read(fill.points).xy;
for (uint j = 0; j < max_n_segs; j++) {
fill.points.offset += Point_size;
vec2 end = Point_read(fill.points).xy;
// Process one segment.
// TODO: I think this would go more smoothly (and be easier to
// make numerically robust) if it were based on clipping the line
// to the tile box. See:
// https://tavianator.com/fast-branchless-raybounding-box-intersections/
vec2 xymin = min(start, end);
vec2 xymax = max(start, end);
float a = end.y - start.y;
float b = start.x - end.x;
float c = -(a * start.x + b * start.y);
vec2 xy1 = xy0 + vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
float ytop = max(xy0.y, xymin.y);
float ybot = min(xy1.y, xymax.y);
float s00 = sign(b * ytop + a * xy0.x + c);
float s01 = sign(b * ytop + a * xy1.x + c);
float s10 = sign(b * ybot + a * xy0.x + c);
float s11 = sign(b * ybot + a * xy1.x + c);
float sTopLeft = sign(b * xy0.y + a * xy0.x + c);
if (sTopLeft == sign(a) && xymin.y <= xy0.y && xymax.y > xy0.y) {
backdrop -= int(s00);
}
// This is adapted from piet-metal but could be improved.
if (max(xymin.x, xy0.x) < min(xymax.x, xy1.x)
&& ytop < ybot
&& s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
{
// avoid overwriting `end` so that it can be used as start
vec2 enc_end = end;
if (xymin.x < xy0.x) {
float yEdge = mix(start.y, end.y, (start.x - xy0.x) / b);
if (yEdge >= xy0.y && yEdge < xy1.y) {
// This is encoded the same as a general fill segment, but could be
// special-cased, either here or in rendering. (It was special-cased
// in piet-metal).
FillSegment edge_seg;
if (b > 0.0) {
enc_end = vec2(xy0.x, yEdge);
edge_seg.start = enc_end;
edge_seg.end = vec2(xy0.x, xy1.y);
} else {
start = vec2(xy0.x, yEdge);
edge_seg.start = vec2(xy0.x, xy1.y);
edge_seg.end = start;
}
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), edge_seg);
chunk_n_segs++;
}
}
alloc_chunk(chunk_n_segs, seg_chunk_ref, first_seg_chunk, seg_limit);
FillSegment seg = FillSegment(start, enc_end);
FillSegment_write(FillSegmentRef(seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * chunk_n_segs), seg);
chunk_n_segs++;
}
start = end;
}
FillItemHeader_write(item_header, FillItemHeader(backdrop, first_seg_chunk));
if (chunk_n_segs != 0) {
FillSegChunk_write(seg_chunk_ref, FillSegChunk(chunk_n_segs, FillSegChunkRef(0)));
seg_chunk_ref.offset += FillSegChunk_size + FillSegment_size * chunk_n_segs;
}
fill_ref.offset += Instance_size;
chunk.chunk_n--;
item_header.offset += FillItemHeader_size;
}
} else {
// As an optimization, we could just write 0 for the size.
FillTileHeader_write(tile_header_ref, FillTileHeader(fill_n, FillItemHeaderRef(0)));
}
}

Binary file not shown.

View file

@ -1,137 +0,0 @@
// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
// (polyline) items in the scene and generates a list of segments for each, for
// each tile.
#version 450
#extension GL_GOOGLE_include_directive : enable
layout(local_size_x = 32) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
layout(set = 0, binding = 2) buffer SegmentBuf {
uint[] segment;
};
layout(set = 0, binding = 3) buffer AllocBuf {
uint alloc;
};
#include "scene.h"
#include "tilegroup.h"
#include "segment.h"
#include "setup.h"
void main() {
uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
uint stroke_n = tilegroup[stroke_start.offset >> 2];
TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
if (stroke_n > 0) {
ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
Chunk chunk = Chunk_read(chunk_ref);
InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
SegChunkRef seg_chunk_ref = SegChunkRef(0);
uint seg_limit = 0;
// Iterate through items; stroke_n holds count remaining.
while (true) {
if (chunk.chunk_n == 0) {
chunk_ref = chunk.next;
if (chunk_ref.offset == 0) {
break;
}
chunk = Chunk_read(chunk_ref);
stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
}
Instance ins = Instance_read(stroke_ref);
PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
// Process the stroke polyline item.
uint max_n_segs = poly.n_points - 1;
uint chunk_n_segs = 0;
SegChunkRef seg_chunk_ref;
vec2 start = Point_read(poly.points).xy;
for (uint j = 0; j < max_n_segs; j++) {
poly.points.offset += Point_size;
vec2 end = Point_read(poly.points).xy;
// Process one segment.
// This logic just tests for collision. What we probably want to do
// is a clipping algorithm like Liang-Barsky, and then store coords
// relative to the tile in f16. See also:
// https://tavianator.com/fast-branchless-raybounding-box-intersections/
// Also note that when we go to the fancy version, we want to compute
// the (horizontal projection of) the bounding box of the intersection
// once per tilegroup, so we can assign work to individual tiles.
float a = end.y - start.y;
float b = start.x - end.x;
float c = -(a * start.x + b * start.y);
float half_width = 0.5 * poly.width;
// Tile boundaries padded by half-width.
float xmin = xy0.x - half_width;
float ymin = xy0.y - half_width;
float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
float s00 = sign(b * ymin + a * xmin + c);
float s01 = sign(b * ymin + a * xmax + c);
float s10 = sign(b * ymax + a * xmin + c);
float s11 = sign(b * ymax + a * xmax + c);
// If bounding boxes intersect and not all four corners are on the same side, hit.
// Also note: this is designed to be false on NAN input.
if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
&& max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
&& s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
{
// Allocate a chunk if needed.
if (chunk_n_segs == 0) {
if (seg_chunk_ref.offset + 40 > seg_limit) {
seg_chunk_ref.offset = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = seg_chunk_ref.offset + SEG_CHUNK_ALLOC - Segment_size;
}
ItemHeader_write(item_header, ItemHeader(seg_chunk_ref));
} else if (seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs > seg_limit) {
uint new_chunk_ref = atomicAdd(alloc, SEG_CHUNK_ALLOC);
seg_limit = new_chunk_ref + SEG_CHUNK_ALLOC - Segment_size;
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(new_chunk_ref)));
seg_chunk_ref.offset = new_chunk_ref;
chunk_n_segs = 0;
}
Segment seg = Segment(start, end);
Segment_write(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * chunk_n_segs), seg);
chunk_n_segs++;
}
start = end;
}
if (chunk_n_segs == 0) {
ItemHeader_write(item_header, ItemHeader(SegChunkRef(0)));
} else {
SegChunk_write(seg_chunk_ref, SegChunk(chunk_n_segs, SegChunkRef(0)));
seg_chunk_ref.offset += SegChunk_size + Segment_size * chunk_n_segs;
}
stroke_ref.offset += Instance_size;
chunk.chunk_n--;
item_header.offset += ItemHeader_size;
}
} else {
// As an optimization, we could just write 0 for the size.
TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
}
}

Binary file not shown.

View file

@ -1,135 +0,0 @@
// This is "kernel 3" in a 4-kernel pipeline. It walks the active items
// for the tilegroup and produces a per-tile command list for each tile.
#version 450
#extension GL_GOOGLE_include_directive : enable
layout(local_size_x = 32, local_size_y = 1) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
// TODO: this should have a `readonly` qualifier, but then inclusion
// of ptcl.h would fail because of the writers.
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
// Used readonly
layout(set = 0, binding = 2) buffer SegmentBuf {
uint[] segment;
};
// Used readonly
layout(set = 0, binding = 3) buffer FillSegmentBuf {
uint[] fill_seg;
};
layout(set = 0, binding = 4) buffer PtclBuf {
uint[] ptcl;
};
layout(set = 0, binding = 5) buffer AllocBuf {
uint alloc;
};
#include "scene.h"
#include "tilegroup.h"
#include "segment.h"
#include "fill_seg.h"
#include "ptcl.h"
#include "setup.h"
void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset > cmd_limit) {
uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_ref, jump);
cmd_ref = CmdRef(new_cmd);
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
}
}
void main() {
uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
TileHeader stroke_th = TileHeader_read(TileHeaderRef(tile_ix * TileHeader_size));
FillTileHeader fill_th = FillTileHeader_read(FillTileHeaderRef(tile_ix * FillTileHeader_size));
while (true) {
uint tg_tag = TileGroup_tag(tg_ref);
if (tg_tag == TileGroup_End) {
break;
}
if (tg_tag == TileGroup_Jump) {
tg_ref = TileGroup_Jump_read(tg_ref).new_ref;
continue;
}
// Assume tg_tag is `Instance`, though there will be more cases.
Instance ins = TileGroup_Instance_read(tg_ref);
PietItemRef item_ref = PietItemRef(ins.item_ref);
uint item_tag = PietItem_tag(item_ref);
switch (item_tag) {
case PietItem_Circle:
PietCircle circle = PietItem_Circle_read(item_ref);
vec2 center = ins.offset + circle.center.xy;
float r = circle.radius;
if (max(center.x - r, xy0.x) < min(center.x + r, xy0.x + float(TILE_WIDTH_PX))
&& max(center.y - r, xy0.y) < min(center.y + r, xy0.y + float(TILE_HEIGHT_PX)))
{
CmdCircle cmd = CmdCircle(center, r, circle.rgba_color);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Circle_write(cmd_ref, cmd);
cmd_ref.offset += Cmd_size;
}
break;
case PietItem_Poly:
ItemHeader stroke_item = ItemHeader_read(stroke_th.items);
stroke_th.items.offset += ItemHeader_size;
if (stroke_item.segments.offset != 0) {
PietStrokePolyLine poly = PietItem_Poly_read(item_ref);
CmdStroke cmd = CmdStroke(
stroke_item.segments.offset,
0.5 * poly.width,
poly.rgba_color
);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Stroke_write(cmd_ref, cmd);
cmd_ref.offset += Cmd_size;
}
break;
case PietItem_Fill:
FillItemHeader fill_item = FillItemHeader_read(fill_th.items);
fill_th.items.offset += FillItemHeader_size;
// TODO: handle segments == 0 but backdrop != specially, it's a solid tile.
if (fill_item.segments.offset != 0) {
PietFill fill = PietItem_Fill_read(item_ref);
CmdFill cmd = CmdFill(
fill_item.segments.offset,
fill_item.backdrop,
fill.rgba_color
);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Fill_write(cmd_ref, cmd);
cmd_ref.offset += Cmd_size;
} else if (fill_item.backdrop != 0) {
// TODO: truncate existing cmd list if alpha is opaque
PietFill fill = PietItem_Fill_read(item_ref);
alloc_cmd(cmd_ref, cmd_limit);
Cmd_Solid_write(cmd_ref, CmdSolid(fill.rgba_color));
cmd_ref.offset += Cmd_size;
}
break;
}
tg_ref.offset += TileGroup_size;
}
Cmd_End_write(cmd_ref);
}

Binary file not shown.

View file

@ -6,29 +6,20 @@
#version 450
#extension GL_GOOGLE_include_directive : enable
#extension GL_KHR_shader_subgroup_basic : enable
layout(local_size_x = 16, local_size_y = 16) in;
#define CHUNK 8
#define CHUNK_DY (16 / CHUNK)
layout(local_size_x = 16, local_size_y = 2) in;
// Same concern that this should be readonly as in kernel 3.
layout(set = 0, binding = 0) buffer PtclBuf {
uint[] ptcl;
};
// Used readonly
layout(set = 0, binding = 1) buffer SegmentBuf {
uint[] segment;
};
// Used readonly
layout(set = 0, binding = 2) buffer FillSegBuf {
uint[] fill_seg;
};
layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
layout(rgba8, set = 0, binding = 1) uniform writeonly image2D image;
#include "ptcl.h"
#include "segment.h"
#include "fill_seg.h"
#include "setup.h"
@ -36,10 +27,14 @@ void main() {
uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
uvec2 xy_uint = gl_GlobalInvocationID.xy;
uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
vec2 xy = vec2(xy_uint);
vec2 uv = xy * vec2(1.0 / IMAGE_WIDTH, 1.0 / IMAGE_HEIGHT);
vec3 rgb = uv.xyy;
//vec3 rgb = uv.xyy;
vec3 rgb[CHUNK];
for (uint i = 0; i < CHUNK; i++) {
rgb[i] = vec3(0.5);
}
while (true) {
uint tag = Cmd_tag(cmd_ref);
@ -49,65 +44,85 @@ void main() {
switch (tag) {
case Cmd_Circle:
CmdCircle circle = Cmd_Circle_read(cmd_ref);
float r = length(xy + vec2(0.5, 0.5) - circle.center.xy);
float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
vec4 fg_rgba = unpackUnorm4x8(circle.rgba_color).wzyx;
// TODO: sRGB
rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
for (uint i = 0; i < CHUNK; i++) {
float dy = float(i * CHUNK_DY);
float r = length(vec2(xy.x, xy.y + dy) + vec2(0.5, 0.5) - circle.center.xy);
float alpha = clamp(0.5 + circle.radius - r, 0.0, 1.0);
// TODO: sRGB
rgb[i] = mix(rgb[i], fg_rgba.rgb, alpha * fg_rgba.a);
}
break;
case Cmd_Stroke:
CmdStroke stroke = Cmd_Stroke_read(cmd_ref);
float df = 1e9;
SegChunkRef seg_chunk_ref = SegChunkRef(stroke.seg_ref);
float df[CHUNK];
for (uint k = 0; k < CHUNK; k++) df[k] = 1e9;
SegChunkRef seg_chunk_ref = stroke.seg_ref;
do {
SegChunk seg_chunk = SegChunk_read(seg_chunk_ref);
SegmentRef segs = seg_chunk.segs;
for (int i = 0; i < seg_chunk.n; i++) {
Segment seg = Segment_read(SegmentRef(seg_chunk_ref.offset + SegChunk_size + Segment_size * i));
Segment seg = Segment_read(Segment_index(segs, i));
vec2 line_vec = seg.end - seg.start;
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df = min(df, length(line_vec * t - dpos));
for (uint k = 0; k < CHUNK; k++) {
vec2 dpos = xy + vec2(0.5, 0.5) - seg.start;
dpos.y += float(k * CHUNK_DY);
float t = clamp(dot(line_vec, dpos) / dot(line_vec, line_vec), 0.0, 1.0);
df[k] = min(df[k], length(line_vec * t - dpos));
}
}
seg_chunk_ref = seg_chunk.next;
} while (seg_chunk_ref.offset != 0);
fg_rgba = unpackUnorm4x8(stroke.rgba_color).wzyx;
alpha = clamp(stroke.half_width + 0.5 - df, 0.0, 1.0);
rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
for (uint k = 0; k < CHUNK; k++) {
float alpha = clamp(stroke.half_width + 0.5 - df[k], 0.0, 1.0);
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
}
break;
case Cmd_Fill:
CmdFill fill = Cmd_Fill_read(cmd_ref);
// Probably better to store as float, but conversion is no doubt cheap.
float area = float(fill.backdrop);
FillSegChunkRef fill_seg_chunk_ref = FillSegChunkRef(fill.seg_ref);
float area[CHUNK];
for (uint k = 0; k < CHUNK; k++) area[k] = float(fill.backdrop);
SegChunkRef fill_seg_chunk_ref = fill.seg_ref;
do {
FillSegChunk seg_chunk = FillSegChunk_read(fill_seg_chunk_ref);
SegChunk seg_chunk = SegChunk_read(fill_seg_chunk_ref);
SegmentRef segs = seg_chunk.segs;
for (int i = 0; i < seg_chunk.n; i++) {
FillSegment seg = FillSegment_read(FillSegmentRef(fill_seg_chunk_ref.offset + FillSegChunk_size + FillSegment_size * i));
vec2 start = seg.start - xy;
vec2 end = seg.end - xy;
vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
if (window.x != window.y) {
vec2 t = (window - start.y) / (end.y - start.y);
vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
float xmax = max(xs.x, xs.y);
float b = min(xmax, 1.0);
float c = max(b, 0.0);
float d = max(xmin, 0.0);
float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
area += a * (window.x - window.y);
Segment seg = Segment_read(Segment_index(segs, i));
for (uint k = 0; k < CHUNK; k++) {
vec2 my_xy = vec2(xy.x, xy.y + float(k * CHUNK_DY));
vec2 start = seg.start - my_xy;
vec2 end = seg.end - my_xy;
vec2 window = clamp(vec2(start.y, end.y), 0.0, 1.0);
if (window.x != window.y) {
vec2 t = (window - start.y) / (end.y - start.y);
vec2 xs = vec2(mix(start.x, end.x, t.x), mix(start.x, end.x, t.y));
float xmin = min(min(xs.x, xs.y), 1.0) - 1e-6;
float xmax = max(xs.x, xs.y);
float b = min(xmax, 1.0);
float c = max(b, 0.0);
float d = max(xmin, 0.0);
float a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
area[k] += a * (window.x - window.y);
}
area[k] += sign(end.x - start.x) * clamp(my_xy.y - seg.y_edge + 1.0, 0.0, 1.0);
}
}
fill_seg_chunk_ref = seg_chunk.next;
} while (fill_seg_chunk_ref.offset != 0);
fg_rgba = unpackUnorm4x8(fill.rgba_color).wzyx;
alpha = min(abs(area), 1.0);
rgb = mix(rgb, fg_rgba.rgb, alpha * fg_rgba.a);
for (uint k = 0; k < CHUNK; k++) {
float alpha = min(abs(area[k]), 1.0);
rgb[k] = mix(rgb[k], fg_rgba.rgb, alpha * fg_rgba.a);
}
break;
case Cmd_Solid:
CmdSolid solid = Cmd_Solid_read(cmd_ref);
fg_rgba = unpackUnorm4x8(solid.rgba_color).wzyx;
rgb = mix(rgb, fg_rgba.rgb, fg_rgba.a);
for (uint k = 0; k < CHUNK; k++) {
rgb[k] = mix(rgb[k], fg_rgba.rgb, fg_rgba.a);
}
break;
case Cmd_Jump:
cmd_ref = CmdRef(Cmd_Jump_read(cmd_ref).new_ref);
@ -116,5 +131,8 @@ void main() {
cmd_ref.offset += Cmd_size;
}
imageStore(image, ivec2(xy_uint), vec4(rgb, 1.0));
// TODO: sRGB
for (uint i = 0; i < CHUNK; i++) {
imageStore(image, ivec2(xy_uint.x, xy_uint.y + CHUNK_DY * i), vec4(rgb[i], 1.0));
}
}

Binary file not shown.

View file

@ -36,6 +36,14 @@ struct CmdRef {
uint offset;
};
struct SegmentRef {
uint offset;
};
struct SegChunkRef {
uint offset;
};
struct CmdCircle {
vec2 center;
float radius;
@ -60,7 +68,7 @@ CmdLineRef CmdLine_index(CmdLineRef ref, uint index) {
}
struct CmdStroke {
uint seg_ref;
SegChunkRef seg_ref;
float half_width;
uint rgba_color;
};
@ -72,7 +80,7 @@ CmdStrokeRef CmdStroke_index(CmdStrokeRef ref, uint index) {
}
struct CmdFill {
uint seg_ref;
SegChunkRef seg_ref;
int backdrop;
uint rgba_color;
};
@ -141,6 +149,30 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
return CmdRef(ref.offset + index * Cmd_size);
}
struct Segment {
vec2 start;
vec2 end;
float y_edge;
};
#define Segment_size 20
SegmentRef Segment_index(SegmentRef ref, uint index) {
return SegmentRef(ref.offset + index * Segment_size);
}
struct SegChunk {
uint n;
SegChunkRef next;
SegmentRef segs;
};
#define SegChunk_size 12
SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
return SegChunkRef(ref.offset + index * SegChunk_size);
}
CmdCircle CmdCircle_read(CmdCircleRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
@ -188,7 +220,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
CmdStroke s;
s.seg_ref = raw0;
s.seg_ref = SegChunkRef(raw0);
s.half_width = uintBitsToFloat(raw1);
s.rgba_color = raw2;
return s;
@ -196,7 +228,7 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.seg_ref;
ptcl[ix + 0] = s.seg_ref.offset;
ptcl[ix + 1] = floatBitsToUint(s.half_width);
ptcl[ix + 2] = s.rgba_color;
}
@ -207,7 +239,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
CmdFill s;
s.seg_ref = raw0;
s.seg_ref = SegChunkRef(raw0);
s.backdrop = int(raw1);
s.rgba_color = raw2;
return s;
@ -215,7 +247,7 @@ CmdFill CmdFill_read(CmdFillRef ref) {
void CmdFill_write(CmdFillRef ref, CmdFill s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.seg_ref;
ptcl[ix + 0] = s.seg_ref.offset;
ptcl[ix + 1] = uint(s.backdrop);
ptcl[ix + 2] = s.rgba_color;
}
@ -362,3 +394,45 @@ void Cmd_Bail_write(CmdRef ref) {
ptcl[ref.offset >> 2] = Cmd_Bail;
}
Segment Segment_read(SegmentRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
uint raw3 = ptcl[ix + 3];
uint raw4 = ptcl[ix + 4];
Segment s;
s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.y_edge = uintBitsToFloat(raw4);
return s;
}
void Segment_write(SegmentRef ref, Segment s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = floatBitsToUint(s.start.x);
ptcl[ix + 1] = floatBitsToUint(s.start.y);
ptcl[ix + 2] = floatBitsToUint(s.end.x);
ptcl[ix + 3] = floatBitsToUint(s.end.y);
ptcl[ix + 4] = floatBitsToUint(s.y_edge);
}
SegChunk SegChunk_read(SegChunkRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = ptcl[ix + 0];
uint raw1 = ptcl[ix + 1];
uint raw2 = ptcl[ix + 2];
SegChunk s;
s.n = raw0;
s.next = SegChunkRef(raw1);
s.segs = SegmentRef(raw2);
return s;
}
void SegChunk_write(SegChunkRef ref, SegChunk s) {
uint ix = ref.offset >> 2;
ptcl[ix + 0] = s.n;
ptcl[ix + 1] = s.next.offset;
ptcl[ix + 2] = s.segs.offset;
}

View file

@ -32,6 +32,38 @@ struct PietItemRef {
uint offset;
};
struct LineSegRef {
uint offset;
};
struct QuadSegRef {
uint offset;
};
struct CubicSegRef {
uint offset;
};
struct FillRef {
uint offset;
};
struct StrokeRef {
uint offset;
};
struct SetLineWidthRef {
uint offset;
};
struct TransformRef {
uint offset;
};
struct ElementRef {
uint offset;
};
struct Bbox {
ivec4 bbox;
};
@ -128,6 +160,98 @@ PietItemRef PietItem_index(PietItemRef ref, uint index) {
return PietItemRef(ref.offset + index * PietItem_size);
}
struct LineSeg {
vec2 p0;
vec2 p1;
};
#define LineSeg_size 16
LineSegRef LineSeg_index(LineSegRef ref, uint index) {
return LineSegRef(ref.offset + index * LineSeg_size);
}
struct QuadSeg {
vec2 p0;
vec2 p1;
vec2 p2;
};
#define QuadSeg_size 24
QuadSegRef QuadSeg_index(QuadSegRef ref, uint index) {
return QuadSegRef(ref.offset + index * QuadSeg_size);
}
struct CubicSeg {
vec2 p0;
vec2 p1;
vec2 p2;
vec2 p3;
};
#define CubicSeg_size 32
CubicSegRef CubicSeg_index(CubicSegRef ref, uint index) {
return CubicSegRef(ref.offset + index * CubicSeg_size);
}
struct Fill {
uint rgba_color;
};
#define Fill_size 4
FillRef Fill_index(FillRef ref, uint index) {
return FillRef(ref.offset + index * Fill_size);
}
struct Stroke {
uint rgba_color;
};
#define Stroke_size 4
StrokeRef Stroke_index(StrokeRef ref, uint index) {
return StrokeRef(ref.offset + index * Stroke_size);
}
struct SetLineWidth {
float width;
};
#define SetLineWidth_size 4
SetLineWidthRef SetLineWidth_index(SetLineWidthRef ref, uint index) {
return SetLineWidthRef(ref.offset + index * SetLineWidth_size);
}
struct Transform {
vec4 mat;
vec2 translate;
};
#define Transform_size 24
TransformRef Transform_index(TransformRef ref, uint index) {
return TransformRef(ref.offset + index * Transform_size);
}
#define Element_Nop 0
#define Element_StrokeLine 1
#define Element_FillLine 2
#define Element_Quad 3
#define Element_Cubic 4
#define Element_Stroke 5
#define Element_Fill 6
#define Element_SetLineWidth 7
#define Element_Transform 8
#define Element_size 36
ElementRef Element_index(ElementRef ref, uint index) {
return ElementRef(ref.offset + index * Element_size);
}
Bbox Bbox_read(BboxRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
@ -236,3 +360,122 @@ PietStrokePolyLine PietItem_Poly_read(PietItemRef ref) {
return PietStrokePolyLine_read(PietStrokePolyLineRef(ref.offset + 4));
}
LineSeg LineSeg_read(LineSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
LineSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
QuadSeg QuadSeg_read(QuadSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
uint raw4 = scene[ix + 4];
uint raw5 = scene[ix + 5];
QuadSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
return s;
}
CubicSeg CubicSeg_read(CubicSegRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
uint raw4 = scene[ix + 4];
uint raw5 = scene[ix + 5];
uint raw6 = scene[ix + 6];
uint raw7 = scene[ix + 7];
CubicSeg s;
s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.p2 = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
s.p3 = vec2(uintBitsToFloat(raw6), uintBitsToFloat(raw7));
return s;
}
Fill Fill_read(FillRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
Fill s;
s.rgba_color = raw0;
return s;
}
Stroke Stroke_read(StrokeRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
Stroke s;
s.rgba_color = raw0;
return s;
}
SetLineWidth SetLineWidth_read(SetLineWidthRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
SetLineWidth s;
s.width = uintBitsToFloat(raw0);
return s;
}
Transform Transform_read(TransformRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = scene[ix + 0];
uint raw1 = scene[ix + 1];
uint raw2 = scene[ix + 2];
uint raw3 = scene[ix + 3];
uint raw4 = scene[ix + 4];
uint raw5 = scene[ix + 5];
Transform s;
s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
return s;
}
uint Element_tag(ElementRef ref) {
return scene[ref.offset >> 2];
}
LineSeg Element_StrokeLine_read(ElementRef ref) {
return LineSeg_read(LineSegRef(ref.offset + 4));
}
LineSeg Element_FillLine_read(ElementRef ref) {
return LineSeg_read(LineSegRef(ref.offset + 4));
}
QuadSeg Element_Quad_read(ElementRef ref) {
return QuadSeg_read(QuadSegRef(ref.offset + 4));
}
CubicSeg Element_Cubic_read(ElementRef ref) {
return CubicSeg_read(CubicSegRef(ref.offset + 4));
}
Stroke Element_Stroke_read(ElementRef ref) {
return Stroke_read(StrokeRef(ref.offset + 4));
}
Fill Element_Fill_read(ElementRef ref) {
return Fill_read(FillRef(ref.offset + 4));
}
SetLineWidth Element_SetLineWidth_read(ElementRef ref) {
return SetLineWidth_read(SetLineWidthRef(ref.offset + 4));
}
Transform Element_Transform_read(ElementRef ref) {
return Transform_read(TransformRef(ref.offset + 4));
}

View file

@ -1,126 +0,0 @@
// Code auto-generated by piet-gpu-derive
struct TileHeaderRef {
uint offset;
};
struct ItemHeaderRef {
uint offset;
};
struct SegmentRef {
uint offset;
};
struct SegChunkRef {
uint offset;
};
struct TileHeader {
uint n;
ItemHeaderRef items;
};
#define TileHeader_size 8
TileHeaderRef TileHeader_index(TileHeaderRef ref, uint index) {
return TileHeaderRef(ref.offset + index * TileHeader_size);
}
struct ItemHeader {
SegChunkRef segments;
};
#define ItemHeader_size 4
ItemHeaderRef ItemHeader_index(ItemHeaderRef ref, uint index) {
return ItemHeaderRef(ref.offset + index * ItemHeader_size);
}
struct Segment {
vec2 start;
vec2 end;
};
#define Segment_size 16
SegmentRef Segment_index(SegmentRef ref, uint index) {
return SegmentRef(ref.offset + index * Segment_size);
}
struct SegChunk {
uint n;
SegChunkRef next;
};
#define SegChunk_size 8
SegChunkRef SegChunk_index(SegChunkRef ref, uint index) {
return SegChunkRef(ref.offset + index * SegChunk_size);
}
TileHeader TileHeader_read(TileHeaderRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
uint raw1 = segment[ix + 1];
TileHeader s;
s.n = raw0;
s.items = ItemHeaderRef(raw1);
return s;
}
void TileHeader_write(TileHeaderRef ref, TileHeader s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = s.n;
segment[ix + 1] = s.items.offset;
}
ItemHeader ItemHeader_read(ItemHeaderRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
ItemHeader s;
s.segments = SegChunkRef(raw0);
return s;
}
void ItemHeader_write(ItemHeaderRef ref, ItemHeader s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = s.segments.offset;
}
Segment Segment_read(SegmentRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
uint raw1 = segment[ix + 1];
uint raw2 = segment[ix + 2];
uint raw3 = segment[ix + 3];
Segment s;
s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
return s;
}
void Segment_write(SegmentRef ref, Segment s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = floatBitsToUint(s.start.x);
segment[ix + 1] = floatBitsToUint(s.start.y);
segment[ix + 2] = floatBitsToUint(s.end.x);
segment[ix + 3] = floatBitsToUint(s.end.y);
}
SegChunk SegChunk_read(SegChunkRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = segment[ix + 0];
uint raw1 = segment[ix + 1];
SegChunk s;
s.n = raw0;
s.next = SegChunkRef(raw1);
return s;
}
void SegChunk_write(SegChunkRef ref, SegChunk s) {
uint ix = ref.offset >> 2;
segment[ix + 0] = s.n;
segment[ix + 1] = s.next.offset;
}

View file

@ -39,4 +39,26 @@
// Maximum number of segments in a SegChunk
#define SEG_CHUNK_N 32
#define SEG_CHUNK_ALLOC 512
#define SEG_CHUNK_ALLOC 512
// Stuff for new algorithm follows; some of the above should get
// deleted.
// These should probably be renamed and/or reworked. In the binning
// kernel, they represent the number of bins. Also, the workgroup size
// of that kernel is equal to the number of bins, but should probably
// be more flexible (it's 512 in the K&L paper).
#define N_TILE_X 16
#define N_TILE_Y 16
#define N_TILE (N_TILE_X * N_TILE_Y)
#define LG_N_TILE 8
#define N_SLICE (N_TILE / 32)
// Number of workgroups for binning kernel
#define N_WG 16
// This is the ratio of the number of elements in a binning workgroup
// over the number of elements in a partition workgroup.
#define ELEMENT_BINNING_RATIO 2
#define BIN_INITIAL_ALLOC 64
#define BIN_ALLOC 256

59
piet-gpu/shader/state.h Normal file
View file

@ -0,0 +1,59 @@
// Code auto-generated by piet-gpu-derive
struct StateRef {
uint offset;
};
struct State {
vec4 mat;
vec2 translate;
vec4 bbox;
float linewidth;
uint flags;
};
#define State_size 48
StateRef State_index(StateRef ref, uint index) {
return StateRef(ref.offset + index * State_size);
}
State State_read(StateRef ref) {
uint ix = ref.offset >> 2;
uint raw0 = state[ix + 0];
uint raw1 = state[ix + 1];
uint raw2 = state[ix + 2];
uint raw3 = state[ix + 3];
uint raw4 = state[ix + 4];
uint raw5 = state[ix + 5];
uint raw6 = state[ix + 6];
uint raw7 = state[ix + 7];
uint raw8 = state[ix + 8];
uint raw9 = state[ix + 9];
uint raw10 = state[ix + 10];
uint raw11 = state[ix + 11];
State s;
s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
s.linewidth = uintBitsToFloat(raw10);
s.flags = raw11;
return s;
}
void State_write(StateRef ref, State s) {
uint ix = ref.offset >> 2;
state[ix + 0] = floatBitsToUint(s.mat.x);
state[ix + 1] = floatBitsToUint(s.mat.y);
state[ix + 2] = floatBitsToUint(s.mat.z);
state[ix + 3] = floatBitsToUint(s.mat.w);
state[ix + 4] = floatBitsToUint(s.translate.x);
state[ix + 5] = floatBitsToUint(s.translate.y);
state[ix + 6] = floatBitsToUint(s.bbox.x);
state[ix + 7] = floatBitsToUint(s.bbox.y);
state[ix + 8] = floatBitsToUint(s.bbox.z);
state[ix + 9] = floatBitsToUint(s.bbox.w);
state[ix + 10] = floatBitsToUint(s.linewidth);
state[ix + 11] = s.flags;
}

View file

@ -1,5 +1,5 @@
mod render_ctx;
mod pico_svg;
mod render_ctx;
pub use render_ctx::PietGpuRenderContext;
@ -8,6 +8,8 @@ use rand::{Rng, RngCore};
use piet::kurbo::{BezPath, Circle, Line, Point, Vec2};
use piet::{Color, RenderContext};
use piet_gpu_types::encoder::Encode;
use piet_gpu_hal::{CmdBuf, Device, Error, ImageLayout, MemFlags};
use pico_svg::PicoSvg;
@ -28,7 +30,20 @@ const PTCL_INITIAL_ALLOC: usize = 1024;
const K2_PER_TILE_SIZE: usize = 8;
const N_CIRCLES: usize = 1;
const N_CIRCLES: usize = 0;
const N_WG: u32 = 16;
pub fn render_svg(rc: &mut impl RenderContext, filename: &str, scale: f64) {
let xml_str = std::fs::read_to_string(filename).unwrap();
let start = std::time::Instant::now();
let svg = PicoSvg::load(&xml_str, scale).unwrap();
println!("parsing time: {:?}", start.elapsed());
let start = std::time::Instant::now();
svg.render(rc);
println!("flattening and encoding time: {:?}", start.elapsed());
}
pub fn render_scene(rc: &mut impl RenderContext) {
let mut rng = rand::thread_rng();
@ -42,12 +57,14 @@ pub fn render_scene(rc: &mut impl RenderContext) {
let circle = Circle::new(center, radius);
rc.fill(circle, &color);
}
/*
let mut path = BezPath::new();
path.move_to((100.0, 1150.0));
path.line_to((200.0, 1200.0));
path.line_to((150.0, 1250.0));
path.close_path();
rc.fill(path, &Color::rgb8(128, 0, 128));
*/
rc.stroke(
Line::new((100.0, 100.0), (200.0, 150.0)),
&Color::WHITE,
@ -59,7 +76,7 @@ pub fn render_scene(rc: &mut impl RenderContext) {
#[allow(unused)]
fn render_cardioid(rc: &mut impl RenderContext) {
let n = 91;
let n = 601;
let dth = std::f64::consts::PI * 2.0 / (n as f64);
let center = Point::new(1024.0, 768.0);
let r = 750.0;
@ -67,7 +84,7 @@ fn render_cardioid(rc: &mut impl RenderContext) {
for i in 1..n {
let p0 = center + Vec2::from_angle(i as f64 * dth) * r;
let p1 = center + Vec2::from_angle(((i * 2) % n) as f64 * dth) * r;
rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
//rc.fill(&Circle::new(p0, 8.0), &Color::WHITE);
path.move_to(p0);
path.line_to(p1);
//rc.stroke(Line::new(p0, p1), &Color::BLACK, 2.0);
@ -96,10 +113,10 @@ fn dump_scene(buf: &[u8]) {
}
#[allow(unused)]
fn dump_k1_data(k1_buf: &[u32]) {
pub fn dump_k1_data(k1_buf: &[u32]) {
for i in 0..k1_buf.len() {
if k1_buf[i] != 0 {
println!("{:4x}: {:8x}", i, k1_buf[i]);
println!("{:4x}: {:8x}", i * 4, k1_buf[i]);
}
}
}
@ -110,27 +127,30 @@ pub struct Renderer<D: Device> {
scene_buf: D::Buffer,
scene_dev: D::Buffer,
k1_alloc_buf_host: D::Buffer,
k1_alloc_buf_dev: D::Buffer,
k2s_alloc_buf_host: D::Buffer,
k2s_alloc_buf_dev: D::Buffer,
k2f_alloc_buf_host: D::Buffer,
k2f_alloc_buf_dev: D::Buffer,
k3_alloc_buf_host: D::Buffer,
k3_alloc_buf_dev: D::Buffer,
tilegroup_buf: D::Buffer,
ptcl_buf: D::Buffer,
pub state_buf: D::Buffer,
pub anno_buf: D::Buffer,
pub bin_buf: D::Buffer,
pub ptcl_buf: D::Buffer,
el_pipeline: D::Pipeline,
el_ds: D::DescriptorSet,
bin_pipeline: D::Pipeline,
bin_ds: D::DescriptorSet,
bin_alloc_buf_host: D::Buffer,
bin_alloc_buf_dev: D::Buffer,
coarse_pipeline: D::Pipeline,
coarse_ds: D::DescriptorSet,
coarse_alloc_buf_host: D::Buffer,
coarse_alloc_buf_dev: D::Buffer,
k1_pipeline: D::Pipeline,
k1_ds: D::DescriptorSet,
k2s_pipeline: D::Pipeline,
k2s_ds: D::DescriptorSet,
k2f_pipeline: D::Pipeline,
k2f_ds: D::DescriptorSet,
k3_pipeline: D::Pipeline,
k3_ds: D::DescriptorSet,
k4_pipeline: D::Pipeline,
k4_ds: D::DescriptorSet,
n_elements: usize,
}
impl<D: Device> Renderer<D> {
@ -138,6 +158,9 @@ impl<D: Device> Renderer<D> {
let host = MemFlags::host_coherent();
let dev = MemFlags::device_local();
let n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
println!("scene: {} elements", n_elements);
let scene_buf = device
.create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
.unwrap();
@ -146,174 +169,121 @@ impl<D: Device> Renderer<D> {
.unwrap();
device.write_buffer(&scene_buf, &scene)?;
let tilegroup_buf = device.create_buffer(4 * 1024 * 1024, dev)?;
let state_buf = device.create_buffer(1 * 1024 * 1024, dev)?;
let anno_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let bin_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let ptcl_buf = device.create_buffer(48 * 1024 * 1024, dev)?;
let segment_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let fill_seg_buf = device.create_buffer(64 * 1024 * 1024, dev)?;
let image_dev = device.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
let k1_alloc_buf_host = device.create_buffer(4, host)?;
let k1_alloc_buf_dev = device.create_buffer(4, dev)?;
let k1_alloc_start = WIDTH_IN_TILEGROUPS * HEIGHT_IN_TILEGROUPS * TILEGROUP_STRIDE;
device.write_buffer(&k1_alloc_buf_host, &[k1_alloc_start as u32])?;
let k1_code = include_bytes!("../shader/kernel1.spv");
let k1_pipeline = device
.create_simple_compute_pipeline(k1_code, 3, 0)?;
let k1_ds = device
.create_descriptor_set(
&k1_pipeline,
&[&scene_dev, &tilegroup_buf, &k1_alloc_buf_dev],
&[],
)?;
let el_code = include_bytes!("../shader/elements.spv");
let el_pipeline = device.create_simple_compute_pipeline(el_code, 3, 0)?;
let el_ds = device.create_descriptor_set(
&el_pipeline,
&[&scene_dev, &state_buf, &anno_buf],
&[],
)?;
let k2s_alloc_buf_host = device.create_buffer(4, host)?;
let k2s_alloc_buf_dev = device.create_buffer(4, dev)?;
let k2s_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device
.write_buffer(&k2s_alloc_buf_host, &[k2s_alloc_start as u32])
?;
let k2s_code = include_bytes!("../shader/kernel2s.spv");
let k2s_pipeline = device
.create_simple_compute_pipeline(k2s_code, 4, 0)
?;
let k2s_ds = device
.create_descriptor_set(
&k2s_pipeline,
&[&scene_dev, &tilegroup_buf, &segment_buf, &k2s_alloc_buf_dev],
&[],
)
?;
let bin_alloc_buf_host = device.create_buffer(12, host)?;
let bin_alloc_buf_dev = device.create_buffer(12, dev)?;
let k2f_alloc_buf_host = device.create_buffer(4, host)?;
let k2f_alloc_buf_dev = device.create_buffer(4, dev)?;
let k2f_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * K2_PER_TILE_SIZE;
device
.write_buffer(&k2f_alloc_buf_host, &[k2f_alloc_start as u32])
?;
let k2f_code = include_bytes!("../shader/kernel2f.spv");
let k2f_pipeline = device.create_simple_compute_pipeline(k2f_code, 4, 0)?;
let k2f_ds = device
.create_descriptor_set(
&k2f_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&fill_seg_buf,
&k2f_alloc_buf_dev,
],
&[],
)
?;
// TODO: constants
let bin_alloc_start = ((n_elements + 255) & !255) * 8;
device.write_buffer(
&bin_alloc_buf_host,
&[n_elements as u32, 0, bin_alloc_start as u32],
)?;
let bin_code = include_bytes!("../shader/binning.spv");
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
let bin_ds = device.create_descriptor_set(
&bin_pipeline,
&[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
&[],
)?;
let k3_alloc_buf_host = device.create_buffer(4, host)?;
let k3_alloc_buf_dev = device.create_buffer(4, dev)?;
let k3_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
device
.write_buffer(&k3_alloc_buf_host, &[k3_alloc_start as u32])
?;
let k3_code = include_bytes!("../shader/kernel3.spv");
let k3_pipeline = device.create_simple_compute_pipeline(k3_code, 6, 0)?;
let k3_ds = device
.create_descriptor_set(
&k3_pipeline,
&[
&scene_dev,
&tilegroup_buf,
&segment_buf,
&fill_seg_buf,
&ptcl_buf,
&k3_alloc_buf_dev,
],
&[],
)
?;
let coarse_alloc_buf_host = device.create_buffer(8, host)?;
let coarse_alloc_buf_dev = device.create_buffer(8, dev)?;
let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
device.write_buffer(
&coarse_alloc_buf_host,
&[n_elements as u32, coarse_alloc_start as u32],
)?;
let coarse_code = include_bytes!("../shader/coarse.spv");
let coarse_pipeline = device.create_simple_compute_pipeline(coarse_code, 4, 0)?;
let coarse_ds = device.create_descriptor_set(
&coarse_pipeline,
&[&anno_buf, &bin_buf, &coarse_alloc_buf_dev, &ptcl_buf],
&[],
)?;
let k4_code = include_bytes!("../shader/kernel4.spv");
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 3, 1)?;
let k4_ds = device
.create_descriptor_set(&k4_pipeline, &[&ptcl_buf, &segment_buf, &fill_seg_buf], &[&image_dev])
?;
let k4_pipeline = device.create_simple_compute_pipeline(k4_code, 1, 1)?;
let k4_ds = device.create_descriptor_set(&k4_pipeline, &[&ptcl_buf], &[&image_dev])?;
Ok(Renderer {
scene_buf,
scene_dev,
image_dev,
k1_alloc_buf_host,
k1_alloc_buf_dev,
k2s_alloc_buf_host,
k2s_alloc_buf_dev,
k2f_alloc_buf_host,
k2f_alloc_buf_dev,
k3_alloc_buf_host,
k3_alloc_buf_dev,
tilegroup_buf,
ptcl_buf,
k1_pipeline,
k1_ds,
k2s_pipeline,
k2s_ds,
k2f_pipeline,
k2f_ds,
k3_pipeline,
k3_ds,
el_pipeline,
el_ds,
bin_pipeline,
bin_ds,
coarse_pipeline,
coarse_ds,
k4_pipeline,
k4_ds,
state_buf,
anno_buf,
bin_buf,
ptcl_buf,
bin_alloc_buf_host,
bin_alloc_buf_dev,
coarse_alloc_buf_host,
coarse_alloc_buf_dev,
n_elements,
})
}
pub unsafe fn record(&self, cmd_buf: &mut impl CmdBuf<D>, query_pool: &D::QueryPool) {
cmd_buf.copy_buffer(&self.scene_buf, &self.scene_dev);
// Note: we could use one alloc buf and reuse it. But we'll stick with
// multiple ones for clarity.
cmd_buf.copy_buffer(&self.k1_alloc_buf_host, &self.k1_alloc_buf_dev);
cmd_buf.copy_buffer(&self.k2s_alloc_buf_host, &self.k2s_alloc_buf_dev);
cmd_buf.copy_buffer(&self.k2f_alloc_buf_host, &self.k2f_alloc_buf_dev);
cmd_buf.copy_buffer(&self.k3_alloc_buf_host, &self.k3_alloc_buf_dev);
// Note: these clears aren't necessary, and are here to make inspection
// of the buffers cleaner. Can likely be removed.
cmd_buf.clear_buffer(&self.tilegroup_buf);
cmd_buf.clear_buffer(&self.ptcl_buf);
cmd_buf.copy_buffer(&self.bin_alloc_buf_host, &self.bin_alloc_buf_dev);
cmd_buf.copy_buffer(&self.coarse_alloc_buf_host, &self.coarse_alloc_buf_dev);
cmd_buf.clear_buffer(&self.state_buf);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::Undefined, ImageLayout::General);
cmd_buf.image_barrier(
&self.image_dev,
ImageLayout::Undefined,
ImageLayout::General,
);
cmd_buf.reset_query_pool(&query_pool);
cmd_buf.write_timestamp(&query_pool, 0);
cmd_buf.dispatch(
&self.k1_pipeline,
&self.k1_ds,
((WIDTH / 512) as u32, (HEIGHT / 512) as u32, 1),
&self.el_pipeline,
&self.el_ds,
(((self.n_elements + 127) / 128) as u32, 1, 1),
);
cmd_buf.write_timestamp(&query_pool, 1);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k2s_pipeline,
&self.k2s_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 1),
&self.bin_pipeline,
&self.bin_ds,
(((self.n_elements + 255) / 256) as u32, 1, 1),
);
cmd_buf.write_timestamp(&query_pool, 2);
// Note: this barrier is not necessary (k2f does not depend on
// k2s output), but I'm keeping it here to increase transparency
// of performance.
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k2f_pipeline,
&self.k2f_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 2),
&self.coarse_pipeline,
&self.coarse_ds,
(WIDTH as u32 / 256, HEIGHT as u32 / 256, 1),
);
cmd_buf.write_timestamp(&query_pool, 3);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k3_pipeline,
&self.k3_ds,
((WIDTH / 512) as u32, (HEIGHT / 16) as u32, 3),
);
cmd_buf.write_timestamp(&query_pool, 4);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&self.k4_pipeline,
&self.k4_ds,
((WIDTH / TILE_W) as u32, (HEIGHT / TILE_H) as u32, 1),
);
cmd_buf.write_timestamp(&query_pool, 5);
cmd_buf.write_timestamp(&query_pool, 4);
cmd_buf.memory_barrier();
cmd_buf.image_barrier(&self.image_dev, ImageLayout::General, ImageLayout::BlitSrc);
}

View file

@ -2,7 +2,7 @@
use std::str::FromStr;
use roxmltree::Document;
use roxmltree::{Document, Node};
use piet::kurbo::{Affine, BezPath};
@ -28,27 +28,19 @@ pub struct FillItem {
path: BezPath,
}
struct Parser<'a> {
scale: f64,
items: &'a mut Vec<Item>,
}
impl PicoSvg {
pub fn load(xml_string: &str, scale: f64) -> Result<PicoSvg, Box<dyn std::error::Error>> {
let doc = Document::parse(xml_string)?;
let root = doc.root_element();
let g = root.first_element_child().ok_or("no root element")?;
let mut items = Vec::new();
for el in g.children() {
if el.is_element() {
let d = el.attribute("d").ok_or("missing 'd' attribute")?;
let bp = BezPath::from_svg(d)?;
let path = Affine::scale(scale) * bp;
if let Some(fill_color) = el.attribute("fill") {
let color = parse_color(fill_color);
items.push(Item::Fill(FillItem { color, path: path.clone() }));
}
if let Some(stroke_color) = el.attribute("stroke") {
let width = f64::from_str(el.attribute("stroke-width").ok_or("missing width")?)?;
let color = parse_color(stroke_color);
items.push(Item::Stroke(StrokeItem { width, color, path }));
}
}
let mut parser = Parser::new(&mut items, scale);
for node in root.children() {
parser.rec_parse(node)?;
}
Ok(PicoSvg { items })
}
@ -58,6 +50,7 @@ impl PicoSvg {
match item {
Item::Fill(fill_item) => {
rc.fill(&fill_item.path, &fill_item.color);
//rc.stroke(&fill_item.path, &fill_item.color, 1.0);
}
Item::Stroke(stroke_item) => {
rc.stroke(&stroke_item.path, &stroke_item.color, stroke_item.width);
@ -67,6 +60,59 @@ impl PicoSvg {
}
}
impl<'a> Parser<'a> {
fn new(items: &'a mut Vec<Item>, scale: f64) -> Parser<'a> {
Parser { scale, items }
}
fn rec_parse(&mut self, node: Node) -> Result<(), Box<dyn std::error::Error>> {
let transform = if self.scale >= 0.0 {
Affine::scale(self.scale)
} else {
Affine::new([-self.scale, 0.0, 0.0, self.scale, 0.0, 1536.0])
};
if node.is_element() {
match node.tag_name().name() {
"g" => {
for child in node.children() {
self.rec_parse(child)?;
}
}
"path" => {
let d = node.attribute("d").ok_or("missing 'd' attribute")?;
let bp = BezPath::from_svg(d)?;
let path = transform * bp;
// TODO: default fill color is black, but this is overridden in tiger to this logic.
if let Some(fill_color) = node.attribute("fill") {
if fill_color != "none" {
let color = parse_color(fill_color);
let color = modify_opacity(color, "fill-opacity", node);
self.items.push(Item::Fill(FillItem {
color,
path: path.clone(),
}));
}
}
if let Some(stroke_color) = node.attribute("stroke") {
if stroke_color != "none" {
let width = self.scale.abs()
* f64::from_str(
node.attribute("stroke-width").ok_or("missing width")?,
)?;
let color = parse_color(stroke_color);
let color = modify_opacity(color, "stroke-opacity", node);
self.items
.push(Item::Stroke(StrokeItem { width, color, path }));
}
}
}
_ => (),
}
}
Ok(())
}
}
fn parse_color(color: &str) -> Color {
if color.as_bytes()[0] == b'#' {
let mut hex = u32::from_str_radix(&color[1..], 16).unwrap();
@ -74,7 +120,27 @@ fn parse_color(color: &str) -> Color {
hex = (hex >> 8) * 0x110000 + ((hex >> 4) & 0xf) * 0x1100 + (hex & 0xf) * 0x11;
}
Color::from_rgba32_u32((hex << 8) + 0xff)
} else if color.starts_with("rgb(") {
let mut iter = color[4..color.len() - 1].split(',');
let r = u8::from_str(iter.next().unwrap()).unwrap();
let g = u8::from_str(iter.next().unwrap()).unwrap();
let b = u8::from_str(iter.next().unwrap()).unwrap();
Color::rgb8(r, g, b)
} else {
Color::from_rgba32_u32(0xff00ff80)
}
}
fn modify_opacity(color: Color, attr_name: &str, node: Node) -> Color {
if let Some(opacity) = node.attribute(attr_name) {
let alpha = if opacity.ends_with("%") {
let pctg = opacity[..opacity.len() - 1].parse().unwrap_or(100.0);
pctg * 0.01
} else {
opacity.parse().unwrap_or(1.0)
};
color.with_alpha(alpha)
} else {
color
}
}

View file

@ -2,7 +2,11 @@ use std::borrow::Cow;
use piet_gpu_types::encoder::{Encode, Encoder, Ref};
use piet_gpu_types::scene;
use piet_gpu_types::scene::{Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup};
use piet_gpu_types::scene::{
Bbox, PietCircle, PietFill, PietItem, PietStrokePolyLine, SimpleGroup,
};
use piet_gpu_types::scene::{CubicSeg, Element, Fill, LineSeg, QuadSeg, SetLineWidth, Stroke};
use piet::kurbo::{Affine, PathEl, Point, Rect, Shape};
@ -27,10 +31,10 @@ pub struct PietGpuText;
pub struct PietGpuRenderContext {
encoder: Encoder,
bboxes: Vec<Bbox>,
items: Vec<PietItem>,
elements: Vec<Element>,
// Will probably need direct accesss to hal Device to create images etc.
inner_text: PietGpuText,
stroke_width: f32,
}
#[derive(Clone)]
@ -43,47 +47,22 @@ const TOLERANCE: f64 = 0.25;
impl PietGpuRenderContext {
pub fn new() -> PietGpuRenderContext {
let mut encoder = Encoder::new();
let _reserve_root = encoder.alloc_chunk(PietItem::fixed_size() as u32);
let bboxes = Vec::new();
let items = Vec::new();
let encoder = Encoder::new();
let elements = Vec::new();
let inner_text = PietGpuText;
let stroke_width = 0.0;
PietGpuRenderContext {
encoder,
bboxes,
items,
elements,
inner_text,
stroke_width,
}
}
pub fn get_scene_buf(&mut self) -> &[u8] {
let n_items = self.bboxes.len() as u32;
let bboxes = self.bboxes.encode(&mut self.encoder).transmute();
let items = self.items.encode(&mut self.encoder).transmute();
let offset = scene::Point { xy: [0.0, 0.0] };
let simple_group = SimpleGroup {
n_items,
bboxes,
items,
offset,
};
let root_item = PietItem::Group(simple_group);
root_item.encode_to(&mut self.encoder.buf_mut()[0..PietItem::fixed_size()]);
self.elements.encode(&mut self.encoder);
self.encoder.buf()
}
fn push_item(&mut self, item: PietItem, bbox: Rect) {
let scene_bbox = Bbox {
bbox: [
bbox.x0.floor() as i16,
bbox.y0.floor() as i16,
bbox.x1.ceil() as i16,
bbox.y1.ceil() as i16,
],
};
self.items.push(item);
self.bboxes.push(scene_bbox);
}
}
impl RenderContext for PietGpuRenderContext {
@ -107,20 +86,19 @@ impl RenderContext for PietGpuRenderContext {
fn clear(&mut self, _color: Color) {}
fn stroke(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>, width: f64) {
let bbox = shape.bounding_box();
let brush = brush.make_brush(self, || bbox).into_owned();
let width = width as f32;
if self.stroke_width != width {
self.elements
.push(Element::SetLineWidth(SetLineWidth { width }));
self.stroke_width = width;
}
let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
let path = shape.to_bez_path(TOLERANCE);
let (n_points, points) = flatten_shape(&mut self.encoder, path);
self.encode_path(path, false);
match brush {
PietGpuBrush::Solid(rgba_color) => {
let poly_line = PietStrokePolyLine {
rgba_color,
width: width as f32,
n_points,
points,
};
let bbox = bbox.inset(-0.5 * width);
self.push_item(PietItem::Poly(poly_line), bbox);
let stroke = Stroke { rgba_color };
self.elements.push(Element::Stroke(stroke));
}
_ => (),
}
@ -136,35 +114,13 @@ impl RenderContext for PietGpuRenderContext {
}
fn fill(&mut self, shape: impl Shape, brush: &impl IntoBrush<Self>) {
let bbox = shape.bounding_box();
let brush = brush.make_brush(self, || shape.bounding_box()).into_owned();
if let Some(circle) = shape.as_circle() {
match brush {
PietGpuBrush::Solid(rgba_color) => {
let piet_circle = PietCircle {
rgba_color,
center: to_scene_point(circle.center),
radius: circle.radius as f32,
};
let bbox = circle.bounding_box();
self.push_item(PietItem::Circle(piet_circle), bbox);
}
_ => {}
}
return;
}
let path = shape.to_bez_path(TOLERANCE);
let (n_points, points) = flatten_shape(&mut self.encoder, path);
self.encode_path(path, true);
match brush {
PietGpuBrush::Solid(rgba_color) => {
let fill = PietFill {
flags: 0,
rgba_color,
n_points,
points,
};
self.push_item(PietItem::Fill(fill), bbox);
let fill = Fill { rgba_color };
self.elements.push(Element::Fill(fill));
}
_ => (),
}
@ -241,45 +197,110 @@ impl RenderContext for PietGpuRenderContext {
}
}
fn flatten_shape(
encoder: &mut Encoder,
path: impl Iterator<Item = PathEl>,
) -> (u32, Ref<scene::Point>) {
let mut points = Vec::new();
let mut start_pt = None;
let mut last_pt = None;
piet::kurbo::flatten(path, TOLERANCE, |el| {
match el {
PathEl::MoveTo(p) => {
let scene_pt = to_scene_point(p);
start_pt = Some(clone_scene_pt(&scene_pt));
if !points.is_empty() {
points.push(scene::Point {
xy: [std::f32::NAN, std::f32::NAN],
});
impl PietGpuRenderContext {
fn encode_line_seg(&mut self, seg: LineSeg, is_fill: bool) {
if is_fill {
self.elements.push(Element::FillLine(seg));
} else {
self.elements.push(Element::StrokeLine(seg));
}
}
fn encode_path(&mut self, path: impl Iterator<Item = PathEl>, is_fill: bool) {
let flatten = true;
if flatten {
let mut start_pt = None;
let mut last_pt = None;
piet::kurbo::flatten(path, TOLERANCE, |el| {
match el {
PathEl::MoveTo(p) => {
let scene_pt = to_f32_2(p);
start_pt = Some(scene_pt);
last_pt = Some(scene_pt);
}
PathEl::LineTo(p) => {
let scene_pt = to_f32_2(p);
let seg = LineSeg {
p0: last_pt.unwrap(),
p1: scene_pt,
};
self.encode_line_seg(seg, is_fill);
last_pt = Some(scene_pt);
}
PathEl::ClosePath => {
if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
if last != start {
let seg = LineSeg {
p0: last,
p1: start,
};
self.encode_line_seg(seg, is_fill);
}
}
}
_ => (),
}
last_pt = Some(clone_scene_pt(&scene_pt));
points.push(scene_pt);
}
PathEl::LineTo(p) => {
let scene_pt = to_scene_point(p);
last_pt = Some(clone_scene_pt(&scene_pt));
points.push(scene_pt);
}
PathEl::ClosePath => {
if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
if start.xy != last.xy {
points.push(start);
//println!("{:?}", el);
});
} else {
let mut start_pt = None;
let mut last_pt = None;
for el in path {
match el {
PathEl::MoveTo(p) => {
let scene_pt = to_f32_2(p);
start_pt = Some(scene_pt);
last_pt = Some(scene_pt);
}
PathEl::LineTo(p) => {
let scene_pt = to_f32_2(p);
let seg = LineSeg {
p0: last_pt.unwrap(),
p1: scene_pt,
};
self.encode_line_seg(seg, is_fill);
last_pt = Some(scene_pt);
}
PathEl::QuadTo(p1, p2) => {
let scene_p1 = to_f32_2(p1);
let scene_p2 = to_f32_2(p2);
let seg = QuadSeg {
p0: last_pt.unwrap(),
p1: scene_p1,
p2: scene_p2,
};
self.elements.push(Element::Quad(seg));
last_pt = Some(scene_p2);
}
PathEl::CurveTo(p1, p2, p3) => {
let scene_p1 = to_f32_2(p1);
let scene_p2 = to_f32_2(p2);
let scene_p3 = to_f32_2(p3);
let seg = CubicSeg {
p0: last_pt.unwrap(),
p1: scene_p1,
p2: scene_p2,
p3: scene_p3,
};
self.elements.push(Element::Cubic(seg));
last_pt = Some(scene_p3);
}
PathEl::ClosePath => {
if let (Some(start), Some(last)) = (start_pt.take(), last_pt.take()) {
if last != start {
let seg = LineSeg {
p0: last,
p1: start,
};
self.encode_line_seg(seg, is_fill);
}
}
}
}
//println!("{:?}", el);
}
_ => (),
}
//println!("{:?}", el);
});
let n_points = points.len() as u32;
let points_ref = points.encode(encoder).transmute();
(n_points, points_ref)
}
}
impl Text for PietGpuText {
@ -360,13 +381,6 @@ impl IntoBrush<PietGpuRenderContext> for PietGpuBrush {
}
}
fn to_scene_point(point: Point) -> scene::Point {
scene::Point {
xy: [point.x as f32, point.y as f32],
}
}
// TODO: allow #[derive(Clone)] in piet-gpu-derive.
fn clone_scene_pt(p: &scene::Point) -> scene::Point {
scene::Point { xy: p.xy }
fn to_f32_2(point: Point) -> [f32; 2] {
[point.x as f32, point.y as f32]
}