mirror of
https://github.com/italicsjenga/vello.git
synced 2025-01-10 12:41:30 +11:00
Start implementing fills
This should get the "right_edge" value for each segment plumbed through to the binning phase. It also needs to be plumbed to coarse raster and wired up there. Also considering WIP because none of this logic has been tested yet.
This commit is contained in:
parent
0ed759814b
commit
03da52cff8
|
@ -8,6 +8,7 @@ piet_gpu! {
|
||||||
translate: [f32; 2],
|
translate: [f32; 2],
|
||||||
bbox: [f32; 4],
|
bbox: [f32; 4],
|
||||||
linewidth: f32,
|
linewidth: f32,
|
||||||
|
right_edge: f32,
|
||||||
flags: u32,
|
flags: u32,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,24 +11,33 @@ layout(set = 0, binding = 0) buffer AnnotatedBuf {
|
||||||
uint[] annotated;
|
uint[] annotated;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(set = 0, binding = 1) buffer AllocBuf {
|
// This is for scanning forward for right_edge data.
|
||||||
|
layout(set = 0, binding = 1) buffer StateBuf {
|
||||||
|
uint[] state;
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(set = 0, binding = 2) buffer AllocBuf {
|
||||||
uint n_elements;
|
uint n_elements;
|
||||||
// Will be incremented atomically to claim tiles
|
// Will be incremented atomically to claim tiles
|
||||||
uint tile_ix;
|
uint tile_ix;
|
||||||
uint alloc;
|
uint alloc;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(set = 0, binding = 2) buffer BinsBuf {
|
layout(set = 0, binding = 3) buffer BinsBuf {
|
||||||
uint[] bins;
|
uint[] bins;
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "annotated.h"
|
#include "annotated.h"
|
||||||
|
#include "state.h"
|
||||||
#include "bins.h"
|
#include "bins.h"
|
||||||
|
|
||||||
// scale factors useful for converting coordinates to bins
|
// scale factors useful for converting coordinates to bins
|
||||||
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
|
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
|
||||||
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
|
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
|
||||||
|
|
||||||
|
// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
|
||||||
|
#define INFINITY (1.0 / 0.0)
|
||||||
|
|
||||||
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
|
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
|
||||||
shared uint bitmaps[N_SLICE][N_TILE];
|
shared uint bitmaps[N_SLICE][N_TILE];
|
||||||
shared uint count[N_SLICE][N_TILE];
|
shared uint count[N_SLICE][N_TILE];
|
||||||
|
@ -37,6 +46,14 @@ shared uint sh_chunk_start[N_TILE];
|
||||||
shared uint sh_chunk_end[N_TILE];
|
shared uint sh_chunk_end[N_TILE];
|
||||||
shared uint sh_chunk_jump[N_TILE];
|
shared uint sh_chunk_jump[N_TILE];
|
||||||
|
|
||||||
|
shared float sh_right_edge[N_TILE];
|
||||||
|
|
||||||
|
#define StateBuf_stride (4 + 2 * State_size)
|
||||||
|
|
||||||
|
StateRef state_aggregate_ref(uint partition_ix) {
|
||||||
|
return StateRef(8 + partition_ix * StateBuf_stride);
|
||||||
|
}
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
BinChunkRef chunk_ref = BinChunkRef((gl_LocalInvocationID.x * N_WG + gl_WorkGroupID.x) * BIN_INITIAL_ALLOC);
|
||||||
uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
|
uint wr_limit = chunk_ref.offset + BIN_INITIAL_ALLOC;
|
||||||
|
@ -65,6 +82,7 @@ void main() {
|
||||||
tag = Annotated_tag(ref);
|
tag = Annotated_tag(ref);
|
||||||
}
|
}
|
||||||
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
|
||||||
|
float my_right_edge = INFINITY;
|
||||||
switch (tag) {
|
switch (tag) {
|
||||||
case Annotated_Line:
|
case Annotated_Line:
|
||||||
AnnoLineSeg line = Annotated_Line_read(ref);
|
AnnoLineSeg line = Annotated_Line_read(ref);
|
||||||
|
@ -82,8 +100,37 @@ void main() {
|
||||||
y0 = int(floor(fill.bbox.y * SY));
|
y0 = int(floor(fill.bbox.y * SY));
|
||||||
x1 = int(ceil(fill.bbox.z * SX));
|
x1 = int(ceil(fill.bbox.z * SX));
|
||||||
y1 = int(ceil(fill.bbox.w * SY));
|
y1 = int(ceil(fill.bbox.w * SY));
|
||||||
|
my_right_edge = x1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the last element in this partition is a fill edge, then we need to do a
|
||||||
|
// look-forward to find the right edge of its corresponding fill. That data is
|
||||||
|
// recorded in aggregates computed in the element processing pass.
|
||||||
|
if (gl_LocalInvocationID.x == N_TILE - 1 && tag == Annotated_Line) {
|
||||||
|
uint aggregate_ix = (my_tile + 1) * ELEMENT_BINNING_RATIO;
|
||||||
|
// This is sequential but the expectation is that the amount of
|
||||||
|
// look-forward is small (performance may degrade in the case
|
||||||
|
// of massively complex paths).
|
||||||
|
do {
|
||||||
|
StateRef agg_ref = state_aggregate_ref(aggregate_ix);
|
||||||
|
my_right_edge = State_read(agg_ref).right_edge;
|
||||||
|
aggregate_ix++;
|
||||||
|
} while (isinf(my_right_edge));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now propagate right_edge backward, from fill to segment.
|
||||||
|
for (uint i = 0; i < LG_N_TILE; i++) {
|
||||||
|
// Note: we could try to cut down on write bandwidth here if the value hasn't
|
||||||
|
// changed, but not sure it's worth the complexity to track.
|
||||||
|
sh_right_edge[gl_LocalInvocationID.x] = my_right_edge;
|
||||||
|
barrier();
|
||||||
|
if (gl_LocalInvocationID.x + (1 << i) < N_TILE && isinf(my_right_edge)) {
|
||||||
|
my_right_edge = sh_right_edge[gl_LocalInvocationID.x + (1 << i)];
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
// At this point, we run an iterator over the coverage area,
|
// At this point, we run an iterator over the coverage area,
|
||||||
// trying to keep divergence low.
|
// trying to keep divergence low.
|
||||||
// Right now, it's just a bbox, but we'll get finer with
|
// Right now, it's just a bbox, but we'll get finer with
|
||||||
|
@ -141,6 +188,7 @@ void main() {
|
||||||
chunk_n = element_count - chunk_n;
|
chunk_n = element_count - chunk_n;
|
||||||
} else {
|
} else {
|
||||||
chunk_end = ~0;
|
chunk_end = ~0;
|
||||||
|
chunk_new_start = ~0;
|
||||||
chunk_n = element_count;
|
chunk_n = element_count;
|
||||||
}
|
}
|
||||||
sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
|
sh_chunk_start[gl_LocalInvocationID.x] = instance_ref.offset;
|
||||||
|
|
Binary file not shown.
|
@ -10,7 +10,7 @@
|
||||||
#define N_ROWS 4
|
#define N_ROWS 4
|
||||||
#define WG_SIZE 32
|
#define WG_SIZE 32
|
||||||
#define LG_WG_SIZE 5
|
#define LG_WG_SIZE 5
|
||||||
#define TILE_SIZE (WG_SIZE * N_ROWS)
|
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
|
||||||
|
|
||||||
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
|
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
|
||||||
|
|
||||||
|
@ -81,6 +81,7 @@ State combine_state(State a, State b) {
|
||||||
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
|
c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
|
||||||
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
|
c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
|
||||||
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
|
c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
|
||||||
|
c.right_edge = (a.flags & FLAG_SET_BBOX) != 0 ? a.right_edge : (a.flags & FLAG_RESET_BBOX) != 0 ? a.bbox.z : c.right_edge;
|
||||||
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
|
c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX)) | b.flags;
|
||||||
c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
|
c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
|
||||||
return c;
|
return c;
|
||||||
|
@ -143,6 +144,7 @@ shared vec4 sh_mat[WG_SIZE];
|
||||||
shared vec2 sh_translate[WG_SIZE];
|
shared vec2 sh_translate[WG_SIZE];
|
||||||
shared vec4 sh_bbox[WG_SIZE];
|
shared vec4 sh_bbox[WG_SIZE];
|
||||||
shared float sh_width[WG_SIZE];
|
shared float sh_width[WG_SIZE];
|
||||||
|
shared float sh_right_edge[WG_SIZE];
|
||||||
shared uint sh_flags[WG_SIZE];
|
shared uint sh_flags[WG_SIZE];
|
||||||
|
|
||||||
shared uint sh_tile_ix;
|
shared uint sh_tile_ix;
|
||||||
|
@ -158,7 +160,7 @@ void main() {
|
||||||
barrier();
|
barrier();
|
||||||
uint tile_ix = sh_tile_ix;
|
uint tile_ix = sh_tile_ix;
|
||||||
|
|
||||||
uint ix = tile_ix * TILE_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
uint ix = tile_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
|
||||||
ElementRef ref = ElementRef(ix * Element_size);
|
ElementRef ref = ElementRef(ix * Element_size);
|
||||||
|
|
||||||
th_state[0] = map_element(ref);
|
th_state[0] = map_element(ref);
|
||||||
|
@ -172,6 +174,7 @@ void main() {
|
||||||
sh_translate[gl_LocalInvocationID.x] = agg.translate;
|
sh_translate[gl_LocalInvocationID.x] = agg.translate;
|
||||||
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
|
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
|
||||||
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
|
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
|
||||||
|
sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
|
||||||
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
||||||
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
for (uint i = 0; i < LG_WG_SIZE; i++) {
|
||||||
barrier();
|
barrier();
|
||||||
|
@ -190,6 +193,7 @@ void main() {
|
||||||
sh_translate[gl_LocalInvocationID.x] = agg.translate;
|
sh_translate[gl_LocalInvocationID.x] = agg.translate;
|
||||||
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
|
sh_bbox[gl_LocalInvocationID.x] = agg.bbox;
|
||||||
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
|
sh_width[gl_LocalInvocationID.x] = agg.linewidth;
|
||||||
|
sh_right_edge[gl_LocalInvocationID.x] = agg.right_edge;
|
||||||
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
sh_flags[gl_LocalInvocationID.x] = agg.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,6 +202,7 @@ void main() {
|
||||||
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
|
exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
|
||||||
exclusive.translate = vec2(0.0, 0.0);
|
exclusive.translate = vec2(0.0, 0.0);
|
||||||
exclusive.linewidth = 1.0; //TODO should be 0.0
|
exclusive.linewidth = 1.0; //TODO should be 0.0
|
||||||
|
exclusive.right_edge = 0.0;
|
||||||
exclusive.flags = 0;
|
exclusive.flags = 0;
|
||||||
|
|
||||||
// Publish aggregate for this partition
|
// Publish aggregate for this partition
|
||||||
|
@ -250,6 +255,7 @@ void main() {
|
||||||
other.translate = sh_translate[ix];
|
other.translate = sh_translate[ix];
|
||||||
other.bbox = sh_bbox[ix];
|
other.bbox = sh_bbox[ix];
|
||||||
other.linewidth = sh_width[ix];
|
other.linewidth = sh_width[ix];
|
||||||
|
other.right_edge = sh_right_edge[ix];
|
||||||
other.flags = sh_flags[ix];
|
other.flags = sh_flags[ix];
|
||||||
row = combine_state(row, other);
|
row = combine_state(row, other);
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
|
@ -51,9 +51,14 @@
|
||||||
#define N_TILE_X 16
|
#define N_TILE_X 16
|
||||||
#define N_TILE_Y 16
|
#define N_TILE_Y 16
|
||||||
#define N_TILE (N_TILE_X * N_TILE_Y)
|
#define N_TILE (N_TILE_X * N_TILE_Y)
|
||||||
|
#define LG_N_TILE 8
|
||||||
#define N_SLICE (N_TILE / 32)
|
#define N_SLICE (N_TILE / 32)
|
||||||
// Number of workgroups for binning kernel
|
// Number of workgroups for binning kernel
|
||||||
#define N_WG 16
|
#define N_WG 16
|
||||||
|
|
||||||
|
// This is the ratio of the number of elements in a binning workgroup
|
||||||
|
// over the number of elements in a partition workgroup.
|
||||||
|
#define ELEMENT_BINNING_RATIO 4
|
||||||
|
|
||||||
#define BIN_INITIAL_ALLOC 64
|
#define BIN_INITIAL_ALLOC 64
|
||||||
#define BIN_ALLOC 256
|
#define BIN_ALLOC 256
|
||||||
|
|
|
@ -9,10 +9,11 @@ struct State {
|
||||||
vec2 translate;
|
vec2 translate;
|
||||||
vec4 bbox;
|
vec4 bbox;
|
||||||
float linewidth;
|
float linewidth;
|
||||||
|
float right_edge;
|
||||||
uint flags;
|
uint flags;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define State_size 48
|
#define State_size 52
|
||||||
|
|
||||||
StateRef State_index(StateRef ref, uint index) {
|
StateRef State_index(StateRef ref, uint index) {
|
||||||
return StateRef(ref.offset + index * State_size);
|
return StateRef(ref.offset + index * State_size);
|
||||||
|
@ -32,12 +33,14 @@ State State_read(StateRef ref) {
|
||||||
uint raw9 = state[ix + 9];
|
uint raw9 = state[ix + 9];
|
||||||
uint raw10 = state[ix + 10];
|
uint raw10 = state[ix + 10];
|
||||||
uint raw11 = state[ix + 11];
|
uint raw11 = state[ix + 11];
|
||||||
|
uint raw12 = state[ix + 12];
|
||||||
State s;
|
State s;
|
||||||
s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
|
s.mat = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
|
||||||
s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
|
s.translate = vec2(uintBitsToFloat(raw4), uintBitsToFloat(raw5));
|
||||||
s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
|
s.bbox = vec4(uintBitsToFloat(raw6), uintBitsToFloat(raw7), uintBitsToFloat(raw8), uintBitsToFloat(raw9));
|
||||||
s.linewidth = uintBitsToFloat(raw10);
|
s.linewidth = uintBitsToFloat(raw10);
|
||||||
s.flags = raw11;
|
s.right_edge = uintBitsToFloat(raw11);
|
||||||
|
s.flags = raw12;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -54,6 +57,7 @@ void State_write(StateRef ref, State s) {
|
||||||
state[ix + 8] = floatBitsToUint(s.bbox.z);
|
state[ix + 8] = floatBitsToUint(s.bbox.z);
|
||||||
state[ix + 9] = floatBitsToUint(s.bbox.w);
|
state[ix + 9] = floatBitsToUint(s.bbox.w);
|
||||||
state[ix + 10] = floatBitsToUint(s.linewidth);
|
state[ix + 10] = floatBitsToUint(s.linewidth);
|
||||||
state[ix + 11] = s.flags;
|
state[ix + 11] = floatBitsToUint(s.right_edge);
|
||||||
|
state[ix + 12] = s.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -185,10 +185,10 @@ impl<D: Device> Renderer<D> {
|
||||||
])
|
])
|
||||||
?;
|
?;
|
||||||
let bin_code = include_bytes!("../shader/binning.spv");
|
let bin_code = include_bytes!("../shader/binning.spv");
|
||||||
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 3, 0)?;
|
let bin_pipeline = device.create_simple_compute_pipeline(bin_code, 4, 0)?;
|
||||||
let bin_ds = device.create_descriptor_set(
|
let bin_ds = device.create_descriptor_set(
|
||||||
&bin_pipeline,
|
&bin_pipeline,
|
||||||
&[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
|
&[&anno_buf, &state_buf, &bin_alloc_buf_dev, &bin_buf],
|
||||||
&[],
|
&[],
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue