vello/piet-gpu/shader/kernel2s.comp
Raph Levien cb06b1bc3d Implement stroked polylines
This version seems to work but the allocation of segments has low
utilization. Probably best to allocate in chunks rather than try to
make them contiguous.
2020-04-28 18:45:59 -07:00

128 lines
5.4 KiB
Plaintext

// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
// (polyline) items in the scene and generates a list of segments for each, for
// each tile.
#version 450
#extension GL_GOOGLE_include_directive : enable
layout(local_size_x = 32) in;
layout(set = 0, binding = 0) readonly buffer SceneBuf {
uint[] scene;
};
layout(set = 0, binding = 1) buffer TilegroupBuf {
uint[] tilegroup;
};
layout(set = 0, binding = 2) buffer SegmentBuf {
uint[] segment;
};
layout(set = 0, binding = 3) buffer AllocBuf {
uint alloc;
};
#include "scene.h"
#include "tilegroup.h"
#include "segment.h"
#include "setup.h"
void main() {
uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
+ (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
uint stroke_n = tilegroup[stroke_start.offset >> 2];
TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
if (stroke_n > 0) {
ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
Chunk chunk = Chunk_read(chunk_ref);
InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
SegmentRef seg_ref = SegmentRef(0);
uint seg_limit = 0;
// Iterate through items; stroke_n holds count remaining.
while (true) {
if (chunk.chunk_n == 0) {
chunk_ref = chunk.next;
chunk = Chunk_read(chunk_ref);
stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
}
Instance ins = Instance_read(stroke_ref);
PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));
// Process the stroke polyline item.
uint max_n_segs = poly.n_points - 1;
uint reserve = max_n_segs * Segment_size;
if (seg_ref.offset + reserve > seg_limit) {
// This is a heuristic to balance atomic bandwidth and utilization.
// The output always gets a contiguous allocation. We might use
// all, some, or none of the capacity.
uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve;
seg_ref.offset = atomicAdd(alloc, capacity_bytes);
seg_limit = seg_ref.offset + capacity_bytes;
}
uint n_segs = 0;
vec2 start = Point_read(poly.points).xy;
for (uint j = 0; j < max_n_segs; j++) {
poly.points.offset += Point_size;
vec2 end = Point_read(poly.points).xy;
// Process one segment.
// This logic just tests for collision. What we probably want to do
// is a clipping algorithm like Liang-Barsky, and then store coords
// relative to the tile in f16. See also:
// https://tavianator.com/fast-branchless-raybounding-box-intersections/
// Also note that when we go to the fancy version, we want to compute
// the (horizontal projection of) the bounding box of the intersection
// once per tilegroup, so we can assign work to individual tiles.
float a = end.y - start.y;
float b = start.x - end.x;
float c = -(a * start.x + b * start.y);
float half_width = 0.5 * poly.width;
// Tile boundaries padded by half-width.
float xmin = xy0.x - half_width;
float ymin = xy0.y - half_width;
float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
float s00 = sign(b * ymin + a * xmin + c);
float s01 = sign(b * ymin + a * xmax + c);
float s10 = sign(b * ymax + a * xmin + c);
float s11 = sign(b * ymax + a * xmax + c);
// If bounding boxes intersect and not all four corners are on the same side, hit.
// Also note: this is designed to be false on NAN input.
if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
&& max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
&& s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
{
Segment seg = Segment(start, end);
Segment_write(Segment_index(seg_ref, n_segs), seg);
n_segs++;
}
start = end;
}
ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref));
if (--stroke_n == 0) {
break;
}
seg_ref.offset += n_segs * Segment_size;
stroke_ref.offset += Instance_size;
chunk.chunk_n--;
item_header.offset += ItemHeader_size;
}
} else {
// As an optimization, we could just write 0 for the size.
TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
}
}