vello/piet-gpu/shader/kernel2s.comp

// This is "kernel 2" (strokes) in a 4-kernel pipeline. It processes the stroke
// (polyline) items in the scene and generates a list of segments for each, for
// each tile.

#version 450
#extension GL_GOOGLE_include_directive : enable

layout(local_size_x = 32) in;

layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
};

layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
};

layout(set = 0, binding = 2) buffer SegmentBuf {
    uint[] segment;
};

layout(set = 0, binding = 3) buffer AllocBuf {
    uint alloc;
};

#include "scene.h"
#include "tilegroup.h"
#include "segment.h"

#include "setup.h"

void main() {
    uint tile_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILES + gl_GlobalInvocationID.x;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS
        + (gl_GlobalInvocationID.x / TILEGROUP_WIDTH_TILES);
    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILE_WIDTH_PX, TILE_HEIGHT_PX);
    TileGroupRef stroke_start = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE + TILEGROUP_STROKE_START);
    uint stroke_n = tilegroup[stroke_start.offset >> 2];

    TileHeaderRef tile_header_ref = TileHeaderRef(tile_ix * TileHeader_size);
    if (stroke_n > 0) {
        ChunkRef chunk_ref = ChunkRef(stroke_start.offset + 4);
        Chunk chunk = Chunk_read(chunk_ref);
        InstanceRef stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
        ItemHeaderRef item_header = ItemHeaderRef(atomicAdd(alloc, stroke_n * ItemHeader_size));
        TileHeader_write(tile_header_ref, TileHeader(stroke_n, item_header));
        SegmentRef seg_ref = SegmentRef(0);
        uint seg_limit = 0;
        // Iterate through items; stroke_n holds count remaining.
        while (true) {
            if (chunk.chunk_n == 0) {
                chunk_ref = chunk.next;
                chunk = Chunk_read(chunk_ref);
                stroke_ref = InstanceRef(chunk_ref.offset + Chunk_size);
            }
            Instance ins = Instance_read(stroke_ref);
            PietStrokePolyLine poly = PietItem_Poly_read(PietItemRef(ins.item_ref));

            // Process the stroke polyline item.
            uint max_n_segs = poly.n_points - 1;
            uint reserve = max_n_segs * Segment_size;
            if (seg_ref.offset + reserve > seg_limit) {
                // This is a heuristic to balance atomic bandwidth and utilization.
                // The output always gets a contiguous allocation. We might use
                // all, some, or none of the capacity.
                uint capacity_bytes = stroke_n > 1 ? reserve * 2 + 128 : reserve;
                seg_ref.offset = atomicAdd(alloc, capacity_bytes);
                seg_limit = seg_ref.offset + capacity_bytes;
            }
            uint n_segs = 0;
            vec2 start = Point_read(poly.points).xy;
            for (uint j = 0; j < max_n_segs; j++) {
                poly.points.offset += Point_size;
                vec2 end = Point_read(poly.points).xy;

                // Process one segment.

                // This logic just tests for collision. What we probably want to do
                // is a clipping algorithm like Liang-Barsky, and then store coords
                // relative to the tile in f16. See also:
                // https://tavianator.com/fast-branchless-raybounding-box-intersections/

                // Also note that when we go to the fancy version, we want to compute
                // the (horizontal projection of) the bounding box of the intersection
                // once per tilegroup, so we can assign work to individual tiles.

                float a = end.y - start.y;
                float b = start.x - end.x;
                float c = -(a * start.x + b * start.y);
                float half_width = 0.5 * poly.width;
                // Tile boundaries padded by half-width.
                float xmin = xy0.x - half_width;
                float ymin = xy0.y - half_width;
                float xmax = xy0.x + float(TILE_WIDTH_PX) + half_width;
                float ymax = xy0.y + float(TILE_HEIGHT_PX) + half_width;
                float s00 = sign(b * ymin + a * xmin + c);
                float s01 = sign(b * ymin + a * xmax + c);
                float s10 = sign(b * ymax + a * xmin + c);
                float s11 = sign(b * ymax + a * xmax + c);
                // If bounding boxes intersect and not all four corners are on the same side, hit.
                // Also note: this is designed to be false on NAN input.
                if (max(min(start.x, end.x), xmin) < min(max(start.x, end.x), xmax)
                    && max(min(start.y, end.y), ymin) < min(max(start.y, end.y), ymax)
                    && s00 * s01 + s00 * s10 + s00 * s11 < 3.0)
                {
                    Segment seg = Segment(start, end);
                    Segment_write(Segment_index(seg_ref, n_segs), seg);
                    n_segs++;
                }

                start = end;
            }
            ItemHeader_write(item_header, ItemHeader(n_segs, seg_ref));
            if (--stroke_n == 0) {
                break;
            }
            seg_ref.offset += n_segs * Segment_size;

            stroke_ref.offset += Instance_size;
            chunk.chunk_n--;
            item_header.offset += ItemHeader_size;
        }
    } else {
        // As an optimization, we could just write 0 for the size.
        TileHeader_write(tile_header_ref, TileHeader(stroke_n, ItemHeaderRef(0)));
    }
}