vello/piet-gpu/shader/kernel1.comp

// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph
// and outputs "instances" (references to item + translation) for each item
// that intersects the tilegroup.
//
// This implementation is simplistic and leaves a lot of performance on the
// table. A fancier implementation would use threadgroup shared memory or
// subgroups (or possibly both) to parallelize the reading of the input and
// the computation of tilegroup intersection.
//
// In addition, there are some features currently missing, such as support
// for clipping.

#version 450
#extension GL_GOOGLE_include_directive : enable

// It's possible we should lay this out with x and do our own math.
layout(local_size_x = 1, local_size_y = 32) in;

layout(set = 0, binding = 0) readonly buffer SceneBuf {
    uint[] scene;
};

layout(set = 0, binding = 1) buffer TilegroupBuf {
    uint[] tilegroup;
};

layout(set = 0, binding = 2) buffer AllocBuf {
    uint alloc;
};

#include "scene.h"
#include "tilegroup.h"

#include "setup.h"

#define MAX_STACK 8

struct StackElement {
    PietItemRef group;
    uint index;
    vec2 offset;
};

void main() {
    StackElement stack[MAX_STACK];
    uint stack_ix = 0;
    uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;
    TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);
    uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;

    // State for stroke references.
    TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);
    ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);
    InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);
    uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;
    uint stroke_chunk_n = 0;
    uint stroke_n = 0;

    // State for fill references. All this is a bit cut'n'paste, but making a
    // proper abstraction isn't easy.
    TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);
    ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);
    InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);
    uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;
    uint fill_chunk_n = 0;
    uint fill_n = 0;

    vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);
    PietItemRef root = PietItemRef(0);
    SimpleGroup group = PietItem_Group_read(root);
    StackElement tos = StackElement(root, 0, group.offset.xy);

    while (true) {
        if (tos.index < group.n_items) {
            Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));
            vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;
            bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))
                && max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));
            bool is_group = false;
            uint tag;
            if (hit) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                tag = PietItem_tag(item_ref);
                is_group = tag == PietItem_Group;
            }
            if (hit && !is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                Instance ins = Instance(item_ref.offset, tos.offset);
                if (tg_ref.offset > tg_limit) {
                    // Allocation exceeded; do atomic bump alloc.
                    uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);
                    Jump jump = Jump(TileGroupRef(new_tg));
                    TileGroup_Jump_write(tg_ref, jump);
                    tg_ref = TileGroupRef(new_tg);
                    tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;
                }
                TileGroup_Instance_write(tg_ref, ins);
                tg_ref.offset += TileGroup_size;
                if (tag == PietItem_Poly) {
                    if (stroke_ref.offset > stroke_limit) {
                        uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);
                        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));
                        stroke_chunk_start = ChunkRef(new_stroke);
                        stroke_ref = InstanceRef(new_stroke + Chunk_size);
                        stroke_n += stroke_chunk_n;
                        stroke_chunk_n = 0;
                        stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;
                    }
                    Instance_write(stroke_ref, ins);
                    stroke_chunk_n++;
                    stroke_ref.offset += Instance_size;
                } else if (tag == PietItem_Fill) {
                    if (fill_ref.offset > fill_limit) {
                        uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);
                        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));
                        fill_chunk_start = ChunkRef(new_fill);
                        fill_ref = InstanceRef(new_fill + Chunk_size);
                        fill_n += fill_chunk_n;
                        fill_chunk_n = 0;
                        fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;
                    }
                    Instance_write(fill_ref, ins);
                    fill_chunk_n++;
                    fill_ref.offset += Instance_size;

                }
            }
            if (is_group) {
                PietItemRef item_ref = PietItem_index(group.items, tos.index);
                tos.index++;
                if (tos.index < group.n_items) {
                    stack[stack_ix++] = tos;
                }
                group = PietItem_Group_read(item_ref);
                tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);
            } else {
                tos.index++;
            }
        } else {
            // processed all items in this group; pop the stack
            if (stack_ix == 0) {
                break;
            }
            tos = stack[--stack_ix];
            group = PietItem_Group_read(tos.group);
        }
    }
    TileGroup_End_write(tg_ref);

    stroke_n += stroke_chunk_n;
    if (stroke_n > 0) {
        Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));
    }
    tilegroup[stroke_start.offset >> 2] = stroke_n;

    fill_n += fill_chunk_n;
    if (fill_n > 0) {
        Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));
    }
    tilegroup[fill_start.offset >> 2] = fill_n;
}
Add first draft of kernel 3 A fairly simple approach, but it adds the translation (not tested yet in scene encoding) and does bounding box culling. 2020-04-22 10:55:17 +10:00			`// This is "kernel 1" in a 4-kernel pipeline. It traverses the scene graph`
			`// and outputs "instances" (references to item + translation) for each item`
			`// that intersects the tilegroup.`
			`//`
			`// This implementation is simplistic and leaves a lot of performance on the`
			`// table. A fancier implementation would use threadgroup shared memory or`
			`// subgroups (or possibly both) to parallelize the reading of the input and`
			`// the computation of tilegroup intersection.`
			`//`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`// In addition, there are some features currently missing, such as support`
Add first draft of kernel 3 A fairly simple approach, but it adds the translation (not tested yet in scene encoding) and does bounding box culling. 2020-04-22 10:55:17 +10:00			`// for clipping.`

First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`#version 450`
			`#extension GL_GOOGLE_include_directive : enable`

			`// It's possible we should lay this out with x and do our own math.`
			`layout(local_size_x = 1, local_size_y = 32) in;`

			`layout(set = 0, binding = 0) readonly buffer SceneBuf {`
			`uint[] scene;`
			`};`

			`layout(set = 0, binding = 1) buffer TilegroupBuf {`
			`uint[] tilegroup;`
			`};`

Dynamic allocation of intermediate buffers When the initial allocation is exceeded, do an atomic bump allocation. This is done for both tilegroup instances and per tile command lists. 2020-04-26 03:15:22 +10:00			`layout(set = 0, binding = 2) buffer AllocBuf {`
			`uint alloc;`
			`};`

First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`#include "scene.h"`
			`#include "tilegroup.h"`

Encode stroke in scene This just adds the first step of polyline stroking, which is adding it to the scene. Also just a bit of cleaning up of dimensions into one header file. 2020-04-25 06:06:47 +10:00			`#include "setup.h"`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00
			`#define MAX_STACK 8`

			`struct StackElement {`
			`PietItemRef group;`
			`uint index;`
			`vec2 offset;`
			`};`

			`void main() {`
			`StackElement stack[MAX_STACK];`
			`uint stack_ix = 0;`
			`uint tilegroup_ix = gl_GlobalInvocationID.y * WIDTH_IN_TILEGROUPS + gl_GlobalInvocationID.x;`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`TileGroupRef tg_ref = TileGroupRef(tilegroup_ix * TILEGROUP_STRIDE);`
Dynamic allocation of intermediate buffers When the initial allocation is exceeded, do an atomic bump allocation. This is done for both tilegroup instances and per tile command lists. 2020-04-26 03:15:22 +10:00			`uint tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00
			`// State for stroke references.`
			`TileGroupRef stroke_start = TileGroupRef(tg_ref.offset + TILEGROUP_STROKE_START);`
			`ChunkRef stroke_chunk_start = ChunkRef(stroke_start.offset + 4);`
			`InstanceRef stroke_ref = InstanceRef(stroke_chunk_start.offset + Chunk_size);`
Fills Adds fills, and has more or less working tiger render (with artifacts). 2020-05-01 10:06:01 +10:00			`uint stroke_limit = stroke_start.offset + TILEGROUP_INITIAL_STROKE_ALLOC - Instance_size;`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`uint stroke_chunk_n = 0;`
			`uint stroke_n = 0;`

Fills Adds fills, and has more or less working tiger render (with artifacts). 2020-05-01 10:06:01 +10:00			`// State for fill references. All this is a bit cut'n'paste, but making a`
			`// proper abstraction isn't easy.`
			`TileGroupRef fill_start = TileGroupRef(tg_ref.offset + TILEGROUP_FILL_START);`
			`ChunkRef fill_chunk_start = ChunkRef(fill_start.offset + 4);`
			`InstanceRef fill_ref = InstanceRef(fill_chunk_start.offset + Chunk_size);`
			`uint fill_limit = fill_start.offset + TILEGROUP_INITIAL_FILL_ALLOC - Instance_size;`
			`uint fill_chunk_n = 0;`
			`uint fill_n = 0;`

Add first draft of kernel 3 A fairly simple approach, but it adds the translation (not tested yet in scene encoding) and does bounding box culling. 2020-04-22 10:55:17 +10:00			`vec2 xy0 = vec2(gl_GlobalInvocationID.xy) * vec2(TILEGROUP_WIDTH_PX, TILEGROUP_HEIGHT_PX);`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`PietItemRef root = PietItemRef(0);`
			`SimpleGroup group = PietItem_Group_read(root);`
			`StackElement tos = StackElement(root, 0, group.offset.xy);`

			`while (true) {`
			`if (tos.index < group.n_items) {`
			`Bbox bbox = Bbox_read(Bbox_index(group.bboxes, tos.index));`
			`vec4 bb = vec4(bbox.bbox) + tos.offset.xyxy;`
Add first draft of kernel 3 A fairly simple approach, but it adds the translation (not tested yet in scene encoding) and does bounding box culling. 2020-04-22 10:55:17 +10:00			`bool hit = max(bb.x, xy0.x) < min(bb.z, xy0.x + float(TILEGROUP_WIDTH_PX))`
			`&& max(bb.y, xy0.y) < min(bb.w, xy0.y + float(TILEGROUP_HEIGHT_PX));`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`bool is_group = false;`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`uint tag;`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`if (hit) {`
			`PietItemRef item_ref = PietItem_index(group.items, tos.index);`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`tag = PietItem_tag(item_ref);`
			`is_group = tag == PietItem_Group;`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`}`
			`if (hit && !is_group) {`
			`PietItemRef item_ref = PietItem_index(group.items, tos.index);`
			`Instance ins = Instance(item_ref.offset, tos.offset);`
Dynamic allocation of intermediate buffers When the initial allocation is exceeded, do an atomic bump allocation. This is done for both tilegroup instances and per tile command lists. 2020-04-26 03:15:22 +10:00			`if (tg_ref.offset > tg_limit) {`
			`// Allocation exceeded; do atomic bump alloc.`
			`uint new_tg = atomicAdd(alloc, TILEGROUP_INITIAL_ALLOC);`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`Jump jump = Jump(TileGroupRef(new_tg));`
Dynamic allocation of intermediate buffers When the initial allocation is exceeded, do an atomic bump allocation. This is done for both tilegroup instances and per tile command lists. 2020-04-26 03:15:22 +10:00			`TileGroup_Jump_write(tg_ref, jump);`
			`tg_ref = TileGroupRef(new_tg);`
			`tg_limit = tg_ref.offset + TILEGROUP_INITIAL_ALLOC - 2 * TileGroup_size;`
			`}`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`TileGroup_Instance_write(tg_ref, ins);`
			`tg_ref.offset += TileGroup_size;`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`if (tag == PietItem_Poly) {`
			`if (stroke_ref.offset > stroke_limit) {`
			`uint new_stroke = atomicAdd(alloc, TILEGROUP_STROKE_ALLOC);`
			`Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(new_stroke)));`
			`stroke_chunk_start = ChunkRef(new_stroke);`
			`stroke_ref = InstanceRef(new_stroke + Chunk_size);`
			`stroke_n += stroke_chunk_n;`
			`stroke_chunk_n = 0;`
			`stroke_limit = new_stroke + TILEGROUP_STROKE_ALLOC - Instance_size;`
			`}`
			`Instance_write(stroke_ref, ins);`
			`stroke_chunk_n++;`
			`stroke_ref.offset += Instance_size;`
Fills Adds fills, and has more or less working tiger render (with artifacts). 2020-05-01 10:06:01 +10:00			`} else if (tag == PietItem_Fill) {`
			`if (fill_ref.offset > fill_limit) {`
			`uint new_fill = atomicAdd(alloc, TILEGROUP_FILL_ALLOC);`
			`Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(new_fill)));`
			`fill_chunk_start = ChunkRef(new_fill);`
			`fill_ref = InstanceRef(new_fill + Chunk_size);`
			`fill_n += fill_chunk_n;`
			`fill_chunk_n = 0;`
			`fill_limit = new_fill + TILEGROUP_FILL_ALLOC - Instance_size;`
			`}`
			`Instance_write(fill_ref, ins);`
			`fill_chunk_n++;`
			`fill_ref.offset += Instance_size;`

Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00			`}`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`}`
			`if (is_group) {`
			`PietItemRef item_ref = PietItem_index(group.items, tos.index);`
			`tos.index++;`
			`if (tos.index < group.n_items) {`
			`stack[stack_ix++] = tos;`
			`}`
			`group = PietItem_Group_read(item_ref);`
			`tos = StackElement(item_ref, 0, tos.offset + group.offset.xy);`
			`} else {`
			`tos.index++;`
			`}`
			`} else {`
			`// processed all items in this group; pop the stack`
			`if (stack_ix == 0) {`
			`break;`
			`}`
			`tos = stack[--stack_ix];`
			`group = PietItem_Group_read(tos.group);`
			`}`
			`}`
			`TileGroup_End_write(tg_ref);`
Implement stroked polylines This version seems to work but the allocation of segments has low utilization. Probably best to allocate in chunks rather than try to make them contiguous. 2020-04-29 04:02:19 +10:00
			`stroke_n += stroke_chunk_n;`
			`if (stroke_n > 0) {`
			`Chunk_write(stroke_chunk_start, Chunk(stroke_chunk_n, ChunkRef(0)));`
			`}`
			`tilegroup[stroke_start.offset >> 2] = stroke_n;`
Fills Adds fills, and has more or less working tiger render (with artifacts). 2020-05-01 10:06:01 +10:00
			`fill_n += fill_chunk_n;`
			`if (fill_n > 0) {`
			`Chunk_write(fill_chunk_start, Chunk(fill_chunk_n, ChunkRef(0)));`
			`}`
			`tilegroup[fill_start.offset >> 2] = fill_n;`
First draft of kernel 1 Output of kernel 1 is validated by simple inspection, next step is to wire it up properly. 2020-04-21 10:15:36 +10:00			`}`