vello/piet-gpu/shader/draw_leaf.comp

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The leaf scan pass for draw tag scan implemented as a tree reduction.
// This stage can be fused with its consumer but is separate now.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define N_ROWS 8
#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_ROWS)

layout(local_size_x = WG_SIZE, local_size_y = 1) in;

layout(binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

layout(binding = 2) readonly buffer SceneBuf {
    uint[] scene;
};

#include "scene.h"
#include "tile.h"
#include "drawtag.h"
#include "blend.h"

#define Monoid DrawMonoid

layout(set = 0, binding = 3) readonly buffer ParentBuf {
    Monoid[] parent;
};

shared Monoid sh_scratch[WG_SIZE];

void main() {
    Monoid local[N_ROWS];

    uint ix = gl_GlobalInvocationID.x * N_ROWS;
    uint drawtag_base = conf.drawtag_offset >> 2;
    uint tag_word = scene[drawtag_base + ix];

    Monoid agg = map_tag(tag_word);
    local[0] = agg;
    for (uint i = 1; i < N_ROWS; i++) {
        tag_word = scene[drawtag_base + ix + i];
        agg = combine_draw_monoid(agg, map_tag(tag_word));
        local[i] = agg;
    }
    sh_scratch[gl_LocalInvocationID.x] = agg;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (gl_LocalInvocationID.x >= (1u << i)) {
            Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
            agg = combine_draw_monoid(other, agg);
        }
        barrier();
        sh_scratch[gl_LocalInvocationID.x] = agg;
    }

    barrier();
    Monoid row = draw_monoid_identity();
    if (gl_WorkGroupID.x > 0) {
        row = parent[gl_WorkGroupID.x - 1];
    }
    if (gl_LocalInvocationID.x > 0) {
        row = combine_draw_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
    }
    uint drawdata_base = conf.drawdata_offset >> 2;
    uint drawinfo_base = conf.drawinfo_alloc.offset >> 2;
    uint out_ix = gl_GlobalInvocationID.x * N_ROWS;
    uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 4;
    uint clip_out_base = conf.clip_alloc.offset >> 2;
    for (uint i = 0; i < N_ROWS; i++) {
        Monoid m = row;
        if (i > 0) {
            m = combine_draw_monoid(m, local[i - 1]);
        }
        // m now holds exclusive scan of draw monoid
        memory[out_base + i * 4] = m.path_ix;
        memory[out_base + i * 4 + 1] = m.clip_ix;
        memory[out_base + i * 4 + 2] = m.scene_offset;
        memory[out_base + i * 4 + 3] = m.info_offset;

        // u32 offset of drawobj data
        uint dd = drawdata_base + (m.scene_offset >> 2);
        uint di = drawinfo_base + (m.info_offset >> 2);

        // For compatibility, we'll generate an Annotated object, same as old
        // pipeline. However, going forward we'll get rid of that, and have
        // later stages read scene + bbox etc.
        tag_word = scene[drawtag_base + ix + i];
        if (tag_word == Drawtag_FillColor || tag_word == Drawtag_FillLinGradient || tag_word == Drawtag_FillImage ||
            tag_word == Drawtag_BeginClip) {
            uint bbox_offset = (conf.path_bbox_alloc.offset >> 2) + 6 * m.path_ix;
            float bbox_l = float(memory[bbox_offset]) - 32768.0;
            float bbox_t = float(memory[bbox_offset + 1]) - 32768.0;
            float bbox_r = float(memory[bbox_offset + 2]) - 32768.0;
            float bbox_b = float(memory[bbox_offset + 3]) - 32768.0;
            vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
            float linewidth = uintBitsToFloat(memory[bbox_offset + 4]);
            uint fill_mode = uint(linewidth >= 0.0);
            vec4 mat;
            vec2 translate;
            if (linewidth >= 0.0 || tag_word == Drawtag_FillLinGradient) {
                uint trans_ix = memory[bbox_offset + 5];
                uint t = (conf.trans_alloc.offset >> 2) + 6 * trans_ix;
                mat = uintBitsToFloat(uvec4(memory[t], memory[t + 1], memory[t + 2], memory[t + 3]));
                if (tag_word == Drawtag_FillLinGradient) {
                    translate = uintBitsToFloat(uvec2(memory[t + 4], memory[t + 5]));
                }
            }
            if (linewidth >= 0.0) {
                // TODO: need to deal with anisotropic case
                linewidth *= sqrt(abs(mat.x * mat.w - mat.y * mat.z));
            }
            switch (tag_word) {
            case Drawtag_FillColor:
            case Drawtag_FillImage:
                memory[di] = floatBitsToUint(linewidth);
                break;
            case Drawtag_FillLinGradient:
                memory[di] = floatBitsToUint(linewidth);
                uint index = scene[dd];
                vec2 p0 = uintBitsToFloat(uvec2(scene[dd + 1], scene[dd + 2]));
                vec2 p1 = uintBitsToFloat(uvec2(scene[dd + 3], scene[dd + 4]));
                p0 = mat.xy * p0.x + mat.zw * p0.y + translate;
                p1 = mat.xy * p1.x + mat.zw * p1.y + translate;
                vec2 dxy = p1 - p0;
                float scale = 1.0 / (dxy.x * dxy.x + dxy.y * dxy.y);
                float line_x = dxy.x * scale;
                float line_y = dxy.y * scale;
                float line_c = -(p0.x * line_x + p0.y * line_y);
                memory[di + 1] = floatBitsToUint(line_x);
                memory[di + 2] = floatBitsToUint(line_y);
                memory[di + 3] = floatBitsToUint(line_c);
                break;
            case Drawtag_BeginClip:
                break;
            }
        }
        // Generate clip stream.
        if (tag_word == Drawtag_BeginClip || tag_word == Drawtag_EndClip) {
            uint path_ix = ~(out_ix + i);
            if (tag_word == Drawtag_BeginClip) {
                path_ix = m.path_ix;
            }
            memory[clip_out_base + m.clip_ix] = path_ix;
        }
    }
}