// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The second dispatch of clip stack processing.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE WG_SIZE

layout(local_size_x = WG_SIZE) in;

layout(binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

// Some of this is cut'n'paste duplication with the reduce pass, and
// arguably should be moved to a common .h file.
// The bicyclic monoid

struct ClipEl {
    // index of parent node
    uint parent_ix;
    // bounding box
    vec4 bbox;
};

struct Bic {
    uint a;
    uint b;
};

Bic bic_combine(Bic x, Bic y) {
    uint m = min(x.b, y.a);
    return Bic(x.a + y.a - m, x.b + y.b - m);
}

// Load path's bbox from bbox (as written by pathseg).
vec4 load_path_bbox(uint path_ix) {
    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
    float bbox_l = float(memory[base]) - 32768.0;
    float bbox_t = float(memory[base + 1]) - 32768.0;
    float bbox_r = float(memory[base + 2]) - 32768.0;
    float bbox_b = float(memory[base + 3]) - 32768.0;
    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
    return bbox;
}

vec4 bbox_intersect(vec4 a, vec4 b) {
    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
}

shared Bic sh_bic[WG_SIZE * 2 - 2];
shared uint sh_stack[PARTITION_SIZE];
shared vec4 sh_stack_bbox[PARTITION_SIZE];
shared uint sh_link[PARTITION_SIZE];
shared vec4 sh_bbox[PARTITION_SIZE];

// This is adapted directly from the stack monoid impl.
// Return value is reference within partition if >= 0,
// otherwise reference to stack.
uint search_link(inout Bic bic) {
    uint ix = gl_LocalInvocationID.x;
    uint j = 0;
    while (j < LG_WG_SIZE) {
        uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
        if (((ix >> j) & 1) != 0) {
            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
            if (test.b > 0) {
                break;
            }
            bic = test;
            ix -= 1u << j;
        }
        j++;
    }
    if (ix > 0) {
        while (j > 0) {
            j--;
            uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
            if (test.b == 0) {
                bic = test;
                ix -= 1u << j;
            }
        }
    }
    // ix is the smallest value such that reduce(ix..th).b == 0
    if (ix > 0) {
        return ix - 1;
    } else {
        return ~0u - bic.a;
    }
}

Bic load_bic(uint ix) {
    uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
    return Bic(memory[base], memory[base + 1]);
}

ClipEl load_clip_el(uint ix) {
    uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
    uint parent_ix = memory[base];
    float x0 = uintBitsToFloat(memory[base + 1]);
    float y0 = uintBitsToFloat(memory[base + 2]);
    float x1 = uintBitsToFloat(memory[base + 3]);
    float y1 = uintBitsToFloat(memory[base + 4]);
    vec4 bbox = vec4(x0, y0, x1, y1);
    return ClipEl(parent_ix, bbox);
}

uint load_path_ix(uint ix) {
    // This is one approach to a partial final block. Another would be
    // to do a memset to the padding in the command queue.
    if (ix < conf.n_clip) {
        return memory[(conf.clip_alloc.offset >> 2) + ix];
    } else {
        // EndClip tags don't implicate further loads.
        return 0x80000000;
    }
}

void store_clip_bbox(uint ix, vec4 bbox) {
    uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
    memory[base] = floatBitsToUint(bbox.x);
    memory[base + 1] = floatBitsToUint(bbox.y);
    memory[base + 2] = floatBitsToUint(bbox.z);
    memory[base + 3] = floatBitsToUint(bbox.w);
}

void main() {
    // materialize stack up to the start of this partition. This
    // is based on the pure stack monoid, but with two additions.

    // First, (this only matters if the stack goes deeper than the
    // partition size, which might be unlikely in practice), the
    // topmost stack element from each partition is picked, then an
    // exclusive scan of those. Also note that if this is skipped,
    // a scan is not needed in the reduce stage.

    // Second, after the stream compaction, do a scan of the retrieved
    // bbox values.
    uint th = gl_LocalInvocationID.x;
    Bic bic = Bic(0, 0);
    if (th < gl_WorkGroupID.x) {
        bic = load_bic(th);
    }
    sh_bic[th] = bic;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (th + (1u << i) < WG_SIZE) {
            Bic other = sh_bic[th + (1u << i)];
            bic = bic_combine(bic, other);
        }
        barrier();
        sh_bic[th] = bic;
    }
    barrier();
    uint stack_size = sh_bic[0].b;

    // TODO: do bbox scan here (to unlock greater stack depth)

    // binary search in stack
    uint sp = PARTITION_SIZE - 1 - th;
    uint ix = 0;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
        if (sp < sh_bic[probe].b) {
            ix = probe;
        }
    }
    // ix is largest value such that sp < sh_bic[ix].b (if any)
    uint b = sh_bic[ix].b;
    vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
    if (sp < b) {
        // maybe store the index here for future use?
        ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
        sh_stack[th] = el.parent_ix;
        bbox = el.bbox;
        // other element values here?
    }

    // forward scan of bbox values of prefix stack
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        sh_stack_bbox[th] = bbox;
        barrier();
        if (th >= (1u << i)) {
            bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
        }
        barrier();
    }
    sh_stack_bbox[th] = bbox;

    // Read input and compute bicyclic semigroup binary tree
    uint inp = load_path_ix(gl_GlobalInvocationID.x);
    bool is_push = int(inp) >= 0;
    bic = Bic(1 - uint(is_push), uint(is_push));
    sh_bic[th] = bic;
    if (is_push) {
        bbox = load_path_bbox(inp);
    } else {
        bbox = vec4(-1e9, -1e9, 1e9, 1e9);
    }
    uint inbase = 0;
    for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
        uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
        barrier();
        if (th < (1u << (LG_WG_SIZE - 1 - i))) {
            sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
        }
        inbase = outbase;
    }
    barrier();
    // Search for predecessor node
    bic = Bic(0, 0);
    uint link = search_link(bic);
    // we use N_SEQ > 1 convention here:
    // link >= 0 is index within partition
    // link < 0 is reference to stack

    // We want grandparent bbox for pop nodes, so follow those links.
    sh_link[th] = link;
    barrier();
    uint grandparent;
    if (int(link) >= 0) {
        grandparent = sh_link[link];
    } else {
        grandparent = link - 1;
    }

    // Resolve parent
    uint parent;
    if (int(link) >= 0) {
        parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
    } else if (int(link + stack_size) >= 0) {
        parent = sh_stack[PARTITION_SIZE + link];
    } else {
        parent = ~0u;
    }

    // bbox scan along parent links
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        // sh_link was already stored for first iteration
        if (i != 0) {
            sh_link[th] = link;
        }
        sh_bbox[th] = bbox;
        barrier();
        if (int(link) >= 0) {
            bbox = bbox_intersect(sh_bbox[link], bbox);
            link = sh_link[link];
        }
        barrier();
    }
    if (int(link + stack_size) >= 0) {
        bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
    }
    // At this point, bbox is the reduction of bounding boxes along the tree.
    sh_bbox[th] = bbox;
    barrier();

    uint path_ix = inp;
    if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
        // Is this load expensive? If so, it's loaded earlier for in-partition
        // and is in the ClipEl for cross-partition.
        // If not, can probably get rid of it in the stack intermediate buf.
        path_ix = load_path_ix(parent);
        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
        // Fix up drawmonoid so path_ix at EndClip matches BeginClip
        memory[drawmonoid_out_base] = path_ix;

        if (int(grandparent) >= 0) {
            bbox = sh_bbox[grandparent];
        } else if (int(grandparent + stack_size) >= 0) {
            bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
        } else {
            bbox = vec4(-1e9, -1e9, 1e9, 1e9);
        }
    }
    store_clip_bbox(gl_GlobalInvocationID.x, bbox);
}