// The coarse rasterizer stage of the pipeline.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "setup.h"

layout(local_size_x = N_TILE, local_size_y = 1) in;

layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
};

layout(set = 0, binding = 1) buffer BinsBuf {
    uint[] bins;
};

layout(set = 0, binding = 2) buffer AllocBuf {
    uint alloc;
};

layout(set = 0, binding = 3) buffer PtclBuf {
    uint[] ptcl;
};

#include "annotated.h"
#include "bins.h"
#include "ptcl.h"

#define N_RINGBUF 512

shared uint sh_elements[N_RINGBUF];
shared uint sh_chunk[N_WG];
shared uint sh_chunk_next[N_WG];
shared uint sh_chunk_n[N_WG];
shared uint sh_min_buf;
// Some of these are kept in shared memory to ease register
// pressure, but it could go either way.
shared uint sh_first_el[N_WG];
shared uint sh_selected_n;
shared uint sh_elements_ref;

shared uint sh_bitmaps[N_SLICE][N_TILE];

void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
    uint th_ix = gl_LocalInvocationID.x;
    uint wr_ix = 0;
    uint rd_ix = 0;
    uint first_el;
    if (th_ix < N_WG) {
        uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
        sh_chunk[th_ix] = start_chunk;
        BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
        sh_chunk_n[th_ix] = chunk.n;
        sh_chunk_next[th_ix] = chunk.next.offset;
        sh_first_el[th_ix] = chunk.n > 0 ?
            BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
    }
    uint probe = 0; // for debugging
    do {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
        }

        while (wr_ix - rd_ix <= N_TILE) {
            // Choose segment with least element.
            uint my_min;
            if (th_ix < N_WG) {
                if (th_ix == 0) {
                    sh_selected_n = 0;
                    sh_min_buf = ~1;
                }
            }
            barrier();
            // Tempting to do this with subgroups, but atomic should be good enough.
            my_min = sh_first_el[th_ix];
            if (th_ix < N_WG) {
                atomicMin(sh_min_buf, my_min);
            }
            barrier();
            if (th_ix < N_WG) {
                if (sh_first_el[th_ix] == sh_min_buf) {
                    sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
                    uint selected_n = sh_chunk_n[th_ix];
                    sh_selected_n = selected_n;
                    uint next_chunk = sh_chunk_next[th_ix];
                    if (next_chunk == 0) {
                        sh_first_el[th_ix] = ~0;
                    } else {
                        sh_chunk[th_ix] = next_chunk;
                        BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
                        sh_chunk_n[th_ix] = chunk.n;
                        sh_chunk_next[th_ix] = chunk.next.offset;
                        sh_first_el[th_ix] = BinInstance_read(
                            BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
                    }
                }
            }
            barrier();
            uint chunk_n = sh_selected_n;
            if (chunk_n == 0) {
                // All chunks consumed
                break;
            }
            BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
            if (th_ix < chunk_n) {
                uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix;
                sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el;
                probe = el;
            }
            wr_ix += chunk_n;
        }

        // We've done the merge and filled the buffer.
        uint tag = Annotated_Nop;
        AnnotatedRef ref;
        if (th_ix + rd_ix < wr_ix) {
            uint element_ix = (sh_elements[rd_ix] + th_ix) % N_RINGBUF;
            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
            probe = tag;
        }
        rd_ix += N_TILE;
    } while (wr_ix > rd_ix);
    ptcl[bin_ix * N_TILE + th_ix] = probe;
}