vello/piet-gpu/shader/coarse.comp

// The coarse rasterizer stage of the pipeline.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "setup.h"

layout(local_size_x = N_TILE, local_size_y = 1) in;

layout(set = 0, binding = 0) buffer AnnotatedBuf {
    uint[] annotated;
};

layout(set = 0, binding = 1) buffer BinsBuf {
    uint[] bins;
};

layout(set = 0, binding = 2) buffer AllocBuf {
    uint alloc;
};

layout(set = 0, binding = 3) buffer PtclBuf {
    uint[] ptcl;
};

#include "annotated.h"
#include "bins.h"
#include "ptcl.h"

#define N_RINGBUF 512

shared uint sh_elements[N_RINGBUF];
shared uint sh_chunk[N_WG];
shared uint sh_chunk_next[N_WG];
shared uint sh_chunk_n[N_WG];
shared uint sh_min_buf;
// Some of these are kept in shared memory to ease register
// pressure, but it could go either way.
shared uint sh_first_el[N_WG];
shared uint sh_selected_n;
shared uint sh_elements_ref;

shared uint sh_bitmaps[N_SLICE][N_TILE];

void main() {
    // Could use either linear or 2d layouts for both dispatch and
    // invocations within the workgroup. We'll use variables to abstract.
    uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
    uint th_ix = gl_LocalInvocationID.x;
    uint wr_ix = 0;
    uint rd_ix = 0;
    uint first_el;
    if (th_ix < N_WG) {
        uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;
        sh_chunk[th_ix] = start_chunk;
        BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));
        sh_chunk_n[th_ix] = chunk.n;
        sh_chunk_next[th_ix] = chunk.next.offset;
        sh_first_el[th_ix] = chunk.n > 0 ?
            BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;
    }
    uint probe = 0; // for debugging
    do {
        for (uint i = 0; i < N_SLICE; i++) {
            sh_bitmaps[i][th_ix] = 0;
        }

        while (wr_ix - rd_ix <= N_TILE) {
            // Choose segment with least element.
            uint my_min;
            if (th_ix < N_WG) {
                if (th_ix == 0) {
                    sh_selected_n = 0;
                    sh_min_buf = ~1;
                }
            }
            barrier();
            // Tempting to do this with subgroups, but atomic should be good enough.
            my_min = sh_first_el[th_ix];
            if (th_ix < N_WG) {
                atomicMin(sh_min_buf, my_min);
            }
            barrier();
            if (th_ix < N_WG) {
                if (sh_first_el[th_ix] == sh_min_buf) {
                    sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;
                    uint selected_n = sh_chunk_n[th_ix];
                    sh_selected_n = selected_n;
                    uint next_chunk = sh_chunk_next[th_ix];
                    if (next_chunk == 0) {
                        sh_first_el[th_ix] = ~0;
                    } else {
                        sh_chunk[th_ix] = next_chunk;
                        BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));
                        sh_chunk_n[th_ix] = chunk.n;
                        sh_chunk_next[th_ix] = chunk.next.offset;
                        sh_first_el[th_ix] = BinInstance_read(
                            BinInstanceRef(next_chunk + BinChunk_size)).element_ix;
                    }
                }
            }
            barrier();
            uint chunk_n = sh_selected_n;
            if (chunk_n == 0) {
                // All chunks consumed
                break;
            }
            BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);
            if (th_ix < chunk_n) {
                uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix;
                sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el;
                probe = el;
            }
            wr_ix += chunk_n;
        }

        // We've done the merge and filled the buffer.
        uint tag = Annotated_Nop;
        AnnotatedRef ref;
        if (th_ix + rd_ix < wr_ix) {
            uint element_ix = (sh_elements[rd_ix] + th_ix) % N_RINGBUF;
            ref = AnnotatedRef(element_ix * Annotated_size);
            tag = Annotated_tag(ref);
            probe = tag;
        }
        rd_ix += N_TILE;
    } while (wr_ix > rd_ix);
    ptcl[bin_ix * N_TILE + th_ix] = probe;
}
Starting coarse rasterizer Working down the pipeline. WIP 2020-05-14 08:35:19 +10:00			`// The coarse rasterizer stage of the pipeline.`

			`#version 450`
			`#extension GL_GOOGLE_include_directive : enable`

			`#include "setup.h"`

			`layout(local_size_x = N_TILE, local_size_y = 1) in;`

			`layout(set = 0, binding = 0) buffer AnnotatedBuf {`
			`uint[] annotated;`
			`};`

			`layout(set = 0, binding = 1) buffer BinsBuf {`
			`uint[] bins;`
			`};`

			`layout(set = 0, binding = 2) buffer AllocBuf {`
			`uint alloc;`
			`};`

			`layout(set = 0, binding = 3) buffer PtclBuf {`
			`uint[] ptcl;`
			`};`

			`#include "annotated.h"`
			`#include "bins.h"`
			`#include "ptcl.h"`

			`#define N_RINGBUF 512`

			`shared uint sh_elements[N_RINGBUF];`
			`shared uint sh_chunk[N_WG];`
			`shared uint sh_chunk_next[N_WG];`
			`shared uint sh_chunk_n[N_WG];`
			`shared uint sh_min_buf;`
			`// Some of these are kept in shared memory to ease register`
			`// pressure, but it could go either way.`
			`shared uint sh_first_el[N_WG];`
			`shared uint sh_selected_n;`
			`shared uint sh_elements_ref;`

			`shared uint sh_bitmaps[N_SLICE][N_TILE];`

			`void main() {`
			`// Could use either linear or 2d layouts for both dispatch and`
			`// invocations within the workgroup. We'll use variables to abstract.`
			`uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;`
			`uint th_ix = gl_LocalInvocationID.x;`
			`uint wr_ix = 0;`
			`uint rd_ix = 0;`
			`uint first_el;`
			`if (th_ix < N_WG) {`
			`uint start_chunk = (bin_ix * N_WG + th_ix) * BIN_INITIAL_ALLOC;`
			`sh_chunk[th_ix] = start_chunk;`
			`BinChunk chunk = BinChunk_read(BinChunkRef(start_chunk));`
			`sh_chunk_n[th_ix] = chunk.n;`
			`sh_chunk_next[th_ix] = chunk.next.offset;`
			`sh_first_el[th_ix] = chunk.n > 0 ?`
			`BinInstance_read(BinInstanceRef(start_chunk + BinChunk_size)).element_ix : ~0;`
			`}`
			`uint probe = 0; // for debugging`
			`do {`
			`for (uint i = 0; i < N_SLICE; i++) {`
			`sh_bitmaps[i][th_ix] = 0;`
			`}`

			`while (wr_ix - rd_ix <= N_TILE) {`
			`// Choose segment with least element.`
			`uint my_min;`
			`if (th_ix < N_WG) {`
			`if (th_ix == 0) {`
			`sh_selected_n = 0;`
			`sh_min_buf = ~1;`
			`}`
			`}`
			`barrier();`
			`// Tempting to do this with subgroups, but atomic should be good enough.`
			`my_min = sh_first_el[th_ix];`
			`if (th_ix < N_WG) {`
			`atomicMin(sh_min_buf, my_min);`
			`}`
			`barrier();`
			`if (th_ix < N_WG) {`
			`if (sh_first_el[th_ix] == sh_min_buf) {`
			`sh_elements_ref = sh_chunk[th_ix] + BinChunk_size;`
			`uint selected_n = sh_chunk_n[th_ix];`
			`sh_selected_n = selected_n;`
			`uint next_chunk = sh_chunk_next[th_ix];`
			`if (next_chunk == 0) {`
			`sh_first_el[th_ix] = ~0;`
			`} else {`
			`sh_chunk[th_ix] = next_chunk;`
			`BinChunk chunk = BinChunk_read(BinChunkRef(next_chunk));`
			`sh_chunk_n[th_ix] = chunk.n;`
			`sh_chunk_next[th_ix] = chunk.next.offset;`
			`sh_first_el[th_ix] = BinInstance_read(`
			`BinInstanceRef(next_chunk + BinChunk_size)).element_ix;`
			`}`
			`}`
			`}`
			`barrier();`
			`uint chunk_n = sh_selected_n;`
			`if (chunk_n == 0) {`
			`// All chunks consumed`
			`break;`
			`}`
			`BinInstanceRef inst_ref = BinInstanceRef(sh_elements_ref);`
			`if (th_ix < chunk_n) {`
			`uint el = BinInstance_read(BinInstance_index(inst_ref, th_ix)).element_ix;`
			`sh_elements[(wr_ix + th_ix) % N_RINGBUF] = el;`
			`probe = el;`
			`}`
			`wr_ix += chunk_n;`
			`}`

			`// We've done the merge and filled the buffer.`
			`uint tag = Annotated_Nop;`
			`AnnotatedRef ref;`
			`if (th_ix + rd_ix < wr_ix) {`
			`uint element_ix = (sh_elements[rd_ix] + th_ix) % N_RINGBUF;`
			`ref = AnnotatedRef(element_ix * Annotated_size);`
			`tag = Annotated_tag(ref);`
			`probe = tag;`
			`}`
			`rd_ix += N_TILE;`
			`} while (wr_ix > rd_ix);`
			`ptcl[bin_ix * N_TILE + th_ix] = probe;`
			`}`