vello/piet-gpu/shader/clip_leaf.comp

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense

// The second dispatch of clip stack processing.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "mem.h"
#include "setup.h"

#define LG_WG_SIZE (7 + LG_WG_FACTOR)
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE WG_SIZE

layout(local_size_x = WG_SIZE) in;

layout(binding = 1) readonly buffer ConfigBuf {
    Config conf;
};

// Some of this is cut'n'paste duplication with the reduce pass, and
// arguably should be moved to a common .h file.
// The bicyclic monoid

struct ClipEl {
    // index of parent node
    uint parent_ix;
    // bounding box
    vec4 bbox;
};

struct Bic {
    uint a;
    uint b;
};

Bic bic_combine(Bic x, Bic y) {
    uint m = min(x.b, y.a);
    return Bic(x.a + y.a - m, x.b + y.b - m);
}

// Load path's bbox from bbox (as written by pathseg).
vec4 load_path_bbox(uint path_ix) {
    uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
    float bbox_l = float(memory[base]) - 32768.0;
    float bbox_t = float(memory[base + 1]) - 32768.0;
    float bbox_r = float(memory[base + 2]) - 32768.0;
    float bbox_b = float(memory[base + 3]) - 32768.0;
    vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
    return bbox;
}

vec4 bbox_intersect(vec4 a, vec4 b) {
    return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
}

shared Bic sh_bic[WG_SIZE * 2 - 2];
shared uint sh_stack[PARTITION_SIZE];
shared vec4 sh_stack_bbox[PARTITION_SIZE];
shared uint sh_link[PARTITION_SIZE];
shared vec4 sh_bbox[PARTITION_SIZE];

// This is adapted directly from the stack monoid impl.
// Return value is reference within partition if >= 0,
// otherwise reference to stack.
uint search_link(inout Bic bic) {
    uint ix = gl_LocalInvocationID.x;
    uint j = 0;
    while (j < LG_WG_SIZE) {
        uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
        if (((ix >> j) & 1) != 0) {
            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
            if (test.b > 0) {
                break;
            }
            bic = test;
            ix -= 1u << j;
        }
        j++;
    }
    if (ix > 0) {
        while (j > 0) {
            j--;
            uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));
            Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);
            if (test.b == 0) {
                bic = test;
                ix -= 1u << j;
            }
        }
    }
    // ix is the smallest value such that reduce(ix..th).b == 0
    if (ix > 0) {
        return ix - 1;
    } else {
        return ~0u - bic.a;
    }
}

Bic load_bic(uint ix) {
    uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;
    return Bic(memory[base], memory[base + 1]);
}

ClipEl load_clip_el(uint ix) {
    uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;
    uint parent_ix = memory[base];
    float x0 = uintBitsToFloat(memory[base + 1]);
    float y0 = uintBitsToFloat(memory[base + 2]);
    float x1 = uintBitsToFloat(memory[base + 3]);
    float y1 = uintBitsToFloat(memory[base + 4]);
    vec4 bbox = vec4(x0, y0, x1, y1);
    return ClipEl(parent_ix, bbox);
}

uint load_path_ix(uint ix) {
    // This is one approach to a partial final block. Another would be
    // to do a memset to the padding in the command queue.
    if (ix < conf.n_clip) {
        return memory[(conf.clip_alloc.offset >> 2) + ix];
    } else {
        // EndClip tags don't implicate further loads.
        return 0x80000000;
    }
}

void store_clip_bbox(uint ix, vec4 bbox) {
    uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;
    memory[base] = floatBitsToUint(bbox.x);
    memory[base + 1] = floatBitsToUint(bbox.y);
    memory[base + 2] = floatBitsToUint(bbox.z);
    memory[base + 3] = floatBitsToUint(bbox.w);
}

void main() {
    // materialize stack up to the start of this partition. This
    // is based on the pure stack monoid, but with two additions.

    // First, (this only matters if the stack goes deeper than the
    // partition size, which might be unlikely in practice), the
    // topmost stack element from each partition is picked, then an
    // exclusive scan of those. Also note that if this is skipped,
    // a scan is not needed in the reduce stage.

    // Second, after the stream compaction, do a scan of the retrieved
    // bbox values.
    uint th = gl_LocalInvocationID.x;
    Bic bic = Bic(0, 0);
    if (th < gl_WorkGroupID.x) {
        bic = load_bic(th);
    }
    sh_bic[th] = bic;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        barrier();
        if (th + (1u << i) < WG_SIZE) {
            Bic other = sh_bic[th + (1u << i)];
            bic = bic_combine(bic, other);
        }
        barrier();
        sh_bic[th] = bic;
    }
    barrier();
    uint stack_size = sh_bic[0].b;

    // TODO: do bbox scan here (to unlock greater stack depth)

    // binary search in stack
    uint sp = PARTITION_SIZE - 1 - th;
    uint ix = 0;
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);
        if (sp < sh_bic[probe].b) {
            ix = probe;
        }
    }
    // ix is largest value such that sp < sh_bic[ix].b (if any)
    uint b = sh_bic[ix].b;
    vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);
    if (sp < b) {
        // maybe store the index here for future use?
        ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);
        sh_stack[th] = el.parent_ix;
        bbox = el.bbox;
        // other element values here?
    }

    // forward scan of bbox values of prefix stack
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        sh_stack_bbox[th] = bbox;
        barrier();
        if (th >= (1u << i)) {
            bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);
        }
        barrier();
    }
    sh_stack_bbox[th] = bbox;

    // Read input and compute bicyclic semigroup binary tree
    uint inp = load_path_ix(gl_GlobalInvocationID.x);
    bool is_push = int(inp) >= 0;
    bic = Bic(1 - uint(is_push), uint(is_push));
    sh_bic[th] = bic;
    if (is_push) {
        bbox = load_path_bbox(inp);
    } else {
        bbox = vec4(-1e9, -1e9, 1e9, 1e9);
    }
    uint inbase = 0;
    for (uint i = 0; i < LG_WG_SIZE - 1; i++) {
        uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));
        barrier();
        if (th < (1u << (LG_WG_SIZE - 1 - i))) {
            sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);
        }
        inbase = outbase;
    }
    barrier();
    // Search for predecessor node
    bic = Bic(0, 0);
    uint link = search_link(bic);
    // we use N_SEQ > 1 convention here:
    // link >= 0 is index within partition
    // link < 0 is reference to stack

    // We want grandparent bbox for pop nodes, so follow those links.
    sh_link[th] = link;
    barrier();
    uint grandparent;
    if (int(link) >= 0) {
        grandparent = sh_link[link];
    } else {
        grandparent = link - 1;
    }

    // Resolve parent
    uint parent;
    if (int(link) >= 0) {
        parent = gl_WorkGroupID.x * PARTITION_SIZE + link;
    } else if (int(link + stack_size) >= 0) {
        parent = sh_stack[PARTITION_SIZE + link];
    } else {
        parent = ~0u;
    }

    // bbox scan along parent links
    for (uint i = 0; i < LG_WG_SIZE; i++) {
        // sh_link was already stored for first iteration
        if (i != 0) {
            sh_link[th] = link;
        }
        sh_bbox[th] = bbox;
        barrier();
        if (int(link) >= 0) {
            bbox = bbox_intersect(sh_bbox[link], bbox);
            link = sh_link[link];
        }
        barrier();
    }
    if (int(link + stack_size) >= 0) {
        bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);
    }
    // At this point, bbox is the reduction of bounding boxes along the tree.
    sh_bbox[th] = bbox;
    barrier();

    uint path_ix = inp;
    if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {
        // Is this load expensive? If so, it's loaded earlier for in-partition
        // and is in the ClipEl for cross-partition.
        // If not, can probably get rid of it in the stack intermediate buf.
        path_ix = load_path_ix(parent);
        uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;
        // Fix up drawmonoid so path_ix at EndClip matches BeginClip
        memory[drawmonoid_out_base] = path_ix;

        if (int(grandparent) >= 0) {
            bbox = sh_bbox[grandparent];
        } else if (int(grandparent + stack_size) >= 0) {
            bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];
        } else {
            bbox = vec4(-1e9, -1e9, 1e9, 1e9);
        }
    }
    store_clip_bbox(gl_GlobalInvocationID.x, bbox);
}
New clip implementation This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack. This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely. Fixes #88. Also relevant to #119 2022-02-18 11:25:41 +11:00			`// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense`

			`// The second dispatch of clip stack processing.`

			`#version 450`
			`#extension GL_GOOGLE_include_directive : enable`

			`#include "mem.h"`
			`#include "setup.h"`

			`#define LG_WG_SIZE (7 + LG_WG_FACTOR)`
			`#define WG_SIZE (1 << LG_WG_SIZE)`
			`#define PARTITION_SIZE WG_SIZE`

			`layout(local_size_x = WG_SIZE) in;`

			`layout(binding = 1) readonly buffer ConfigBuf {`
			`Config conf;`
			`};`

			`// Some of this is cut'n'paste duplication with the reduce pass, and`
			`// arguably should be moved to a common .h file.`
			`// The bicyclic monoid`

			`struct ClipEl {`
			`// index of parent node`
			`uint parent_ix;`
			`// bounding box`
			`vec4 bbox;`
			`};`

			`struct Bic {`
			`uint a;`
			`uint b;`
			`};`

			`Bic bic_combine(Bic x, Bic y) {`
			`uint m = min(x.b, y.a);`
			`return Bic(x.a + y.a - m, x.b + y.b - m);`
			`}`

			`// Load path's bbox from bbox (as written by pathseg).`
			`vec4 load_path_bbox(uint path_ix) {`
Variable size encoding of draw objects This patch switches to a variable size encoding of draw objects. In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying). Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style). This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken. Closes #152 2022-03-03 09:44:03 +11:00			`uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;`
New clip implementation This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack. This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely. Fixes #88. Also relevant to #119 2022-02-18 11:25:41 +11:00			`float bbox_l = float(memory[base]) - 32768.0;`
			`float bbox_t = float(memory[base + 1]) - 32768.0;`
			`float bbox_r = float(memory[base + 2]) - 32768.0;`
			`float bbox_b = float(memory[base + 3]) - 32768.0;`
			`vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);`
			`return bbox;`
			`}`

			`vec4 bbox_intersect(vec4 a, vec4 b) {`
			`return vec4(max(a.xy, b.xy), min(a.zw, b.zw));`
			`}`

			`shared Bic sh_bic[WG_SIZE * 2 - 2];`
			`shared uint sh_stack[PARTITION_SIZE];`
			`shared vec4 sh_stack_bbox[PARTITION_SIZE];`
			`shared uint sh_link[PARTITION_SIZE];`
			`shared vec4 sh_bbox[PARTITION_SIZE];`

			`// This is adapted directly from the stack monoid impl.`
			`// Return value is reference within partition if >= 0,`
			`// otherwise reference to stack.`
			`uint search_link(inout Bic bic) {`
			`uint ix = gl_LocalInvocationID.x;`
			`uint j = 0;`
			`while (j < LG_WG_SIZE) {`
			`uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));`
			`if (((ix >> j) & 1) != 0) {`
			`Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);`
			`if (test.b > 0) {`
			`break;`
			`}`
			`bic = test;`
			`ix -= 1u << j;`
			`}`
			`j++;`
			`}`
			`if (ix > 0) {`
			`while (j > 0) {`
			`j--;`
			`uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j));`
			`Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic);`
			`if (test.b == 0) {`
			`bic = test;`
			`ix -= 1u << j;`
			`}`
			`}`
			`}`
			`// ix is the smallest value such that reduce(ix..th).b == 0`
			`if (ix > 0) {`
			`return ix - 1;`
			`} else {`
			`return ~0u - bic.a;`
			`}`
			`}`

			`Bic load_bic(uint ix) {`
			`uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix;`
			`return Bic(memory[base], memory[base + 1]);`
			`}`

			`ClipEl load_clip_el(uint ix) {`
			`uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix;`
			`uint parent_ix = memory[base];`
			`float x0 = uintBitsToFloat(memory[base + 1]);`
			`float y0 = uintBitsToFloat(memory[base + 2]);`
			`float x1 = uintBitsToFloat(memory[base + 3]);`
			`float y1 = uintBitsToFloat(memory[base + 4]);`
			`vec4 bbox = vec4(x0, y0, x1, y1);`
			`return ClipEl(parent_ix, bbox);`
			`}`

			`uint load_path_ix(uint ix) {`
			`// This is one approach to a partial final block. Another would be`
			`// to do a memset to the padding in the command queue.`
			`if (ix < conf.n_clip) {`
			`return memory[(conf.clip_alloc.offset >> 2) + ix];`
			`} else {`
			`// EndClip tags don't implicate further loads.`
			`return 0x80000000;`
			`}`
			`}`

			`void store_clip_bbox(uint ix, vec4 bbox) {`
			`uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix;`
			`memory[base] = floatBitsToUint(bbox.x);`
			`memory[base + 1] = floatBitsToUint(bbox.y);`
			`memory[base + 2] = floatBitsToUint(bbox.z);`
			`memory[base + 3] = floatBitsToUint(bbox.w);`
			`}`

			`void main() {`
			`// materialize stack up to the start of this partition. This`
			`// is based on the pure stack monoid, but with two additions.`

			`// First, (this only matters if the stack goes deeper than the`
			`// partition size, which might be unlikely in practice), the`
			`// topmost stack element from each partition is picked, then an`
			`// exclusive scan of those. Also note that if this is skipped,`
			`// a scan is not needed in the reduce stage.`

			`// Second, after the stream compaction, do a scan of the retrieved`
			`// bbox values.`
			`uint th = gl_LocalInvocationID.x;`
			`Bic bic = Bic(0, 0);`
			`if (th < gl_WorkGroupID.x) {`
			`bic = load_bic(th);`
			`}`
			`sh_bic[th] = bic;`
			`for (uint i = 0; i < LG_WG_SIZE; i++) {`
			`barrier();`
			`if (th + (1u << i) < WG_SIZE) {`
			`Bic other = sh_bic[th + (1u << i)];`
			`bic = bic_combine(bic, other);`
			`}`
			`barrier();`
			`sh_bic[th] = bic;`
			`}`
			`barrier();`
			`uint stack_size = sh_bic[0].b;`

			`// TODO: do bbox scan here (to unlock greater stack depth)`

			`// binary search in stack`
			`uint sp = PARTITION_SIZE - 1 - th;`
			`uint ix = 0;`
			`for (uint i = 0; i < LG_WG_SIZE; i++) {`
			`uint probe = ix + (uint(PARTITION_SIZE / 2) >> i);`
			`if (sp < sh_bic[probe].b) {`
			`ix = probe;`
			`}`
			`}`
			`// ix is largest value such that sp < sh_bic[ix].b (if any)`
			`uint b = sh_bic[ix].b;`
			`vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9);`
			`if (sp < b) {`
			`// maybe store the index here for future use?`
			`ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1);`
			`sh_stack[th] = el.parent_ix;`
			`bbox = el.bbox;`
			`// other element values here?`
			`}`

			`// forward scan of bbox values of prefix stack`
			`for (uint i = 0; i < LG_WG_SIZE; i++) {`
			`sh_stack_bbox[th] = bbox;`
			`barrier();`
			`if (th >= (1u << i)) {`
			`bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox);`
			`}`
			`barrier();`
			`}`
			`sh_stack_bbox[th] = bbox;`

			`// Read input and compute bicyclic semigroup binary tree`
			`uint inp = load_path_ix(gl_GlobalInvocationID.x);`
			`bool is_push = int(inp) >= 0;`
			`bic = Bic(1 - uint(is_push), uint(is_push));`
			`sh_bic[th] = bic;`
			`if (is_push) {`
			`bbox = load_path_bbox(inp);`
			`} else {`
			`bbox = vec4(-1e9, -1e9, 1e9, 1e9);`
			`}`
			`uint inbase = 0;`
			`for (uint i = 0; i < LG_WG_SIZE - 1; i++) {`
			`uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i));`
			`barrier();`
			`if (th < (1u << (LG_WG_SIZE - 1 - i))) {`
			`sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]);`
			`}`
			`inbase = outbase;`
			`}`
			`barrier();`
			`// Search for predecessor node`
			`bic = Bic(0, 0);`
			`uint link = search_link(bic);`
			`// we use N_SEQ > 1 convention here:`
			`// link >= 0 is index within partition`
			`// link < 0 is reference to stack`

			`// We want grandparent bbox for pop nodes, so follow those links.`
			`sh_link[th] = link;`
			`barrier();`
			`uint grandparent;`
			`if (int(link) >= 0) {`
			`grandparent = sh_link[link];`
			`} else {`
			`grandparent = link - 1;`
			`}`

			`// Resolve parent`
			`uint parent;`
			`if (int(link) >= 0) {`
			`parent = gl_WorkGroupID.x * PARTITION_SIZE + link;`
			`} else if (int(link + stack_size) >= 0) {`
			`parent = sh_stack[PARTITION_SIZE + link];`
			`} else {`
			`parent = ~0u;`
			`}`

			`// bbox scan along parent links`
			`for (uint i = 0; i < LG_WG_SIZE; i++) {`
			`// sh_link was already stored for first iteration`
			`if (i != 0) {`
			`sh_link[th] = link;`
			`}`
			`sh_bbox[th] = bbox;`
			`barrier();`
			`if (int(link) >= 0) {`
			`bbox = bbox_intersect(sh_bbox[link], bbox);`
			`link = sh_link[link];`
			`}`
			`barrier();`
			`}`
			`if (int(link + stack_size) >= 0) {`
			`bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox);`
			`}`
			`// At this point, bbox is the reduction of bounding boxes along the tree.`
			`sh_bbox[th] = bbox;`
			`barrier();`

			`uint path_ix = inp;`
			`if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) {`
			`// Is this load expensive? If so, it's loaded earlier for in-partition`
			`// and is in the ClipEl for cross-partition.`
			`// If not, can probably get rid of it in the stack intermediate buf.`
			`path_ix = load_path_ix(parent);`
Variable size encoding of draw objects This patch switches to a variable size encoding of draw objects. In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying). Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style). This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken. Closes #152 2022-03-03 09:44:03 +11:00			`uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 4 * ~inp;`
New clip implementation This PR reworks the clip implementation. The highlight is that clip bounding box accounting is now done on GPU rather than CPU. The clip mask is also rasterized on EndClip rather than BeginClip, which decreases memory traffic needed for the clip stack. This is a pretty good working state, but not all cleanup has been applied. An important next step is to remove the CPU clip accounting (it is computed and encoded, but that result is not used). Another step is to remove the Annotated structure entirely. Fixes #88. Also relevant to #119 2022-02-18 11:25:41 +11:00			`// Fix up drawmonoid so path_ix at EndClip matches BeginClip`
			`memory[drawmonoid_out_base] = path_ix;`

			`if (int(grandparent) >= 0) {`
			`bbox = sh_bbox[grandparent];`
			`} else if (int(grandparent + stack_size) >= 0) {`
			`bbox = sh_stack_bbox[PARTITION_SIZE + grandparent];`
			`} else {`
			`bbox = vec4(-1e9, -1e9, 1e9, 1e9);`
			`}`
			`}`
			`store_clip_bbox(gl_GlobalInvocationID.x, bbox);`
			`}`