vello/piet-gpu/shader/path_coarse.comp

// Coarse rasterization of path segments.

// Allocation and initialization of tiles for paths.

#version 450
#extension GL_GOOGLE_include_directive : enable

#include "setup.h"

#define LG_COARSE_WG 5
#define COARSE_WG (1 << LG_COARSE_WG)

layout(local_size_x = COARSE_WG, local_size_y = 1) in;

layout(set = 0, binding = 0) buffer PathSegBuf {
    uint[] pathseg;
};

layout(set = 0, binding = 1) buffer AllocBuf {
    uint n_paths;
    uint n_pathseg;
    uint alloc;
};

layout(set = 0, binding = 2) buffer TileBuf {
    uint[] tile;
};

#include "pathseg.h"
#include "tile.h"

// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))

shared uint sh_tile_count[COARSE_WG];
shared uint sh_width[COARSE_WG];
shared uint sh_draw_width[COARSE_WG];
shared vec2 sh_p0[COARSE_WG];
shared vec2 sh_p1[COARSE_WG];
shared int sh_x0[COARSE_WG];
shared int sh_y0[COARSE_WG];
shared float sh_a[COARSE_WG];
shared float sh_b[COARSE_WG];
shared float sh_c[COARSE_WG];
shared uint sh_base[COARSE_WG];
shared uint sh_stride[COARSE_WG];
shared uint sh_alloc_start;

void main() {
    uint th_ix = gl_LocalInvocationID.x;
    uint element_ix = gl_GlobalInvocationID.x;
    PathSegRef ref = PathSegRef(element_ix * PathSeg_size);

    uint tag = PathSeg_Nop;
    if (element_ix < n_pathseg) {
        tag = PathSeg_tag(ref);
    }
    // Setup for coverage algorithm.
    float a, b, c;
    // Bounding box of element in pixel coordinates.
    float xmin, xmax, ymin, ymax;
    PathStrokeLine line;
    switch (tag) {
    case PathSeg_FillLine:
    case PathSeg_StrokeLine:
        line = PathSeg_StrokeLine_read(ref);
        sh_p0[th_ix] = line.p0;
        sh_p1[th_ix] = line.p1;
        xmin = min(line.p0.x, line.p1.x) - line.stroke.x;
        xmax = max(line.p0.x, line.p1.x) + line.stroke.x;
        ymin = min(line.p0.y, line.p1.y) - line.stroke.y;
        ymax = max(line.p0.y, line.p1.y) + line.stroke.y;
        float dx = line.p1.x - line.p0.x;
        float dy = line.p1.y - line.p0.y;
        // Set up for per-scanline coverage formula, below.
        float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
        c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;
        b = invslope; // Note: assumes square tiles, otherwise scale.
        a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
        sh_a[th_ix] = a;
        sh_b[th_ix] = b;
        sh_c[th_ix] = c;
        break;
    }
    int x0 = int(floor((xmin) * SX));
    int x1 = int(ceil((xmax) * SX));
    int y0 = int(floor((ymin) * SY));
    int y1 = int(ceil((ymax) * SY));
    
    uint path_ix = line.path_ix;
    Path path = Path_read(PathRef(path_ix * Path_size));
    ivec4 bbox = ivec4(path.bbox);
    x0 = clamp(x0, bbox.x, bbox.z);
    y0 = clamp(y0, bbox.y, bbox.w);
    x1 = clamp(x1, bbox.x, bbox.z);
    y1 = clamp(y1, bbox.y, bbox.w);
    sh_x0[th_ix] = x0;
    // TODO: can get rid of this (fold into base), with care (also need to update `a`)
    sh_y0[th_ix] = y0;
    int stride = bbox.z - bbox.x;
    sh_stride[th_ix] = stride;
    sh_base[th_ix] = path.tiles.offset - (bbox.y * stride + bbox.x) * Tile_size;
    uint width = uint(x1 - x0);
    sh_width[th_ix] = width;
    uint draw_width = min(width, uint(1.0 + ceil(2.0 * c)));
    sh_draw_width[th_ix] = draw_width;
    uint tile_count = draw_width * uint(y1 - y0);

    sh_tile_count[th_ix] = tile_count;
    for (uint i = 0; i < LG_COARSE_WG; i++) {
        barrier();
        if (th_ix >= (1 << i)) {
            tile_count += sh_tile_count[th_ix - (1 << i)];
        }
        barrier();
        sh_tile_count[th_ix] = tile_count;
    }
    if (th_ix == COARSE_WG - 1) {
        sh_alloc_start = atomicAdd(alloc, tile_count * TileSeg_size);
    }
    barrier();
    uint alloc_start = sh_alloc_start;
    uint total_tile_count = sh_tile_count[COARSE_WG - 1];

    for (uint ix = th_ix; ix < total_tile_count; ix += COARSE_WG) {
        // Binary search to find element
        uint el_ix = 0;
        for (uint i = 0; i < LG_COARSE_WG; i++) {
            uint probe = el_ix + ((COARSE_WG / 2) >> i);
            if (ix >= sh_tile_count[probe - 1]) {
                el_ix = probe;
            }
        }
        uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
        uint draw_width = sh_draw_width[el_ix];
        int x0 = sh_x0[el_ix];
        int x1 = x0 + int(sh_width[el_ix]);
        int dx = int(seq_ix % draw_width);
        uint y = sh_y0[el_ix] + seq_ix / draw_width;
        float t = sh_a[el_ix] + sh_b[el_ix] * float(y);
        float c = sh_c[el_ix];
        int xx0 = clamp(int(floor(t - c)), x0, x1);
        int xx1 = clamp(int(ceil(t + c)), x0, x1);
        int x = xx0 + dx;
        if (x < xx1) {
            uint tile_offset = alloc_start + ix * TileSeg_size;
            uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2;
            uint old = atomicExchange(tile[tile_el], tile_offset);
            TileSeg tile_seg;
            tile_seg.start = sh_p0[el_ix];
            tile_seg.end = sh_p1[el_ix];
            tile_seg.next.offset = old;
            TileSeg_write(TileSegRef(tile_offset), tile_seg);
        }
    }
}
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`// Coarse rasterization of path segments.`

			`// Allocation and initialization of tiles for paths.`

			`#version 450`
			`#extension GL_GOOGLE_include_directive : enable`

			`#include "setup.h"`

More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`#define LG_COARSE_WG 5`
			`#define COARSE_WG (1 << LG_COARSE_WG)`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`layout(local_size_x = COARSE_WG, local_size_y = 1) in;`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00
			`layout(set = 0, binding = 0) buffer PathSegBuf {`
			`uint[] pathseg;`
			`};`

			`layout(set = 0, binding = 1) buffer AllocBuf {`
			`uint n_paths;`
			`uint n_pathseg;`
			`uint alloc;`
			`};`

			`layout(set = 0, binding = 2) buffer TileBuf {`
			`uint[] tile;`
			`};`

			`#include "pathseg.h"`
			`#include "tile.h"`

			`// scale factors useful for converting coordinates to tiles`
			`#define SX (1.0 / float(TILE_WIDTH_PX))`
			`#define SY (1.0 / float(TILE_HEIGHT_PX))`

More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`shared uint sh_tile_count[COARSE_WG];`
			`shared uint sh_width[COARSE_WG];`
			`shared uint sh_draw_width[COARSE_WG];`
			`shared vec2 sh_p0[COARSE_WG];`
			`shared vec2 sh_p1[COARSE_WG];`
			`shared int sh_x0[COARSE_WG];`
			`shared int sh_y0[COARSE_WG];`
			`shared float sh_a[COARSE_WG];`
			`shared float sh_b[COARSE_WG];`
			`shared float sh_c[COARSE_WG];`
			`shared uint sh_base[COARSE_WG];`
			`shared uint sh_stride[COARSE_WG];`
			`shared uint sh_alloc_start;`

Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`void main() {`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`uint th_ix = gl_LocalInvocationID.x;`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`uint element_ix = gl_GlobalInvocationID.x;`
			`PathSegRef ref = PathSegRef(element_ix * PathSeg_size);`

			`uint tag = PathSeg_Nop;`
			`if (element_ix < n_pathseg) {`
			`tag = PathSeg_tag(ref);`
			`}`
			`// Setup for coverage algorithm.`
			`float a, b, c;`
			`// Bounding box of element in pixel coordinates.`
			`float xmin, xmax, ymin, ymax;`
			`PathStrokeLine line;`
			`switch (tag) {`
			`case PathSeg_FillLine:`
			`case PathSeg_StrokeLine:`
			`line = PathSeg_StrokeLine_read(ref);`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`sh_p0[th_ix] = line.p0;`
			`sh_p1[th_ix] = line.p1;`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`xmin = min(line.p0.x, line.p1.x) - line.stroke.x;`
			`xmax = max(line.p0.x, line.p1.x) + line.stroke.x;`
			`ymin = min(line.p0.y, line.p1.y) - line.stroke.y;`
			`ymax = max(line.p0.y, line.p1.y) + line.stroke.y;`
			`float dx = line.p1.x - line.p0.x;`
			`float dy = line.p1.y - line.p0.y;`
			`// Set up for per-scanline coverage formula, below.`
			`float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;`
			`c = (line.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + line.stroke.y)) * SX;`
			`b = invslope; // Note: assumes square tiles, otherwise scale.`
			`a = (line.p0.x - (line.p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`sh_a[th_ix] = a;`
			`sh_b[th_ix] = b;`
			`sh_c[th_ix] = c;`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`break;`
			`}`
			`int x0 = int(floor((xmin) * SX));`
			`int x1 = int(ceil((xmax) * SX));`
			`int y0 = int(floor((ymin) * SY));`
			`int y1 = int(ceil((ymax) * SY));`

			`uint path_ix = line.path_ix;`
			`Path path = Path_read(PathRef(path_ix * Path_size));`
			`ivec4 bbox = ivec4(path.bbox);`
			`x0 = clamp(x0, bbox.x, bbox.z);`
			`y0 = clamp(y0, bbox.y, bbox.w);`
			`x1 = clamp(x1, bbox.x, bbox.z);`
			`y1 = clamp(y1, bbox.y, bbox.w);`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`sh_x0[th_ix] = x0;`
			// TODO: can get rid of this (fold into base), with care (also need to update `a`)
			`sh_y0[th_ix] = y0;`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`int stride = bbox.z - bbox.x;`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`sh_stride[th_ix] = stride;`
			`sh_base[th_ix] = path.tiles.offset - (bbox.y * stride + bbox.x) * Tile_size;`
			`uint width = uint(x1 - x0);`
			`sh_width[th_ix] = width;`
			`uint draw_width = min(width, uint(1.0 + ceil(2.0 * c)));`
			`sh_draw_width[th_ix] = draw_width;`
			`uint tile_count = draw_width * uint(y1 - y0);`

			`sh_tile_count[th_ix] = tile_count;`
			`for (uint i = 0; i < LG_COARSE_WG; i++) {`
			`barrier();`
			`if (th_ix >= (1 << i)) {`
			`tile_count += sh_tile_count[th_ix - (1 << i)];`
			`}`
			`barrier();`
			`sh_tile_count[th_ix] = tile_count;`
			`}`
			`if (th_ix == COARSE_WG - 1) {`
			`sh_alloc_start = atomicAdd(alloc, tile_count * TileSeg_size);`
			`}`
			`barrier();`
			`uint alloc_start = sh_alloc_start;`
			`uint total_tile_count = sh_tile_count[COARSE_WG - 1];`

			`for (uint ix = th_ix; ix < total_tile_count; ix += COARSE_WG) {`
			`// Binary search to find element`
			`uint el_ix = 0;`
			`for (uint i = 0; i < LG_COARSE_WG; i++) {`
			`uint probe = el_ix + ((COARSE_WG / 2) >> i);`
			`if (ix >= sh_tile_count[probe - 1]) {`
			`el_ix = probe;`
			`}`
			`}`
			`uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);`
			`uint draw_width = sh_draw_width[el_ix];`
			`int x0 = sh_x0[el_ix];`
			`int x1 = x0 + int(sh_width[el_ix]);`
			`int dx = int(seq_ix % draw_width);`
			`uint y = sh_y0[el_ix] + seq_ix / draw_width;`
			`float t = sh_a[el_ix] + sh_b[el_ix] * float(y);`
			`float c = sh_c[el_ix];`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`int xx0 = clamp(int(floor(t - c)), x0, x1);`
			`int xx1 = clamp(int(ceil(t + c)), x0, x1);`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`int x = xx0 + dx;`
			`if (x < xx1) {`
			`uint tile_offset = alloc_start + ix * TileSeg_size;`
			`uint tile_el = (sh_base[el_ix] + uint(y * sh_stride[el_ix] + x) * Tile_size) >> 2;`
Use atomicExchange over atomicCompSwap Significant perf win (approx 2x in the path coarse rasterizer) 2020-06-06 01:24:26 +10:00			`uint old = atomicExchange(tile[tile_el], tile_offset);`
More parallel path coarse raster Use fancier load balancing algorithm for coarse rendering of paths. Seems to work and an improvement in some cases. 2020-06-05 08:58:38 +10:00			`TileSeg tile_seg;`
			`tile_seg.start = sh_p0[el_ix];`
			`tile_seg.end = sh_p1[el_ix];`
Experiment with new sorting scheme Path segments are unsorted, but other elements are using the same sort-middle approach as before. This is a checkpoint. At this point, there are unoptimized versions of tile init and coarse path raster, but it isn't wired up into a working pipeline. Also observing about a 3x performance regression in element processing, which needs to be investigated. 2020-06-03 10:10:20 +10:00			`tile_seg.next.offset = old;`
			`TileSeg_write(TileSegRef(tile_offset), tile_seg);`
			`}`
			`}`
			`}`