vello/piet-gpu/shader/binning.comp
Raph Levien acb3933d94 Variable size encoding of draw objects
This patch switches to a variable size encoding of draw objects.

In addition to the CPU-side scene encoding, it changes the representation of intermediate per draw object state from the `Annotated` struct to a variable "info" encoding. In addition, the bounding boxes are moved to a separate array (for a more "structure of "arrays" approach). Data that's unchanged from the scene encoding is not copied. Rather, downstream stages can access the data from the scene buffer (reducing allocation and copying).

Prefix sums, computed in `DrawMonoid` track the offset of both scene and intermediate data. The tags for the CPU-side encoding have been split into their own stream (again a change from AoS to SoA style).

This is not necessarily the final form. There's some stuff (including at least one piet-gpu-derive type) that can be deleted. In addition, the linewidth field should probably move from the info to path-specific. Also, the 1:1 correspondence between draw object and path has not yet been broken.

Closes #152
2022-03-14 16:32:08 -07:00

195 lines
6.7 KiB
GLSL

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The binning stage of the pipeline.
//
// Each workgroup processes N_TILE paths.
// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
// based on the path bounding box to bin the paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "bins.h"
#include "drawtag.h"
// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
#define INFINITY (1.0 / 0.0)
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE];
shared bool sh_alloc_failed;
DrawMonoid load_draw_monoid(uint element_ix) {
uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
uint path_ix = memory[base];
uint clip_ix = memory[base + 1];
uint scene_offset = memory[base + 2];
uint info_offset = memory[base + 3];
return DrawMonoid(path_ix, clip_ix, scene_offset, info_offset);
}
// Load bounding box computed by clip processing
vec4 load_clip_bbox(uint clip_ix) {
uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * clip_ix;
float x0 = uintBitsToFloat(memory[base]);
float y0 = uintBitsToFloat(memory[base + 1]);
float x1 = uintBitsToFloat(memory[base + 2]);
float y1 = uintBitsToFloat(memory[base + 3]);
vec4 bbox = vec4(x0, y0, x1, y1);
return bbox;
}
vec4 bbox_intersect(vec4 a, vec4 b) {
return vec4(max(a.xy, b.xy), min(a.zw, b.zw));
}
// Load path's bbox from bbox (as written by pathseg).
vec4 load_path_bbox(uint path_ix) {
uint base = (conf.path_bbox_alloc.offset >> 2) + 6 * path_ix;
float bbox_l = float(memory[base]) - 32768.0;
float bbox_t = float(memory[base + 1]) - 32768.0;
float bbox_r = float(memory[base + 2]) - 32768.0;
float bbox_b = float(memory[base + 3]) - 32768.0;
vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b);
return bbox;
}
void store_draw_bbox(uint draw_ix, vec4 bbox) {
uint base = (conf.draw_bbox_alloc.offset >> 2) + 4 * draw_ix;
memory[base] = floatBitsToUint(bbox.x);
memory[base + 1] = floatBitsToUint(bbox.y);
memory[base + 2] = floatBitsToUint(bbox.z);
memory[base + 3] = floatBitsToUint(bbox.w);
}
void main() {
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0;
}
if (gl_LocalInvocationID.x == 0) {
sh_alloc_failed = false;
}
barrier();
// Read inputs and determine coverage of bins
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
if (element_ix < conf.n_elements) {
DrawMonoid draw_monoid = load_draw_monoid(element_ix);
uint path_ix = draw_monoid.path_ix;
vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9);
uint clip_ix = draw_monoid.clip_ix;
if (clip_ix > 0) {
clip_bbox = load_clip_bbox(clip_ix - 1);
}
// For clip elements, clip_bbox is the bbox of the clip path, intersected
// with enclosing clips.
// For other elements, it is the bbox of the enclosing clips.
vec4 path_bbox = load_path_bbox(path_ix);
vec4 bbox = bbox_intersect(path_bbox, clip_bbox);
// Avoid negative-size bbox (is this necessary)?
bbox.zw = max(bbox.xy, bbox.zw);
// Store clip-intersected bbox for tile_alloc.
store_draw_bbox(element_ix, bbox);
x0 = int(floor(bbox.x * SX));
y0 = int(floor(bbox.y * SY));
x1 = int(ceil(bbox.z * SX));
y1 = int(ceil(bbox.w * SY));
}
// At this point, we run an iterator over the coverage area,
// trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with
// segments.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1) / N_TILE_Y;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1)
y1 = y0;
int x = x0, y = y0;
uint my_slice = gl_LocalInvocationID.x / 32;
uint my_mask = 1u << (gl_LocalInvocationID.x & 31);
while (y < y1) {
atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
x++;
if (x == x1) {
x = x0;
y++;
}
}
barrier();
// Allocate output segments.
uint element_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
count[i][gl_LocalInvocationID.x] = element_count;
}
// element_count is number of elements covering bin for this invocation.
Alloc chunk_alloc = new_alloc(0, 0, true);
if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest)
MallocResult chunk = malloc(element_count * BinInstance_size);
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed) {
sh_alloc_failed = true;
}
}
// Note: it might be more efficient for reading to do this in the
// other order (each bin is a contiguous sequence of partitions)
uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
write_mem(conf.bin_alloc, out_ix, element_count);
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
barrier();
if (sh_alloc_failed || mem_error != NO_ERROR) {
return;
}
// Use similar strategy as Laine & Karras paper; loop over bbox of bins
// touched by this element
x = x0;
y = y0;
while (y < y1) {
uint bin_ix = y * width_in_bins + x;
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0) {
uint idx = bitCount(out_mask & (my_mask - 1));
if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + idx * BinInstance_size;
BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
}
x++;
if (x == x1) {
x = x0;
y++;
}
}
}