vello/piet-gpu/shader/binning.comp

153 lines
5 KiB
Plaintext
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The binning stage of the pipeline.
//
// Each workgroup processes N_TILE paths.
// Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
// based on the path bounding box to bin the paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
layout(local_size_x = N_TILE, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "annotated.h"
#include "bins.h"
// scale factors useful for converting coordinates to bins
#define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
#define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
// Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
#define INFINITY (1.0 / 0.0)
// Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE];
shared bool sh_alloc_failed;
void main() {
if (mem_error != NO_ERROR) {
return;
}
uint my_n_elements = conf.n_elements;
uint my_partition = gl_WorkGroupID.x;
for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0;
}
if (gl_LocalInvocationID.x == 0) {
sh_alloc_failed = false;
}
barrier();
// Read inputs and determine coverage of bins
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
uint tag = Annotated_Nop;
if (element_ix < my_n_elements) {
tag = Annotated_tag(conf.anno_alloc, ref).tag;
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) {
case Annotated_Fill:
implement FillImage command and sRGB support FillImage is like Fill, except that it takes its color from one or more image atlases. kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array of image descriptors on Vulkan. A previous version of this commit used textures. I think images are a better choice for piet-gpu, for several reasons: - Texture sampling, in particular textureGrad, is slow on lower spec devices such as Google Pixel. Texture sampling is particularly slow and difficult to implement for CPU fallbacks. - Texture sampling need more parameters, in particular the full u,v transformation matrix, leading to a large increase in the command size. Since all commands use the same size, that memory penalty is paid by all scenes, not just scenes with textures. - It is unlikely that piet-gpu will support every kind of fill for every client, because each kind must be added to kernel4. With FillImage, a client will prepare the image(s) in separate shader stages, sampling and applying transformations and special effects as needed. Textures that align with the output pixel grid can be used directly, without pre-processing. Note that the pre-processing step can run concurrently with the piet-gpu pipeline; Only the last stage, kernel4, needs the images. Pre-processing most likely uses fixed function vertex/fragment programs, which on some GPUs may run in parallel with piet-gpu's compute programs. While here, fix a few validation errors: - Explicitly enable EXT_descriptor_indexing, KHR_maintenance3, KHR_get_physical_device_properties2. - Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but sampler2D arrays do, at least on my setup). Updates #38 Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
case Annotated_FillImage:
case Annotated_Stroke:
case Annotated_BeginClip:
case Annotated_EndClip:
// Note: we take advantage of the fact that these drawing elements
// have the bbox at the same place in their layout.
AnnoFill fill = Annotated_Fill_read(conf.anno_alloc, ref);
x0 = int(floor(fill.bbox.x * SX));
y0 = int(floor(fill.bbox.y * SY));
x1 = int(ceil(fill.bbox.z * SX));
y1 = int(ceil(fill.bbox.w * SY));
break;
}
// At this point, we run an iterator over the coverage area,
// trying to keep divergence low.
// Right now, it's just a bbox, but we'll get finer with
// segments.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
x0 = clamp(x0, 0, int(width_in_bins));
x1 = clamp(x1, x0, int(width_in_bins));
y0 = clamp(y0, 0, int(height_in_bins));
y1 = clamp(y1, y0, int(height_in_bins));
if (x0 == x1) y1 = y0;
int x = x0, y = y0;
uint my_slice = gl_LocalInvocationID.x / 32;
uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
while (y < y1) {
atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
x++;
if (x == x1) {
x = x0;
y++;
}
}
barrier();
// Allocate output segments.
uint element_count = 0;
for (uint i = 0; i < N_SLICE; i++) {
element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
count[i][gl_LocalInvocationID.x] = element_count;
}
// element_count is number of elements covering bin for this invocation.
Alloc chunk_alloc = new_alloc(0, 0);
if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest)
MallocResult chunk = malloc(element_count * BinInstance_size);
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed) {
sh_alloc_failed = true;
}
}
// Note: it might be more efficient for reading to do this in the
// other order (each bin is a contiguous sequence of partitions)
uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
write_mem(conf.bin_alloc, out_ix, element_count);
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
barrier();
if (sh_alloc_failed) {
return;
}
// Use similar strategy as Laine & Karras paper; loop over bbox of bins
// touched by this element
x = x0;
y = y0;
while (y < y1) {
uint bin_ix = y * width_in_bins + x;
uint out_mask = bitmaps[my_slice][bin_ix];
if ((out_mask & my_mask) != 0) {
uint idx = bitCount(out_mask & (my_mask - 1));
if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + idx * BinInstance_size;
BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
}
x++;
if (x == x1) {
x = x0;
y++;
}
}
}