vello/piet-gpu/shader/tile_alloc.comp

109 lines
3.4 KiB
Plaintext
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Allocation and initialization of tiles for paths.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
#define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
Config conf;
};
#include "annotated.h"
#include "tile.h"
// scale factors useful for converting coordinates to tiles
#define SX (1.0 / float(TILE_WIDTH_PX))
#define SY (1.0 / float(TILE_HEIGHT_PX))
shared uint sh_tile_count[TILE_ALLOC_WG];
shared MallocResult sh_tile_alloc;
void main() {
if (mem_error != NO_ERROR) {
return;
}
uint th_ix = gl_LocalInvocationID.x;
uint element_ix = gl_GlobalInvocationID.x;
PathRef path_ref = PathRef(conf.tile_alloc.offset + element_ix * Path_size);
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
uint tag = Annotated_Nop;
if (element_ix < conf.n_elements) {
tag = Annotated_tag(conf.anno_alloc, ref).tag;
}
int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
switch (tag) {
case Annotated_Color:
implement FillImage command and sRGB support FillImage is like Fill, except that it takes its color from one or more image atlases. kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array of image descriptors on Vulkan. A previous version of this commit used textures. I think images are a better choice for piet-gpu, for several reasons: - Texture sampling, in particular textureGrad, is slow on lower spec devices such as Google Pixel. Texture sampling is particularly slow and difficult to implement for CPU fallbacks. - Texture sampling need more parameters, in particular the full u,v transformation matrix, leading to a large increase in the command size. Since all commands use the same size, that memory penalty is paid by all scenes, not just scenes with textures. - It is unlikely that piet-gpu will support every kind of fill for every client, because each kind must be added to kernel4. With FillImage, a client will prepare the image(s) in separate shader stages, sampling and applying transformations and special effects as needed. Textures that align with the output pixel grid can be used directly, without pre-processing. Note that the pre-processing step can run concurrently with the piet-gpu pipeline; Only the last stage, kernel4, needs the images. Pre-processing most likely uses fixed function vertex/fragment programs, which on some GPUs may run in parallel with piet-gpu's compute programs. While here, fix a few validation errors: - Explicitly enable EXT_descriptor_indexing, KHR_maintenance3, KHR_get_physical_device_properties2. - Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but sampler2D arrays do, at least on my setup). Updates #38 Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
case Annotated_FillImage:
case Annotated_BeginClip:
case Annotated_EndClip:
// Note: we take advantage of the fact that fills, strokes, and
// clips have compatible layout.
AnnoClip clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
x0 = int(floor(clip.bbox.x * SX));
y0 = int(floor(clip.bbox.y * SY));
x1 = int(ceil(clip.bbox.z * SX));
y1 = int(ceil(clip.bbox.w * SY));
break;
}
x0 = clamp(x0, 0, int(conf.width_in_tiles));
y0 = clamp(y0, 0, int(conf.height_in_tiles));
x1 = clamp(x1, 0, int(conf.width_in_tiles));
y1 = clamp(y1, 0, int(conf.height_in_tiles));
Path path;
path.bbox = uvec4(x0, y0, x1, y1);
uint tile_count = (x1 - x0) * (y1 - y0);
if (tag == Annotated_EndClip) {
// Don't actually allocate tiles for an end clip, but we do want
// the path structure (especially bbox) allocated for it.
tile_count = 0;
}
sh_tile_count[th_ix] = tile_count;
uint total_tile_count = tile_count;
// Prefix sum of sh_tile_count
for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) {
barrier();
if (th_ix >= (1 << i)) {
total_tile_count += sh_tile_count[th_ix - (1 << i)];
}
barrier();
sh_tile_count[th_ix] = total_tile_count;
}
if (th_ix == TILE_ALLOC_WG - 1) {
sh_tile_alloc = malloc(total_tile_count * Tile_size);
}
barrier();
MallocResult alloc_start = sh_tile_alloc;
if (alloc_start.failed) {
return;
}
if (element_ix < conf.n_elements) {
uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
Alloc tiles_alloc = slice_mem(alloc_start.alloc, Tile_size * tile_subix, Tile_size * tile_count);
path.tiles = TileRef(tiles_alloc.offset);
Path_write(conf.tile_alloc, path_ref, path);
}
// Zero out allocated tiles efficiently
uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
uint start_ix = alloc_start.alloc.offset >> 2;
for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
// Note: this interleaving is faster than using Tile_write
// by a significant amount.
write_mem(alloc_start.alloc, start_ix + i, 0);
}
}