2020-12-12 01:01:48 +11:00
|
|
|
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
|
|
|
|
|
2020-05-14 08:35:19 +10:00
|
|
|
// The coarse rasterizer stage of the pipeline.
|
2020-06-28 23:37:27 +10:00
|
|
|
//
|
|
|
|
// As input we have the ordered partitions of paths from the binning phase and
|
|
|
|
// the annotated tile list of segments and backdrop per path.
|
|
|
|
//
|
|
|
|
// Each workgroup operating on one bin by stream compacting
|
|
|
|
// the elements corresponding to the bin.
|
|
|
|
//
|
|
|
|
// As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded.
|
2020-05-14 08:35:19 +10:00
|
|
|
|
|
|
|
#version 450
|
|
|
|
#extension GL_GOOGLE_include_directive : enable
|
|
|
|
|
2020-12-12 04:30:20 +11:00
|
|
|
#include "mem.h"
|
2020-12-24 22:00:53 +11:00
|
|
|
#include "setup.h"
|
2020-05-14 08:35:19 +10:00
|
|
|
|
|
|
|
layout(local_size_x = N_TILE, local_size_y = 1) in;
|
|
|
|
|
2020-12-12 04:30:20 +11:00
|
|
|
layout(set = 0, binding = 1) readonly buffer ConfigBuf {
|
|
|
|
Config conf;
|
2020-05-14 08:35:19 +10:00
|
|
|
};
|
|
|
|
|
|
|
|
#include "annotated.h"
|
|
|
|
#include "bins.h"
|
2020-06-04 02:28:43 +10:00
|
|
|
#include "tile.h"
|
2020-05-14 08:35:19 +10:00
|
|
|
#include "ptcl.h"
|
|
|
|
|
2020-09-13 20:58:47 +10:00
|
|
|
#define LG_N_PART_READ (7 + LG_WG_FACTOR)
|
2020-05-31 08:37:34 +10:00
|
|
|
#define N_PART_READ (1 << LG_N_PART_READ)
|
|
|
|
|
2020-05-31 14:12:55 +10:00
|
|
|
shared uint sh_elements[N_TILE];
|
2020-05-14 08:35:19 +10:00
|
|
|
|
2020-05-31 08:37:34 +10:00
|
|
|
// Number of elements in the partition; prefix sum.
|
|
|
|
shared uint sh_part_count[N_PART_READ];
|
2020-12-24 22:00:53 +11:00
|
|
|
shared Alloc sh_part_elements[N_PART_READ];
|
2020-05-31 08:37:34 +10:00
|
|
|
|
2020-05-14 08:35:19 +10:00
|
|
|
shared uint sh_bitmaps[N_SLICE][N_TILE];
|
2020-05-23 07:18:39 +10:00
|
|
|
|
2020-06-04 10:55:42 +10:00
|
|
|
shared uint sh_tile_count[N_TILE];
|
|
|
|
// The width of the tile rect for the element, intersected with this bin
|
|
|
|
shared uint sh_tile_width[N_TILE];
|
|
|
|
shared uint sh_tile_x0[N_TILE];
|
|
|
|
shared uint sh_tile_y0[N_TILE];
|
2020-05-14 08:35:19 +10:00
|
|
|
|
2020-06-05 03:39:08 +10:00
|
|
|
// These are set up so base + tile_y * stride + tile_x points to a Tile.
|
|
|
|
shared uint sh_tile_base[N_TILE];
|
|
|
|
shared uint sh_tile_stride[N_TILE];
|
|
|
|
|
2020-12-24 22:00:53 +11:00
|
|
|
#ifdef MEM_DEBUG
|
|
|
|
// Store allocs only when MEM_DEBUG to save shared memory traffic.
|
|
|
|
shared Alloc sh_tile_alloc[N_TILE];
|
|
|
|
|
|
|
|
void write_tile_alloc(uint el_ix, Alloc a) {
|
|
|
|
sh_tile_alloc[el_ix] = a;
|
|
|
|
}
|
|
|
|
|
|
|
|
Alloc read_tile_alloc(uint el_ix) {
|
|
|
|
return sh_tile_alloc[el_ix];
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
void write_tile_alloc(uint el_ix, Alloc a) {
|
|
|
|
// No-op
|
|
|
|
}
|
|
|
|
|
|
|
|
Alloc read_tile_alloc(uint el_ix) {
|
|
|
|
// All memory.
|
|
|
|
return new_alloc(0, memory.length()*4);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2021-03-18 22:47:14 +11:00
|
|
|
// The maximum number of commands per annotated element.
|
|
|
|
#define ANNO_COMMANDS 2
|
|
|
|
|
2020-12-24 22:00:53 +11:00
|
|
|
// Perhaps cmd_alloc should be a global? This is a style question.
|
|
|
|
bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
|
2020-12-12 04:30:20 +11:00
|
|
|
if (cmd_ref.offset < cmd_limit) {
|
|
|
|
return true;
|
2020-05-16 05:28:29 +10:00
|
|
|
}
|
2020-12-24 22:00:53 +11:00
|
|
|
MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
|
2020-12-12 04:30:20 +11:00
|
|
|
if (new_cmd.failed) {
|
|
|
|
return false;
|
|
|
|
}
|
2020-12-24 22:00:53 +11:00
|
|
|
CmdJump jump = CmdJump(new_cmd.alloc.offset);
|
|
|
|
Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
|
|
|
|
cmd_alloc = new_cmd.alloc;
|
|
|
|
cmd_ref = CmdRef(cmd_alloc.offset);
|
2021-03-18 22:47:14 +11:00
|
|
|
// Reserve space for the maximum number of commands and a potential jump.
|
|
|
|
cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
|
2020-12-12 04:30:20 +11:00
|
|
|
return true;
|
2020-05-16 05:28:29 +10:00
|
|
|
}
|
|
|
|
|
2020-05-14 08:35:19 +10:00
|
|
|
void main() {
|
2020-12-24 22:00:53 +11:00
|
|
|
if (mem_error != NO_ERROR) {
|
2020-12-12 04:30:20 +11:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-05-14 08:35:19 +10:00
|
|
|
// Could use either linear or 2d layouts for both dispatch and
|
|
|
|
// invocations within the workgroup. We'll use variables to abstract.
|
2020-12-24 21:53:17 +11:00
|
|
|
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
|
|
|
|
uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x;
|
2020-05-31 01:35:26 +10:00
|
|
|
uint partition_ix = 0;
|
2020-12-12 04:30:20 +11:00
|
|
|
uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
|
2020-05-14 08:35:19 +10:00
|
|
|
uint th_ix = gl_LocalInvocationID.x;
|
2020-05-16 05:28:29 +10:00
|
|
|
|
2020-06-05 03:39:08 +10:00
|
|
|
// Coordinates of top left of bin, in tiles.
|
|
|
|
uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x;
|
|
|
|
uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y;
|
2020-11-28 03:42:21 +11:00
|
|
|
|
|
|
|
// Per-tile state
|
2020-06-05 03:39:08 +10:00
|
|
|
uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
|
|
|
|
uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
|
2020-12-18 10:55:21 +11:00
|
|
|
uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x;
|
2020-12-24 22:00:53 +11:00
|
|
|
Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC);
|
|
|
|
CmdRef cmd_ref = CmdRef(cmd_alloc.offset);
|
2021-03-18 22:47:14 +11:00
|
|
|
// Reserve space for the maximum number of commands and a potential jump.
|
|
|
|
uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
|
2020-11-28 03:42:21 +11:00
|
|
|
// The nesting depth of the clip stack
|
|
|
|
uint clip_depth = 0;
|
|
|
|
// State for the "clip zero" optimization. If it's nonzero, then we are
|
|
|
|
// currently in a clip for which the entire tile has an alpha of zero, and
|
|
|
|
// the value is the depth after the "begin clip" of that element.
|
|
|
|
uint clip_zero_depth = 0;
|
|
|
|
// State for the "clip one" optimization. If bit `i` is set, then that means
|
|
|
|
// that the clip pushed at depth `i` has an alpha of all one.
|
|
|
|
uint clip_one_mask = 0;
|
2020-05-16 05:28:29 +10:00
|
|
|
|
2020-05-31 08:37:34 +10:00
|
|
|
// I'm sure we can figure out how to do this with at least one fewer register...
|
|
|
|
// Items up to rd_ix have been read from sh_elements
|
2020-05-14 08:35:19 +10:00
|
|
|
uint rd_ix = 0;
|
2020-05-31 08:37:34 +10:00
|
|
|
// Items up to wr_ix have been written into sh_elements
|
|
|
|
uint wr_ix = 0;
|
|
|
|
// Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements
|
|
|
|
uint part_start_ix = 0;
|
|
|
|
uint ready_ix = 0;
|
2020-11-28 03:42:21 +11:00
|
|
|
|
2020-05-16 13:57:07 +10:00
|
|
|
while (true) {
|
2020-05-14 08:35:19 +10:00
|
|
|
for (uint i = 0; i < N_SLICE; i++) {
|
|
|
|
sh_bitmaps[i][th_ix] = 0;
|
|
|
|
}
|
|
|
|
|
2020-05-31 08:37:34 +10:00
|
|
|
// parallel read of input partitions
|
|
|
|
do {
|
|
|
|
if (ready_ix == wr_ix && partition_ix < n_partitions) {
|
|
|
|
part_start_ix = ready_ix;
|
|
|
|
uint count = 0;
|
|
|
|
if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
|
2020-12-24 22:00:53 +11:00
|
|
|
uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
|
|
|
|
count = read_mem(conf.bin_alloc, in_ix);
|
|
|
|
uint offset = read_mem(conf.bin_alloc, in_ix + 1);
|
|
|
|
sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size);
|
2020-05-31 08:37:34 +10:00
|
|
|
}
|
|
|
|
// prefix sum of counts
|
|
|
|
for (uint i = 0; i < LG_N_PART_READ; i++) {
|
|
|
|
if (th_ix < N_PART_READ) {
|
|
|
|
sh_part_count[th_ix] = count;
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
if (th_ix < N_PART_READ) {
|
|
|
|
if (th_ix >= (1 << i)) {
|
|
|
|
count += sh_part_count[th_ix - (1 << i)];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
}
|
|
|
|
if (th_ix < N_PART_READ) {
|
|
|
|
sh_part_count[th_ix] = part_start_ix + count;
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
ready_ix = sh_part_count[N_PART_READ - 1];
|
|
|
|
partition_ix += N_PART_READ;
|
|
|
|
}
|
|
|
|
// use binary search to find element to read
|
|
|
|
uint ix = rd_ix + th_ix;
|
|
|
|
if (ix >= wr_ix && ix < ready_ix) {
|
|
|
|
uint part_ix = 0;
|
|
|
|
for (uint i = 0; i < LG_N_PART_READ; i++) {
|
|
|
|
uint probe = part_ix + ((N_PART_READ / 2) >> i);
|
|
|
|
if (ix >= sh_part_count[probe - 1]) {
|
|
|
|
part_ix = probe;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix;
|
2020-12-24 22:00:53 +11:00
|
|
|
Alloc bin_alloc = sh_part_elements[part_ix];
|
|
|
|
BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset);
|
|
|
|
BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix));
|
2020-05-31 14:12:55 +10:00
|
|
|
sh_elements[th_ix] = inst.element_ix;
|
2020-05-14 08:35:19 +10:00
|
|
|
}
|
2020-05-31 08:37:34 +10:00
|
|
|
barrier();
|
|
|
|
|
|
|
|
wr_ix = min(rd_ix + N_TILE, ready_ix);
|
|
|
|
} while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions));
|
|
|
|
|
2020-05-14 08:35:19 +10:00
|
|
|
// We've done the merge and filled the buffer.
|
2020-05-15 10:06:45 +10:00
|
|
|
|
|
|
|
// Read one element, compute coverage.
|
2020-05-14 08:35:19 +10:00
|
|
|
uint tag = Annotated_Nop;
|
2020-06-05 03:39:08 +10:00
|
|
|
uint element_ix;
|
2020-05-14 08:35:19 +10:00
|
|
|
AnnotatedRef ref;
|
|
|
|
if (th_ix + rd_ix < wr_ix) {
|
2020-06-05 03:39:08 +10:00
|
|
|
element_ix = sh_elements[th_ix];
|
2020-12-24 22:00:53 +11:00
|
|
|
ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
2021-03-17 20:51:38 +11:00
|
|
|
tag = Annotated_tag(conf.anno_alloc, ref).tag;
|
2020-05-14 08:35:19 +10:00
|
|
|
}
|
2020-05-15 10:06:45 +10:00
|
|
|
|
2020-05-20 01:20:45 +10:00
|
|
|
// Bounding box of element in pixel coordinates.
|
2020-06-05 03:39:08 +10:00
|
|
|
uint tile_count;
|
2020-05-15 10:06:45 +10:00
|
|
|
switch (tag) {
|
2021-03-17 22:02:41 +11:00
|
|
|
case Annotated_Color:
|
2021-03-19 05:21:07 +11:00
|
|
|
case Annotated_Image:
|
2020-11-21 04:26:02 +11:00
|
|
|
case Annotated_BeginClip:
|
|
|
|
case Annotated_EndClip:
|
|
|
|
// We have one "path" for each element, even if the element isn't
|
|
|
|
// actually a path (currently EndClip, but images etc in the future).
|
2020-06-05 03:39:08 +10:00
|
|
|
uint path_ix = element_ix;
|
2020-12-24 22:00:53 +11:00
|
|
|
Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
|
2020-06-05 03:39:08 +10:00
|
|
|
uint stride = path.bbox.z - path.bbox.x;
|
|
|
|
sh_tile_stride[th_ix] = stride;
|
|
|
|
int dx = int(path.bbox.x) - int(bin_tile_x);
|
|
|
|
int dy = int(path.bbox.y) - int(bin_tile_y);
|
|
|
|
int x0 = clamp(dx, 0, N_TILE_X);
|
|
|
|
int y0 = clamp(dy, 0, N_TILE_Y);
|
|
|
|
int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X);
|
|
|
|
int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y);
|
|
|
|
sh_tile_width[th_ix] = uint(x1 - x0);
|
|
|
|
sh_tile_x0[th_ix] = x0;
|
|
|
|
sh_tile_y0[th_ix] = y0;
|
|
|
|
tile_count = uint(x1 - x0) * uint(y1 - y0);
|
|
|
|
// base relative to bin
|
|
|
|
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
|
|
|
|
sh_tile_base[th_ix] = base;
|
2020-12-24 22:00:53 +11:00
|
|
|
Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size);
|
|
|
|
write_tile_alloc(th_ix, path_alloc);
|
2020-05-20 01:20:45 +10:00
|
|
|
break;
|
|
|
|
default:
|
2020-06-05 03:39:08 +10:00
|
|
|
tile_count = 0;
|
2020-05-15 10:06:45 +10:00
|
|
|
break;
|
|
|
|
}
|
2020-05-20 01:20:45 +10:00
|
|
|
|
2020-06-04 10:55:42 +10:00
|
|
|
// Prefix sum of sh_tile_count
|
|
|
|
sh_tile_count[th_ix] = tile_count;
|
|
|
|
for (uint i = 0; i < LG_N_TILE; i++) {
|
|
|
|
barrier();
|
|
|
|
if (th_ix >= (1 << i)) {
|
|
|
|
tile_count += sh_tile_count[th_ix - (1 << i)];
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
sh_tile_count[th_ix] = tile_count;
|
|
|
|
}
|
|
|
|
barrier();
|
|
|
|
uint total_tile_count = sh_tile_count[N_TILE - 1];
|
|
|
|
for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) {
|
|
|
|
// Binary search to find element
|
|
|
|
uint el_ix = 0;
|
|
|
|
for (uint i = 0; i < LG_N_TILE; i++) {
|
|
|
|
uint probe = el_ix + ((N_TILE / 2) >> i);
|
|
|
|
if (ix >= sh_tile_count[probe - 1]) {
|
|
|
|
el_ix = probe;
|
|
|
|
}
|
|
|
|
}
|
2020-12-24 22:00:53 +11:00
|
|
|
AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size);
|
2021-03-17 20:51:38 +11:00
|
|
|
uint tag = Annotated_tag(conf.anno_alloc, ref).tag;
|
2020-06-04 10:55:42 +10:00
|
|
|
uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
|
|
|
|
uint width = sh_tile_width[el_ix];
|
|
|
|
uint x = sh_tile_x0[el_ix] + seq_ix % width;
|
|
|
|
uint y = sh_tile_y0[el_ix] + seq_ix / width;
|
2020-11-21 04:26:02 +11:00
|
|
|
bool include_tile;
|
|
|
|
if (tag == Annotated_BeginClip || tag == Annotated_EndClip) {
|
|
|
|
include_tile = true;
|
|
|
|
} else {
|
2020-12-24 22:00:53 +11:00
|
|
|
Tile tile = Tile_read(read_tile_alloc(el_ix), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
|
2020-11-21 04:26:02 +11:00
|
|
|
// Include the path in the tile if
|
|
|
|
// - the tile contains at least a segment (tile offset non-zero)
|
|
|
|
// - the tile is completely covered (backdrop non-zero)
|
2020-11-30 02:59:58 +11:00
|
|
|
include_tile = tile.tile.offset != 0 || tile.backdrop != 0;
|
2020-11-21 04:26:02 +11:00
|
|
|
}
|
|
|
|
if (include_tile) {
|
2020-06-05 03:39:08 +10:00
|
|
|
uint el_slice = el_ix / 32;
|
|
|
|
uint el_mask = 1 << (el_ix & 31);
|
|
|
|
atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask);
|
2020-05-15 10:06:45 +10:00
|
|
|
}
|
|
|
|
}
|
2020-06-04 10:55:42 +10:00
|
|
|
|
2020-05-16 05:28:29 +10:00
|
|
|
barrier();
|
2020-05-15 10:06:45 +10:00
|
|
|
|
2020-05-23 07:18:39 +10:00
|
|
|
// Output non-segment elements for this tile. The thread does a sequential walk
|
2020-11-21 04:26:02 +11:00
|
|
|
// through the non-segment elements.
|
2020-05-15 10:06:45 +10:00
|
|
|
uint slice_ix = 0;
|
|
|
|
uint bitmap = sh_bitmaps[0][th_ix];
|
|
|
|
while (true) {
|
2020-06-04 02:28:43 +10:00
|
|
|
if (bitmap == 0) {
|
2020-05-15 10:06:45 +10:00
|
|
|
slice_ix++;
|
|
|
|
if (slice_ix == N_SLICE) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bitmap = sh_bitmaps[slice_ix][th_ix];
|
2020-06-04 02:28:43 +10:00
|
|
|
if (bitmap == 0) {
|
2020-05-15 10:06:45 +10:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2020-06-04 02:28:43 +10:00
|
|
|
uint element_ref_ix = slice_ix * 32 + findLSB(bitmap);
|
2020-05-31 14:12:55 +10:00
|
|
|
uint element_ix = sh_elements[element_ref_ix];
|
2020-05-15 10:06:45 +10:00
|
|
|
|
2020-06-04 02:28:43 +10:00
|
|
|
// Clear LSB
|
|
|
|
bitmap &= bitmap - 1;
|
2020-05-21 04:48:05 +10:00
|
|
|
|
2020-05-15 10:06:45 +10:00
|
|
|
// At this point, we read the element again from global memory.
|
|
|
|
// If that turns out to be expensive, maybe we can pack it into
|
|
|
|
// shared memory (or perhaps just the tag).
|
2020-12-24 22:00:53 +11:00
|
|
|
ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
|
2021-03-17 22:02:41 +11:00
|
|
|
AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref);
|
2020-05-16 05:28:29 +10:00
|
|
|
|
2020-11-28 03:42:21 +11:00
|
|
|
if (clip_zero_depth == 0) {
|
2021-03-17 22:02:41 +11:00
|
|
|
switch (tag.tag) {
|
|
|
|
case Annotated_Color:
|
2020-12-24 22:00:53 +11:00
|
|
|
Tile tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
|
2020-11-28 03:42:21 +11:00
|
|
|
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
|
2021-03-17 22:02:41 +11:00
|
|
|
AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref);
|
2020-12-24 22:00:53 +11:00
|
|
|
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
|
2020-12-12 04:30:20 +11:00
|
|
|
break;
|
|
|
|
}
|
2021-03-17 22:02:41 +11:00
|
|
|
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
|
|
|
|
if (tile.tile.offset != 0) {
|
2021-03-19 05:21:07 +11:00
|
|
|
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
|
2021-03-17 22:02:41 +11:00
|
|
|
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdFill_size;
|
2021-03-17 22:02:41 +11:00
|
|
|
} else {
|
2021-03-18 22:47:14 +11:00
|
|
|
Cmd_Solid_write(cmd_alloc, cmd_ref);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4;
|
2021-03-17 22:02:41 +11:00
|
|
|
}
|
2020-11-28 03:42:21 +11:00
|
|
|
} else {
|
2021-03-19 05:21:07 +11:00
|
|
|
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill.linewidth);
|
2021-03-17 22:02:41 +11:00
|
|
|
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdStroke_size;
|
2020-11-28 03:42:21 +11:00
|
|
|
}
|
2021-03-18 22:47:14 +11:00
|
|
|
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color));
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdColor_size;
|
2020-11-28 03:42:21 +11:00
|
|
|
break;
|
2021-03-19 05:21:07 +11:00
|
|
|
case Annotated_Image:
|
implement FillImage command and sRGB support
FillImage is like Fill, except that it takes its color from one or
more image atlases.
kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array
of image descriptors on Vulkan.
A previous version of this commit used textures. I think images are a better
choice for piet-gpu, for several reasons:
- Texture sampling, in particular textureGrad, is slow on lower spec devices
such as Google Pixel. Texture sampling is particularly slow and difficult to
implement for CPU fallbacks.
- Texture sampling need more parameters, in particular the full u,v
transformation matrix, leading to a large increase in the command size. Since
all commands use the same size, that memory penalty is paid by all scenes, not
just scenes with textures.
- It is unlikely that piet-gpu will support every kind of fill for every
client, because each kind must be added to kernel4.
With FillImage, a client will prepare the image(s) in separate shader stages,
sampling and applying transformations and special effects as needed. Textures
that align with the output pixel grid can be used directly, without
pre-processing.
Note that the pre-processing step can run concurrently with the piet-gpu pipeline;
Only the last stage, kernel4, needs the images.
Pre-processing most likely uses fixed function vertex/fragment programs,
which on some GPUs may run in parallel with piet-gpu's compute programs.
While here, fix a few validation errors:
- Explicitly enable EXT_descriptor_indexing, KHR_maintenance3,
KHR_get_physical_device_properties2.
- Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for
vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but
sampler2D arrays do, at least on my setup).
Updates #38
Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
|
|
|
tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
|
|
|
|
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
|
2021-03-19 05:21:07 +11:00
|
|
|
AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref);
|
implement FillImage command and sRGB support
FillImage is like Fill, except that it takes its color from one or
more image atlases.
kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array
of image descriptors on Vulkan.
A previous version of this commit used textures. I think images are a better
choice for piet-gpu, for several reasons:
- Texture sampling, in particular textureGrad, is slow on lower spec devices
such as Google Pixel. Texture sampling is particularly slow and difficult to
implement for CPU fallbacks.
- Texture sampling need more parameters, in particular the full u,v
transformation matrix, leading to a large increase in the command size. Since
all commands use the same size, that memory penalty is paid by all scenes, not
just scenes with textures.
- It is unlikely that piet-gpu will support every kind of fill for every
client, because each kind must be added to kernel4.
With FillImage, a client will prepare the image(s) in separate shader stages,
sampling and applying transformations and special effects as needed. Textures
that align with the output pixel grid can be used directly, without
pre-processing.
Note that the pre-processing step can run concurrently with the piet-gpu pipeline;
Only the last stage, kernel4, needs the images.
Pre-processing most likely uses fixed function vertex/fragment programs,
which on some GPUs may run in parallel with piet-gpu's compute programs.
While here, fix a few validation errors:
- Explicitly enable EXT_descriptor_indexing, KHR_maintenance3,
KHR_get_physical_device_properties2.
- Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for
vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but
sampler2D arrays do, at least on my setup).
Updates #38
Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
|
|
|
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
|
|
|
|
break;
|
|
|
|
}
|
2021-03-19 05:21:07 +11:00
|
|
|
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
|
|
|
|
if (tile.tile.offset != 0) {
|
|
|
|
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
|
|
|
|
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdFill_size;
|
2021-03-19 05:21:07 +11:00
|
|
|
} else {
|
|
|
|
Cmd_Solid_write(cmd_alloc, cmd_ref);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4;
|
2021-03-19 05:21:07 +11:00
|
|
|
}
|
implement FillImage command and sRGB support
FillImage is like Fill, except that it takes its color from one or
more image atlases.
kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array
of image descriptors on Vulkan.
A previous version of this commit used textures. I think images are a better
choice for piet-gpu, for several reasons:
- Texture sampling, in particular textureGrad, is slow on lower spec devices
such as Google Pixel. Texture sampling is particularly slow and difficult to
implement for CPU fallbacks.
- Texture sampling need more parameters, in particular the full u,v
transformation matrix, leading to a large increase in the command size. Since
all commands use the same size, that memory penalty is paid by all scenes, not
just scenes with textures.
- It is unlikely that piet-gpu will support every kind of fill for every
client, because each kind must be added to kernel4.
With FillImage, a client will prepare the image(s) in separate shader stages,
sampling and applying transformations and special effects as needed. Textures
that align with the output pixel grid can be used directly, without
pre-processing.
Note that the pre-processing step can run concurrently with the piet-gpu pipeline;
Only the last stage, kernel4, needs the images.
Pre-processing most likely uses fixed function vertex/fragment programs,
which on some GPUs may run in parallel with piet-gpu's compute programs.
While here, fix a few validation errors:
- Explicitly enable EXT_descriptor_indexing, KHR_maintenance3,
KHR_get_physical_device_properties2.
- Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for
vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but
sampler2D arrays do, at least on my setup).
Updates #38
Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
|
|
|
} else {
|
2021-03-19 05:21:07 +11:00
|
|
|
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * fill_img.linewidth);
|
|
|
|
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdStroke_size;
|
implement FillImage command and sRGB support
FillImage is like Fill, except that it takes its color from one or
more image atlases.
kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array
of image descriptors on Vulkan.
A previous version of this commit used textures. I think images are a better
choice for piet-gpu, for several reasons:
- Texture sampling, in particular textureGrad, is slow on lower spec devices
such as Google Pixel. Texture sampling is particularly slow and difficult to
implement for CPU fallbacks.
- Texture sampling need more parameters, in particular the full u,v
transformation matrix, leading to a large increase in the command size. Since
all commands use the same size, that memory penalty is paid by all scenes, not
just scenes with textures.
- It is unlikely that piet-gpu will support every kind of fill for every
client, because each kind must be added to kernel4.
With FillImage, a client will prepare the image(s) in separate shader stages,
sampling and applying transformations and special effects as needed. Textures
that align with the output pixel grid can be used directly, without
pre-processing.
Note that the pre-processing step can run concurrently with the piet-gpu pipeline;
Only the last stage, kernel4, needs the images.
Pre-processing most likely uses fixed function vertex/fragment programs,
which on some GPUs may run in parallel with piet-gpu's compute programs.
While here, fix a few validation errors:
- Explicitly enable EXT_descriptor_indexing, KHR_maintenance3,
KHR_get_physical_device_properties2.
- Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for
vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but
sampler2D arrays do, at least on my setup).
Updates #38
Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
|
|
|
}
|
2021-03-18 22:47:14 +11:00
|
|
|
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset));
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdImage_size;
|
implement FillImage command and sRGB support
FillImage is like Fill, except that it takes its color from one or
more image atlases.
kernel4 uses a single image for non-Vulkan hosts, and the dynamic sized array
of image descriptors on Vulkan.
A previous version of this commit used textures. I think images are a better
choice for piet-gpu, for several reasons:
- Texture sampling, in particular textureGrad, is slow on lower spec devices
such as Google Pixel. Texture sampling is particularly slow and difficult to
implement for CPU fallbacks.
- Texture sampling need more parameters, in particular the full u,v
transformation matrix, leading to a large increase in the command size. Since
all commands use the same size, that memory penalty is paid by all scenes, not
just scenes with textures.
- It is unlikely that piet-gpu will support every kind of fill for every
client, because each kind must be added to kernel4.
With FillImage, a client will prepare the image(s) in separate shader stages,
sampling and applying transformations and special effects as needed. Textures
that align with the output pixel grid can be used directly, without
pre-processing.
Note that the pre-processing step can run concurrently with the piet-gpu pipeline;
Only the last stage, kernel4, needs the images.
Pre-processing most likely uses fixed function vertex/fragment programs,
which on some GPUs may run in parallel with piet-gpu's compute programs.
While here, fix a few validation errors:
- Explicitly enable EXT_descriptor_indexing, KHR_maintenance3,
KHR_get_physical_device_properties2.
- Specify a vkDescriptorSetVariableDescriptorCountAllocateInfo for
vkAllocateDescriptorSets. Otherwise, variable image2D arrays won't work (but
sampler2D arrays do, at least on my setup).
Updates #38
Signed-off-by: Elias Naur <mail@eliasnaur.com>
2020-12-29 08:02:39 +11:00
|
|
|
break;
|
2020-11-28 03:42:21 +11:00
|
|
|
case Annotated_BeginClip:
|
2020-12-24 22:00:53 +11:00
|
|
|
tile = Tile_read(read_tile_alloc(element_ref_ix), TileRef(sh_tile_base[element_ref_ix]
|
2020-11-28 03:42:21 +11:00
|
|
|
+ (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
|
|
|
|
if (tile.tile.offset == 0 && tile.backdrop == 0) {
|
|
|
|
clip_zero_depth = clip_depth + 1;
|
|
|
|
} else if (tile.tile.offset == 0 && clip_depth < 32) {
|
|
|
|
clip_one_mask |= (1 << clip_depth);
|
|
|
|
} else {
|
2021-03-19 05:21:07 +11:00
|
|
|
AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref);
|
2020-12-24 22:00:53 +11:00
|
|
|
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
|
2020-12-12 04:30:20 +11:00
|
|
|
break;
|
|
|
|
}
|
2021-03-19 05:21:07 +11:00
|
|
|
if (fill_mode_from_flags(tag.flags) == MODE_NONZERO) {
|
|
|
|
if (tile.tile.offset != 0) {
|
|
|
|
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
|
|
|
|
Cmd_Fill_write(cmd_alloc, cmd_ref, cmd_fill);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdFill_size;
|
2021-03-19 05:21:07 +11:00
|
|
|
} else {
|
|
|
|
// TODO: here is where a bunch of optimization magic should happen
|
|
|
|
float alpha = tile.backdrop == 0 ? 0.0 : 1.0;
|
|
|
|
Cmd_Alpha_write(cmd_alloc, cmd_ref, CmdAlpha(alpha));
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdAlpha_size;
|
2021-03-19 05:21:07 +11:00
|
|
|
}
|
2020-11-28 03:42:21 +11:00
|
|
|
} else {
|
2021-03-19 05:21:07 +11:00
|
|
|
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * begin_clip.linewidth);
|
|
|
|
Cmd_Stroke_write(cmd_alloc, cmd_ref, cmd_stroke);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4 + CmdStroke_size;
|
2020-11-28 03:42:21 +11:00
|
|
|
}
|
2021-03-18 22:47:14 +11:00
|
|
|
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4;
|
2020-11-28 03:42:21 +11:00
|
|
|
if (clip_depth < 32) {
|
|
|
|
clip_one_mask &= ~(1 << clip_depth);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
clip_depth++;
|
|
|
|
break;
|
|
|
|
case Annotated_EndClip:
|
|
|
|
clip_depth--;
|
|
|
|
if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
|
2020-12-24 22:00:53 +11:00
|
|
|
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
|
2020-12-12 04:30:20 +11:00
|
|
|
break;
|
|
|
|
}
|
2021-03-18 22:47:14 +11:00
|
|
|
Cmd_Solid_write(cmd_alloc, cmd_ref);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4;
|
2021-03-18 22:47:14 +11:00
|
|
|
Cmd_EndClip_write(cmd_alloc, cmd_ref);
|
2021-03-19 06:17:04 +11:00
|
|
|
cmd_ref.offset += 4;
|
2020-11-28 03:42:21 +11:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// In "clip zero" state, suppress all drawing
|
2021-03-17 22:02:41 +11:00
|
|
|
switch (tag.tag) {
|
2020-11-28 03:42:21 +11:00
|
|
|
case Annotated_BeginClip:
|
|
|
|
clip_depth++;
|
|
|
|
break;
|
|
|
|
case Annotated_EndClip:
|
|
|
|
if (clip_depth == clip_zero_depth) {
|
|
|
|
clip_zero_depth = 0;
|
|
|
|
}
|
|
|
|
clip_depth--;
|
|
|
|
break;
|
2020-11-22 02:00:03 +11:00
|
|
|
}
|
2020-05-16 05:28:29 +10:00
|
|
|
}
|
2020-05-15 10:06:45 +10:00
|
|
|
}
|
2020-05-16 13:57:07 +10:00
|
|
|
barrier();
|
2020-05-15 10:06:45 +10:00
|
|
|
|
2020-05-14 08:35:19 +10:00
|
|
|
rd_ix += N_TILE;
|
2020-05-31 08:37:34 +10:00
|
|
|
if (rd_ix >= ready_ix && partition_ix >= n_partitions) break;
|
2020-05-16 13:57:07 +10:00
|
|
|
}
|
2020-12-18 10:55:21 +11:00
|
|
|
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
|
2020-12-24 22:00:53 +11:00
|
|
|
Cmd_End_write(cmd_alloc, cmd_ref);
|
2020-12-18 10:29:25 +11:00
|
|
|
}
|
2020-05-14 08:35:19 +10:00
|
|
|
}
|