vello/piet-gpu/shader/setup.h

104 lines
3.1 KiB
GLSL
Raw Normal View History

// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// Various constants for the sizes of groups and tiles.
// Much of this will be made dynamic in various ways, but for now it's easiest
// to hardcode and keep all in one place.
// A LG_WG_FACTOR of n scales workgroup sizes by 2^n. Use 0 for a
// maximum workgroup size of 128, or 1 for a maximum size of 256.
#define LG_WG_FACTOR 1
#define WG_FACTOR (1<<LG_WG_FACTOR)
#define TILE_WIDTH_PX 16
#define TILE_HEIGHT_PX 16
#define PTCL_INITIAL_ALLOC 1024
// These should probably be renamed and/or reworked. In the binning
// kernel, they represent the number of bins. Also, the workgroup size
// of that kernel is equal to the number of bins, but should probably
// be more flexible (it's 512 in the K&L paper).
#define N_TILE_X 16
#define N_TILE_Y (8 * WG_FACTOR)
#define N_TILE (N_TILE_X * N_TILE_Y)
#define LG_N_TILE (7 + LG_WG_FACTOR)
#define N_SLICE (N_TILE / 32)
#define GRADIENT_WIDTH 512
// We allocate this many blend stack entries in registers, and spill
// to memory for the overflow.
#define BLEND_STACK_SPLIT 4
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-23 08:48:26 -07:00
#ifdef MALLOC_FAILED
struct Config {
Implement robust dynamic memory This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic. On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed. On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory. The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer. Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config. This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!) There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter. Closes #175
2022-06-23 08:48:26 -07:00
uint mem_size; // in bytes
uint n_elements; // paths
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
// new element pipeline stuff follows
// Bounding boxes of paths, stored as int (so atomics work)
Alloc path_bbox_alloc;
// Monoid for draw objects
Alloc drawmonoid_alloc;
// BeginClip(path_ix) / EndClip
Alloc clip_alloc;
// Intermediate bicyclic semigroup
Alloc clip_bic_alloc;
// Intermediate stack
Alloc clip_stack_alloc;
// Clip processing results (path_ix + bbox)
Alloc clip_bbox_alloc;
// Bounding box per draw object
Alloc draw_bbox_alloc;
// Info computed in draw stage, per draw object
Alloc drawinfo_alloc;
// Number of transforms in scene
// This is probably not needed.
uint n_trans;
// This *should* count only actual paths, but in the current
// implementation is redundant with n_elements.
uint n_path;
// Total number of BeginClip and EndClip draw objects.
uint n_clip;
// Note: one of these offsets *could* be hardcoded to zero (as was the
// original element stream), but for now retain flexibility.
// Offset (in bytes) of transform stream in scene buffer
uint trans_offset;
// Offset (in bytes) of linewidth stream in scene
uint linewidth_offset;
// Offset (in bytes) of path tag stream in scene
uint pathtag_offset;
// Offset (in bytes) of path segment stream in scene
uint pathseg_offset;
// Offset (in bytes) of draw object tag stream in scene; see drawtag.h
uint drawtag_offset;
// Offset (in bytes) of draw payload stream in scene
uint drawdata_offset;
};
#endif
// Fill modes.
#define MODE_NONZERO 0
#define MODE_STROKE 1
// Size of kernel4 clip state, in words.
#define CLIP_STATE_SIZE 1
// fill_mode_from_flags extracts the fill mode from tag flags.
uint fill_mode_from_flags(uint flags) {
return flags & 0x1;
}