diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index a3a8ffd..2304ea2 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -20,6 +20,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf { #include "annotated.h" #include "bins.h" +#include "drawtag.h" // scale factors useful for converting coordinates to bins #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) @@ -35,6 +36,47 @@ shared uint count[N_SLICE][N_TILE]; shared Alloc sh_chunk_alloc[N_TILE]; shared bool sh_alloc_failed; +DrawMonoid load_draw_monoid(uint element_ix) { + uint base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix; + uint path_ix = memory[base]; + uint clip_ix = memory[base + 1]; + return DrawMonoid(path_ix, clip_ix); +} + +// Load bounding box computed by clip processing +vec4 load_clip_bbox(uint clip_ix) { + uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * clip_ix; + float x0 = uintBitsToFloat(memory[base]); + float y0 = uintBitsToFloat(memory[base + 1]); + float x1 = uintBitsToFloat(memory[base + 2]); + float y1 = uintBitsToFloat(memory[base + 3]); + vec4 bbox = vec4(x0, y0, x1, y1); + return bbox; +} + +vec4 bbox_intersect(vec4 a, vec4 b) { + return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +// Load path's bbox from bbox (as written by pathseg). +vec4 load_path_bbox(uint path_ix) { + uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix; + float bbox_l = float(memory[base]) - 32768.0; + float bbox_t = float(memory[base + 1]) - 32768.0; + float bbox_r = float(memory[base + 2]) - 32768.0; + float bbox_b = float(memory[base + 3]) - 32768.0; + vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +void store_path_bbox(AnnotatedRef ref, vec4 bbox) { + uint ix = ref.offset >> 2; + memory[ix + 1] = floatBitsToUint(bbox.x); + memory[ix + 2] = floatBitsToUint(bbox.y); + memory[ix + 3] = floatBitsToUint(bbox.z); + memory[ix + 4] = floatBitsToUint(bbox.w); +} + void main() { uint my_n_elements = conf.n_elements; uint my_partition = gl_WorkGroupID.x; @@ -61,13 +103,27 @@ void main() { case Annotated_Image: case Annotated_BeginClip: case Annotated_EndClip: - // Note: we take advantage of the fact that these drawing elements - // have the bbox at the same place in their layout. - AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref); - x0 = int(floor(clip.bbox.x * SX)); - y0 = int(floor(clip.bbox.y * SY)); - x1 = int(ceil(clip.bbox.z * SX)); - y1 = int(ceil(clip.bbox.w * SY)); + DrawMonoid draw_monoid = load_draw_monoid(element_ix); + uint path_ix = draw_monoid.path_ix; + vec4 clip_bbox = vec4(-1e9, -1e9, 1e9, 1e9); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0) { + clip_bbox = load_clip_bbox(clip_ix - 1); + } + // For clip elements, clip_bbox is the bbox of the clip path, intersected + // with enclosing clips. + // For other elements, it is the bbox of the enclosing clips. + + vec4 path_bbox = load_path_bbox(path_ix); + vec4 bbox = bbox_intersect(path_bbox, clip_bbox); + // Avoid negative-size bbox (is this necessary)? + bbox.zw = max(bbox.xy, bbox.zw); + // Store clip-intersected bbox for tile_alloc. + store_path_bbox(ref, bbox); + x0 = int(floor(bbox.x * SX)); + y0 = int(floor(bbox.y * SY)); + x1 = int(ceil(bbox.z * SX)); + y1 = int(ceil(bbox.w * SY)); break; } diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 7b80f6f..8b9998f 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -22,7 +22,7 @@ rule dxil rule msl command = $spirv_cross --msl $in --output $out $msl_flags -build gen/binning.spv: glsl binning.comp | annotated.h state.h bins.h setup.h mem.h +build gen/binning.spv: glsl binning.comp | annotated.h bins.h drawtag.h setup.h mem.h build gen/binning.hlsl: hlsl gen/binning.spv build gen/binning.dxil: dxil gen/binning.hlsl build gen/binning.msl: msl gen/binning.spv @@ -119,6 +119,16 @@ build gen/draw_leaf.hlsl: hlsl gen/draw_leaf.spv build gen/draw_leaf.dxil: dxil gen/draw_leaf.hlsl build gen/draw_leaf.msl: msl gen/draw_leaf.spv -build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/kernel4_gray.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv gen/transform_leaf.spv gen/transform_reduce.spv gen/transform_root.spv -build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/kernel4_gray.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl gen/transform_leaf.hlsl gen/transform_reduce.hlsl gen/transform_root.hlsl -build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/kernel4_gray.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl gen/transform_leaf.msl gen/transform_reduce.msl gen/transform_root.msl +build gen/clip_reduce.spv: glsl clip_reduce.comp | mem.h setup.h annotated.h +build gen/clip_reduce.hlsl: hlsl gen/clip_reduce.spv +build gen/clip_reduce.dxil: dxil gen/clip_reduce.hlsl +build gen/clip_reduce.msl: msl gen/clip_reduce.spv + +build gen/clip_leaf.spv: glsl clip_leaf.comp | mem.h setup.h annotated.h +build gen/clip_leaf.hlsl: hlsl gen/clip_leaf.spv +build gen/clip_leaf.dxil: dxil gen/clip_leaf.hlsl +build gen/clip_leaf.msl: msl gen/clip_leaf.spv + +build spv: phony gen/backdrop_lg.spv gen/backdrop.spv gen/bbox_clear.spv gen/binning.spv gen/clip_leaf.spv gen/clip_reduce.spv gen/coarse.spv gen/draw_leaf.spv gen/draw_reduce.spv gen/draw_root.spv gen/kernel4.spv gen/kernel4_gray.spv gen/path_coarse.spv gen/pathseg.spv gen/pathtag_reduce.spv gen/pathtag_root.spv gen/tile_alloc.spv gen/transform_leaf.spv gen/transform_reduce.spv gen/transform_root.spv +build dxil: phony gen/backdrop.hlsl gen/backdrop_lg.hlsl gen/bbox_clear.hlsl gen/binning.hlsl gen/clip_leaf.hlsl gen/clip_reduce.hlsl gen/coarse.hlsl gen/draw_leaf.hlsl gen/draw_reduce.hlsl gen/draw_root.hlsl gen/kernel4.hlsl gen/kernel4_gray.hlsl gen/path_coarse.hlsl gen/pathseg.hlsl gen/pathtag_reduce.hlsl gen/pathtag_root.hlsl gen/tile_alloc.hlsl gen/transform_leaf.hlsl gen/transform_reduce.hlsl gen/transform_root.hlsl +build msl: phony gen/backdrop_lg.msl gen/backdrop.msl gen/bbox_clear.msl gen/binning.msl gen/clip_leaf.msl gen/clip_reduce.msl gen/coarse.msl gen/draw_leaf.msl gen/draw_reduce.msl gen/draw_root.msl gen/kernel4.msl gen/kernel4_gray.msl gen/path_coarse.msl gen/pathseg.msl gen/pathtag_reduce.msl gen/pathtag_root.msl gen/tile_alloc.msl gen/transform_leaf.msl gen/transform_reduce.msl gen/transform_root.msl diff --git a/piet-gpu/shader/clip_leaf.comp b/piet-gpu/shader/clip_leaf.comp new file mode 100644 index 0000000..5f7e79b --- /dev/null +++ b/piet-gpu/shader/clip_leaf.comp @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The second dispatch of clip stack processing. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define LG_WG_SIZE (7 + LG_WG_FACTOR) +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE WG_SIZE + +layout(local_size_x = WG_SIZE) in; + +layout(binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" + +// Some of this is cut'n'paste duplication with the reduce pass, and +// arguably should be moved to a common .h file. +// The bicyclic monoid + +struct ClipEl { + // index of parent node + uint parent_ix; + // bounding box + vec4 bbox; +}; + +struct Bic { + uint a; + uint b; +}; + +Bic bic_combine(Bic x, Bic y) { + uint m = min(x.b, y.a); + return Bic(x.a + y.a - m, x.b + y.b - m); +} + +// Load path's bbox from bbox (as written by pathseg). +vec4 load_path_bbox(uint path_ix) { + uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix; + float bbox_l = float(memory[base]) - 32768.0; + float bbox_t = float(memory[base + 1]) - 32768.0; + float bbox_r = float(memory[base + 2]) - 32768.0; + float bbox_b = float(memory[base + 3]) - 32768.0; + vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +vec4 bbox_intersect(vec4 a, vec4 b) { + return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +shared Bic sh_bic[WG_SIZE * 2 - 2]; +shared uint sh_stack[PARTITION_SIZE]; +shared vec4 sh_stack_bbox[PARTITION_SIZE]; +shared uint sh_link[PARTITION_SIZE]; +shared vec4 sh_bbox[PARTITION_SIZE]; + +// This is adapted directly from the stack monoid impl. +// Return value is reference within partition if >= 0, +// otherwise reference to stack. +uint search_link(inout Bic bic) { + uint ix = gl_LocalInvocationID.x; + uint j = 0; + while (j < LG_WG_SIZE) { + uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j)); + if (((ix >> j) & 1) != 0) { + Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic); + if (test.b > 0) { + break; + } + bic = test; + ix -= 1u << j; + } + j++; + } + if (ix > 0) { + while (j > 0) { + j--; + uint base = 2 * WG_SIZE - (2u << (LG_WG_SIZE - j)); + Bic test = bic_combine(sh_bic[base + (ix >> j) - 1], bic); + if (test.b == 0) { + bic = test; + ix -= 1u << j; + } + } + } + // ix is the smallest value such that reduce(ix..th).b == 0 + if (ix > 0) { + return ix - 1; + } else { + return ~0u - bic.a; + } +} + +Bic load_bic(uint ix) { + uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix; + return Bic(memory[base], memory[base + 1]); +} + +ClipEl load_clip_el(uint ix) { + uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix; + uint parent_ix = memory[base]; + float x0 = uintBitsToFloat(memory[base + 1]); + float y0 = uintBitsToFloat(memory[base + 2]); + float x1 = uintBitsToFloat(memory[base + 3]); + float y1 = uintBitsToFloat(memory[base + 4]); + vec4 bbox = vec4(x0, y0, x1, y1); + return ClipEl(parent_ix, bbox); +} + +uint load_path_ix(uint ix) { + // This is one approach to a partial final block. Another would be + // to do a memset to the padding in the command queue. + if (ix < conf.n_clip) { + return memory[(conf.clip_alloc.offset >> 2) + ix]; + } else { + // EndClip tags don't implicate further loads. + return 0x80000000; + } +} + +void store_clip_bbox(uint ix, vec4 bbox) { + uint base = (conf.clip_bbox_alloc.offset >> 2) + 4 * ix; + memory[base] = floatBitsToUint(bbox.x); + memory[base + 1] = floatBitsToUint(bbox.y); + memory[base + 2] = floatBitsToUint(bbox.z); + memory[base + 3] = floatBitsToUint(bbox.w); +} + +void main() { + // materialize stack up to the start of this partition. This + // is based on the pure stack monoid, but with two additions. + + // First, (this only matters if the stack goes deeper than the + // partition size, which might be unlikely in practice), the + // topmost stack element from each partition is picked, then an + // exclusive scan of those. Also note that if this is skipped, + // a scan is not needed in the reduce stage. + + // Second, after the stream compaction, do a scan of the retrieved + // bbox values. + uint th = gl_LocalInvocationID.x; + Bic bic = Bic(0, 0); + if (th < gl_WorkGroupID.x) { + bic = load_bic(th); + } + sh_bic[th] = bic; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (th + (1u << i) < WG_SIZE) { + Bic other = sh_bic[th + (1u << i)]; + bic = bic_combine(bic, other); + } + barrier(); + sh_bic[th] = bic; + } + barrier(); + uint stack_size = sh_bic[0].b; + + // TODO: do bbox scan here (to unlock greater stack depth) + + // binary search in stack + uint sp = PARTITION_SIZE - 1 - th; + uint ix = 0; + for (uint i = 0; i < LG_WG_SIZE; i++) { + uint probe = ix + (uint(PARTITION_SIZE / 2) >> i); + if (sp < sh_bic[probe].b) { + ix = probe; + } + } + // ix is largest value such that sp < sh_bic[ix].b (if any) + uint b = sh_bic[ix].b; + vec4 bbox = vec4(-1e9, -1e9, 1e9, 1e9); + if (sp < b) { + // maybe store the index here for future use? + ClipEl el = load_clip_el(ix * PARTITION_SIZE + b - sp - 1); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + // other element values here? + } + + // forward scan of bbox values of prefix stack + for (uint i = 0; i < LG_WG_SIZE; i++) { + sh_stack_bbox[th] = bbox; + barrier(); + if (th >= (1u << i)) { + bbox = bbox_intersect(sh_stack_bbox[th - (1u << i)], bbox); + } + barrier(); + } + sh_stack_bbox[th] = bbox; + + // Read input and compute bicyclic semigroup binary tree + uint inp = load_path_ix(gl_GlobalInvocationID.x); + bool is_push = int(inp) >= 0; + bic = Bic(1 - uint(is_push), uint(is_push)); + sh_bic[th] = bic; + if (is_push) { + bbox = load_path_bbox(inp); + } else { + bbox = vec4(-1e9, -1e9, 1e9, 1e9); + } + uint inbase = 0; + for (uint i = 0; i < LG_WG_SIZE - 1; i++) { + uint outbase = 2 * WG_SIZE - (1u << (LG_WG_SIZE - i)); + barrier(); + if (th < (1u << (LG_WG_SIZE - 1 - i))) { + sh_bic[outbase + th] = bic_combine(sh_bic[inbase + th * 2], sh_bic[inbase + th * 2 + 1]); + } + inbase = outbase; + } + barrier(); + // Search for predecessor node + bic = Bic(0, 0); + uint link = search_link(bic); + // we use N_SEQ > 1 convention here: + // link >= 0 is index within partition + // link < 0 is reference to stack + + // We want grandparent bbox for pop nodes, so follow those links. + sh_link[th] = link; + barrier(); + uint grandparent; + if (int(link) >= 0) { + grandparent = sh_link[link]; + } else { + grandparent = link - 1; + } + + // Resolve parent + uint parent; + if (int(link) >= 0) { + parent = gl_WorkGroupID.x * PARTITION_SIZE + link; + } else if (int(link + stack_size) >= 0) { + parent = sh_stack[PARTITION_SIZE + link]; + } else { + parent = ~0u; + } + + // bbox scan along parent links + for (uint i = 0; i < LG_WG_SIZE; i++) { + // sh_link was already stored for first iteration + if (i != 0) { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + barrier(); + if (int(link) >= 0) { + bbox = bbox_intersect(sh_bbox[link], bbox); + link = sh_link[link]; + } + barrier(); + } + if (int(link + stack_size) >= 0) { + bbox = bbox_intersect(sh_stack_bbox[PARTITION_SIZE + link], bbox); + } + // At this point, bbox is the reduction of bounding boxes along the tree. + sh_bbox[th] = bbox; + barrier(); + + uint path_ix = inp; + if (!is_push && gl_GlobalInvocationID.x < conf.n_clip) { + // Is this load expensive? If so, it's loaded earlier for in-partition + // and is in the ClipEl for cross-partition. + // If not, can probably get rid of it in the stack intermediate buf. + path_ix = load_path_ix(parent); + uint drawmonoid_out_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * ~inp; + // Fix up drawmonoid so path_ix at EndClip matches BeginClip + memory[drawmonoid_out_base] = path_ix; + + if (int(grandparent) >= 0) { + bbox = sh_bbox[grandparent]; + } else if (int(grandparent + stack_size) >= 0) { + bbox = sh_stack_bbox[PARTITION_SIZE + grandparent]; + } else { + bbox = vec4(-1e9, -1e9, 1e9, 1e9); + } + } + store_clip_bbox(gl_GlobalInvocationID.x, bbox); +} diff --git a/piet-gpu/shader/clip_reduce.comp b/piet-gpu/shader/clip_reduce.comp new file mode 100644 index 0000000..c62b239 --- /dev/null +++ b/piet-gpu/shader/clip_reduce.comp @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The reduce pass for clip stack processing. + +// The primary input is a sequence of path ids representing paths to +// push, with a special value of ~0 to represent pop. + +// For each path, the bounding box is found in the anno stream +// (anno_alloc), though this may change. + +// Output is a stack monoid reduction for the partition. The Bic +// is stored in the BicBuf, and the stack slice in StackBuf. + +// Note: for this shader, only pushes are represented in the stack +// monoid reduction output, so we don't have to worry about the +// interpretation of pops. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define LG_WG_SIZE (7 + LG_WG_FACTOR) +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE WG_SIZE + +layout(local_size_x = WG_SIZE) in; + +layout(binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +#include "annotated.h" + +// The intermediate state for clip processing. +struct ClipEl { + // index of parent node + uint parent_ix; + // bounding box + vec4 bbox; +}; + +// The bicyclic monoid +struct Bic { + uint a; + uint b; +}; + +Bic bic_combine(Bic x, Bic y) { + uint m = min(x.b, y.a); + return Bic(x.a + y.a - m, x.b + y.b - m); +} + +shared Bic sh_bic[WG_SIZE]; +shared uint sh_parent[WG_SIZE]; +shared uint sh_path_ix[WG_SIZE]; +shared vec4 sh_bbox[WG_SIZE]; + +// Load path's bbox from bbox (as written by pathseg). +vec4 load_path_bbox(uint path_ix) { + uint base = (conf.bbox_alloc.offset >> 2) + 6 * path_ix; + float bbox_l = float(memory[base]) - 32768.0; + float bbox_t = float(memory[base + 1]) - 32768.0; + float bbox_r = float(memory[base + 2]) - 32768.0; + float bbox_b = float(memory[base + 3]) - 32768.0; + vec4 bbox = vec4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +vec4 bbox_intersect(vec4 a, vec4 b) { + return vec4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +void store_bic(uint ix, Bic bic) { + uint base = (conf.clip_bic_alloc.offset >> 2) + 2 * ix; + memory[base] = bic.a; + memory[base + 1] = bic.b; +} + +void store_clip_el(uint ix, ClipEl el) { + uint base = (conf.clip_stack_alloc.offset >> 2) + 5 * ix; + memory[base] = el.parent_ix; + memory[base + 1] = floatBitsToUint(el.bbox.x); + memory[base + 2] = floatBitsToUint(el.bbox.y); + memory[base + 3] = floatBitsToUint(el.bbox.z); + memory[base + 4] = floatBitsToUint(el.bbox.w); +} + +void main() { + uint th = gl_LocalInvocationID.x; + uint inp = memory[(conf.clip_alloc.offset >> 2) + gl_GlobalInvocationID.x]; + bool is_push = int(inp) >= 0; + // reverse scan of bicyclic semigroup + Bic bic = Bic(1 - uint(is_push), uint(is_push)); + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (th + (1u << i) < WG_SIZE) { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + bic = bic_combine(bic, other); + } + barrier(); + sh_bic[th] = bic; + } + if (th == 0) { + store_bic(gl_WorkGroupID.x, bic); + } + barrier(); + uint size = sh_bic[0].b; + bic = Bic(0, 0); + if (th + 1 < WG_SIZE) { + bic = sh_bic[th + 1]; + } + if (is_push && bic.a == 0) { + uint local_ix = size - bic.b - 1; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + barrier(); + // Do forward scan of bounding box intersection + vec4 bbox; + uint path_ix; + if (th < size) { + path_ix = sh_path_ix[th]; + bbox = load_path_bbox(path_ix); + } + // Not necessary if depth is bounded by wg size +#if 0 + for (uint i = 0; i < LG_WG_SIZE; i++) { + // We gate so we never access uninit data, but it might + // be more efficient to avoid the conditionals. + if (th < size) { + sh_bbox[th] = bbox; + } + barrier(); + if (th < size && th >= (1u << i)) { + bbox = bbox_intersect(sh_bbox[th - (1u << i)], bbox); + } + barrier(); + } +#endif + if (th < size) { + uint parent_ix = sh_parent[th] + gl_WorkGroupID.x * PARTITION_SIZE; + ClipEl el = ClipEl(parent_ix, bbox); + store_clip_el(gl_GlobalInvocationID.x, el); + } +} diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index bf5f949..98ab270 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -136,9 +136,6 @@ void main() { // currently in a clip for which the entire tile has an alpha of zero, and // the value is the depth after the "begin clip" of that element. uint clip_zero_depth = 0; - // State for the "clip one" optimization. If bit `i` is set, then that means - // that the clip pushed at depth `i` has an alpha of all one. - uint clip_one_mask = 0; // I'm sure we can figure out how to do this with at least one fewer register... // Items up to rd_ix have been read from sh_elements @@ -227,9 +224,8 @@ void main() { case Annotated_LinGradient: case Annotated_BeginClip: case Annotated_EndClip: - // We have one "path" for each element, even if the element isn't - // actually a path (currently EndClip, but images etc in the future). - uint path_ix = element_ix; + uint drawmonoid_base = (conf.drawmonoid_alloc.offset >> 2) + 2 * element_ix; + uint path_ix = memory[drawmonoid_base]; Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; @@ -283,15 +279,15 @@ void main() { uint x = sh_tile_x0[el_ix] + seq_ix % width; uint y = sh_tile_y0[el_ix] + seq_ix / width; bool include_tile = false; - if (tag == Annotated_BeginClip || tag == Annotated_EndClip) { - include_tile = true; - } else if (mem_ok) { + if (mem_ok) { Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); - // Include the path in the tile if - // - the tile contains at least a segment (tile offset non-zero) - // - the tile is completely covered (backdrop non-zero) - include_tile = tile.tile.offset != 0 || tile.backdrop != 0; + bool is_clip = tag == Annotated_BeginClip || tag == Annotated_EndClip; + // Always include the tile if it contains a path segment. + // For draws, include the tile if it is solid. + // For clips, include the tile if it is empty - this way, logic + // below will suppress the drawing of inner elements. + include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip; } if (include_tile) { uint el_slice = el_ix / 32; @@ -378,33 +374,26 @@ void main() { (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); if (tile.tile.offset == 0 && tile.backdrop == 0) { clip_zero_depth = clip_depth + 1; - } else if (tile.tile.offset == 0 && clip_depth < 32) { - clip_one_mask |= (1u << clip_depth); } else { - AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref); if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { break; } - write_fill(cmd_alloc, cmd_ref, tag.flags, tile, begin_clip.linewidth); Cmd_BeginClip_write(cmd_alloc, cmd_ref); cmd_ref.offset += 4; - if (clip_depth < 32) { - clip_one_mask &= ~(1u << clip_depth); - } } clip_depth++; break; case Annotated_EndClip: + tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), + TileRef(sh_tile_base[element_ref_ix] + + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); clip_depth--; - if (clip_depth >= 32 || (clip_one_mask & (1u << clip_depth)) == 0) { - if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { - break; - } - Cmd_Solid_write(cmd_alloc, cmd_ref); - cmd_ref.offset += 4; - Cmd_EndClip_write(cmd_alloc, cmd_ref); - cmd_ref.offset += 4; + if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { + break; } + write_fill(cmd_alloc, cmd_ref, MODE_NONZERO, tile, 0.0); + Cmd_EndClip_write(cmd_alloc, cmd_ref); + cmd_ref.offset += 4; break; } } else { diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp index c020847..f236b7f 100644 --- a/piet-gpu/shader/draw_leaf.comp +++ b/piet-gpu/shader/draw_leaf.comp @@ -72,9 +72,14 @@ void main() { } uint out_ix = gl_GlobalInvocationID.x * N_ROWS; uint out_base = (conf.drawmonoid_alloc.offset >> 2) + out_ix * 2; + uint clip_out_base = conf.clip_alloc.offset >> 2; AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + out_ix * Annotated_size); for (uint i = 0; i < N_ROWS; i++) { - Monoid m = combine_tag_monoid(row, local[i]); + Monoid m = row; + if (i > 0) { + m = combine_tag_monoid(m, local[i - 1]); + } + // m now holds exclusive scan of draw monoid memory[out_base + i * 2] = m.path_ix; memory[out_base + i * 2 + 1] = m.clip_ix; @@ -83,8 +88,9 @@ void main() { // later stages read scene + bbox etc. ElementRef this_ref = Element_index(ref, i); tag_word = Element_tag(this_ref).tag; - if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage) { - uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * (m.path_ix - 1); + if (tag_word == Element_FillColor || tag_word == Element_FillLinGradient || tag_word == Element_FillImage || + tag_word == Element_BeginClip) { + uint bbox_offset = (conf.bbox_alloc.offset >> 2) + 6 * m.path_ix; float bbox_l = float(memory[bbox_offset]) - 32768.0; float bbox_t = float(memory[bbox_offset + 1]) - 32768.0; float bbox_r = float(memory[bbox_offset + 2]) - 32768.0; @@ -142,21 +148,27 @@ void main() { anno_img.offset = fill_img.offset; Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img); break; + case Element_BeginClip: + AnnoBeginClip anno_begin_clip; + anno_begin_clip.bbox = bbox; + anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke + Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip); + break; } - } else if (tag_word == Element_BeginClip) { - Clip begin_clip = Element_BeginClip_read(this_ref); - AnnoBeginClip anno_begin_clip; - // This is the absolute bbox, it's been transformed during encoding. - anno_begin_clip.bbox = begin_clip.bbox; - anno_begin_clip.linewidth = 0.0; // don't support clip-with-stroke - Annotated_BeginClip_write(conf.anno_alloc, out_ref, 0, anno_begin_clip); } else if (tag_word == Element_EndClip) { - Clip end_clip = Element_EndClip_read(this_ref); AnnoEndClip anno_end_clip; - // This bbox is expected to be the same as the begin one. - anno_end_clip.bbox = end_clip.bbox; + // The actual bbox will be reconstructed from clip stream output. + anno_end_clip.bbox = vec4(-1e9, -1e9, 1e9, 1e9); Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); } + // Generate clip stream. + if (tag_word == Element_BeginClip || tag_word == Element_EndClip) { + uint path_ix = ~(out_ix + i); + if (tag_word == Element_BeginClip) { + path_ix = m.path_ix; + } + memory[clip_out_base + m.clip_ix] = path_ix; + } out_ref.offset += Annotated_size; } } diff --git a/piet-gpu/shader/gen/backdrop.hlsl b/piet-gpu/shader/gen/backdrop.hlsl index 65b969d..a2e71a8 100644 --- a/piet-gpu/shader/gen/backdrop.hlsl +++ b/piet-gpu/shader/gen/backdrop.hlsl @@ -44,8 +44,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/backdrop.msl b/piet-gpu/shader/gen/backdrop.msl index 7640ed0..be670fc 100644 --- a/piet-gpu/shader/gen/backdrop.msl +++ b/piet-gpu/shader/gen/backdrop.msl @@ -63,8 +63,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/backdrop.spv b/piet-gpu/shader/gen/backdrop.spv index f3a7824..f90bf6e 100644 Binary files a/piet-gpu/shader/gen/backdrop.spv and b/piet-gpu/shader/gen/backdrop.spv differ diff --git a/piet-gpu/shader/gen/backdrop_lg.hlsl b/piet-gpu/shader/gen/backdrop_lg.hlsl index 57bb6d3..5071af2 100644 --- a/piet-gpu/shader/gen/backdrop_lg.hlsl +++ b/piet-gpu/shader/gen/backdrop_lg.hlsl @@ -44,8 +44,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/backdrop_lg.msl b/piet-gpu/shader/gen/backdrop_lg.msl index 1c68980..31cd6cd 100644 --- a/piet-gpu/shader/gen/backdrop_lg.msl +++ b/piet-gpu/shader/gen/backdrop_lg.msl @@ -63,8 +63,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/backdrop_lg.spv b/piet-gpu/shader/gen/backdrop_lg.spv index a77d46d..3f90d2e 100644 Binary files a/piet-gpu/shader/gen/backdrop_lg.spv and b/piet-gpu/shader/gen/backdrop_lg.spv differ diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil index 9ce0add..26b5f16 100644 Binary files a/piet-gpu/shader/gen/bbox_clear.dxil and b/piet-gpu/shader/gen/bbox_clear.dxil differ diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl index 64b109f..903d84c 100644 --- a/piet-gpu/shader/gen/bbox_clear.hlsl +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -17,8 +17,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -39,7 +44,7 @@ struct SPIRV_Cross_Input void comp_main() { uint ix = gl_GlobalInvocationID.x; - if (ix < _21.Load(52)) + if (ix < _21.Load(68)) { uint out_ix = (_21.Load(40) >> uint(2)) + (6u * ix); _45.Store(out_ix * 4 + 8, 65535u); diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl index 9af5b11..e80f15e 100644 --- a/piet-gpu/shader/gen/bbox_clear.msl +++ b/piet-gpu/shader/gen/bbox_clear.msl @@ -22,8 +22,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/bbox_clear.spv b/piet-gpu/shader/gen/bbox_clear.spv index c459502..e3e88d7 100644 Binary files a/piet-gpu/shader/gen/bbox_clear.spv and b/piet-gpu/shader/gen/bbox_clear.spv differ diff --git a/piet-gpu/shader/gen/binning.dxil b/piet-gpu/shader/gen/binning.dxil index 50034cc..fe53f27 100644 Binary files a/piet-gpu/shader/gen/binning.dxil and b/piet-gpu/shader/gen/binning.dxil differ diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl index b13db37..e4de1b9 100644 --- a/piet-gpu/shader/gen/binning.hlsl +++ b/piet-gpu/shader/gen/binning.hlsl @@ -9,16 +9,6 @@ struct MallocResult bool failed; }; -struct AnnoEndClipRef -{ - uint offset; -}; - -struct AnnoEndClip -{ - float4 bbox; -}; - struct AnnotatedRef { uint offset; @@ -40,6 +30,12 @@ struct BinInstance uint element_ix; }; +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; +}; + struct Config { uint n_elements; @@ -54,8 +50,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -64,8 +65,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _84 : register(u0, space0); -ByteAddressBuffer _253 : register(t1, space0); +RWByteAddressBuffer _94 : register(u0, space0); +ByteAddressBuffer _202 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -93,7 +94,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _84.Load(offset * 4 + 8); + uint v = _94.Load(offset * 4 + 8); return v; } @@ -102,36 +103,53 @@ AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) Alloc param = a; uint param_1 = ref.offset >> uint(2); uint tag_and_flags = read_mem(param, param_1); - AnnotatedTag _221 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; - return _221; + AnnotatedTag _181 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _181; } -AnnoEndClip AnnoEndClip_read(Alloc a, AnnoEndClipRef ref) +DrawMonoid load_draw_monoid(uint element_ix) +{ + uint base = (_202.Load(44) >> uint(2)) + (2u * element_ix); + uint path_ix = _94.Load(base * 4 + 8); + uint clip_ix = _94.Load((base + 1u) * 4 + 8); + DrawMonoid _222 = { path_ix, clip_ix }; + return _222; +} + +float4 load_clip_bbox(uint clip_ix) +{ + uint base = (_202.Load(60) >> uint(2)) + (4u * clip_ix); + float x0 = asfloat(_94.Load(base * 4 + 8)); + float y0 = asfloat(_94.Load((base + 1u) * 4 + 8)); + float x1 = asfloat(_94.Load((base + 2u) * 4 + 8)); + float y1 = asfloat(_94.Load((base + 3u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_202.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_94.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_94.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_94.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_94.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +float4 bbox_intersect(float4 a, float4 b) +{ + return float4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +void store_path_bbox(AnnotatedRef ref, float4 bbox) { uint ix = ref.offset >> uint(2); - Alloc param = a; - uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1); - Alloc param_2 = a; - uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3); - Alloc param_4 = a; - uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5); - Alloc param_6 = a; - uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7); - AnnoEndClip s; - s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); - return s; -} - -AnnoEndClip Annotated_EndClip_read(Alloc a, AnnotatedRef ref) -{ - AnnoEndClipRef _228 = { ref.offset + 4u }; - Alloc param = a; - AnnoEndClipRef param_1 = _228; - return AnnoEndClip_read(param, param_1); + _94.Store((ix + 1u) * 4 + 8, asuint(bbox.x)); + _94.Store((ix + 2u) * 4 + 8, asuint(bbox.y)); + _94.Store((ix + 3u) * 4 + 8, asuint(bbox.z)); + _94.Store((ix + 4u) * 4 + 8, asuint(bbox.w)); } Alloc new_alloc(uint offset, uint size, bool mem_ok) @@ -143,22 +161,22 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) MallocResult malloc(uint size) { - uint _90; - _84.InterlockedAdd(0, size, _90); - uint offset = _90; - uint _97; - _84.GetDimensions(_97); - _97 = (_97 - 8) / 4; + uint _100; + _94.InterlockedAdd(0, size, _100); + uint offset = _100; + uint _107; + _94.GetDimensions(_107); + _107 = (_107 - 8) / 4; MallocResult r; - r.failed = (offset + size) > uint(int(_97) * 4); + r.failed = (offset + size) > uint(int(_107) * 4); uint param = offset; uint param_1 = size; bool param_2 = !r.failed; r.alloc = new_alloc(param, param_1, param_2); if (r.failed) { - uint _119; - _84.InterlockedMax(4, 1u, _119); + uint _129; + _94.InterlockedMax(4, 1u, _129); return r; } return r; @@ -172,7 +190,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _84.Store(offset * 4 + 8, val); + _94.Store(offset * 4 + 8, val); } void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) @@ -186,7 +204,7 @@ void BinInstance_write(Alloc a, BinInstanceRef ref, BinInstance s) void comp_main() { - uint my_n_elements = _253.Load(0); + uint my_n_elements = _202.Load(0); uint my_partition = gl_WorkGroupID.x; for (uint i = 0u; i < 8u; i++) { @@ -198,15 +216,15 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; - AnnotatedRef _308 = { _253.Load(32) + (element_ix * 40u) }; - AnnotatedRef ref = _308; + AnnotatedRef _415 = { _202.Load(32) + (element_ix * 40u) }; + AnnotatedRef ref = _415; uint tag = 0u; if (element_ix < my_n_elements) { - Alloc _318; - _318.offset = _253.Load(32); + Alloc _425; + _425.offset = _202.Load(32); Alloc param; - param.offset = _318.offset; + param.offset = _425.offset; AnnotatedRef param_1 = ref; tag = Annotated_tag(param, param_1).tag; } @@ -222,21 +240,38 @@ void comp_main() case 4u: case 5u: { - Alloc _336; - _336.offset = _253.Load(32); - Alloc param_2; - param_2.offset = _336.offset; - AnnotatedRef param_3 = ref; - AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3); - x0 = int(floor(clip.bbox.x * 0.00390625f)); - y0 = int(floor(clip.bbox.y * 0.00390625f)); - x1 = int(ceil(clip.bbox.z * 0.00390625f)); - y1 = int(ceil(clip.bbox.w * 0.00390625f)); + uint param_2 = element_ix; + DrawMonoid draw_monoid = load_draw_monoid(param_2); + uint path_ix = draw_monoid.path_ix; + float4 clip_bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0u) + { + uint param_3 = clip_ix - 1u; + clip_bbox = load_clip_bbox(param_3); + } + uint param_4 = path_ix; + float4 path_bbox = load_path_bbox(param_4); + float4 param_5 = path_bbox; + float4 param_6 = clip_bbox; + float4 bbox = bbox_intersect(param_5, param_6); + float4 _473 = bbox; + float4 _475 = bbox; + float2 _477 = max(_473.xy, _475.zw); + bbox.z = _477.x; + bbox.w = _477.y; + AnnotatedRef param_7 = ref; + float4 param_8 = bbox; + store_path_bbox(param_7, param_8); + x0 = int(floor(bbox.x * 0.00390625f)); + y0 = int(floor(bbox.y * 0.00390625f)); + x1 = int(ceil(bbox.z * 0.00390625f)); + y1 = int(ceil(bbox.w * 0.00390625f)); break; } } - uint width_in_bins = ((_253.Load(8) + 16u) - 1u) / 16u; - uint height_in_bins = ((_253.Load(12) + 16u) - 1u) / 16u; + uint width_in_bins = ((_202.Load(8) + 16u) - 1u) / 16u; + uint height_in_bins = ((_202.Load(12) + 16u) - 1u) / 16u; x0 = clamp(x0, 0, int(width_in_bins)); x1 = clamp(x1, x0, int(width_in_bins)); y0 = clamp(y0, 0, int(height_in_bins)); @@ -251,8 +286,8 @@ void comp_main() uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _437; - InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _437); + uint _581; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _581); x++; if (x == x1) { @@ -267,15 +302,15 @@ void comp_main() element_count += uint(int(countbits(bitmaps[i_1][gl_LocalInvocationID.x]))); count[i_1][gl_LocalInvocationID.x] = element_count; } - uint param_4 = 0u; - uint param_5 = 0u; - bool param_6 = true; - Alloc chunk_alloc = new_alloc(param_4, param_5, param_6); + uint param_9 = 0u; + uint param_10 = 0u; + bool param_11 = true; + Alloc chunk_alloc = new_alloc(param_9, param_10, param_11); if (element_count != 0u) { - uint param_7 = element_count * 4u; - MallocResult _487 = malloc(param_7); - MallocResult chunk = _487; + uint param_12 = element_count * 4u; + MallocResult _631 = malloc(param_12); + MallocResult chunk = _631; chunk_alloc = chunk.alloc; sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; if (chunk.failed) @@ -283,32 +318,32 @@ void comp_main() sh_alloc_failed = true; } } - uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); - Alloc _516; - _516.offset = _253.Load(20); - Alloc param_8; - param_8.offset = _516.offset; - uint param_9 = out_ix; - uint param_10 = element_count; - write_mem(param_8, param_9, param_10); - Alloc _528; - _528.offset = _253.Load(20); - Alloc param_11; - param_11.offset = _528.offset; - uint param_12 = out_ix + 1u; - uint param_13 = chunk_alloc.offset; - write_mem(param_11, param_12, param_13); + uint out_ix = (_202.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc _660; + _660.offset = _202.Load(20); + Alloc param_13; + param_13.offset = _660.offset; + uint param_14 = out_ix; + uint param_15 = element_count; + write_mem(param_13, param_14, param_15); + Alloc _672; + _672.offset = _202.Load(20); + Alloc param_16; + param_16.offset = _672.offset; + uint param_17 = out_ix + 1u; + uint param_18 = chunk_alloc.offset; + write_mem(param_16, param_17, param_18); GroupMemoryBarrierWithGroupSync(); - bool _543; + bool _687; if (!sh_alloc_failed) { - _543 = _84.Load(4) != 0u; + _687 = _94.Load(4) != 0u; } else { - _543 = sh_alloc_failed; + _687 = sh_alloc_failed; } - if (_543) + if (_687) { return; } @@ -327,12 +362,12 @@ void comp_main() } Alloc out_alloc = sh_chunk_alloc[bin_ix]; uint out_offset = out_alloc.offset + (idx * 4u); - BinInstanceRef _605 = { out_offset }; - BinInstance _607 = { element_ix }; - Alloc param_14 = out_alloc; - BinInstanceRef param_15 = _605; - BinInstance param_16 = _607; - BinInstance_write(param_14, param_15, param_16); + BinInstanceRef _749 = { out_offset }; + BinInstance _751 = { element_ix }; + Alloc param_19 = out_alloc; + BinInstanceRef param_20 = _749; + BinInstance param_21 = _751; + BinInstance_write(param_19, param_20, param_21); } x++; if (x == x1) diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl index 42a11ee..0e3b6c8 100644 --- a/piet-gpu/shader/gen/binning.msl +++ b/piet-gpu/shader/gen/binning.msl @@ -18,16 +18,6 @@ struct MallocResult bool failed; }; -struct AnnoEndClipRef -{ - uint offset; -}; - -struct AnnoEndClip -{ - float4 bbox; -}; - struct AnnotatedRef { uint offset; @@ -49,6 +39,12 @@ struct BinInstance uint element_ix; }; +struct DrawMonoid +{ + uint path_ix; + uint clip_ix; +}; + struct Memory { uint mem_offset; @@ -75,8 +71,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -97,7 +98,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_84, constant uint& v_84BufferSize) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_94, constant uint& v_94BufferSize) { Alloc param = alloc; uint param_1 = offset; @@ -105,46 +106,66 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_84.memory[offset]; + uint v = v_94.memory[offset]; return v; } static inline __attribute__((always_inline)) -AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_94, constant uint& v_94BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); - uint tag_and_flags = read_mem(param, param_1, v_84, v_84BufferSize); + uint tag_and_flags = read_mem(param, param_1, v_94, v_94BufferSize); return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; } static inline __attribute__((always_inline)) -AnnoEndClip AnnoEndClip_read(thread const Alloc& a, thread const AnnoEndClipRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +DrawMonoid load_draw_monoid(thread const uint& element_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202) { - uint ix = ref.offset >> uint(2); - Alloc param = a; - uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_84, v_84BufferSize); - Alloc param_2 = a; - uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_84, v_84BufferSize); - Alloc param_4 = a; - uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_84, v_84BufferSize); - Alloc param_6 = a; - uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7, v_84, v_84BufferSize); - AnnoEndClip s; - s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); - return s; + uint base = (v_202.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix); + uint path_ix = v_94.memory[base]; + uint clip_ix = v_94.memory[base + 1u]; + return DrawMonoid{ path_ix, clip_ix }; } static inline __attribute__((always_inline)) -AnnoEndClip Annotated_EndClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_84, constant uint& v_84BufferSize) +float4 load_clip_bbox(thread const uint& clip_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202) { - Alloc param = a; - AnnoEndClipRef param_1 = AnnoEndClipRef{ ref.offset + 4u }; - return AnnoEndClip_read(param, param_1, v_84, v_84BufferSize); + uint base = (v_202.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * clip_ix); + float x0 = as_type(v_94.memory[base]); + float y0 = as_type(v_94.memory[base + 1u]); + float x1 = as_type(v_94.memory[base + 2u]); + float y1 = as_type(v_94.memory[base + 3u]); + float4 bbox = float4(x0, y0, x1, y1); + return bbox; +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, device Memory& v_94, constant uint& v_94BufferSize, const device ConfigBuf& v_202) +{ + uint base = (v_202.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_94.memory[base]) - 32768.0; + float bbox_t = float(v_94.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_94.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_94.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +float4 bbox_intersect(thread const float4& a, thread const float4& b) +{ + return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); +} + +static inline __attribute__((always_inline)) +void store_path_bbox(thread const AnnotatedRef& ref, thread const float4& bbox, device Memory& v_94, constant uint& v_94BufferSize) +{ + uint ix = ref.offset >> uint(2); + v_94.memory[ix + 1u] = as_type(bbox.x); + v_94.memory[ix + 2u] = as_type(bbox.y); + v_94.memory[ix + 3u] = as_type(bbox.z); + v_94.memory[ix + 4u] = as_type(bbox.w); } static inline __attribute__((always_inline)) @@ -156,26 +177,26 @@ Alloc new_alloc(thread const uint& offset, thread const uint& size, thread const } static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_84, constant uint& v_84BufferSize) +MallocResult malloc(thread const uint& size, device Memory& v_94, constant uint& v_94BufferSize) { - uint _90 = atomic_fetch_add_explicit((device atomic_uint*)&v_84.mem_offset, size, memory_order_relaxed); - uint offset = _90; + uint _100 = atomic_fetch_add_explicit((device atomic_uint*)&v_94.mem_offset, size, memory_order_relaxed); + uint offset = _100; MallocResult r; - r.failed = (offset + size) > uint(int((v_84BufferSize - 8) / 4) * 4); + r.failed = (offset + size) > uint(int((v_94BufferSize - 8) / 4) * 4); uint param = offset; uint param_1 = size; bool param_2 = !r.failed; r.alloc = new_alloc(param, param_1, param_2); if (r.failed) { - uint _119 = atomic_fetch_max_explicit((device atomic_uint*)&v_84.mem_error, 1u, memory_order_relaxed); + uint _129 = atomic_fetch_max_explicit((device atomic_uint*)&v_94.mem_error, 1u, memory_order_relaxed); return r; } return r; } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_84, constant uint& v_84BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_94, constant uint& v_94BufferSize) { Alloc param = alloc; uint param_1 = offset; @@ -183,27 +204,27 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_84.memory[offset] = val; + v_94.memory[offset] = val; } static inline __attribute__((always_inline)) -void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_84, constant uint& v_84BufferSize) +void BinInstance_write(thread const Alloc& a, thread const BinInstanceRef& ref, thread const BinInstance& s, device Memory& v_94, constant uint& v_94BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.element_ix; - write_mem(param, param_1, param_2, v_84, v_84BufferSize); + write_mem(param, param_1, param_2, v_94, v_94BufferSize); } -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_84 [[buffer(0)]], const device ConfigBuf& _253 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_94 [[buffer(0)]], const device ConfigBuf& v_202 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint bitmaps[8][256]; threadgroup short sh_alloc_failed; threadgroup uint count[8][256]; threadgroup Alloc sh_chunk_alloc[256]; - constant uint& v_84BufferSize = spvBufferSizeConstants[0]; - uint my_n_elements = _253.conf.n_elements; + constant uint& v_94BufferSize = spvBufferSizeConstants[0]; + uint my_n_elements = v_202.conf.n_elements; uint my_partition = gl_WorkGroupID.x; for (uint i = 0u; i < 8u; i++) { @@ -215,14 +236,14 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } threadgroup_barrier(mem_flags::mem_threadgroup); uint element_ix = (my_partition * 256u) + gl_LocalInvocationID.x; - AnnotatedRef ref = AnnotatedRef{ _253.conf.anno_alloc.offset + (element_ix * 40u) }; + AnnotatedRef ref = AnnotatedRef{ v_202.conf.anno_alloc.offset + (element_ix * 40u) }; uint tag = 0u; if (element_ix < my_n_elements) { Alloc param; - param.offset = _253.conf.anno_alloc.offset; + param.offset = v_202.conf.anno_alloc.offset; AnnotatedRef param_1 = ref; - tag = Annotated_tag(param, param_1, v_84, v_84BufferSize).tag; + tag = Annotated_tag(param, param_1, v_94, v_94BufferSize).tag; } int x0 = 0; int y0 = 0; @@ -236,19 +257,38 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M case 4u: case 5u: { - Alloc param_2; - param_2.offset = _253.conf.anno_alloc.offset; - AnnotatedRef param_3 = ref; - AnnoEndClip clip = Annotated_EndClip_read(param_2, param_3, v_84, v_84BufferSize); - x0 = int(floor(clip.bbox.x * 0.00390625)); - y0 = int(floor(clip.bbox.y * 0.00390625)); - x1 = int(ceil(clip.bbox.z * 0.00390625)); - y1 = int(ceil(clip.bbox.w * 0.00390625)); + uint param_2 = element_ix; + DrawMonoid draw_monoid = load_draw_monoid(param_2, v_94, v_94BufferSize, v_202); + uint path_ix = draw_monoid.path_ix; + float4 clip_bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + uint clip_ix = draw_monoid.clip_ix; + if (clip_ix > 0u) + { + uint param_3 = clip_ix - 1u; + clip_bbox = load_clip_bbox(param_3, v_94, v_94BufferSize, v_202); + } + uint param_4 = path_ix; + float4 path_bbox = load_path_bbox(param_4, v_94, v_94BufferSize, v_202); + float4 param_5 = path_bbox; + float4 param_6 = clip_bbox; + float4 bbox = bbox_intersect(param_5, param_6); + float4 _473 = bbox; + float4 _475 = bbox; + float2 _477 = fast::max(_473.xy, _475.zw); + bbox.z = _477.x; + bbox.w = _477.y; + AnnotatedRef param_7 = ref; + float4 param_8 = bbox; + store_path_bbox(param_7, param_8, v_94, v_94BufferSize); + x0 = int(floor(bbox.x * 0.00390625)); + y0 = int(floor(bbox.y * 0.00390625)); + x1 = int(ceil(bbox.z * 0.00390625)); + y1 = int(ceil(bbox.w * 0.00390625)); break; } } - uint width_in_bins = ((_253.conf.width_in_tiles + 16u) - 1u) / 16u; - uint height_in_bins = ((_253.conf.height_in_tiles + 16u) - 1u) / 16u; + uint width_in_bins = ((v_202.conf.width_in_tiles + 16u) - 1u) / 16u; + uint height_in_bins = ((v_202.conf.height_in_tiles + 16u) - 1u) / 16u; x0 = clamp(x0, 0, int(width_in_bins)); x1 = clamp(x1, x0, int(width_in_bins)); y0 = clamp(y0, 0, int(height_in_bins)); @@ -263,7 +303,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _437 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + uint _581 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); x++; if (x == x1) { @@ -278,15 +318,15 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M element_count += uint(int(popcount(bitmaps[i_1][gl_LocalInvocationID.x]))); count[i_1][gl_LocalInvocationID.x] = element_count; } - uint param_4 = 0u; - uint param_5 = 0u; - bool param_6 = true; - Alloc chunk_alloc = new_alloc(param_4, param_5, param_6); + uint param_9 = 0u; + uint param_10 = 0u; + bool param_11 = true; + Alloc chunk_alloc = new_alloc(param_9, param_10, param_11); if (element_count != 0u) { - uint param_7 = element_count * 4u; - MallocResult _487 = malloc(param_7, v_84, v_84BufferSize); - MallocResult chunk = _487; + uint param_12 = element_count * 4u; + MallocResult _631 = malloc(param_12, v_94, v_94BufferSize); + MallocResult chunk = _631; chunk_alloc = chunk.alloc; sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; if (chunk.failed) @@ -294,28 +334,28 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M sh_alloc_failed = short(true); } } - uint out_ix = (_253.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); - Alloc param_8; - param_8.offset = _253.conf.bin_alloc.offset; - uint param_9 = out_ix; - uint param_10 = element_count; - write_mem(param_8, param_9, param_10, v_84, v_84BufferSize); - Alloc param_11; - param_11.offset = _253.conf.bin_alloc.offset; - uint param_12 = out_ix + 1u; - uint param_13 = chunk_alloc.offset; - write_mem(param_11, param_12, param_13, v_84, v_84BufferSize); + uint out_ix = (v_202.conf.bin_alloc.offset >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); + Alloc param_13; + param_13.offset = v_202.conf.bin_alloc.offset; + uint param_14 = out_ix; + uint param_15 = element_count; + write_mem(param_13, param_14, param_15, v_94, v_94BufferSize); + Alloc param_16; + param_16.offset = v_202.conf.bin_alloc.offset; + uint param_17 = out_ix + 1u; + uint param_18 = chunk_alloc.offset; + write_mem(param_16, param_17, param_18, v_94, v_94BufferSize); threadgroup_barrier(mem_flags::mem_threadgroup); - bool _543; + bool _687; if (!bool(sh_alloc_failed)) { - _543 = v_84.mem_error != 0u; + _687 = v_94.mem_error != 0u; } else { - _543 = bool(sh_alloc_failed); + _687 = bool(sh_alloc_failed); } - if (_543) + if (_687) { return; } @@ -334,10 +374,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } Alloc out_alloc = sh_chunk_alloc[bin_ix]; uint out_offset = out_alloc.offset + (idx * 4u); - Alloc param_14 = out_alloc; - BinInstanceRef param_15 = BinInstanceRef{ out_offset }; - BinInstance param_16 = BinInstance{ element_ix }; - BinInstance_write(param_14, param_15, param_16, v_84, v_84BufferSize); + Alloc param_19 = out_alloc; + BinInstanceRef param_20 = BinInstanceRef{ out_offset }; + BinInstance param_21 = BinInstance{ element_ix }; + BinInstance_write(param_19, param_20, param_21, v_94, v_94BufferSize); } x++; if (x == x1) diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv index 17043bc..eca0692 100644 Binary files a/piet-gpu/shader/gen/binning.spv and b/piet-gpu/shader/gen/binning.spv differ diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil new file mode 100644 index 0000000..fe87a11 Binary files /dev/null and b/piet-gpu/shader/gen/clip_leaf.dxil differ diff --git a/piet-gpu/shader/gen/clip_leaf.hlsl b/piet-gpu/shader/gen/clip_leaf.hlsl new file mode 100644 index 0000000..d570420 --- /dev/null +++ b/piet-gpu/shader/gen/clip_leaf.hlsl @@ -0,0 +1,367 @@ +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Bic _393 = { 0u, 0u }; + +ByteAddressBuffer _80 : register(t1, space0); +RWByteAddressBuffer _96 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Bic sh_bic[510]; +groupshared uint sh_stack[256]; +groupshared float4 sh_stack_bbox[256]; +groupshared uint sh_link[256]; +groupshared float4 sh_bbox[256]; + +Bic load_bic(uint ix) +{ + uint base = (_80.Load(52) >> uint(2)) + (2u * ix); + Bic _286 = { _96.Load(base * 4 + 8), _96.Load((base + 1u) * 4 + 8) }; + return _286; +} + +Bic bic_combine(Bic x, Bic y) +{ + uint m = min(x.b, y.a); + Bic _72 = { (x.a + y.a) - m, (x.b + y.b) - m }; + return _72; +} + +ClipEl load_clip_el(uint ix) +{ + uint base = (_80.Load(56) >> uint(2)) + (5u * ix); + uint parent_ix = _96.Load(base * 4 + 8); + float x0 = asfloat(_96.Load((base + 1u) * 4 + 8)); + float y0 = asfloat(_96.Load((base + 2u) * 4 + 8)); + float x1 = asfloat(_96.Load((base + 3u) * 4 + 8)); + float y1 = asfloat(_96.Load((base + 4u) * 4 + 8)); + float4 bbox = float4(x0, y0, x1, y1); + ClipEl _335 = { parent_ix, bbox }; + return _335; +} + +float4 bbox_intersect(float4 a, float4 b) +{ + return float4(max(a.xy, b.xy), min(a.zw, b.zw)); +} + +uint load_path_ix(uint ix) +{ + if (ix < _80.Load(72)) + { + return _96.Load(((_80.Load(48) >> uint(2)) + ix) * 4 + 8); + } + else + { + return 2147483648u; + } +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_80.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_96.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_96.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_96.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_96.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +uint search_link(inout Bic bic) +{ + uint ix = gl_LocalInvocationID.x; + uint j = 0u; + while (j < 8u) + { + uint base = 512u - (2u << (8u - j)); + if (((ix >> j) & 1u) != 0u) + { + Bic param = sh_bic[(base + (ix >> j)) - 1u]; + Bic param_1 = bic; + Bic test = bic_combine(param, param_1); + if (test.b > 0u) + { + break; + } + bic = test; + ix -= (1u << j); + } + j++; + } + if (ix > 0u) + { + while (j > 0u) + { + j--; + uint base_1 = 512u - (2u << (8u - j)); + Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; + Bic param_3 = bic; + Bic test_1 = bic_combine(param_2, param_3); + if (test_1.b == 0u) + { + bic = test_1; + ix -= (1u << j); + } + } + } + if (ix > 0u) + { + return ix - 1u; + } + else + { + return 4294967295u - bic.a; + } +} + +void store_clip_bbox(uint ix, float4 bbox) +{ + uint base = (_80.Load(60) >> uint(2)) + (4u * ix); + _96.Store(base * 4 + 8, asuint(bbox.x)); + _96.Store((base + 1u) * 4 + 8, asuint(bbox.y)); + _96.Store((base + 2u) * 4 + 8, asuint(bbox.z)); + _96.Store((base + 3u) * 4 + 8, asuint(bbox.w)); +} + +void comp_main() +{ + uint th = gl_LocalInvocationID.x; + Bic bic = _393; + if (th < gl_WorkGroupID.x) + { + uint param = th; + bic = load_bic(param); + } + sh_bic[th] = bic; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[th + (1u << i)]; + Bic param_1 = bic; + Bic param_2 = other; + bic = bic_combine(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_bic[th] = bic; + } + GroupMemoryBarrierWithGroupSync(); + uint stack_size = sh_bic[0].b; + uint sp = 255u - th; + uint ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = ix + (128u >> i_1); + if (sp < sh_bic[probe].b) + { + ix = probe; + } + } + uint b = sh_bic[ix].b; + float4 bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + if (sp < b) + { + uint param_3 = (((ix * 256u) + b) - sp) - 1u; + ClipEl el = load_clip_el(param_3); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + sh_stack_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + if (th >= (1u << i_2)) + { + float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; + float4 param_5 = bbox; + bbox = bbox_intersect(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + } + sh_stack_bbox[th] = bbox; + uint param_6 = gl_GlobalInvocationID.x; + uint inp = load_path_ix(param_6); + bool is_push = int(inp) >= 0; + Bic _559 = { 1u - uint(is_push), uint(is_push) }; + bic = _559; + sh_bic[th] = bic; + if (is_push) + { + uint param_7 = inp; + bbox = load_path_bbox(param_7); + } + else + { + bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + } + uint inbase = 0u; + for (uint i_3 = 0u; i_3 < 7u; i_3++) + { + uint outbase = 512u - (1u << (8u - i_3)); + GroupMemoryBarrierWithGroupSync(); + if (th < (1u << (7u - i_3))) + { + Bic param_8 = sh_bic[inbase + (th * 2u)]; + Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_8, param_9); + } + inbase = outbase; + } + GroupMemoryBarrierWithGroupSync(); + bic = _393; + Bic param_10 = bic; + uint _618 = search_link(param_10); + bic = param_10; + uint link = _618; + sh_link[th] = link; + GroupMemoryBarrierWithGroupSync(); + uint grandparent; + if (int(link) >= 0) + { + grandparent = sh_link[link]; + } + else + { + grandparent = link - 1u; + } + uint parent; + if (int(link) >= 0) + { + parent = (gl_WorkGroupID.x * 256u) + link; + } + else + { + if (int(link + stack_size) >= 0) + { + parent = sh_stack[256u + link]; + } + else + { + parent = 4294967295u; + } + } + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + if (i_4 != 0u) + { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + if (int(link) >= 0) + { + float4 param_11 = sh_bbox[link]; + float4 param_12 = bbox; + bbox = bbox_intersect(param_11, param_12); + link = sh_link[link]; + } + GroupMemoryBarrierWithGroupSync(); + } + if (int(link + stack_size) >= 0) + { + float4 param_13 = sh_stack_bbox[256u + link]; + float4 param_14 = bbox; + bbox = bbox_intersect(param_13, param_14); + } + sh_bbox[th] = bbox; + GroupMemoryBarrierWithGroupSync(); + uint path_ix = inp; + bool _717 = !is_push; + bool _725; + if (_717) + { + _725 = gl_GlobalInvocationID.x < _80.Load(72); + } + else + { + _725 = _717; + } + if (_725) + { + uint param_15 = parent; + path_ix = load_path_ix(param_15); + uint drawmonoid_out_base = (_80.Load(44) >> uint(2)) + (2u * (~inp)); + _96.Store(drawmonoid_out_base * 4 + 8, path_ix); + if (int(grandparent) >= 0) + { + bbox = sh_bbox[grandparent]; + } + else + { + if (int(grandparent + stack_size) >= 0) + { + bbox = sh_stack_bbox[256u + grandparent]; + } + else + { + bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + } + } + } + uint param_16 = gl_GlobalInvocationID.x; + float4 param_17 = bbox; + store_clip_bbox(param_16, param_17); +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/clip_leaf.msl b/piet-gpu/shader/gen/clip_leaf.msl new file mode 100644 index 0000000..4e2d059 --- /dev/null +++ b/piet-gpu/shader/gen/clip_leaf.msl @@ -0,0 +1,366 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Bic load_bic(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); + return Bic{ v_96.memory[base], v_96.memory[base + 1u] }; +} + +static inline __attribute__((always_inline)) +Bic bic_combine(thread const Bic& x, thread const Bic& y) +{ + uint m = min(x.b, y.a); + return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; +} + +static inline __attribute__((always_inline)) +ClipEl load_clip_el(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); + uint parent_ix = v_96.memory[base]; + float x0 = as_type(v_96.memory[base + 1u]); + float y0 = as_type(v_96.memory[base + 2u]); + float x1 = as_type(v_96.memory[base + 3u]); + float y1 = as_type(v_96.memory[base + 4u]); + float4 bbox = float4(x0, y0, x1, y1); + return ClipEl{ parent_ix, bbox }; +} + +static inline __attribute__((always_inline)) +float4 bbox_intersect(thread const float4& a, thread const float4& b) +{ + return float4(fast::max(a.xy, b.xy), fast::min(a.zw, b.zw)); +} + +static inline __attribute__((always_inline)) +uint load_path_ix(thread const uint& ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + if (ix < v_80.conf.n_clip) + { + return v_96.memory[(v_80.conf.clip_alloc.offset >> uint(2)) + ix]; + } + else + { + return 2147483648u; + } +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_96.memory[base]) - 32768.0; + float bbox_t = float(v_96.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_96.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_96.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +uint search_link(thread Bic& bic, thread uint3& gl_LocalInvocationID, threadgroup Bic (&sh_bic)[510]) +{ + uint ix = gl_LocalInvocationID.x; + uint j = 0u; + while (j < 8u) + { + uint base = 512u - (2u << (8u - j)); + if (((ix >> j) & 1u) != 0u) + { + Bic param = sh_bic[(base + (ix >> j)) - 1u]; + Bic param_1 = bic; + Bic test = bic_combine(param, param_1); + if (test.b > 0u) + { + break; + } + bic = test; + ix -= (1u << j); + } + j++; + } + if (ix > 0u) + { + while (j > 0u) + { + j--; + uint base_1 = 512u - (2u << (8u - j)); + Bic param_2 = sh_bic[(base_1 + (ix >> j)) - 1u]; + Bic param_3 = bic; + Bic test_1 = bic_combine(param_2, param_3); + if (test_1.b == 0u) + { + bic = test_1; + ix -= (1u << j); + } + } + } + if (ix > 0u) + { + return ix - 1u; + } + else + { + return 4294967295u - bic.a; + } +} + +static inline __attribute__((always_inline)) +void store_clip_bbox(thread const uint& ix, thread const float4& bbox, const device ConfigBuf& v_80, device Memory& v_96) +{ + uint base = (v_80.conf.clip_bbox_alloc.offset >> uint(2)) + (4u * ix); + v_96.memory[base] = as_type(bbox.x); + v_96.memory[base + 1u] = as_type(bbox.y); + v_96.memory[base + 2u] = as_type(bbox.z); + v_96.memory[base + 3u] = as_type(bbox.w); +} + +kernel void main0(device Memory& v_96 [[buffer(0)]], const device ConfigBuf& v_80 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + threadgroup Bic sh_bic[510]; + threadgroup uint sh_stack[256]; + threadgroup float4 sh_stack_bbox[256]; + threadgroup uint sh_link[256]; + threadgroup float4 sh_bbox[256]; + uint th = gl_LocalInvocationID.x; + Bic bic = Bic{ 0u, 0u }; + if (th < gl_WorkGroupID.x) + { + uint param = th; + bic = load_bic(param, v_80, v_96); + } + sh_bic[th] = bic; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[th + (1u << i)]; + Bic param_1 = bic; + Bic param_2 = other; + bic = bic_combine(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_bic[th] = bic; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint stack_size = sh_bic[0].b; + uint sp = 255u - th; + uint ix = 0u; + for (uint i_1 = 0u; i_1 < 8u; i_1++) + { + uint probe = ix + (128u >> i_1); + if (sp < sh_bic[probe].b) + { + ix = probe; + } + } + uint b = sh_bic[ix].b; + float4 bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + if (sp < b) + { + uint param_3 = (((ix * 256u) + b) - sp) - 1u; + ClipEl el = load_clip_el(param_3, v_80, v_96); + sh_stack[th] = el.parent_ix; + bbox = el.bbox; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + sh_stack_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th >= (1u << i_2)) + { + float4 param_4 = sh_stack_bbox[th - (1u << i_2)]; + float4 param_5 = bbox; + bbox = bbox_intersect(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + sh_stack_bbox[th] = bbox; + uint param_6 = gl_GlobalInvocationID.x; + uint inp = load_path_ix(param_6, v_80, v_96); + bool is_push = int(inp) >= 0; + bic = Bic{ 1u - uint(is_push), uint(is_push) }; + sh_bic[th] = bic; + if (is_push) + { + uint param_7 = inp; + bbox = load_path_bbox(param_7, v_80, v_96); + } + else + { + bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + } + uint inbase = 0u; + for (uint i_3 = 0u; i_3 < 7u; i_3++) + { + uint outbase = 512u - (1u << (8u - i_3)); + threadgroup_barrier(mem_flags::mem_threadgroup); + if (th < (1u << (7u - i_3))) + { + Bic param_8 = sh_bic[inbase + (th * 2u)]; + Bic param_9 = sh_bic[(inbase + (th * 2u)) + 1u]; + sh_bic[outbase + th] = bic_combine(param_8, param_9); + } + inbase = outbase; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + bic = Bic{ 0u, 0u }; + Bic param_10 = bic; + uint _618 = search_link(param_10, gl_LocalInvocationID, sh_bic); + bic = param_10; + uint link = _618; + sh_link[th] = link; + threadgroup_barrier(mem_flags::mem_threadgroup); + uint grandparent; + if (int(link) >= 0) + { + grandparent = sh_link[link]; + } + else + { + grandparent = link - 1u; + } + uint parent; + if (int(link) >= 0) + { + parent = (gl_WorkGroupID.x * 256u) + link; + } + else + { + if (int(link + stack_size) >= 0) + { + parent = sh_stack[256u + link]; + } + else + { + parent = 4294967295u; + } + } + for (uint i_4 = 0u; i_4 < 8u; i_4++) + { + if (i_4 != 0u) + { + sh_link[th] = link; + } + sh_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + if (int(link) >= 0) + { + float4 param_11 = sh_bbox[link]; + float4 param_12 = bbox; + bbox = bbox_intersect(param_11, param_12); + link = sh_link[link]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + if (int(link + stack_size) >= 0) + { + float4 param_13 = sh_stack_bbox[256u + link]; + float4 param_14 = bbox; + bbox = bbox_intersect(param_13, param_14); + } + sh_bbox[th] = bbox; + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = inp; + bool _717 = !is_push; + bool _725; + if (_717) + { + _725 = gl_GlobalInvocationID.x < v_80.conf.n_clip; + } + else + { + _725 = _717; + } + if (_725) + { + uint param_15 = parent; + path_ix = load_path_ix(param_15, v_80, v_96); + uint drawmonoid_out_base = (v_80.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * (~inp)); + v_96.memory[drawmonoid_out_base] = path_ix; + if (int(grandparent) >= 0) + { + bbox = sh_bbox[grandparent]; + } + else + { + if (int(grandparent + stack_size) >= 0) + { + bbox = sh_stack_bbox[256u + grandparent]; + } + else + { + bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + } + } + } + uint param_16 = gl_GlobalInvocationID.x; + float4 param_17 = bbox; + store_clip_bbox(param_16, param_17, v_80, v_96); +} + diff --git a/piet-gpu/shader/gen/clip_leaf.spv b/piet-gpu/shader/gen/clip_leaf.spv new file mode 100644 index 0000000..7c4c174 Binary files /dev/null and b/piet-gpu/shader/gen/clip_leaf.spv differ diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil new file mode 100644 index 0000000..0dff71b Binary files /dev/null and b/piet-gpu/shader/gen/clip_reduce.dxil differ diff --git a/piet-gpu/shader/gen/clip_reduce.hlsl b/piet-gpu/shader/gen/clip_reduce.hlsl new file mode 100644 index 0000000..6851e63 --- /dev/null +++ b/piet-gpu/shader/gen/clip_reduce.hlsl @@ -0,0 +1,177 @@ +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); + +static const Bic _267 = { 0u, 0u }; + +ByteAddressBuffer _64 : register(t1, space0); +RWByteAddressBuffer _80 : register(u0, space0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Bic sh_bic[256]; +groupshared uint sh_parent[256]; +groupshared uint sh_path_ix[256]; +groupshared float4 sh_bbox[256]; + +Bic bic_combine(Bic x, Bic y) +{ + uint m = min(x.b, y.a); + Bic _56 = { (x.a + y.a) - m, (x.b + y.b) - m }; + return _56; +} + +void store_bic(uint ix, Bic bic) +{ + uint base = (_64.Load(52) >> uint(2)) + (2u * ix); + _80.Store(base * 4 + 8, bic.a); + _80.Store((base + 1u) * 4 + 8, bic.b); +} + +float4 load_path_bbox(uint path_ix) +{ + uint base = (_64.Load(40) >> uint(2)) + (6u * path_ix); + float bbox_l = float(_80.Load(base * 4 + 8)) - 32768.0f; + float bbox_t = float(_80.Load((base + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_80.Load((base + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_80.Load((base + 3u) * 4 + 8)) - 32768.0f; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +void store_clip_el(uint ix, ClipEl el) +{ + uint base = (_64.Load(56) >> uint(2)) + (5u * ix); + _80.Store(base * 4 + 8, el.parent_ix); + _80.Store((base + 1u) * 4 + 8, asuint(el.bbox.x)); + _80.Store((base + 2u) * 4 + 8, asuint(el.bbox.y)); + _80.Store((base + 3u) * 4 + 8, asuint(el.bbox.z)); + _80.Store((base + 4u) * 4 + 8, asuint(el.bbox.w)); +} + +void comp_main() +{ + uint th = gl_LocalInvocationID.x; + uint inp = _80.Load(((_64.Load(48) >> uint(2)) + gl_GlobalInvocationID.x) * 4 + 8); + bool is_push = int(inp) >= 0; + Bic _207 = { 1u - uint(is_push), uint(is_push) }; + Bic bic = _207; + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0u; i < 8u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + Bic param = bic; + Bic param_1 = other; + bic = bic_combine(param, param_1); + } + GroupMemoryBarrierWithGroupSync(); + sh_bic[th] = bic; + } + if (th == 0u) + { + uint param_2 = gl_WorkGroupID.x; + Bic param_3 = bic; + store_bic(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + uint size = sh_bic[0].b; + bic = _267; + if ((th + 1u) < 256u) + { + bic = sh_bic[th + 1u]; + } + bool _283; + if (is_push) + { + _283 = bic.a == 0u; + } + else + { + _283 = is_push; + } + if (_283) + { + uint local_ix = (size - bic.b) - 1u; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + GroupMemoryBarrierWithGroupSync(); + float4 bbox; + if (th < size) + { + uint path_ix = sh_path_ix[th]; + uint param_4 = path_ix; + bbox = load_path_bbox(param_4); + } + if (th < size) + { + uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); + ClipEl _331 = { parent_ix, bbox }; + ClipEl el = _331; + uint param_5 = gl_GlobalInvocationID.x; + ClipEl param_6 = el; + store_clip_el(param_5, param_6); + } +} + +[numthreads(256, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/clip_reduce.msl b/piet-gpu/shader/gen/clip_reduce.msl new file mode 100644 index 0000000..5845676 --- /dev/null +++ b/piet-gpu/shader/gen/clip_reduce.msl @@ -0,0 +1,173 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct Bic +{ + uint a; + uint b; +}; + +struct ClipEl +{ + uint parent_ix; + float4 bbox; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; + uint n_trans; + uint n_path; + uint n_clip; + uint trans_offset; + uint linewidth_offset; + uint pathtag_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); + +static inline __attribute__((always_inline)) +Bic bic_combine(thread const Bic& x, thread const Bic& y) +{ + uint m = min(x.b, y.a); + return Bic{ (x.a + y.a) - m, (x.b + y.b) - m }; +} + +static inline __attribute__((always_inline)) +void store_bic(thread const uint& ix, thread const Bic& bic, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.clip_bic_alloc.offset >> uint(2)) + (2u * ix); + v_80.memory[base] = bic.a; + v_80.memory[base + 1u] = bic.b; +} + +static inline __attribute__((always_inline)) +float4 load_path_bbox(thread const uint& path_ix, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.bbox_alloc.offset >> uint(2)) + (6u * path_ix); + float bbox_l = float(v_80.memory[base]) - 32768.0; + float bbox_t = float(v_80.memory[base + 1u]) - 32768.0; + float bbox_r = float(v_80.memory[base + 2u]) - 32768.0; + float bbox_b = float(v_80.memory[base + 3u]) - 32768.0; + float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); + return bbox; +} + +static inline __attribute__((always_inline)) +void store_clip_el(thread const uint& ix, thread const ClipEl& el, const device ConfigBuf& v_64, device Memory& v_80) +{ + uint base = (v_64.conf.clip_stack_alloc.offset >> uint(2)) + (5u * ix); + v_80.memory[base] = el.parent_ix; + v_80.memory[base + 1u] = as_type(el.bbox.x); + v_80.memory[base + 2u] = as_type(el.bbox.y); + v_80.memory[base + 3u] = as_type(el.bbox.z); + v_80.memory[base + 4u] = as_type(el.bbox.w); +} + +kernel void main0(device Memory& v_80 [[buffer(0)]], const device ConfigBuf& v_64 [[buffer(1)]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Bic sh_bic[256]; + threadgroup uint sh_parent[256]; + threadgroup uint sh_path_ix[256]; + threadgroup float4 sh_bbox[256]; + uint th = gl_LocalInvocationID.x; + uint inp = v_80.memory[(v_64.conf.clip_alloc.offset >> uint(2)) + gl_GlobalInvocationID.x]; + bool is_push = int(inp) >= 0; + Bic bic = Bic{ 1u - uint(is_push), uint(is_push) }; + sh_bic[gl_LocalInvocationID.x] = bic; + for (uint i = 0u; i < 8u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((th + (1u << i)) < 256u) + { + Bic other = sh_bic[gl_LocalInvocationID.x + (1u << i)]; + Bic param = bic; + Bic param_1 = other; + bic = bic_combine(param, param_1); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_bic[th] = bic; + } + if (th == 0u) + { + uint param_2 = gl_WorkGroupID.x; + Bic param_3 = bic; + store_bic(param_2, param_3, v_64, v_80); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint size = sh_bic[0].b; + bic = Bic{ 0u, 0u }; + if ((th + 1u) < 256u) + { + bic = sh_bic[th + 1u]; + } + bool _283; + if (is_push) + { + _283 = bic.a == 0u; + } + else + { + _283 = is_push; + } + if (_283) + { + uint local_ix = (size - bic.b) - 1u; + sh_parent[local_ix] = th; + sh_path_ix[local_ix] = inp; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + float4 bbox; + if (th < size) + { + uint path_ix = sh_path_ix[th]; + uint param_4 = path_ix; + bbox = load_path_bbox(param_4, v_64, v_80); + } + if (th < size) + { + uint parent_ix = sh_parent[th] + (gl_WorkGroupID.x * 256u); + ClipEl el = ClipEl{ parent_ix, bbox }; + uint param_5 = gl_GlobalInvocationID.x; + ClipEl param_6 = el; + store_clip_el(param_5, param_6, v_64, v_80); + } +} + diff --git a/piet-gpu/shader/gen/clip_reduce.spv b/piet-gpu/shader/gen/clip_reduce.spv new file mode 100644 index 0000000..cbe8c1f Binary files /dev/null and b/piet-gpu/shader/gen/clip_reduce.spv differ diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil index 16d47ce..0599eb8 100644 Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl index bc96cea..b5f3949 100644 --- a/piet-gpu/shader/gen/coarse.hlsl +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -49,17 +49,6 @@ struct AnnoLinGradient float line_c; }; -struct AnnoBeginClipRef -{ - uint offset; -}; - -struct AnnoBeginClip -{ - float4 bbox; - float linewidth; -}; - struct AnnotatedRef { uint offset; @@ -193,8 +182,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -203,8 +197,8 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _296 : register(u0, space0); -ByteAddressBuffer _1249 : register(t1, space0); +RWByteAddressBuffer _283 : register(u0, space0); +ByteAddressBuffer _1169 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -227,8 +221,8 @@ groupshared uint sh_tile_count[256]; Alloc slice_mem(Alloc a, uint offset, uint size) { - Alloc _373 = { a.offset + offset }; - return _373; + Alloc _360 = { a.offset + offset }; + return _360; } bool touch_mem(Alloc alloc, uint offset) @@ -244,7 +238,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _296.Load(offset * 4 + 8); + uint v = _283.Load(offset * 4 + 8); return v; } @@ -257,8 +251,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { - BinInstanceRef _754 = { ref.offset + (index * 4u) }; - return _754; + BinInstanceRef _674 = { ref.offset + (index * 4u) }; + return _674; } BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) @@ -277,8 +271,8 @@ AnnotatedTag Annotated_tag(Alloc a, AnnotatedRef ref) Alloc param = a; uint param_1 = ref.offset >> uint(2); uint tag_and_flags = read_mem(param, param_1); - AnnotatedTag _706 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; - return _706; + AnnotatedTag _636 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _636; } Path Path_read(Alloc a, PathRef ref) @@ -295,8 +289,8 @@ Path Path_read(Alloc a, PathRef ref) uint raw2 = read_mem(param_4, param_5); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); - TileRef _814 = { raw2 }; - s.tiles = _814; + TileRef _734 = { raw2 }; + s.tiles = _734; return s; } @@ -306,11 +300,11 @@ void write_tile_alloc(uint el_ix, Alloc a) Alloc read_tile_alloc(uint el_ix, bool mem_ok) { - uint _1135; - _296.GetDimensions(_1135); - _1135 = (_1135 - 8) / 4; + uint _1055; + _283.GetDimensions(_1055); + _1055 = (_1055 - 8) / 4; uint param = 0u; - uint param_1 = uint(int(_1135) * 4); + uint param_1 = uint(int(_1055) * 4); bool param_2 = mem_ok; return new_alloc(param, param_1, param_2); } @@ -324,9 +318,9 @@ Tile Tile_read(Alloc a, TileRef ref) Alloc param_2 = a; uint param_3 = ix + 1u; uint raw1 = read_mem(param_2, param_3); - TileSegRef _839 = { raw0 }; + TileSegRef _759 = { raw0 }; Tile s; - s.tile = _839; + s.tile = _759; s.backdrop = int(raw1); return s; } @@ -361,30 +355,30 @@ AnnoColor AnnoColor_read(Alloc a, AnnoColorRef ref) AnnoColor Annotated_Color_read(Alloc a, AnnotatedRef ref) { - AnnoColorRef _712 = { ref.offset + 4u }; + AnnoColorRef _642 = { ref.offset + 4u }; Alloc param = a; - AnnoColorRef param_1 = _712; + AnnoColorRef param_1 = _642; return AnnoColor_read(param, param_1); } MallocResult malloc(uint size) { - uint _302; - _296.InterlockedAdd(0, size, _302); - uint offset = _302; - uint _309; - _296.GetDimensions(_309); - _309 = (_309 - 8) / 4; + uint _289; + _283.InterlockedAdd(0, size, _289); + uint offset = _289; + uint _296; + _283.GetDimensions(_296); + _296 = (_296 - 8) / 4; MallocResult r; - r.failed = (offset + size) > uint(int(_309) * 4); + r.failed = (offset + size) > uint(int(_296) * 4); uint param = offset; uint param_1 = size; bool param_2 = !r.failed; r.alloc = new_alloc(param, param_1, param_2); if (r.failed) { - uint _331; - _296.InterlockedMax(4, 1u, _331); + uint _318; + _283.InterlockedMax(4, 1u, _318); return r; } return r; @@ -398,7 +392,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _296.Store(offset * 4 + 8, val); + _283.Store(offset * 4 + 8, val); } void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) @@ -416,9 +410,9 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) uint param_1 = ref.offset >> uint(2); uint param_2 = 10u; write_mem(param, param_1, param_2); - CmdJumpRef _1128 = { ref.offset + 4u }; + CmdJumpRef _1048 = { ref.offset + 4u }; Alloc param_3 = a; - CmdJumpRef param_4 = _1128; + CmdJumpRef param_4 = _1048; CmdJump param_5 = s; CmdJump_write(param_3, param_4, param_5); } @@ -430,21 +424,21 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit return true; } uint param = 1024u; - MallocResult _1156 = malloc(param); - MallocResult new_cmd = _1156; + MallocResult _1076 = malloc(param); + MallocResult new_cmd = _1076; if (new_cmd.failed) { return false; } - CmdJump _1166 = { new_cmd.alloc.offset }; - CmdJump jump = _1166; + CmdJump _1086 = { new_cmd.alloc.offset }; + CmdJump jump = _1086; Alloc param_1 = cmd_alloc; CmdRef param_2 = cmd_ref; CmdJump param_3 = jump; Cmd_Jump_write(param_1, param_2, param_3); cmd_alloc = new_cmd.alloc; - CmdRef _1178 = { cmd_alloc.offset }; - cmd_ref = _1178; + CmdRef _1098 = { cmd_alloc.offset }; + cmd_ref = _1098; cmd_limit = (cmd_alloc.offset + 1024u) - 60u; return true; } @@ -473,9 +467,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) uint param_1 = ref.offset >> uint(2); uint param_2 = 1u; write_mem(param, param_1, param_2); - CmdFillRef _1012 = { ref.offset + 4u }; + CmdFillRef _932 = { ref.offset + 4u }; Alloc param_3 = a; - CmdFillRef param_4 = _1012; + CmdFillRef param_4 = _932; CmdFill param_5 = s; CmdFill_write(param_3, param_4, param_5); } @@ -507,9 +501,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) uint param_1 = ref.offset >> uint(2); uint param_2 = 2u; write_mem(param, param_1, param_2); - CmdStrokeRef _1030 = { ref.offset + 4u }; + CmdStrokeRef _950 = { ref.offset + 4u }; Alloc param_3 = a; - CmdStrokeRef param_4 = _1030; + CmdStrokeRef param_4 = _950; CmdStroke param_5 = s; CmdStroke_write(param_3, param_4, param_5); } @@ -521,8 +515,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float { if (tile.tile.offset != 0u) { - CmdFill _1202 = { tile.tile.offset, tile.backdrop }; - CmdFill cmd_fill = _1202; + CmdFill _1122 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _1122; Alloc param_1 = alloc; CmdRef param_2 = cmd_ref; CmdFill param_3 = cmd_fill; @@ -539,8 +533,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float } else { - CmdStroke _1232 = { tile.tile.offset, 0.5f * linewidth }; - CmdStroke cmd_stroke = _1232; + CmdStroke _1152 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _1152; Alloc param_6 = alloc; CmdRef param_7 = cmd_ref; CmdStroke param_8 = cmd_stroke; @@ -564,9 +558,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; write_mem(param, param_1, param_2); - CmdColorRef _1056 = { ref.offset + 4u }; + CmdColorRef _976 = { ref.offset + 4u }; Alloc param_3 = a; - CmdColorRef param_4 = _1056; + CmdColorRef param_4 = _976; CmdColor param_5 = s; CmdColor_write(param_3, param_4, param_5); } @@ -613,9 +607,9 @@ AnnoLinGradient AnnoLinGradient_read(Alloc a, AnnoLinGradientRef ref) AnnoLinGradient Annotated_LinGradient_read(Alloc a, AnnotatedRef ref) { - AnnoLinGradientRef _722 = { ref.offset + 4u }; + AnnoLinGradientRef _652 = { ref.offset + 4u }; Alloc param = a; - AnnoLinGradientRef param_1 = _722; + AnnoLinGradientRef param_1 = _652; return AnnoLinGradient_read(param, param_1); } @@ -646,9 +640,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) uint param_1 = ref.offset >> uint(2); uint param_2 = 6u; write_mem(param, param_1, param_2); - CmdLinGradRef _1074 = { ref.offset + 4u }; + CmdLinGradRef _994 = { ref.offset + 4u }; Alloc param_3 = a; - CmdLinGradRef param_4 = _1074; + CmdLinGradRef param_4 = _994; CmdLinGrad param_5 = s; CmdLinGrad_write(param_3, param_4, param_5); } @@ -687,9 +681,9 @@ AnnoImage AnnoImage_read(Alloc a, AnnoImageRef ref) AnnoImage Annotated_Image_read(Alloc a, AnnotatedRef ref) { - AnnoImageRef _732 = { ref.offset + 4u }; + AnnoImageRef _662 = { ref.offset + 4u }; Alloc param = a; - AnnoImageRef param_1 = _732; + AnnoImageRef param_1 = _662; return AnnoImage_read(param, param_1); } @@ -712,45 +706,13 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) uint param_1 = ref.offset >> uint(2); uint param_2 = 7u; write_mem(param, param_1, param_2); - CmdImageRef _1092 = { ref.offset + 4u }; + CmdImageRef _1012 = { ref.offset + 4u }; Alloc param_3 = a; - CmdImageRef param_4 = _1092; + CmdImageRef param_4 = _1012; CmdImage param_5 = s; CmdImage_write(param_3, param_4, param_5); } -AnnoBeginClip AnnoBeginClip_read(Alloc a, AnnoBeginClipRef ref) -{ - uint ix = ref.offset >> uint(2); - Alloc param = a; - uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1); - Alloc param_2 = a; - uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3); - Alloc param_4 = a; - uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5); - Alloc param_6 = a; - uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7); - Alloc param_8 = a; - uint param_9 = ix + 4u; - uint raw4 = read_mem(param_8, param_9); - AnnoBeginClip s; - s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); - s.linewidth = asfloat(raw4); - return s; -} - -AnnoBeginClip Annotated_BeginClip_read(Alloc a, AnnotatedRef ref) -{ - AnnoBeginClipRef _742 = { ref.offset + 4u }; - Alloc param = a; - AnnoBeginClipRef param_1 = _742; - return AnnoBeginClip_read(param, param_1); -} - void Cmd_BeginClip_write(Alloc a, CmdRef ref) { Alloc param = a; @@ -777,44 +739,43 @@ void Cmd_End_write(Alloc a, CmdRef ref) void comp_main() { - uint width_in_bins = ((_1249.Load(8) + 16u) - 1u) / 16u; + uint width_in_bins = ((_1169.Load(8) + 16u) - 1u) / 16u; uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; uint partition_ix = 0u; - uint n_partitions = ((_1249.Load(0) + 256u) - 1u) / 256u; + uint n_partitions = ((_1169.Load(0) + 256u) - 1u) / 256u; uint th_ix = gl_LocalInvocationID.x; uint bin_tile_x = 16u * gl_WorkGroupID.x; uint bin_tile_y = 16u * gl_WorkGroupID.y; uint tile_x = gl_LocalInvocationID.x % 16u; uint tile_y = gl_LocalInvocationID.x / 16u; - uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.Load(8)) + bin_tile_x) + tile_x; - Alloc _1314; - _1314.offset = _1249.Load(24); + uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.Load(8)) + bin_tile_x) + tile_x; + Alloc _1234; + _1234.offset = _1169.Load(24); Alloc param; - param.offset = _1314.offset; + param.offset = _1234.offset; uint param_1 = this_tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); - CmdRef _1323 = { cmd_alloc.offset }; - CmdRef cmd_ref = _1323; + CmdRef _1243 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1243; uint cmd_limit = (cmd_ref.offset + 1024u) - 60u; uint clip_depth = 0u; uint clip_zero_depth = 0u; - uint clip_one_mask = 0u; uint rd_ix = 0u; uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; - bool mem_ok = _296.Load(4) == 0u; + bool mem_ok = _283.Load(4) == 0u; Alloc param_3; Alloc param_5; - uint _1529; + uint _1448; uint element_ix; AnnotatedRef ref; Alloc param_14; Alloc param_16; uint tile_count; Alloc param_23; - uint _1841; + uint _1770; Alloc param_29; Tile tile_1; AnnoColor fill; @@ -822,41 +783,40 @@ void comp_main() Alloc param_52; CmdLinGrad cmd_lin; Alloc param_69; - Alloc param_86; while (true) { for (uint i = 0u; i < 8u; i++) { sh_bitmaps[i][th_ix] = 0u; } - bool _1581; + bool _1500; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1379 = th_ix < 256u; - bool _1387; - if (_1379) + bool _1298 = th_ix < 256u; + bool _1306; + if (_1298) { - _1387 = (partition_ix + th_ix) < n_partitions; + _1306 = (partition_ix + th_ix) < n_partitions; } else { - _1387 = _1379; + _1306 = _1298; } - if (_1387) + if (_1306) { - uint in_ix = (_1249.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - Alloc _1404; - _1404.offset = _1249.Load(20); - param_3.offset = _1404.offset; + uint in_ix = (_1169.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1323; + _1323.offset = _1169.Load(20); + param_3.offset = _1323.offset; uint param_4 = in_ix; count = read_mem(param_3, param_4); - Alloc _1415; - _1415.offset = _1249.Load(20); - param_5.offset = _1415.offset; + Alloc _1334; + _1334.offset = _1169.Load(20); + param_5.offset = _1334.offset; uint param_6 = in_ix + 1u; uint offset = read_mem(param_5, param_6); uint param_7 = offset; @@ -902,16 +862,16 @@ void comp_main() } if (part_ix > 0u) { - _1529 = sh_part_count[part_ix - 1u]; + _1448 = sh_part_count[part_ix - 1u]; } else { - _1529 = part_start_ix; + _1448 = part_start_ix; } - ix -= _1529; + ix -= _1448; Alloc bin_alloc = sh_part_elements[part_ix]; - BinInstanceRef _1548 = { bin_alloc.offset }; - BinInstanceRef inst_ref = _1548; + BinInstanceRef _1467 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1467; BinInstanceRef param_10 = inst_ref; uint param_11 = ix; Alloc param_12 = bin_alloc; @@ -921,16 +881,16 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1571 = (wr_ix - rd_ix) < 256u; - if (_1571) + bool _1490 = (wr_ix - rd_ix) < 256u; + if (_1490) { - _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1581 = _1571; + _1500 = _1490; } - if (_1581) + if (_1500) { continue; } @@ -943,11 +903,11 @@ void comp_main() if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - AnnotatedRef _1602 = { _1249.Load(32) + (element_ix * 40u) }; - ref = _1602; - Alloc _1605; - _1605.offset = _1249.Load(32); - param_14.offset = _1605.offset; + AnnotatedRef _1521 = { _1169.Load(32) + (element_ix * 40u) }; + ref = _1521; + Alloc _1524; + _1524.offset = _1169.Load(32); + param_14.offset = _1524.offset; AnnotatedRef param_15 = ref; tag = Annotated_tag(param_14, param_15).tag; } @@ -959,12 +919,13 @@ void comp_main() case 4u: case 5u: { - uint path_ix = element_ix; - PathRef _1624 = { _1249.Load(16) + (path_ix * 12u) }; - Alloc _1627; - _1627.offset = _1249.Load(16); - param_16.offset = _1627.offset; - PathRef param_17 = _1624; + uint drawmonoid_base = (_1169.Load(44) >> uint(2)) + (2u * element_ix); + uint path_ix = _283.Load(drawmonoid_base * 4 + 8); + PathRef _1553 = { _1169.Load(16) + (path_ix * 12u) }; + Alloc _1556; + _1556.offset = _1169.Load(16); + param_16.offset = _1556.offset; + PathRef param_17 = _1553; Path path = Path_read(param_16, param_17); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; @@ -1019,59 +980,53 @@ void comp_main() el_ix = probe_1; } } - AnnotatedRef _1826 = { _1249.Load(32) + (sh_elements[el_ix] * 40u) }; - AnnotatedRef ref_1 = _1826; - Alloc _1830; - _1830.offset = _1249.Load(32); - param_23.offset = _1830.offset; + AnnotatedRef _1755 = { _1169.Load(32) + (sh_elements[el_ix] * 40u) }; + AnnotatedRef ref_1 = _1755; + Alloc _1759; + _1759.offset = _1169.Load(32); + param_23.offset = _1759.offset; AnnotatedRef param_24 = ref_1; uint tag_1 = Annotated_tag(param_23, param_24).tag; if (el_ix > 0u) { - _1841 = sh_tile_count[el_ix - 1u]; + _1770 = sh_tile_count[el_ix - 1u]; } else { - _1841 = 0u; + _1770 = 0u; } - uint seq_ix = ix_1 - _1841; + uint seq_ix = ix_1 - _1770; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); bool include_tile = false; - if ((tag_1 == 4u) || (tag_1 == 5u)) + if (mem_ok) { - include_tile = true; - } - else - { - if (mem_ok) + uint param_25 = el_ix; + bool param_26 = mem_ok; + TileRef _1822 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_27 = read_tile_alloc(param_25, param_26); + TileRef param_28 = _1822; + Tile tile = Tile_read(param_27, param_28); + bool is_clip = (tag_1 == 4u) || (tag_1 == 5u); + bool _1834 = tile.tile.offset != 0u; + bool _1843; + if (!_1834) { - uint param_25 = el_ix; - bool param_26 = mem_ok; - TileRef _1901 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; - Alloc param_27 = read_tile_alloc(param_25, param_26); - TileRef param_28 = _1901; - Tile tile = Tile_read(param_27, param_28); - bool _1907 = tile.tile.offset != 0u; - bool _1914; - if (!_1907) - { - _1914 = tile.backdrop != 0; - } - else - { - _1914 = _1907; - } - include_tile = _1914; + _1843 = (tile.backdrop == 0) == is_clip; } + else + { + _1843 = _1834; + } + include_tile = _1843; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1934; - InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1934); + uint _1863; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1863); } } GroupMemoryBarrierWithGroupSync(); @@ -1095,11 +1050,11 @@ void comp_main() uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); uint element_ix_1 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - AnnotatedRef _1988 = { _1249.Load(32) + (element_ix_1 * 40u) }; - ref = _1988; - Alloc _1993; - _1993.offset = _1249.Load(32); - param_29.offset = _1993.offset; + AnnotatedRef _1917 = { _1169.Load(32) + (element_ix_1 * 40u) }; + ref = _1917; + Alloc _1922; + _1922.offset = _1169.Load(32); + param_29.offset = _1922.offset; AnnotatedRef param_30 = ref; AnnotatedTag tag_2 = Annotated_tag(param_29, param_30); if (clip_zero_depth == 0u) @@ -1110,23 +1065,23 @@ void comp_main() { uint param_31 = element_ref_ix; bool param_32 = mem_ok; - TileRef _2029 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + TileRef _1958 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; Alloc param_33 = read_tile_alloc(param_31, param_32); - TileRef param_34 = _2029; + TileRef param_34 = _1958; tile_1 = Tile_read(param_33, param_34); - Alloc _2036; - _2036.offset = _1249.Load(32); - param_35.offset = _2036.offset; + Alloc _1965; + _1965.offset = _1169.Load(32); + param_35.offset = _1965.offset; AnnotatedRef param_36 = ref; fill = Annotated_Color_read(param_35, param_36); Alloc param_37 = cmd_alloc; CmdRef param_38 = cmd_ref; uint param_39 = cmd_limit; - bool _2048 = alloc_cmd(param_37, param_38, param_39); + bool _1977 = alloc_cmd(param_37, param_38, param_39); cmd_alloc = param_37; cmd_ref = param_38; cmd_limit = param_39; - if (!_2048) + if (!_1977) { break; } @@ -1137,10 +1092,10 @@ void comp_main() float param_44 = fill.linewidth; write_fill(param_40, param_41, param_42, param_43, param_44); cmd_ref = param_41; - CmdColor _2072 = { fill.rgba_color }; + CmdColor _2001 = { fill.rgba_color }; Alloc param_45 = cmd_alloc; CmdRef param_46 = cmd_ref; - CmdColor param_47 = _2072; + CmdColor param_47 = _2001; Cmd_Color_write(param_45, param_46, param_47); cmd_ref.offset += 8u; break; @@ -1149,23 +1104,23 @@ void comp_main() { uint param_48 = element_ref_ix; bool param_49 = mem_ok; - TileRef _2101 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + TileRef _2030 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; Alloc param_50 = read_tile_alloc(param_48, param_49); - TileRef param_51 = _2101; + TileRef param_51 = _2030; tile_1 = Tile_read(param_50, param_51); - Alloc _2108; - _2108.offset = _1249.Load(32); - param_52.offset = _2108.offset; + Alloc _2037; + _2037.offset = _1169.Load(32); + param_52.offset = _2037.offset; AnnotatedRef param_53 = ref; AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53); Alloc param_54 = cmd_alloc; CmdRef param_55 = cmd_ref; uint param_56 = cmd_limit; - bool _2120 = alloc_cmd(param_54, param_55, param_56); + bool _2049 = alloc_cmd(param_54, param_55, param_56); cmd_alloc = param_54; cmd_ref = param_55; cmd_limit = param_56; - if (!_2120) + if (!_2049) { break; } @@ -1191,23 +1146,23 @@ void comp_main() { uint param_65 = element_ref_ix; bool param_66 = mem_ok; - TileRef _2185 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + TileRef _2114 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; Alloc param_67 = read_tile_alloc(param_65, param_66); - TileRef param_68 = _2185; + TileRef param_68 = _2114; tile_1 = Tile_read(param_67, param_68); - Alloc _2192; - _2192.offset = _1249.Load(32); - param_69.offset = _2192.offset; + Alloc _2121; + _2121.offset = _1169.Load(32); + param_69.offset = _2121.offset; AnnotatedRef param_70 = ref; AnnoImage fill_img = Annotated_Image_read(param_69, param_70); Alloc param_71 = cmd_alloc; CmdRef param_72 = cmd_ref; uint param_73 = cmd_limit; - bool _2204 = alloc_cmd(param_71, param_72, param_73); + bool _2133 = alloc_cmd(param_71, param_72, param_73); cmd_alloc = param_71; cmd_ref = param_72; cmd_limit = param_73; - if (!_2204) + if (!_2133) { break; } @@ -1218,10 +1173,10 @@ void comp_main() float param_78 = fill_img.linewidth; write_fill(param_74, param_75, param_76, param_77, param_78); cmd_ref = param_75; - CmdImage _2230 = { fill_img.index, fill_img.offset }; + CmdImage _2159 = { fill_img.index, fill_img.offset }; Alloc param_79 = cmd_alloc; CmdRef param_80 = cmd_ref; - CmdImage param_81 = _2230; + CmdImage param_81 = _2159; Cmd_Image_write(param_79, param_80, param_81); cmd_ref.offset += 12u; break; @@ -1230,103 +1185,76 @@ void comp_main() { uint param_82 = element_ref_ix; bool param_83 = mem_ok; - TileRef _2259 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + TileRef _2188 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; Alloc param_84 = read_tile_alloc(param_82, param_83); - TileRef param_85 = _2259; + TileRef param_85 = _2188; tile_1 = Tile_read(param_84, param_85); - bool _2265 = tile_1.tile.offset == 0u; - bool _2271; - if (_2265) + bool _2194 = tile_1.tile.offset == 0u; + bool _2200; + if (_2194) { - _2271 = tile_1.backdrop == 0; + _2200 = tile_1.backdrop == 0; } else { - _2271 = _2265; + _2200 = _2194; } - if (_2271) + if (_2200) { clip_zero_depth = clip_depth + 1u; } else { - if ((tile_1.tile.offset == 0u) && (clip_depth < 32u)) + Alloc param_86 = cmd_alloc; + CmdRef param_87 = cmd_ref; + uint param_88 = cmd_limit; + bool _2212 = alloc_cmd(param_86, param_87, param_88); + cmd_alloc = param_86; + cmd_ref = param_87; + cmd_limit = param_88; + if (!_2212) { - clip_one_mask |= (1u << clip_depth); - } - else - { - Alloc _2293; - _2293.offset = _1249.Load(32); - param_86.offset = _2293.offset; - AnnotatedRef param_87 = ref; - AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87); - Alloc param_88 = cmd_alloc; - CmdRef param_89 = cmd_ref; - uint param_90 = cmd_limit; - bool _2305 = alloc_cmd(param_88, param_89, param_90); - cmd_alloc = param_88; - cmd_ref = param_89; - cmd_limit = param_90; - if (!_2305) - { - break; - } - Alloc param_91 = cmd_alloc; - CmdRef param_92 = cmd_ref; - uint param_93 = tag_2.flags; - Tile param_94 = tile_1; - float param_95 = begin_clip.linewidth; - write_fill(param_91, param_92, param_93, param_94, param_95); - cmd_ref = param_92; - Alloc param_96 = cmd_alloc; - CmdRef param_97 = cmd_ref; - Cmd_BeginClip_write(param_96, param_97); - cmd_ref.offset += 4u; - if (clip_depth < 32u) - { - clip_one_mask &= (~(1u << clip_depth)); - } + break; } + Alloc param_89 = cmd_alloc; + CmdRef param_90 = cmd_ref; + Cmd_BeginClip_write(param_89, param_90); + cmd_ref.offset += 4u; } clip_depth++; break; } case 5u: { + uint param_91 = element_ref_ix; + bool param_92 = mem_ok; + TileRef _2249 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_93 = read_tile_alloc(param_91, param_92); + TileRef param_94 = _2249; + tile_1 = Tile_read(param_93, param_94); clip_depth--; - bool _2351 = clip_depth >= 32u; - bool _2360; - if (!_2351) + Alloc param_95 = cmd_alloc; + CmdRef param_96 = cmd_ref; + uint param_97 = cmd_limit; + bool _2261 = alloc_cmd(param_95, param_96, param_97); + cmd_alloc = param_95; + cmd_ref = param_96; + cmd_limit = param_97; + if (!_2261) { - _2360 = (clip_one_mask & (1u << clip_depth)) == 0u; - } - else - { - _2360 = _2351; - } - if (_2360) - { - Alloc param_98 = cmd_alloc; - CmdRef param_99 = cmd_ref; - uint param_100 = cmd_limit; - bool _2369 = alloc_cmd(param_98, param_99, param_100); - cmd_alloc = param_98; - cmd_ref = param_99; - cmd_limit = param_100; - if (!_2369) - { - break; - } - Alloc param_101 = cmd_alloc; - CmdRef param_102 = cmd_ref; - Cmd_Solid_write(param_101, param_102); - cmd_ref.offset += 4u; - Alloc param_103 = cmd_alloc; - CmdRef param_104 = cmd_ref; - Cmd_EndClip_write(param_103, param_104); - cmd_ref.offset += 4u; + break; } + Alloc param_98 = cmd_alloc; + CmdRef param_99 = cmd_ref; + uint param_100 = 0u; + Tile param_101 = tile_1; + float param_102 = 0.0f; + write_fill(param_98, param_99, param_100, param_101, param_102); + cmd_ref = param_99; + Alloc param_103 = cmd_alloc; + CmdRef param_104 = cmd_ref; + Cmd_EndClip_write(param_103, param_104); + cmd_ref.offset += 4u; break; } } @@ -1359,17 +1287,17 @@ void comp_main() break; } } - bool _2432 = (bin_tile_x + tile_x) < _1249.Load(8); - bool _2441; - if (_2432) + bool _2326 = (bin_tile_x + tile_x) < _1169.Load(8); + bool _2335; + if (_2326) { - _2441 = (bin_tile_y + tile_y) < _1249.Load(12); + _2335 = (bin_tile_y + tile_y) < _1169.Load(12); } else { - _2441 = _2432; + _2335 = _2326; } - if (_2441) + if (_2335) { Alloc param_105 = cmd_alloc; CmdRef param_106 = cmd_ref; diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl index 096f710..e5a0f0d 100644 --- a/piet-gpu/shader/gen/coarse.msl +++ b/piet-gpu/shader/gen/coarse.msl @@ -65,17 +65,6 @@ struct AnnoLinGradient float line_c; }; -struct AnnoBeginClipRef -{ - uint offset; -}; - -struct AnnoBeginClip -{ - float4 bbox; - float linewidth; -}; - struct AnnotatedRef { uint offset; @@ -221,8 +210,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -249,7 +243,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_296, constant uint& v_296BufferSize) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = alloc; uint param_1 = offset; @@ -257,7 +251,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_296.memory[offset]; + uint v = v_283.memory[offset]; return v; } @@ -276,39 +270,39 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const } static inline __attribute__((always_inline)) -BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize); BinInstance s; s.element_ix = raw0; return s; } static inline __attribute__((always_inline)) -AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnotatedTag Annotated_tag(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); - uint tag_and_flags = read_mem(param, param_1, v_296, v_296BufferSize); + uint tag_and_flags = read_mem(param, param_1, v_283, v_283BufferSize); return AnnotatedTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; } static inline __attribute__((always_inline)) -Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); s.tiles = TileRef{ raw2 }; @@ -321,24 +315,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) } static inline __attribute__((always_inline)) -Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_296, constant uint& v_296BufferSize) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_283, constant uint& v_283BufferSize) { uint param = 0u; - uint param_1 = uint(int((v_296BufferSize - 8) / 4) * 4); + uint param_1 = uint(int((v_283BufferSize - 8) / 4) * 4); bool param_2 = mem_ok; return new_alloc(param, param_1, param_2); } static inline __attribute__((always_inline)) -Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize); Tile s; s.tile = TileSegRef{ raw0 }; s.backdrop = int(raw1); @@ -346,27 +340,27 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& } static inline __attribute__((always_inline)) -AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize); Alloc param_6 = a; uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize); Alloc param_8 = a; uint param_9 = ix + 4u; - uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize); Alloc param_10 = a; uint param_11 = ix + 5u; - uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize); AnnoColor s; s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); s.linewidth = as_type(raw4); @@ -375,34 +369,34 @@ AnnoColor AnnoColor_read(thread const Alloc& a, thread const AnnoColorRef& ref, } static inline __attribute__((always_inline)) -AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnoColor Annotated_Color_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; AnnoColorRef param_1 = AnnoColorRef{ ref.offset + 4u }; - return AnnoColor_read(param, param_1, v_296, v_296BufferSize); + return AnnoColor_read(param, param_1, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_296, constant uint& v_296BufferSize) +MallocResult malloc(thread const uint& size, device Memory& v_283, constant uint& v_283BufferSize) { - uint _302 = atomic_fetch_add_explicit((device atomic_uint*)&v_296.mem_offset, size, memory_order_relaxed); - uint offset = _302; + uint _289 = atomic_fetch_add_explicit((device atomic_uint*)&v_283.mem_offset, size, memory_order_relaxed); + uint offset = _289; MallocResult r; - r.failed = (offset + size) > uint(int((v_296BufferSize - 8) / 4) * 4); + r.failed = (offset + size) > uint(int((v_283BufferSize - 8) / 4) * 4); uint param = offset; uint param_1 = size; bool param_2 = !r.failed; r.alloc = new_alloc(param, param_1, param_2); if (r.failed) { - uint _331 = atomic_fetch_max_explicit((device atomic_uint*)&v_296.mem_error, 1u, memory_order_relaxed); + uint _318 = atomic_fetch_max_explicit((device atomic_uint*)&v_283.mem_error, 1u, memory_order_relaxed); return r; } return r; } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_296, constant uint& v_296BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = alloc; uint param_1 = offset; @@ -410,42 +404,42 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_296.memory[offset] = val; + v_283.memory[offset] = val; } static inline __attribute__((always_inline)) -void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.new_ref; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 10u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; CmdJump param_5 = s; - CmdJump_write(param_3, param_4, param_5, v_296, v_296BufferSize); + CmdJump_write(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_296, constant uint& v_296BufferSize) +bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_283, constant uint& v_283BufferSize) { if (cmd_ref.offset < cmd_limit) { return true; } uint param = 1024u; - MallocResult _1156 = malloc(param, v_296, v_296BufferSize); - MallocResult new_cmd = _1156; + MallocResult _1076 = malloc(param, v_283, v_283BufferSize); + MallocResult new_cmd = _1076; if (new_cmd.failed) { return false; @@ -454,7 +448,7 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd Alloc param_1 = cmd_alloc; CmdRef param_2 = cmd_ref; CmdJump param_3 = jump; - Cmd_Jump_write(param_1, param_2, param_3, v_296, v_296BufferSize); + Cmd_Jump_write(param_1, param_2, param_3, v_283, v_283BufferSize); cmd_alloc = new_cmd.alloc; cmd_ref = CmdRef{ cmd_alloc.offset }; cmd_limit = (cmd_alloc.offset + 1024u) - 60u; @@ -468,70 +462,70 @@ uint fill_mode_from_flags(thread const uint& flags) } static inline __attribute__((always_inline)) -void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.tile_ref; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = uint(s.backdrop); - write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); + write_mem(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 1u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; CmdFill param_5 = s; - CmdFill_write(param_3, param_4, param_5, v_296, v_296BufferSize); + CmdFill_write(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 3u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.tile_ref; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.half_width); - write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); + write_mem(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 2u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; CmdStroke param_5 = s; - CmdStroke_write(param_3, param_4, param_5, v_296, v_296BufferSize); + CmdStroke_write(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_296, constant uint& v_296BufferSize) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const uint& flags, thread const Tile& tile, thread const float& linewidth, device Memory& v_283, constant uint& v_283BufferSize) { uint param = flags; if (fill_mode_from_flags(param) == 0u) @@ -542,14 +536,14 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Alloc param_1 = alloc; CmdRef param_2 = cmd_ref; CmdFill param_3 = cmd_fill; - Cmd_Fill_write(param_1, param_2, param_3, v_296, v_296BufferSize); + Cmd_Fill_write(param_1, param_2, param_3, v_283, v_283BufferSize); cmd_ref.offset += 12u; } else { Alloc param_4 = alloc; CmdRef param_5 = cmd_ref; - Cmd_Solid_write(param_4, param_5, v_296, v_296BufferSize); + Cmd_Solid_write(param_4, param_5, v_283, v_283BufferSize); cmd_ref.offset += 4u; } } @@ -559,65 +553,65 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Alloc param_6 = alloc; CmdRef param_7 = cmd_ref; CmdStroke param_8 = cmd_stroke; - Cmd_Stroke_write(param_6, param_7, param_8, v_296, v_296BufferSize); + Cmd_Stroke_write(param_6, param_7, param_8, v_283, v_283BufferSize); cmd_ref.offset += 12u; } } static inline __attribute__((always_inline)) -void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.rgba_color; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; CmdColor param_5 = s; - CmdColor_write(param_3, param_4, param_5, v_296, v_296BufferSize); + CmdColor_write(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLinGradientRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize); Alloc param_6 = a; uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize); Alloc param_8 = a; uint param_9 = ix + 4u; - uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize); Alloc param_10 = a; uint param_11 = ix + 5u; - uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize); Alloc param_12 = a; uint param_13 = ix + 6u; - uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize); + uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize); Alloc param_14 = a; uint param_15 = ix + 7u; - uint raw7 = read_mem(param_14, param_15, v_296, v_296BufferSize); + uint raw7 = read_mem(param_14, param_15, v_283, v_283BufferSize); Alloc param_16 = a; uint param_17 = ix + 8u; - uint raw8 = read_mem(param_16, param_17, v_296, v_296BufferSize); + uint raw8 = read_mem(param_16, param_17, v_283, v_283BufferSize); AnnoLinGradient s; s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); s.linewidth = as_type(raw4); @@ -629,73 +623,73 @@ AnnoLinGradient AnnoLinGradient_read(thread const Alloc& a, thread const AnnoLin } static inline __attribute__((always_inline)) -AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnoLinGradient Annotated_LinGradient_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; AnnoLinGradientRef param_1 = AnnoLinGradientRef{ ref.offset + 4u }; - return AnnoLinGradient_read(param, param_1, v_296, v_296BufferSize); + return AnnoLinGradient_read(param, param_1, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.line_x); - write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); + write_mem(param_3, param_4, param_5, v_283, v_283BufferSize); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.line_y); - write_mem(param_6, param_7, param_8, v_296, v_296BufferSize); + write_mem(param_6, param_7, param_8, v_283, v_283BufferSize); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.line_c); - write_mem(param_9, param_10, param_11, v_296, v_296BufferSize); + write_mem(param_9, param_10, param_11, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 6u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; CmdLinGrad param_5 = s; - CmdLinGrad_write(param_3, param_4, param_5, v_296, v_296BufferSize); + CmdLinGrad_write(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); + uint raw0 = read_mem(param, param_1, v_283, v_283BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); + uint raw1 = read_mem(param_2, param_3, v_283, v_283BufferSize); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); + uint raw2 = read_mem(param_4, param_5, v_283, v_283BufferSize); Alloc param_6 = a; uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); + uint raw3 = read_mem(param_6, param_7, v_283, v_283BufferSize); Alloc param_8 = a; uint param_9 = ix + 4u; - uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); + uint raw4 = read_mem(param_8, param_9, v_283, v_283BufferSize); Alloc param_10 = a; uint param_11 = ix + 5u; - uint raw5 = read_mem(param_10, param_11, v_296, v_296BufferSize); + uint raw5 = read_mem(param_10, param_11, v_283, v_283BufferSize); Alloc param_12 = a; uint param_13 = ix + 6u; - uint raw6 = read_mem(param_12, param_13, v_296, v_296BufferSize); + uint raw6 = read_mem(param_12, param_13, v_283, v_283BufferSize); AnnoImage s; s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); s.linewidth = as_type(raw4); @@ -705,101 +699,68 @@ AnnoImage AnnoImage_read(thread const Alloc& a, thread const AnnoImageRef& ref, } static inline __attribute__((always_inline)) -AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +AnnoImage Annotated_Image_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; AnnoImageRef param_1 = AnnoImageRef{ ref.offset + 4u }; - return AnnoImage_read(param, param_1, v_296, v_296BufferSize); + return AnnoImage_read(param, param_1, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); - write_mem(param_3, param_4, param_5, v_296, v_296BufferSize); + write_mem(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 7u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); Alloc param_3 = a; CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; CmdImage param_5 = s; - CmdImage_write(param_3, param_4, param_5, v_296, v_296BufferSize); + CmdImage_write(param_3, param_4, param_5, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -AnnoBeginClip AnnoBeginClip_read(thread const Alloc& a, thread const AnnoBeginClipRef& ref, device Memory& v_296, constant uint& v_296BufferSize) -{ - uint ix = ref.offset >> uint(2); - Alloc param = a; - uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_296, v_296BufferSize); - Alloc param_2 = a; - uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_296, v_296BufferSize); - Alloc param_4 = a; - uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_296, v_296BufferSize); - Alloc param_6 = a; - uint param_7 = ix + 3u; - uint raw3 = read_mem(param_6, param_7, v_296, v_296BufferSize); - Alloc param_8 = a; - uint param_9 = ix + 4u; - uint raw4 = read_mem(param_8, param_9, v_296, v_296BufferSize); - AnnoBeginClip s; - s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); - s.linewidth = as_type(raw4); - return s; -} - -static inline __attribute__((always_inline)) -AnnoBeginClip Annotated_BeginClip_read(thread const Alloc& a, thread const AnnotatedRef& ref, device Memory& v_296, constant uint& v_296BufferSize) -{ - Alloc param = a; - AnnoBeginClipRef param_1 = AnnoBeginClipRef{ ref.offset + 4u }; - return AnnoBeginClip_read(param, param_1, v_296, v_296BufferSize); -} - -static inline __attribute__((always_inline)) -void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 8u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 9u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); } static inline __attribute__((always_inline)) -void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_296, constant uint& v_296BufferSize) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_283, constant uint& v_283BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 0u; - write_mem(param, param_1, param_2, v_296, v_296BufferSize); + write_mem(param, param_1, param_2, v_283, v_283BufferSize); } -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_296 [[buffer(0)]], const device ConfigBuf& _1249 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_283 [[buffer(0)]], const device ConfigBuf& _1169 [[buffer(1)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint sh_bitmaps[8][256]; threadgroup Alloc sh_part_elements[256]; @@ -811,19 +772,19 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M threadgroup uint sh_tile_y0[256]; threadgroup uint sh_tile_base[256]; threadgroup uint sh_tile_count[256]; - constant uint& v_296BufferSize = spvBufferSizeConstants[0]; - uint width_in_bins = ((_1249.conf.width_in_tiles + 16u) - 1u) / 16u; + constant uint& v_283BufferSize = spvBufferSizeConstants[0]; + uint width_in_bins = ((_1169.conf.width_in_tiles + 16u) - 1u) / 16u; uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; uint partition_ix = 0u; - uint n_partitions = ((_1249.conf.n_elements + 256u) - 1u) / 256u; + uint n_partitions = ((_1169.conf.n_elements + 256u) - 1u) / 256u; uint th_ix = gl_LocalInvocationID.x; uint bin_tile_x = 16u * gl_WorkGroupID.x; uint bin_tile_y = 16u * gl_WorkGroupID.y; uint tile_x = gl_LocalInvocationID.x % 16u; uint tile_y = gl_LocalInvocationID.x / 16u; - uint this_tile_ix = (((bin_tile_y + tile_y) * _1249.conf.width_in_tiles) + bin_tile_x) + tile_x; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1169.conf.width_in_tiles) + bin_tile_x) + tile_x; Alloc param; - param.offset = _1249.conf.ptcl_alloc.offset; + param.offset = _1169.conf.ptcl_alloc.offset; uint param_1 = this_tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); @@ -831,22 +792,21 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint cmd_limit = (cmd_ref.offset + 1024u) - 60u; uint clip_depth = 0u; uint clip_zero_depth = 0u; - uint clip_one_mask = 0u; uint rd_ix = 0u; uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; - bool mem_ok = v_296.mem_error == 0u; + bool mem_ok = v_283.mem_error == 0u; Alloc param_3; Alloc param_5; - uint _1529; + uint _1448; uint element_ix; AnnotatedRef ref; Alloc param_14; Alloc param_16; uint tile_count; Alloc param_23; - uint _1841; + uint _1770; Alloc param_29; Tile tile_1; AnnoColor fill; @@ -854,39 +814,38 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_52; CmdLinGrad cmd_lin; Alloc param_69; - Alloc param_86; while (true) { for (uint i = 0u; i < 8u; i++) { sh_bitmaps[i][th_ix] = 0u; } - bool _1581; + bool _1500; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1379 = th_ix < 256u; - bool _1387; - if (_1379) + bool _1298 = th_ix < 256u; + bool _1306; + if (_1298) { - _1387 = (partition_ix + th_ix) < n_partitions; + _1306 = (partition_ix + th_ix) < n_partitions; } else { - _1387 = _1379; + _1306 = _1298; } - if (_1387) + if (_1306) { - uint in_ix = (_1249.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - param_3.offset = _1249.conf.bin_alloc.offset; + uint in_ix = (_1169.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_3.offset = _1169.conf.bin_alloc.offset; uint param_4 = in_ix; - count = read_mem(param_3, param_4, v_296, v_296BufferSize); - param_5.offset = _1249.conf.bin_alloc.offset; + count = read_mem(param_3, param_4, v_283, v_283BufferSize); + param_5.offset = _1169.conf.bin_alloc.offset; uint param_6 = in_ix + 1u; - uint offset = read_mem(param_5, param_6, v_296, v_296BufferSize); + uint offset = read_mem(param_5, param_6, v_283, v_283BufferSize); uint param_7 = offset; uint param_8 = count * 4u; bool param_9 = mem_ok; @@ -930,34 +889,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } if (part_ix > 0u) { - _1529 = sh_part_count[part_ix - 1u]; + _1448 = sh_part_count[part_ix - 1u]; } else { - _1529 = part_start_ix; + _1448 = part_start_ix; } - ix -= _1529; + ix -= _1448; Alloc bin_alloc = sh_part_elements[part_ix]; BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; BinInstanceRef param_10 = inst_ref; uint param_11 = ix; Alloc param_12 = bin_alloc; BinInstanceRef param_13 = BinInstance_index(param_10, param_11); - BinInstance inst = BinInstance_read(param_12, param_13, v_296, v_296BufferSize); + BinInstance inst = BinInstance_read(param_12, param_13, v_283, v_283BufferSize); sh_elements[th_ix] = inst.element_ix; } threadgroup_barrier(mem_flags::mem_threadgroup); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1571 = (wr_ix - rd_ix) < 256u; - if (_1571) + bool _1490 = (wr_ix - rd_ix) < 256u; + if (_1490) { - _1581 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1500 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1581 = _1571; + _1500 = _1490; } - if (_1581) + if (_1500) { continue; } @@ -970,10 +929,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix * 40u) }; - param_14.offset = _1249.conf.anno_alloc.offset; + ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix * 40u) }; + param_14.offset = _1169.conf.anno_alloc.offset; AnnotatedRef param_15 = ref; - tag = Annotated_tag(param_14, param_15, v_296, v_296BufferSize).tag; + tag = Annotated_tag(param_14, param_15, v_283, v_283BufferSize).tag; } switch (tag) { @@ -983,10 +942,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M case 4u: case 5u: { - uint path_ix = element_ix; - param_16.offset = _1249.conf.tile_alloc.offset; - PathRef param_17 = PathRef{ _1249.conf.tile_alloc.offset + (path_ix * 12u) }; - Path path = Path_read(param_16, param_17, v_296, v_296BufferSize); + uint drawmonoid_base = (_1169.conf.drawmonoid_alloc.offset >> uint(2)) + (2u * element_ix); + uint path_ix = v_283.memory[drawmonoid_base]; + param_16.offset = _1169.conf.tile_alloc.offset; + PathRef param_17 = PathRef{ _1169.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_16, param_17, v_283, v_283BufferSize); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -1040,54 +1000,48 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M el_ix = probe_1; } } - AnnotatedRef ref_1 = AnnotatedRef{ _1249.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) }; - param_23.offset = _1249.conf.anno_alloc.offset; + AnnotatedRef ref_1 = AnnotatedRef{ _1169.conf.anno_alloc.offset + (sh_elements[el_ix] * 40u) }; + param_23.offset = _1169.conf.anno_alloc.offset; AnnotatedRef param_24 = ref_1; - uint tag_1 = Annotated_tag(param_23, param_24, v_296, v_296BufferSize).tag; + uint tag_1 = Annotated_tag(param_23, param_24, v_283, v_283BufferSize).tag; if (el_ix > 0u) { - _1841 = sh_tile_count[el_ix - 1u]; + _1770 = sh_tile_count[el_ix - 1u]; } else { - _1841 = 0u; + _1770 = 0u; } - uint seq_ix = ix_1 - _1841; + uint seq_ix = ix_1 - _1770; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); bool include_tile = false; - if ((tag_1 == 4u) || (tag_1 == 5u)) + if (mem_ok) { - include_tile = true; - } - else - { - if (mem_ok) + uint param_25 = el_ix; + bool param_26 = mem_ok; + Alloc param_27 = read_tile_alloc(param_25, param_26, v_283, v_283BufferSize); + TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_27, param_28, v_283, v_283BufferSize); + bool is_clip = (tag_1 == 4u) || (tag_1 == 5u); + bool _1834 = tile.tile.offset != 0u; + bool _1843; + if (!_1834) { - uint param_25 = el_ix; - bool param_26 = mem_ok; - Alloc param_27 = read_tile_alloc(param_25, param_26, v_296, v_296BufferSize); - TileRef param_28 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; - Tile tile = Tile_read(param_27, param_28, v_296, v_296BufferSize); - bool _1907 = tile.tile.offset != 0u; - bool _1914; - if (!_1907) - { - _1914 = tile.backdrop != 0; - } - else - { - _1914 = _1907; - } - include_tile = _1914; + _1843 = (tile.backdrop == 0) == is_clip; } + else + { + _1843 = _1834; + } + include_tile = _1843; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1934 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + uint _1863 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); } } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -1111,10 +1065,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); uint element_ix_1 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - ref = AnnotatedRef{ _1249.conf.anno_alloc.offset + (element_ix_1 * 40u) }; - param_29.offset = _1249.conf.anno_alloc.offset; + ref = AnnotatedRef{ _1169.conf.anno_alloc.offset + (element_ix_1 * 40u) }; + param_29.offset = _1169.conf.anno_alloc.offset; AnnotatedRef param_30 = ref; - AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_296, v_296BufferSize); + AnnotatedTag tag_2 = Annotated_tag(param_29, param_30, v_283, v_283BufferSize); if (clip_zero_depth == 0u) { switch (tag_2.tag) @@ -1123,20 +1077,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { uint param_31 = element_ref_ix; bool param_32 = mem_ok; - Alloc param_33 = read_tile_alloc(param_31, param_32, v_296, v_296BufferSize); + Alloc param_33 = read_tile_alloc(param_31, param_32, v_283, v_283BufferSize); TileRef param_34 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - tile_1 = Tile_read(param_33, param_34, v_296, v_296BufferSize); - param_35.offset = _1249.conf.anno_alloc.offset; + tile_1 = Tile_read(param_33, param_34, v_283, v_283BufferSize); + param_35.offset = _1169.conf.anno_alloc.offset; AnnotatedRef param_36 = ref; - fill = Annotated_Color_read(param_35, param_36, v_296, v_296BufferSize); + fill = Annotated_Color_read(param_35, param_36, v_283, v_283BufferSize); Alloc param_37 = cmd_alloc; CmdRef param_38 = cmd_ref; uint param_39 = cmd_limit; - bool _2048 = alloc_cmd(param_37, param_38, param_39, v_296, v_296BufferSize); + bool _1977 = alloc_cmd(param_37, param_38, param_39, v_283, v_283BufferSize); cmd_alloc = param_37; cmd_ref = param_38; cmd_limit = param_39; - if (!_2048) + if (!_1977) { break; } @@ -1145,12 +1099,12 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint param_42 = tag_2.flags; Tile param_43 = tile_1; float param_44 = fill.linewidth; - write_fill(param_40, param_41, param_42, param_43, param_44, v_296, v_296BufferSize); + write_fill(param_40, param_41, param_42, param_43, param_44, v_283, v_283BufferSize); cmd_ref = param_41; Alloc param_45 = cmd_alloc; CmdRef param_46 = cmd_ref; CmdColor param_47 = CmdColor{ fill.rgba_color }; - Cmd_Color_write(param_45, param_46, param_47, v_296, v_296BufferSize); + Cmd_Color_write(param_45, param_46, param_47, v_283, v_283BufferSize); cmd_ref.offset += 8u; break; } @@ -1158,20 +1112,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { uint param_48 = element_ref_ix; bool param_49 = mem_ok; - Alloc param_50 = read_tile_alloc(param_48, param_49, v_296, v_296BufferSize); + Alloc param_50 = read_tile_alloc(param_48, param_49, v_283, v_283BufferSize); TileRef param_51 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - tile_1 = Tile_read(param_50, param_51, v_296, v_296BufferSize); - param_52.offset = _1249.conf.anno_alloc.offset; + tile_1 = Tile_read(param_50, param_51, v_283, v_283BufferSize); + param_52.offset = _1169.conf.anno_alloc.offset; AnnotatedRef param_53 = ref; - AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_296, v_296BufferSize); + AnnoLinGradient lin = Annotated_LinGradient_read(param_52, param_53, v_283, v_283BufferSize); Alloc param_54 = cmd_alloc; CmdRef param_55 = cmd_ref; uint param_56 = cmd_limit; - bool _2120 = alloc_cmd(param_54, param_55, param_56, v_296, v_296BufferSize); + bool _2049 = alloc_cmd(param_54, param_55, param_56, v_283, v_283BufferSize); cmd_alloc = param_54; cmd_ref = param_55; cmd_limit = param_56; - if (!_2120) + if (!_2049) { break; } @@ -1180,7 +1134,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint param_59 = tag_2.flags; Tile param_60 = tile_1; float param_61 = fill.linewidth; - write_fill(param_57, param_58, param_59, param_60, param_61, v_296, v_296BufferSize); + write_fill(param_57, param_58, param_59, param_60, param_61, v_283, v_283BufferSize); cmd_ref = param_58; cmd_lin.index = lin.index; cmd_lin.line_x = lin.line_x; @@ -1189,7 +1143,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_62 = cmd_alloc; CmdRef param_63 = cmd_ref; CmdLinGrad param_64 = cmd_lin; - Cmd_LinGrad_write(param_62, param_63, param_64, v_296, v_296BufferSize); + Cmd_LinGrad_write(param_62, param_63, param_64, v_283, v_283BufferSize); cmd_ref.offset += 20u; break; } @@ -1197,20 +1151,20 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { uint param_65 = element_ref_ix; bool param_66 = mem_ok; - Alloc param_67 = read_tile_alloc(param_65, param_66, v_296, v_296BufferSize); + Alloc param_67 = read_tile_alloc(param_65, param_66, v_283, v_283BufferSize); TileRef param_68 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - tile_1 = Tile_read(param_67, param_68, v_296, v_296BufferSize); - param_69.offset = _1249.conf.anno_alloc.offset; + tile_1 = Tile_read(param_67, param_68, v_283, v_283BufferSize); + param_69.offset = _1169.conf.anno_alloc.offset; AnnotatedRef param_70 = ref; - AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_296, v_296BufferSize); + AnnoImage fill_img = Annotated_Image_read(param_69, param_70, v_283, v_283BufferSize); Alloc param_71 = cmd_alloc; CmdRef param_72 = cmd_ref; uint param_73 = cmd_limit; - bool _2204 = alloc_cmd(param_71, param_72, param_73, v_296, v_296BufferSize); + bool _2133 = alloc_cmd(param_71, param_72, param_73, v_283, v_283BufferSize); cmd_alloc = param_71; cmd_ref = param_72; cmd_limit = param_73; - if (!_2204) + if (!_2133) { break; } @@ -1219,12 +1173,12 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint param_76 = tag_2.flags; Tile param_77 = tile_1; float param_78 = fill_img.linewidth; - write_fill(param_74, param_75, param_76, param_77, param_78, v_296, v_296BufferSize); + write_fill(param_74, param_75, param_76, param_77, param_78, v_283, v_283BufferSize); cmd_ref = param_75; Alloc param_79 = cmd_alloc; CmdRef param_80 = cmd_ref; CmdImage param_81 = CmdImage{ fill_img.index, fill_img.offset }; - Cmd_Image_write(param_79, param_80, param_81, v_296, v_296BufferSize); + Cmd_Image_write(param_79, param_80, param_81, v_283, v_283BufferSize); cmd_ref.offset += 12u; break; } @@ -1232,100 +1186,74 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { uint param_82 = element_ref_ix; bool param_83 = mem_ok; - Alloc param_84 = read_tile_alloc(param_82, param_83, v_296, v_296BufferSize); + Alloc param_84 = read_tile_alloc(param_82, param_83, v_283, v_283BufferSize); TileRef param_85 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - tile_1 = Tile_read(param_84, param_85, v_296, v_296BufferSize); - bool _2265 = tile_1.tile.offset == 0u; - bool _2271; - if (_2265) + tile_1 = Tile_read(param_84, param_85, v_283, v_283BufferSize); + bool _2194 = tile_1.tile.offset == 0u; + bool _2200; + if (_2194) { - _2271 = tile_1.backdrop == 0; + _2200 = tile_1.backdrop == 0; } else { - _2271 = _2265; + _2200 = _2194; } - if (_2271) + if (_2200) { clip_zero_depth = clip_depth + 1u; } else { - if ((tile_1.tile.offset == 0u) && (clip_depth < 32u)) + Alloc param_86 = cmd_alloc; + CmdRef param_87 = cmd_ref; + uint param_88 = cmd_limit; + bool _2212 = alloc_cmd(param_86, param_87, param_88, v_283, v_283BufferSize); + cmd_alloc = param_86; + cmd_ref = param_87; + cmd_limit = param_88; + if (!_2212) { - clip_one_mask |= (1u << clip_depth); - } - else - { - param_86.offset = _1249.conf.anno_alloc.offset; - AnnotatedRef param_87 = ref; - AnnoBeginClip begin_clip = Annotated_BeginClip_read(param_86, param_87, v_296, v_296BufferSize); - Alloc param_88 = cmd_alloc; - CmdRef param_89 = cmd_ref; - uint param_90 = cmd_limit; - bool _2305 = alloc_cmd(param_88, param_89, param_90, v_296, v_296BufferSize); - cmd_alloc = param_88; - cmd_ref = param_89; - cmd_limit = param_90; - if (!_2305) - { - break; - } - Alloc param_91 = cmd_alloc; - CmdRef param_92 = cmd_ref; - uint param_93 = tag_2.flags; - Tile param_94 = tile_1; - float param_95 = begin_clip.linewidth; - write_fill(param_91, param_92, param_93, param_94, param_95, v_296, v_296BufferSize); - cmd_ref = param_92; - Alloc param_96 = cmd_alloc; - CmdRef param_97 = cmd_ref; - Cmd_BeginClip_write(param_96, param_97, v_296, v_296BufferSize); - cmd_ref.offset += 4u; - if (clip_depth < 32u) - { - clip_one_mask &= (~(1u << clip_depth)); - } + break; } + Alloc param_89 = cmd_alloc; + CmdRef param_90 = cmd_ref; + Cmd_BeginClip_write(param_89, param_90, v_283, v_283BufferSize); + cmd_ref.offset += 4u; } clip_depth++; break; } case 5u: { + uint param_91 = element_ref_ix; + bool param_92 = mem_ok; + Alloc param_93 = read_tile_alloc(param_91, param_92, v_283, v_283BufferSize); + TileRef param_94 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + tile_1 = Tile_read(param_93, param_94, v_283, v_283BufferSize); clip_depth--; - bool _2351 = clip_depth >= 32u; - bool _2360; - if (!_2351) + Alloc param_95 = cmd_alloc; + CmdRef param_96 = cmd_ref; + uint param_97 = cmd_limit; + bool _2261 = alloc_cmd(param_95, param_96, param_97, v_283, v_283BufferSize); + cmd_alloc = param_95; + cmd_ref = param_96; + cmd_limit = param_97; + if (!_2261) { - _2360 = (clip_one_mask & (1u << clip_depth)) == 0u; - } - else - { - _2360 = _2351; - } - if (_2360) - { - Alloc param_98 = cmd_alloc; - CmdRef param_99 = cmd_ref; - uint param_100 = cmd_limit; - bool _2369 = alloc_cmd(param_98, param_99, param_100, v_296, v_296BufferSize); - cmd_alloc = param_98; - cmd_ref = param_99; - cmd_limit = param_100; - if (!_2369) - { - break; - } - Alloc param_101 = cmd_alloc; - CmdRef param_102 = cmd_ref; - Cmd_Solid_write(param_101, param_102, v_296, v_296BufferSize); - cmd_ref.offset += 4u; - Alloc param_103 = cmd_alloc; - CmdRef param_104 = cmd_ref; - Cmd_EndClip_write(param_103, param_104, v_296, v_296BufferSize); - cmd_ref.offset += 4u; + break; } + Alloc param_98 = cmd_alloc; + CmdRef param_99 = cmd_ref; + uint param_100 = 0u; + Tile param_101 = tile_1; + float param_102 = 0.0; + write_fill(param_98, param_99, param_100, param_101, param_102, v_283, v_283BufferSize); + cmd_ref = param_99; + Alloc param_103 = cmd_alloc; + CmdRef param_104 = cmd_ref; + Cmd_EndClip_write(param_103, param_104, v_283, v_283BufferSize); + cmd_ref.offset += 4u; break; } } @@ -1358,21 +1286,21 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M break; } } - bool _2432 = (bin_tile_x + tile_x) < _1249.conf.width_in_tiles; - bool _2441; - if (_2432) + bool _2326 = (bin_tile_x + tile_x) < _1169.conf.width_in_tiles; + bool _2335; + if (_2326) { - _2441 = (bin_tile_y + tile_y) < _1249.conf.height_in_tiles; + _2335 = (bin_tile_y + tile_y) < _1169.conf.height_in_tiles; } else { - _2441 = _2432; + _2335 = _2326; } - if (_2441) + if (_2335) { Alloc param_105 = cmd_alloc; CmdRef param_106 = cmd_ref; - Cmd_End_write(param_105, param_106, v_296, v_296BufferSize); + Cmd_End_write(param_105, param_106, v_283, v_283BufferSize); } } diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv index fbe025d..b30e2d8 100644 Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil index 17bfd04..d901a80 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl index d0bef52..0ca5843 100644 --- a/piet-gpu/shader/gen/draw_leaf.hlsl +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -41,16 +41,6 @@ struct FillImage int2 offset; }; -struct ClipRef -{ - uint offset; -}; - -struct Clip -{ - float4 bbox; -}; - struct ElementTag { uint tag; @@ -143,8 +133,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -153,14 +148,14 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -static const DrawMonoid _418 = { 0u, 0u }; -static const DrawMonoid _442 = { 1u, 0u }; -static const DrawMonoid _444 = { 1u, 1u }; +static const DrawMonoid _348 = { 0u, 0u }; +static const DrawMonoid _372 = { 1u, 0u }; +static const DrawMonoid _374 = { 1u, 1u }; -RWByteAddressBuffer _201 : register(u0, space0); -ByteAddressBuffer _225 : register(t2, space0); -ByteAddressBuffer _1004 : register(t3, space0); -ByteAddressBuffer _1038 : register(t1, space0); +RWByteAddressBuffer _187 : register(u0, space0); +ByteAddressBuffer _211 : register(t2, space0); +ByteAddressBuffer _934 : register(t3, space0); +ByteAddressBuffer _968 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -176,9 +171,9 @@ groupshared DrawMonoid sh_scratch[256]; ElementTag Element_tag(ElementRef ref) { - uint tag_and_flags = _225.Load((ref.offset >> uint(2)) * 4 + 0); - ElementTag _375 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; - return _375; + uint tag_and_flags = _211.Load((ref.offset >> uint(2)) * 4 + 0); + ElementTag _321 = { tag_and_flags & 65535u, tag_and_flags >> uint(16) }; + return _321; } DrawMonoid map_tag(uint tag_word) @@ -189,24 +184,24 @@ DrawMonoid map_tag(uint tag_word) case 5u: case 6u: { - return _442; + return _372; } case 9u: case 10u: { - return _444; + return _374; } default: { - return _418; + return _348; } } } ElementRef Element_index(ElementRef ref, uint index) { - ElementRef _214 = { ref.offset + (index * 36u) }; - return _214; + ElementRef _200 = { ref.offset + (index * 36u) }; + return _200; } DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) @@ -219,13 +214,13 @@ DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) DrawMonoid tag_monoid_identity() { - return _418; + return _348; } FillColor FillColor_read(FillColorRef ref) { uint ix = ref.offset >> uint(2); - uint raw0 = _225.Load((ix + 0u) * 4 + 0); + uint raw0 = _211.Load((ix + 0u) * 4 + 0); FillColor s; s.rgba_color = raw0; return s; @@ -233,8 +228,8 @@ FillColor FillColor_read(FillColorRef ref) FillColor Element_FillColor_read(ElementRef ref) { - FillColorRef _381 = { ref.offset + 4u }; - FillColorRef param = _381; + FillColorRef _327 = { ref.offset + 4u }; + FillColorRef param = _327; return FillColor_read(param); } @@ -251,7 +246,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _201.Store(offset * 4 + 8, val); + _187.Store(offset * 4 + 8, val); } void AnnoColor_write(Alloc a, AnnoColorRef ref, AnnoColor s) @@ -289,9 +284,9 @@ void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s) uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 1u; write_mem(param, param_1, param_2); - AnnoColorRef _805 = { ref.offset + 4u }; + AnnoColorRef _735 = { ref.offset + 4u }; Alloc param_3 = a; - AnnoColorRef param_4 = _805; + AnnoColorRef param_4 = _735; AnnoColor param_5 = s; AnnoColor_write(param_3, param_4, param_5); } @@ -299,11 +294,11 @@ void Annotated_Color_write(Alloc a, AnnotatedRef ref, uint flags, AnnoColor s) FillLinGradient FillLinGradient_read(FillLinGradientRef ref) { uint ix = ref.offset >> uint(2); - uint raw0 = _225.Load((ix + 0u) * 4 + 0); - uint raw1 = _225.Load((ix + 1u) * 4 + 0); - uint raw2 = _225.Load((ix + 2u) * 4 + 0); - uint raw3 = _225.Load((ix + 3u) * 4 + 0); - uint raw4 = _225.Load((ix + 4u) * 4 + 0); + uint raw0 = _211.Load((ix + 0u) * 4 + 0); + uint raw1 = _211.Load((ix + 1u) * 4 + 0); + uint raw2 = _211.Load((ix + 2u) * 4 + 0); + uint raw3 = _211.Load((ix + 3u) * 4 + 0); + uint raw4 = _211.Load((ix + 4u) * 4 + 0); FillLinGradient s; s.index = raw0; s.p0 = float2(asfloat(raw1), asfloat(raw2)); @@ -313,8 +308,8 @@ FillLinGradient FillLinGradient_read(FillLinGradientRef ref) FillLinGradient Element_FillLinGradient_read(ElementRef ref) { - FillLinGradientRef _389 = { ref.offset + 4u }; - FillLinGradientRef param = _389; + FillLinGradientRef _335 = { ref.offset + 4u }; + FillLinGradientRef param = _335; return FillLinGradient_read(param); } @@ -365,9 +360,9 @@ void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinG uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 2u; write_mem(param, param_1, param_2); - AnnoLinGradientRef _826 = { ref.offset + 4u }; + AnnoLinGradientRef _756 = { ref.offset + 4u }; Alloc param_3 = a; - AnnoLinGradientRef param_4 = _826; + AnnoLinGradientRef param_4 = _756; AnnoLinGradient param_5 = s; AnnoLinGradient_write(param_3, param_4, param_5); } @@ -375,8 +370,8 @@ void Annotated_LinGradient_write(Alloc a, AnnotatedRef ref, uint flags, AnnoLinG FillImage FillImage_read(FillImageRef ref) { uint ix = ref.offset >> uint(2); - uint raw0 = _225.Load((ix + 0u) * 4 + 0); - uint raw1 = _225.Load((ix + 1u) * 4 + 0); + uint raw0 = _211.Load((ix + 0u) * 4 + 0); + uint raw1 = _211.Load((ix + 1u) * 4 + 0); FillImage s; s.index = raw0; s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); @@ -385,8 +380,8 @@ FillImage FillImage_read(FillImageRef ref) FillImage Element_FillImage_read(ElementRef ref) { - FillImageRef _397 = { ref.offset + 4u }; - FillImageRef param = _397; + FillImageRef _343 = { ref.offset + 4u }; + FillImageRef param = _343; return FillImage_read(param); } @@ -429,32 +424,13 @@ void Annotated_Image_write(Alloc a, AnnotatedRef ref, uint flags, AnnoImage s) uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 3u; write_mem(param, param_1, param_2); - AnnoImageRef _847 = { ref.offset + 4u }; + AnnoImageRef _777 = { ref.offset + 4u }; Alloc param_3 = a; - AnnoImageRef param_4 = _847; + AnnoImageRef param_4 = _777; AnnoImage param_5 = s; AnnoImage_write(param_3, param_4, param_5); } -Clip Clip_read(ClipRef ref) -{ - uint ix = ref.offset >> uint(2); - uint raw0 = _225.Load((ix + 0u) * 4 + 0); - uint raw1 = _225.Load((ix + 1u) * 4 + 0); - uint raw2 = _225.Load((ix + 2u) * 4 + 0); - uint raw3 = _225.Load((ix + 3u) * 4 + 0); - Clip s; - s.bbox = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); - return s; -} - -Clip Element_BeginClip_read(ElementRef ref) -{ - ClipRef _405 = { ref.offset + 4u }; - ClipRef param = _405; - return Clip_read(param); -} - void AnnoBeginClip_write(Alloc a, AnnoBeginClipRef ref, AnnoBeginClip s) { uint ix = ref.offset >> uint(2); @@ -486,20 +462,13 @@ void Annotated_BeginClip_write(Alloc a, AnnotatedRef ref, uint flags, AnnoBeginC uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 4u; write_mem(param, param_1, param_2); - AnnoBeginClipRef _868 = { ref.offset + 4u }; + AnnoBeginClipRef _798 = { ref.offset + 4u }; Alloc param_3 = a; - AnnoBeginClipRef param_4 = _868; + AnnoBeginClipRef param_4 = _798; AnnoBeginClip param_5 = s; AnnoBeginClip_write(param_3, param_4, param_5); } -Clip Element_EndClip_read(ElementRef ref) -{ - ClipRef _413 = { ref.offset + 4u }; - ClipRef param = _413; - return Clip_read(param); -} - void AnnoEndClip_write(Alloc a, AnnoEndClipRef ref, AnnoEndClip s) { uint ix = ref.offset >> uint(2); @@ -527,9 +496,9 @@ void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; write_mem(param, param_1, param_2); - AnnoEndClipRef _886 = { ref.offset + 4u }; + AnnoEndClipRef _816 = { ref.offset + 4u }; Alloc param_3 = a; - AnnoEndClipRef param_4 = _886; + AnnoEndClipRef param_4 = _816; AnnoEndClip param_5 = s; AnnoEndClip_write(param_3, param_4, param_5); } @@ -537,8 +506,8 @@ void Annotated_EndClip_write(Alloc a, AnnotatedRef ref, AnnoEndClip s) void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - ElementRef _904 = { ix * 36u }; - ElementRef ref = _904; + ElementRef _834 = { ix * 36u }; + ElementRef ref = _834; ElementRef param = ref; uint tag_word = Element_tag(param).tag; uint param_1 = tag_word; @@ -575,11 +544,11 @@ void comp_main() DrawMonoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - DrawMonoid _1010; - _1010.path_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 0); - _1010.clip_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 4); - row.path_ix = _1010.path_ix; - row.clip_ix = _1010.clip_ix; + DrawMonoid _940; + _940.path_ix = _934.Load((gl_WorkGroupID.x - 1u) * 8 + 0); + _940.clip_ix = _934.Load((gl_WorkGroupID.x - 1u) * 8 + 4); + row.path_ix = _940.path_ix; + row.clip_ix = _940.clip_ix; } if (gl_LocalInvocationID.x > 0u) { @@ -588,9 +557,10 @@ void comp_main() row = combine_tag_monoid(param_10, param_11); } uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_1038.Load(44) >> uint(2)) + (out_ix * 2u); - AnnotatedRef _1054 = { _1038.Load(32) + (out_ix * 40u) }; - AnnotatedRef out_ref = _1054; + uint out_base = (_968.Load(44) >> uint(2)) + (out_ix * 2u); + uint clip_out_base = _968.Load(48) >> uint(2); + AnnotatedRef _989 = { _968.Load(32) + (out_ix * 40u) }; + AnnotatedRef out_ref = _989; float4 mat; float2 translate; AnnoColor anno_fill; @@ -600,39 +570,43 @@ void comp_main() AnnoImage anno_img; Alloc param_28; AnnoBeginClip anno_begin_clip; - Alloc param_33; + Alloc param_32; AnnoEndClip anno_end_clip; - Alloc param_38; + Alloc param_36; for (uint i_2 = 0u; i_2 < 8u; i_2++) { - DrawMonoid param_12 = row; - DrawMonoid param_13 = local[i_2]; - DrawMonoid m = combine_tag_monoid(param_12, param_13); - _201.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix); - _201.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix); + DrawMonoid m = row; + if (i_2 > 0u) + { + DrawMonoid param_12 = m; + DrawMonoid param_13 = local[i_2 - 1u]; + m = combine_tag_monoid(param_12, param_13); + } + _187.Store((out_base + (i_2 * 2u)) * 4 + 8, m.path_ix); + _187.Store(((out_base + (i_2 * 2u)) + 1u) * 4 + 8, m.clip_ix); ElementRef param_14 = ref; uint param_15 = i_2; ElementRef this_ref = Element_index(param_14, param_15); ElementRef param_16 = this_ref; tag_word = Element_tag(param_16).tag; - if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) + if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u)) { - uint bbox_offset = (_1038.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u)); - float bbox_l = float(_201.Load(bbox_offset * 4 + 8)) - 32768.0f; - float bbox_t = float(_201.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; - float bbox_r = float(_201.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; - float bbox_b = float(_201.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f; + uint bbox_offset = (_968.Load(40) >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(_187.Load(bbox_offset * 4 + 8)) - 32768.0f; + float bbox_t = float(_187.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; + float bbox_r = float(_187.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; + float bbox_b = float(_187.Load((bbox_offset + 3u) * 4 + 8)) - 32768.0f; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); - float linewidth = asfloat(_201.Load((bbox_offset + 4u) * 4 + 8)); + float linewidth = asfloat(_187.Load((bbox_offset + 4u) * 4 + 8)); uint fill_mode = uint(linewidth >= 0.0f); if ((linewidth >= 0.0f) || (tag_word == 5u)) { - uint trans_ix = _201.Load((bbox_offset + 5u) * 4 + 8); - uint t = (_1038.Load(36) >> uint(2)) + (6u * trans_ix); - mat = asfloat(uint4(_201.Load(t * 4 + 8), _201.Load((t + 1u) * 4 + 8), _201.Load((t + 2u) * 4 + 8), _201.Load((t + 3u) * 4 + 8))); + uint trans_ix = _187.Load((bbox_offset + 5u) * 4 + 8); + uint t = (_968.Load(36) >> uint(2)) + (6u * trans_ix); + mat = asfloat(uint4(_187.Load(t * 4 + 8), _187.Load((t + 1u) * 4 + 8), _187.Load((t + 2u) * 4 + 8), _187.Load((t + 3u) * 4 + 8))); if (tag_word == 5u) { - translate = asfloat(uint2(_201.Load((t + 4u) * 4 + 8), _201.Load((t + 5u) * 4 + 8))); + translate = asfloat(uint2(_187.Load((t + 4u) * 4 + 8), _187.Load((t + 5u) * 4 + 8))); } } if (linewidth >= 0.0f) @@ -649,9 +623,9 @@ void comp_main() anno_fill.bbox = bbox; anno_fill.linewidth = linewidth; anno_fill.rgba_color = fill.rgba_color; - Alloc _1257; - _1257.offset = _1038.Load(32); - param_18.offset = _1257.offset; + Alloc _1203; + _1203.offset = _968.Load(32); + param_18.offset = _1203.offset; AnnotatedRef param_19 = out_ref; uint param_20 = fill_mode; AnnoColor param_21 = anno_fill; @@ -674,9 +648,9 @@ void comp_main() anno_lin.line_x = line_x; anno_lin.line_y = line_y; anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y)); - Alloc _1353; - _1353.offset = _1038.Load(32); - param_23.offset = _1353.offset; + Alloc _1299; + _1299.offset = _968.Load(32); + param_23.offset = _1299.offset; AnnotatedRef param_24 = out_ref; uint param_25 = fill_mode; AnnoLinGradient param_26 = anno_lin; @@ -691,48 +665,51 @@ void comp_main() anno_img.linewidth = linewidth; anno_img.index = fill_img.index; anno_img.offset = fill_img.offset; - Alloc _1381; - _1381.offset = _1038.Load(32); - param_28.offset = _1381.offset; + Alloc _1327; + _1327.offset = _968.Load(32); + param_28.offset = _1327.offset; AnnotatedRef param_29 = out_ref; uint param_30 = fill_mode; AnnoImage param_31 = anno_img; Annotated_Image_write(param_28, param_29, param_30, param_31); break; } + case 9u: + { + anno_begin_clip.bbox = bbox; + anno_begin_clip.linewidth = 0.0f; + Alloc _1344; + _1344.offset = _968.Load(32); + param_32.offset = _1344.offset; + AnnotatedRef param_33 = out_ref; + uint param_34 = 0u; + AnnoBeginClip param_35 = anno_begin_clip; + Annotated_BeginClip_write(param_32, param_33, param_34, param_35); + break; + } } } else { + if (tag_word == 10u) + { + anno_end_clip.bbox = float4(-1000000000.0f, -1000000000.0f, 1000000000.0f, 1000000000.0f); + Alloc _1368; + _1368.offset = _968.Load(32); + param_36.offset = _1368.offset; + AnnotatedRef param_37 = out_ref; + AnnoEndClip param_38 = anno_end_clip; + Annotated_EndClip_write(param_36, param_37, param_38); + } + } + if ((tag_word == 9u) || (tag_word == 10u)) + { + uint path_ix = ~(out_ix + i_2); if (tag_word == 9u) { - ElementRef param_32 = this_ref; - Clip begin_clip = Element_BeginClip_read(param_32); - anno_begin_clip.bbox = begin_clip.bbox; - anno_begin_clip.linewidth = 0.0f; - Alloc _1410; - _1410.offset = _1038.Load(32); - param_33.offset = _1410.offset; - AnnotatedRef param_34 = out_ref; - uint param_35 = 0u; - AnnoBeginClip param_36 = anno_begin_clip; - Annotated_BeginClip_write(param_33, param_34, param_35, param_36); - } - else - { - if (tag_word == 10u) - { - ElementRef param_37 = this_ref; - Clip end_clip = Element_EndClip_read(param_37); - anno_end_clip.bbox = end_clip.bbox; - Alloc _1435; - _1435.offset = _1038.Load(32); - param_38.offset = _1435.offset; - AnnotatedRef param_39 = out_ref; - AnnoEndClip param_40 = anno_end_clip; - Annotated_EndClip_write(param_38, param_39, param_40); - } + path_ix = m.path_ix; } + _187.Store((clip_out_base + m.clip_ix) * 4 + 8, path_ix); } out_ref.offset += 40u; } diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl index 06a4e23..8de5379 100644 --- a/piet-gpu/shader/gen/draw_leaf.msl +++ b/piet-gpu/shader/gen/draw_leaf.msl @@ -87,16 +87,6 @@ struct FillImage int2 offset; }; -struct ClipRef -{ - uint offset; -}; - -struct Clip -{ - float4 bbox; -}; - struct ElementTag { uint tag; @@ -217,8 +207,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -233,9 +228,9 @@ struct ConfigBuf constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) -ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_225) +ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_211) { - uint tag_and_flags = v_225.scene[ref.offset >> uint(2)]; + uint tag_and_flags = v_211.scene[ref.offset >> uint(2)]; return ElementTag{ tag_and_flags & 65535u, tag_and_flags >> uint(16) }; } @@ -284,20 +279,20 @@ DrawMonoid tag_monoid_identity() } static inline __attribute__((always_inline)) -FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_225) +FillColor FillColor_read(thread const FillColorRef& ref, const device SceneBuf& v_211) { uint ix = ref.offset >> uint(2); - uint raw0 = v_225.scene[ix + 0u]; + uint raw0 = v_211.scene[ix + 0u]; FillColor s; s.rgba_color = raw0; return s; } static inline __attribute__((always_inline)) -FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_225) +FillColor Element_FillColor_read(thread const ElementRef& ref, const device SceneBuf& v_211) { FillColorRef param = FillColorRef{ ref.offset + 4u }; - return FillColor_read(param, v_225); + return FillColor_read(param, v_211); } static inline __attribute__((always_inline)) @@ -307,7 +302,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_201) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_187) { Alloc param = alloc; uint param_1 = offset; @@ -315,61 +310,61 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_201.memory[offset] = val; + v_187.memory[offset] = val; } static inline __attribute__((always_inline)) -void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_201) +void AnnoColor_write(thread const Alloc& a, thread const AnnoColorRef& ref, thread const AnnoColor& s, device Memory& v_187) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = as_type(s.bbox.x); - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.bbox.y); - write_mem(param_3, param_4, param_5, v_201); + write_mem(param_3, param_4, param_5, v_187); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.bbox.z); - write_mem(param_6, param_7, param_8, v_201); + write_mem(param_6, param_7, param_8, v_187); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.bbox.w); - write_mem(param_9, param_10, param_11, v_201); + write_mem(param_9, param_10, param_11, v_187); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.linewidth); - write_mem(param_12, param_13, param_14, v_201); + write_mem(param_12, param_13, param_14, v_187); Alloc param_15 = a; uint param_16 = ix + 5u; uint param_17 = s.rgba_color; - write_mem(param_15, param_16, param_17, v_201); + write_mem(param_15, param_16, param_17, v_187); } static inline __attribute__((always_inline)) -void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_201) +void Annotated_Color_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoColor& s, device Memory& v_187) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 1u; - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; AnnoColorRef param_4 = AnnoColorRef{ ref.offset + 4u }; AnnoColor param_5 = s; - AnnoColor_write(param_3, param_4, param_5, v_201); + AnnoColor_write(param_3, param_4, param_5, v_187); } static inline __attribute__((always_inline)) -FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_225) +FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const device SceneBuf& v_211) { uint ix = ref.offset >> uint(2); - uint raw0 = v_225.scene[ix + 0u]; - uint raw1 = v_225.scene[ix + 1u]; - uint raw2 = v_225.scene[ix + 2u]; - uint raw3 = v_225.scene[ix + 3u]; - uint raw4 = v_225.scene[ix + 4u]; + uint raw0 = v_211.scene[ix + 0u]; + uint raw1 = v_211.scene[ix + 1u]; + uint raw2 = v_211.scene[ix + 2u]; + uint raw3 = v_211.scene[ix + 3u]; + uint raw4 = v_211.scene[ix + 4u]; FillLinGradient s; s.index = raw0; s.p0 = float2(as_type(raw1), as_type(raw2)); @@ -378,73 +373,73 @@ FillLinGradient FillLinGradient_read(thread const FillLinGradientRef& ref, const } static inline __attribute__((always_inline)) -FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_225) +FillLinGradient Element_FillLinGradient_read(thread const ElementRef& ref, const device SceneBuf& v_211) { FillLinGradientRef param = FillLinGradientRef{ ref.offset + 4u }; - return FillLinGradient_read(param, v_225); + return FillLinGradient_read(param, v_211); } static inline __attribute__((always_inline)) -void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_201) +void AnnoLinGradient_write(thread const Alloc& a, thread const AnnoLinGradientRef& ref, thread const AnnoLinGradient& s, device Memory& v_187) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = as_type(s.bbox.x); - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.bbox.y); - write_mem(param_3, param_4, param_5, v_201); + write_mem(param_3, param_4, param_5, v_187); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.bbox.z); - write_mem(param_6, param_7, param_8, v_201); + write_mem(param_6, param_7, param_8, v_187); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.bbox.w); - write_mem(param_9, param_10, param_11, v_201); + write_mem(param_9, param_10, param_11, v_187); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.linewidth); - write_mem(param_12, param_13, param_14, v_201); + write_mem(param_12, param_13, param_14, v_187); Alloc param_15 = a; uint param_16 = ix + 5u; uint param_17 = s.index; - write_mem(param_15, param_16, param_17, v_201); + write_mem(param_15, param_16, param_17, v_187); Alloc param_18 = a; uint param_19 = ix + 6u; uint param_20 = as_type(s.line_x); - write_mem(param_18, param_19, param_20, v_201); + write_mem(param_18, param_19, param_20, v_187); Alloc param_21 = a; uint param_22 = ix + 7u; uint param_23 = as_type(s.line_y); - write_mem(param_21, param_22, param_23, v_201); + write_mem(param_21, param_22, param_23, v_187); Alloc param_24 = a; uint param_25 = ix + 8u; uint param_26 = as_type(s.line_c); - write_mem(param_24, param_25, param_26, v_201); + write_mem(param_24, param_25, param_26, v_187); } static inline __attribute__((always_inline)) -void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_201) +void Annotated_LinGradient_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoLinGradient& s, device Memory& v_187) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 2u; - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; AnnoLinGradientRef param_4 = AnnoLinGradientRef{ ref.offset + 4u }; AnnoLinGradient param_5 = s; - AnnoLinGradient_write(param_3, param_4, param_5, v_201); + AnnoLinGradient_write(param_3, param_4, param_5, v_187); } static inline __attribute__((always_inline)) -FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_225) +FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& v_211) { uint ix = ref.offset >> uint(2); - uint raw0 = v_225.scene[ix + 0u]; - uint raw1 = v_225.scene[ix + 1u]; + uint raw0 = v_211.scene[ix + 0u]; + uint raw1 = v_211.scene[ix + 1u]; FillImage s; s.index = raw0; s.offset = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); @@ -452,167 +447,140 @@ FillImage FillImage_read(thread const FillImageRef& ref, const device SceneBuf& } static inline __attribute__((always_inline)) -FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_225) +FillImage Element_FillImage_read(thread const ElementRef& ref, const device SceneBuf& v_211) { FillImageRef param = FillImageRef{ ref.offset + 4u }; - return FillImage_read(param, v_225); + return FillImage_read(param, v_211); } static inline __attribute__((always_inline)) -void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_201) +void AnnoImage_write(thread const Alloc& a, thread const AnnoImageRef& ref, thread const AnnoImage& s, device Memory& v_187) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = as_type(s.bbox.x); - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.bbox.y); - write_mem(param_3, param_4, param_5, v_201); + write_mem(param_3, param_4, param_5, v_187); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.bbox.z); - write_mem(param_6, param_7, param_8, v_201); + write_mem(param_6, param_7, param_8, v_187); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.bbox.w); - write_mem(param_9, param_10, param_11, v_201); + write_mem(param_9, param_10, param_11, v_187); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.linewidth); - write_mem(param_12, param_13, param_14, v_201); + write_mem(param_12, param_13, param_14, v_187); Alloc param_15 = a; uint param_16 = ix + 5u; uint param_17 = s.index; - write_mem(param_15, param_16, param_17, v_201); + write_mem(param_15, param_16, param_17, v_187); Alloc param_18 = a; uint param_19 = ix + 6u; uint param_20 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); - write_mem(param_18, param_19, param_20, v_201); + write_mem(param_18, param_19, param_20, v_187); } static inline __attribute__((always_inline)) -void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_201) +void Annotated_Image_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoImage& s, device Memory& v_187) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 3u; - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; AnnoImageRef param_4 = AnnoImageRef{ ref.offset + 4u }; AnnoImage param_5 = s; - AnnoImage_write(param_3, param_4, param_5, v_201); + AnnoImage_write(param_3, param_4, param_5, v_187); } static inline __attribute__((always_inline)) -Clip Clip_read(thread const ClipRef& ref, const device SceneBuf& v_225) -{ - uint ix = ref.offset >> uint(2); - uint raw0 = v_225.scene[ix + 0u]; - uint raw1 = v_225.scene[ix + 1u]; - uint raw2 = v_225.scene[ix + 2u]; - uint raw3 = v_225.scene[ix + 3u]; - Clip s; - s.bbox = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); - return s; -} - -static inline __attribute__((always_inline)) -Clip Element_BeginClip_read(thread const ElementRef& ref, const device SceneBuf& v_225) -{ - ClipRef param = ClipRef{ ref.offset + 4u }; - return Clip_read(param, v_225); -} - -static inline __attribute__((always_inline)) -void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_201) +void AnnoBeginClip_write(thread const Alloc& a, thread const AnnoBeginClipRef& ref, thread const AnnoBeginClip& s, device Memory& v_187) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = as_type(s.bbox.x); - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.bbox.y); - write_mem(param_3, param_4, param_5, v_201); + write_mem(param_3, param_4, param_5, v_187); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.bbox.z); - write_mem(param_6, param_7, param_8, v_201); + write_mem(param_6, param_7, param_8, v_187); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.bbox.w); - write_mem(param_9, param_10, param_11, v_201); + write_mem(param_9, param_10, param_11, v_187); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.linewidth); - write_mem(param_12, param_13, param_14, v_201); + write_mem(param_12, param_13, param_14, v_187); } static inline __attribute__((always_inline)) -void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_201) +void Annotated_BeginClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const uint& flags, thread const AnnoBeginClip& s, device Memory& v_187) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = (flags << uint(16)) | 4u; - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; AnnoBeginClipRef param_4 = AnnoBeginClipRef{ ref.offset + 4u }; AnnoBeginClip param_5 = s; - AnnoBeginClip_write(param_3, param_4, param_5, v_201); + AnnoBeginClip_write(param_3, param_4, param_5, v_187); } static inline __attribute__((always_inline)) -Clip Element_EndClip_read(thread const ElementRef& ref, const device SceneBuf& v_225) -{ - ClipRef param = ClipRef{ ref.offset + 4u }; - return Clip_read(param, v_225); -} - -static inline __attribute__((always_inline)) -void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_201) +void AnnoEndClip_write(thread const Alloc& a, thread const AnnoEndClipRef& ref, thread const AnnoEndClip& s, device Memory& v_187) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = as_type(s.bbox.x); - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.bbox.y); - write_mem(param_3, param_4, param_5, v_201); + write_mem(param_3, param_4, param_5, v_187); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.bbox.z); - write_mem(param_6, param_7, param_8, v_201); + write_mem(param_6, param_7, param_8, v_187); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.bbox.w); - write_mem(param_9, param_10, param_11, v_201); + write_mem(param_9, param_10, param_11, v_187); } static inline __attribute__((always_inline)) -void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_201) +void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& ref, thread const AnnoEndClip& s, device Memory& v_187) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; - write_mem(param, param_1, param_2, v_201); + write_mem(param, param_1, param_2, v_187); Alloc param_3 = a; AnnoEndClipRef param_4 = AnnoEndClipRef{ ref.offset + 4u }; AnnoEndClip param_5 = s; - AnnoEndClip_write(param_3, param_4, param_5, v_201); + AnnoEndClip_write(param_3, param_4, param_5, v_187); } -kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1038 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1004 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_187 [[buffer(0)]], const device ConfigBuf& _968 [[buffer(1)]], const device SceneBuf& v_211 [[buffer(2)]], const device ParentBuf& _934 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { threadgroup DrawMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; ElementRef ref = ElementRef{ ix * 36u }; ElementRef param = ref; - uint tag_word = Element_tag(param, v_225).tag; + uint tag_word = Element_tag(param, v_211).tag; uint param_1 = tag_word; DrawMonoid agg = map_tag(param_1); spvUnsafeArray local; @@ -622,7 +590,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 ElementRef param_2 = ref; uint param_3 = i; ElementRef param_4 = Element_index(param_2, param_3); - tag_word = Element_tag(param_4, v_225).tag; + tag_word = Element_tag(param_4, v_211).tag; uint param_5 = tag_word; DrawMonoid param_6 = agg; DrawMonoid param_7 = map_tag(param_5); @@ -647,9 +615,9 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 DrawMonoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _1007 = gl_WorkGroupID.x - 1u; - row.path_ix = _1004.parent[_1007].path_ix; - row.clip_ix = _1004.parent[_1007].clip_ix; + uint _937 = gl_WorkGroupID.x - 1u; + row.path_ix = _934.parent[_937].path_ix; + row.clip_ix = _934.parent[_937].clip_ix; } if (gl_LocalInvocationID.x > 0u) { @@ -658,8 +626,9 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 row = combine_tag_monoid(param_10, param_11); } uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_1038.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u); - AnnotatedRef out_ref = AnnotatedRef{ _1038.conf.anno_alloc.offset + (out_ix * 40u) }; + uint out_base = (_968.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u); + uint clip_out_base = _968.conf.clip_alloc.offset >> uint(2); + AnnotatedRef out_ref = AnnotatedRef{ _968.conf.anno_alloc.offset + (out_ix * 40u) }; float4 mat; float2 translate; AnnoColor anno_fill; @@ -669,39 +638,43 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 AnnoImage anno_img; Alloc param_28; AnnoBeginClip anno_begin_clip; - Alloc param_33; + Alloc param_32; AnnoEndClip anno_end_clip; - Alloc param_38; + Alloc param_36; for (uint i_2 = 0u; i_2 < 8u; i_2++) { - DrawMonoid param_12 = row; - DrawMonoid param_13 = local[i_2]; - DrawMonoid m = combine_tag_monoid(param_12, param_13); - v_201.memory[out_base + (i_2 * 2u)] = m.path_ix; - v_201.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix; + DrawMonoid m = row; + if (i_2 > 0u) + { + DrawMonoid param_12 = m; + DrawMonoid param_13 = local[i_2 - 1u]; + m = combine_tag_monoid(param_12, param_13); + } + v_187.memory[out_base + (i_2 * 2u)] = m.path_ix; + v_187.memory[(out_base + (i_2 * 2u)) + 1u] = m.clip_ix; ElementRef param_14 = ref; uint param_15 = i_2; ElementRef this_ref = Element_index(param_14, param_15); ElementRef param_16 = this_ref; - tag_word = Element_tag(param_16, v_225).tag; - if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) + tag_word = Element_tag(param_16, v_211).tag; + if ((((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) || (tag_word == 9u)) { - uint bbox_offset = (_1038.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u)); - float bbox_l = float(v_201.memory[bbox_offset]) - 32768.0; - float bbox_t = float(v_201.memory[bbox_offset + 1u]) - 32768.0; - float bbox_r = float(v_201.memory[bbox_offset + 2u]) - 32768.0; - float bbox_b = float(v_201.memory[bbox_offset + 3u]) - 32768.0; + uint bbox_offset = (_968.conf.bbox_alloc.offset >> uint(2)) + (6u * m.path_ix); + float bbox_l = float(v_187.memory[bbox_offset]) - 32768.0; + float bbox_t = float(v_187.memory[bbox_offset + 1u]) - 32768.0; + float bbox_r = float(v_187.memory[bbox_offset + 2u]) - 32768.0; + float bbox_b = float(v_187.memory[bbox_offset + 3u]) - 32768.0; float4 bbox = float4(bbox_l, bbox_t, bbox_r, bbox_b); - float linewidth = as_type(v_201.memory[bbox_offset + 4u]); + float linewidth = as_type(v_187.memory[bbox_offset + 4u]); uint fill_mode = uint(linewidth >= 0.0); if ((linewidth >= 0.0) || (tag_word == 5u)) { - uint trans_ix = v_201.memory[bbox_offset + 5u]; - uint t = (_1038.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); - mat = as_type(uint4(v_201.memory[t], v_201.memory[t + 1u], v_201.memory[t + 2u], v_201.memory[t + 3u])); + uint trans_ix = v_187.memory[bbox_offset + 5u]; + uint t = (_968.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); + mat = as_type(uint4(v_187.memory[t], v_187.memory[t + 1u], v_187.memory[t + 2u], v_187.memory[t + 3u])); if (tag_word == 5u) { - translate = as_type(uint2(v_201.memory[t + 4u], v_201.memory[t + 5u])); + translate = as_type(uint2(v_187.memory[t + 4u], v_187.memory[t + 5u])); } } if (linewidth >= 0.0) @@ -714,21 +687,21 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 case 4u: { ElementRef param_17 = this_ref; - FillColor fill = Element_FillColor_read(param_17, v_225); + FillColor fill = Element_FillColor_read(param_17, v_211); anno_fill.bbox = bbox; anno_fill.linewidth = linewidth; anno_fill.rgba_color = fill.rgba_color; - param_18.offset = _1038.conf.anno_alloc.offset; + param_18.offset = _968.conf.anno_alloc.offset; AnnotatedRef param_19 = out_ref; uint param_20 = fill_mode; AnnoColor param_21 = anno_fill; - Annotated_Color_write(param_18, param_19, param_20, param_21, v_201); + Annotated_Color_write(param_18, param_19, param_20, param_21, v_187); break; } case 5u: { ElementRef param_22 = this_ref; - FillLinGradient lin = Element_FillLinGradient_read(param_22, v_225); + FillLinGradient lin = Element_FillLinGradient_read(param_22, v_211); anno_lin.bbox = bbox; anno_lin.linewidth = linewidth; anno_lin.index = lin.index; @@ -741,57 +714,60 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_lin.line_x = line_x; anno_lin.line_y = line_y; anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y)); - param_23.offset = _1038.conf.anno_alloc.offset; + param_23.offset = _968.conf.anno_alloc.offset; AnnotatedRef param_24 = out_ref; uint param_25 = fill_mode; AnnoLinGradient param_26 = anno_lin; - Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_201); + Annotated_LinGradient_write(param_23, param_24, param_25, param_26, v_187); break; } case 6u: { ElementRef param_27 = this_ref; - FillImage fill_img = Element_FillImage_read(param_27, v_225); + FillImage fill_img = Element_FillImage_read(param_27, v_211); anno_img.bbox = bbox; anno_img.linewidth = linewidth; anno_img.index = fill_img.index; anno_img.offset = fill_img.offset; - param_28.offset = _1038.conf.anno_alloc.offset; + param_28.offset = _968.conf.anno_alloc.offset; AnnotatedRef param_29 = out_ref; uint param_30 = fill_mode; AnnoImage param_31 = anno_img; - Annotated_Image_write(param_28, param_29, param_30, param_31, v_201); + Annotated_Image_write(param_28, param_29, param_30, param_31, v_187); + break; + } + case 9u: + { + anno_begin_clip.bbox = bbox; + anno_begin_clip.linewidth = 0.0; + param_32.offset = _968.conf.anno_alloc.offset; + AnnotatedRef param_33 = out_ref; + uint param_34 = 0u; + AnnoBeginClip param_35 = anno_begin_clip; + Annotated_BeginClip_write(param_32, param_33, param_34, param_35, v_187); break; } } } else { + if (tag_word == 10u) + { + anno_end_clip.bbox = float4(-1000000000.0, -1000000000.0, 1000000000.0, 1000000000.0); + param_36.offset = _968.conf.anno_alloc.offset; + AnnotatedRef param_37 = out_ref; + AnnoEndClip param_38 = anno_end_clip; + Annotated_EndClip_write(param_36, param_37, param_38, v_187); + } + } + if ((tag_word == 9u) || (tag_word == 10u)) + { + uint path_ix = ~(out_ix + i_2); if (tag_word == 9u) { - ElementRef param_32 = this_ref; - Clip begin_clip = Element_BeginClip_read(param_32, v_225); - anno_begin_clip.bbox = begin_clip.bbox; - anno_begin_clip.linewidth = 0.0; - param_33.offset = _1038.conf.anno_alloc.offset; - AnnotatedRef param_34 = out_ref; - uint param_35 = 0u; - AnnoBeginClip param_36 = anno_begin_clip; - Annotated_BeginClip_write(param_33, param_34, param_35, param_36, v_201); - } - else - { - if (tag_word == 10u) - { - ElementRef param_37 = this_ref; - Clip end_clip = Element_EndClip_read(param_37, v_225); - anno_end_clip.bbox = end_clip.bbox; - param_38.offset = _1038.conf.anno_alloc.offset; - AnnotatedRef param_39 = out_ref; - AnnoEndClip param_40 = anno_end_clip; - Annotated_EndClip_write(param_38, param_39, param_40, v_201); - } + path_ix = m.path_ix; } + v_187.memory[clip_out_base + m.clip_ix] = path_ix; } out_ref.offset += 40u; } diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv index 7e92cbb..d5e9136 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.spv and b/piet-gpu/shader/gen/draw_leaf.spv differ diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl index a6ccde9..0553c9b 100644 --- a/piet-gpu/shader/gen/draw_reduce.hlsl +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -36,8 +36,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl index 8a87f15..064c515 100644 --- a/piet-gpu/shader/gen/draw_reduce.msl +++ b/piet-gpu/shader/gen/draw_reduce.msl @@ -66,8 +66,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv index aabdf6b..a45627d 100644 Binary files a/piet-gpu/shader/gen/draw_reduce.spv and b/piet-gpu/shader/gen/draw_reduce.spv differ diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil index 3b3c42e..0a14cfa 100644 Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl index 8b0699a..9457d14 100644 --- a/piet-gpu/shader/gen/kernel4.hlsl +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -117,8 +117,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -457,7 +462,6 @@ void comp_main() TileSegRef tile_seg_ref; float area[8]; uint blend_stack[128][8]; - float blend_alpha_stack[128][8]; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -640,7 +644,6 @@ void comp_main() float4 param_34 = float4(rgba[k_11]); uint _1390 = packsRGB(param_34); blend_stack[d_2][k_11] = _1390; - blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f); rgba[k_11] = 0.0f.xxxx; } clip_depth++; @@ -655,7 +658,7 @@ void comp_main() uint d_3 = min(clip_depth, 127u); uint param_35 = blend_stack[d_3][k_12]; float4 bg = unpacksRGB(param_35); - float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + float4 fg_1 = rgba[k_12] * area[k_12]; rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1; } cmd_ref.offset += 4u; @@ -665,8 +668,8 @@ void comp_main() { Alloc param_36 = cmd_alloc; CmdRef param_37 = cmd_ref; - CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref }; - cmd_ref = _1469; + CmdRef _1453 = { Cmd_Jump_read(param_36, param_37).new_ref }; + cmd_ref = _1453; cmd_alloc.offset = cmd_ref.offset; break; } diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl index 9318cc8..3dc7517 100644 --- a/piet-gpu/shader/gen/kernel4.msl +++ b/piet-gpu/shader/gen/kernel4.msl @@ -175,8 +175,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -507,7 +512,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7 TileSegRef tile_seg_ref; spvUnsafeArray area; spvUnsafeArray, 128> blend_stack; - spvUnsafeArray, 128> blend_alpha_stack; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -687,7 +691,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7 float4 param_34 = float4(rgba[k_11]); uint _1390 = packsRGB(param_34); blend_stack[d_2][k_11] = _1390; - blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0); rgba[k_11] = float4(0.0); } clip_depth++; @@ -702,7 +705,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7 uint d_3 = min(clip_depth, 127u); uint param_35 = blend_stack[d_3][k_12]; float4 bg = unpacksRGB(param_35); - float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + float4 fg_1 = rgba[k_12] * area[k_12]; rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1; } cmd_ref.offset += 4u; diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv index 0eb1e5a..31f11c9 100644 Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil index 6ec6d57..f3bd028 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl index 7426758..5ff97fb 100644 --- a/piet-gpu/shader/gen/kernel4_gray.hlsl +++ b/piet-gpu/shader/gen/kernel4_gray.hlsl @@ -117,8 +117,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -457,7 +462,6 @@ void comp_main() TileSegRef tile_seg_ref; float area[8]; uint blend_stack[128][8]; - float blend_alpha_stack[128][8]; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -640,7 +644,6 @@ void comp_main() float4 param_34 = float4(rgba[k_11]); uint _1390 = packsRGB(param_34); blend_stack[d_2][k_11] = _1390; - blend_alpha_stack[d_2][k_11] = clamp(abs(area[k_11]), 0.0f, 1.0f); rgba[k_11] = 0.0f.xxxx; } clip_depth++; @@ -655,7 +658,7 @@ void comp_main() uint d_3 = min(clip_depth, 127u); uint param_35 = blend_stack[d_3][k_12]; float4 bg = unpacksRGB(param_35); - float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + float4 fg_1 = rgba[k_12] * area[k_12]; rgba[k_12] = (bg * (1.0f - fg_1.w)) + fg_1; } cmd_ref.offset += 4u; @@ -665,8 +668,8 @@ void comp_main() { Alloc param_36 = cmd_alloc; CmdRef param_37 = cmd_ref; - CmdRef _1469 = { Cmd_Jump_read(param_36, param_37).new_ref }; - cmd_ref = _1469; + CmdRef _1453 = { Cmd_Jump_read(param_36, param_37).new_ref }; + cmd_ref = _1453; cmd_alloc.offset = cmd_ref.offset; break; } diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl index e672020..15351a0 100644 --- a/piet-gpu/shader/gen/kernel4_gray.msl +++ b/piet-gpu/shader/gen/kernel4_gray.msl @@ -175,8 +175,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -507,7 +512,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7 TileSegRef tile_seg_ref; spvUnsafeArray area; spvUnsafeArray, 128> blend_stack; - spvUnsafeArray, 128> blend_alpha_stack; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -687,7 +691,6 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7 float4 param_34 = float4(rgba[k_11]); uint _1390 = packsRGB(param_34); blend_stack[d_2][k_11] = _1390; - blend_alpha_stack[d_2][k_11] = fast::clamp(abs(area[k_11]), 0.0, 1.0); rgba[k_11] = float4(0.0); } clip_depth++; @@ -702,7 +705,7 @@ kernel void main0(device Memory& v_202 [[buffer(0)]], const device ConfigBuf& _7 uint d_3 = min(clip_depth, 127u); uint param_35 = blend_stack[d_3][k_12]; float4 bg = unpacksRGB(param_35); - float4 fg_1 = (rgba[k_12] * area[k_12]) * blend_alpha_stack[d_3][k_12]; + float4 fg_1 = rgba[k_12] * area[k_12]; rgba[k_12] = (bg * (1.0 - fg_1.w)) + fg_1; } cmd_ref.offset += 4u; diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv index 61e5b1c..42964c8 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ diff --git a/piet-gpu/shader/gen/path_coarse.hlsl b/piet-gpu/shader/gen/path_coarse.hlsl index 6025bde..59cd7a6 100644 --- a/piet-gpu/shader/gen/path_coarse.hlsl +++ b/piet-gpu/shader/gen/path_coarse.hlsl @@ -86,8 +86,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/path_coarse.msl b/piet-gpu/shader/gen/path_coarse.msl index d263f31..f3cead8 100644 --- a/piet-gpu/shader/gen/path_coarse.msl +++ b/piet-gpu/shader/gen/path_coarse.msl @@ -146,8 +146,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/path_coarse.spv b/piet-gpu/shader/gen/path_coarse.spv index 0da044f..7c452bf 100644 Binary files a/piet-gpu/shader/gen/path_coarse.spv and b/piet-gpu/shader/gen/path_coarse.spv differ diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil index 3c81315..3895131 100644 Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl index f7c9e2d..0501f6f 100644 --- a/piet-gpu/shader/gen/pathseg.hlsl +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -64,8 +64,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -80,7 +85,7 @@ static const Monoid _567 = { 0.0f.xxxx, 0u }; RWByteAddressBuffer _111 : register(u0, space0); ByteAddressBuffer _574 : register(t2, space0); ByteAddressBuffer _639 : register(t1, space0); -ByteAddressBuffer _709 : register(t3, space0); +ByteAddressBuffer _710 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -356,7 +361,7 @@ uint round_up(float x) void comp_main() { uint ix = gl_GlobalInvocationID.x * 4u; - uint tag_word = _574.Load(((_639.Load(64) >> uint(2)) + (ix >> uint(2))) * 4 + 0); + uint tag_word = _574.Load(((_639.Load(84) >> uint(2)) + (ix >> uint(2))) * 4 + 0); uint param = tag_word; TagMonoid local_tm = reduce_tag(param); sh_tag[gl_LocalInvocationID.x] = local_tm; @@ -377,17 +382,17 @@ void comp_main() TagMonoid tm = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - TagMonoid _715; - _715.trans_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 0); - _715.linewidth_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 4); - _715.pathseg_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 8); - _715.path_ix = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 12); - _715.pathseg_offset = _709.Load((gl_WorkGroupID.x - 1u) * 20 + 16); - tm.trans_ix = _715.trans_ix; - tm.linewidth_ix = _715.linewidth_ix; - tm.pathseg_ix = _715.pathseg_ix; - tm.path_ix = _715.path_ix; - tm.pathseg_offset = _715.pathseg_offset; + TagMonoid _716; + _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0); + _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4); + _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8); + _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12); + _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16); + tm.trans_ix = _716.trans_ix; + tm.linewidth_ix = _716.linewidth_ix; + tm.pathseg_ix = _716.pathseg_ix; + tm.path_ix = _716.path_ix; + tm.pathseg_offset = _716.pathseg_offset; } if (gl_LocalInvocationID.x > 0u) { @@ -395,14 +400,14 @@ void comp_main() TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; tm = combine_tag_monoid(param_3, param_4); } - uint ps_ix = (_639.Load(68) >> uint(2)) + tm.pathseg_offset; - uint lw_ix = (_639.Load(60) >> uint(2)) + tm.linewidth_ix; + uint ps_ix = (_639.Load(88) >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.Load(80) >> uint(2)) + tm.linewidth_ix; uint save_path_ix = tm.path_ix; uint trans_ix = tm.trans_ix; - TransformSegRef _770 = { _639.Load(36) + (trans_ix * 24u) }; - TransformSegRef trans_ref = _770; - PathSegRef _780 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; - PathSegRef ps_ref = _780; + TransformSegRef _771 = { _639.Load(36) + (trans_ix * 24u) }; + TransformSegRef trans_ref = _771; + PathSegRef _781 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; + PathSegRef ps_ref = _781; float linewidth[4]; uint save_trans_ix[4]; float2 p0; @@ -455,9 +460,9 @@ void comp_main() } } } - Alloc _876; - _876.offset = _639.Load(36); - param_13.offset = _876.offset; + Alloc _877; + _877.offset = _639.Load(36); + param_13.offset = _877.offset; TransformSegRef param_14 = trans_ref; TransformSeg transform = TransformSeg_read(param_13, param_14); p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; @@ -466,25 +471,25 @@ void comp_main() if (seg_type >= 2u) { p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; - float4 _946 = bbox; - float2 _949 = min(_946.xy, p2); - bbox.x = _949.x; - bbox.y = _949.y; - float4 _954 = bbox; - float2 _957 = max(_954.zw, p2); - bbox.z = _957.x; - bbox.w = _957.y; + float4 _947 = bbox; + float2 _950 = min(_947.xy, p2); + bbox.x = _950.x; + bbox.y = _950.y; + float4 _955 = bbox; + float2 _958 = max(_955.zw, p2); + bbox.z = _958.x; + bbox.w = _958.y; if (seg_type == 3u) { p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; - float4 _982 = bbox; - float2 _985 = min(_982.xy, p3); - bbox.x = _985.x; - bbox.y = _985.y; - float4 _990 = bbox; - float2 _993 = max(_990.zw, p3); - bbox.z = _993.x; - bbox.w = _993.y; + float4 _983 = bbox; + float2 _986 = min(_983.xy, p3); + bbox.x = _986.x; + bbox.y = _986.y; + float4 _991 = bbox; + float2 _994 = max(_991.zw, p3); + bbox.z = _994.x; + bbox.w = _994.y; } else { @@ -515,9 +520,9 @@ void comp_main() cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; cubic.stroke = stroke; uint fill_mode = uint(linewidth[i_1] >= 0.0f); - Alloc _1088; - _1088.offset = _639.Load(28); - param_15.offset = _1088.offset; + Alloc _1089; + _1089.offset = _639.Load(28); + param_15.offset = _1089.offset; PathSegRef param_16 = ps_ref; uint param_17 = fill_mode; PathCubic param_18 = cubic; @@ -574,17 +579,17 @@ void comp_main() Monoid param_24 = local[i_4]; Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; - bool _1263 = i_4 == 3u; - bool _1269; - if (_1263) + bool _1264 = i_4 == 3u; + bool _1270; + if (_1264) { - _1269 = gl_LocalInvocationID.x == 255u; + _1270 = gl_LocalInvocationID.x == 255u; } else { - _1269 = _1263; + _1270 = _1264; } - if (_1269) + if (_1270) { do_atomic = true; } @@ -612,30 +617,30 @@ void comp_main() } if (do_atomic) { - bool _1334 = m.bbox.z > m.bbox.x; - bool _1343; - if (!_1334) + bool _1335 = m.bbox.z > m.bbox.x; + bool _1344; + if (!_1335) { - _1343 = m.bbox.w > m.bbox.y; + _1344 = m.bbox.w > m.bbox.y; } else { - _1343 = _1334; + _1344 = _1335; } - if (_1343) + if (_1344) { float param_29 = m.bbox.x; - uint _1352; - _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1352); + uint _1353; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353); float param_30 = m.bbox.y; - uint _1360; - _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1360); + uint _1361; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361); float param_31 = m.bbox.z; - uint _1368; - _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1368); + uint _1369; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369); float param_32 = m.bbox.w; - uint _1376; - _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1376); + uint _1377; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377); } bbox_out_ix += 6u; } diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl index 9708585..0e97d68 100644 --- a/piet-gpu/shader/gen/pathseg.msl +++ b/piet-gpu/shader/gen/pathseg.msl @@ -129,8 +129,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -430,7 +435,7 @@ uint round_up(thread const float& x) return uint(fast::min(65535.0, ceil(x) + 32768.0)); } -kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _709 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { threadgroup TagMonoid sh_tag[256]; threadgroup Monoid sh_scratch[256]; @@ -456,12 +461,12 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 TagMonoid tm = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _712 = gl_WorkGroupID.x - 1u; - tm.trans_ix = _709.parent[_712].trans_ix; - tm.linewidth_ix = _709.parent[_712].linewidth_ix; - tm.pathseg_ix = _709.parent[_712].pathseg_ix; - tm.path_ix = _709.parent[_712].path_ix; - tm.pathseg_offset = _709.parent[_712].pathseg_offset; + uint _713 = gl_WorkGroupID.x - 1u; + tm.trans_ix = _710.parent[_713].trans_ix; + tm.linewidth_ix = _710.parent[_713].linewidth_ix; + tm.pathseg_ix = _710.parent[_713].pathseg_ix; + tm.path_ix = _710.parent[_713].path_ix; + tm.pathseg_offset = _710.parent[_713].pathseg_offset; } if (gl_LocalInvocationID.x > 0u) { @@ -536,25 +541,25 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 if (seg_type >= 2u) { p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; - float4 _946 = bbox; - float2 _949 = fast::min(_946.xy, p2); - bbox.x = _949.x; - bbox.y = _949.y; - float4 _954 = bbox; - float2 _957 = fast::max(_954.zw, p2); - bbox.z = _957.x; - bbox.w = _957.y; + float4 _947 = bbox; + float2 _950 = fast::min(_947.xy, p2); + bbox.x = _950.x; + bbox.y = _950.y; + float4 _955 = bbox; + float2 _958 = fast::max(_955.zw, p2); + bbox.z = _958.x; + bbox.w = _958.y; if (seg_type == 3u) { p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; - float4 _982 = bbox; - float2 _985 = fast::min(_982.xy, p3); - bbox.x = _985.x; - bbox.y = _985.y; - float4 _990 = bbox; - float2 _993 = fast::max(_990.zw, p3); - bbox.z = _993.x; - bbox.w = _993.y; + float4 _983 = bbox; + float2 _986 = fast::min(_983.xy, p3); + bbox.x = _986.x; + bbox.y = _986.y; + float4 _991 = bbox; + float2 _994 = fast::max(_991.zw, p3); + bbox.z = _994.x; + bbox.w = _994.y; } else { @@ -642,17 +647,17 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 Monoid param_24 = local[i_4]; Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; - bool _1263 = i_4 == 3u; - bool _1269; - if (_1263) + bool _1264 = i_4 == 3u; + bool _1270; + if (_1264) { - _1269 = gl_LocalInvocationID.x == 255u; + _1270 = gl_LocalInvocationID.x == 255u; } else { - _1269 = _1263; + _1270 = _1264; } - if (_1269) + if (_1270) { do_atomic = true; } @@ -680,26 +685,26 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 } if (do_atomic) { - bool _1334 = m.bbox.z > m.bbox.x; - bool _1343; - if (!_1334) + bool _1335 = m.bbox.z > m.bbox.x; + bool _1344; + if (!_1335) { - _1343 = m.bbox.w > m.bbox.y; + _1344 = m.bbox.w > m.bbox.y; } else { - _1343 = _1334; + _1344 = _1335; } - if (_1343) + if (_1344) { float param_29 = m.bbox.x; - uint _1352 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); float param_30 = m.bbox.y; - uint _1360 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); float param_31 = m.bbox.z; - uint _1368 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); float param_32 = m.bbox.w; - uint _1376 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); } bbox_out_ix += 6u; } diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv index 37c9847..a1f223c 100644 Binary files a/piet-gpu/shader/gen/pathseg.spv and b/piet-gpu/shader/gen/pathseg.spv differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil index 245c492..89fb562 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl index 1332429..754e6e9 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.hlsl +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -26,8 +26,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -37,9 +42,9 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); ByteAddressBuffer _139 : register(t1, space0); -ByteAddressBuffer _150 : register(t2, space0); -RWByteAddressBuffer _237 : register(u3, space0); -RWByteAddressBuffer _257 : register(u0, space0); +ByteAddressBuffer _151 : register(t2, space0); +RWByteAddressBuffer _238 : register(u3, space0); +RWByteAddressBuffer _258 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -83,13 +88,13 @@ TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) void comp_main() { uint ix = gl_GlobalInvocationID.x * 2u; - uint scene_ix = (_139.Load(64) >> uint(2)) + ix; - uint tag_word = _150.Load(scene_ix * 4 + 0); + uint scene_ix = (_139.Load(84) >> uint(2)) + ix; + uint tag_word = _151.Load(scene_ix * 4 + 0); uint param = tag_word; TagMonoid agg = reduce_tag(param); for (uint i = 1u; i < 2u; i++) { - tag_word = _150.Load((scene_ix + i) * 4 + 0); + tag_word = _151.Load((scene_ix + i) * 4 + 0); uint param_1 = tag_word; TagMonoid param_2 = agg; TagMonoid param_3 = reduce_tag(param_1); @@ -111,11 +116,11 @@ void comp_main() } if (gl_LocalInvocationID.x == 0u) { - _237.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); - _237.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); - _237.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); - _237.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); - _237.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); } } diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl index 6c0a64f..83a8208 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.msl +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -33,8 +33,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -103,17 +108,17 @@ TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& return c; } -kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _237 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { threadgroup TagMonoid sh_scratch[128]; uint ix = gl_GlobalInvocationID.x * 2u; uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; - uint tag_word = _150.scene[scene_ix]; + uint tag_word = _151.scene[scene_ix]; uint param = tag_word; TagMonoid agg = reduce_tag(param); for (uint i = 1u; i < 2u; i++) { - tag_word = _150.scene[scene_ix + i]; + tag_word = _151.scene[scene_ix + i]; uint param_1 = tag_word; TagMonoid param_2 = agg; TagMonoid param_3 = reduce_tag(param_1); @@ -135,11 +140,11 @@ kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device Scene } if (gl_LocalInvocationID.x == 0u) { - _237.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; - _237.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; - _237.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; - _237.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; - _237.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; } } diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv index 9fc105f..feaad0a 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.spv and b/piet-gpu/shader/gen/pathtag_reduce.spv differ diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl index 5231c1d..97e1c23 100644 --- a/piet-gpu/shader/gen/tile_alloc.hlsl +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -60,8 +60,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl index 49bd1c4..bb10cf0 100644 --- a/piet-gpu/shader/gen/tile_alloc.msl +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -81,8 +81,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv index 55d62ad..12277f1 100644 Binary files a/piet-gpu/shader/gen/tile_alloc.spv and b/piet-gpu/shader/gen/tile_alloc.spv differ diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil index 32ec399..8a3200b 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl index 38136c9..3528d6c 100644 --- a/piet-gpu/shader/gen/transform_leaf.hlsl +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -39,8 +39,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -150,7 +155,7 @@ void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - TransformRef _285 = { _278.Load(56) + (ix * 24u) }; + TransformRef _285 = { _278.Load(76) + (ix * 24u) }; TransformRef ref = _285; TransformRef param = ref; Transform agg = Transform_read(param); diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl index 6a55784..6a99fae 100644 --- a/piet-gpu/shader/gen/transform_leaf.msl +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -102,8 +102,13 @@ struct Config Alloc_1 trans_alloc; Alloc_1 bbox_alloc; Alloc_1 drawmonoid_alloc; + Alloc_1 clip_alloc; + Alloc_1 clip_bic_alloc; + Alloc_1 clip_stack_alloc; + Alloc_1 clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv index f418bbe..b9a0a83 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.spv and b/piet-gpu/shader/gen/transform_leaf.spv differ diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil index 63df381..6c94a3a 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl index af52cdb..cce1a22 100644 --- a/piet-gpu/shader/gen/transform_reduce.hlsl +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -28,8 +28,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; @@ -87,7 +92,7 @@ Transform combine_monoid(Transform a, Transform b) void comp_main() { uint ix = gl_GlobalInvocationID.x * 8u; - TransformRef _168 = { _161.Load(56) + (ix * 24u) }; + TransformRef _168 = { _161.Load(76) + (ix * 24u) }; TransformRef ref = _168; TransformRef param = ref; Transform agg = Transform_read(param); diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl index c387f03..3695563 100644 --- a/piet-gpu/shader/gen/transform_reduce.msl +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -40,8 +40,13 @@ struct Config Alloc trans_alloc; Alloc bbox_alloc; Alloc drawmonoid_alloc; + Alloc clip_alloc; + Alloc clip_bic_alloc; + Alloc clip_stack_alloc; + Alloc clip_bbox_alloc; uint n_trans; uint n_path; + uint n_clip; uint trans_offset; uint linewidth_offset; uint pathtag_offset; diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv index af5ffb9..e74cb8d 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.spv and b/piet-gpu/shader/gen/transform_reduce.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 9aba204..dd4a855 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -91,7 +91,6 @@ void main() { vec2 xy = vec2(xy_uint); mediump vec4 rgba[CHUNK]; uint blend_stack[MAX_BLEND_STACK][CHUNK]; - mediump float blend_alpha_stack[MAX_BLEND_STACK][CHUNK]; for (uint i = 0; i < CHUNK; i++) { rgba[i] = vec4(0.0); } @@ -211,7 +210,6 @@ void main() { // The following is a sanity check so we don't corrupt memory should there be malformed inputs. uint d = min(clip_depth, MAX_BLEND_STACK - 1); blend_stack[d][k] = packsRGB(vec4(rgba[k])); - blend_alpha_stack[d][k] = clamp(abs(area[k]), 0.0, 1.0); rgba[k] = vec4(0.0); } clip_depth++; @@ -222,7 +220,7 @@ void main() { for (uint k = 0; k < CHUNK; k++) { uint d = min(clip_depth, MAX_BLEND_STACK - 1); mediump vec4 bg = unpacksRGB(blend_stack[d][k]); - mediump vec4 fg = rgba[k] * area[k] * blend_alpha_stack[d][k]; + mediump vec4 fg = rgba[k] * area[k]; rgba[k] = bg * (1.0 - fg.a) + fg; } cmd_ref.offset += 4; diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 4211b08..0dccecb 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -46,11 +46,23 @@ struct Config { // Monoid for draw objects Alloc drawmonoid_alloc; + // BeginClip(path_ix) / EndClip + Alloc clip_alloc; + // Intermediate bicyclic semigroup + Alloc clip_bic_alloc; + // Intermediate stack + Alloc clip_stack_alloc; + // Clip processing results (path_ix + bbox) + Alloc clip_bbox_alloc; + // Number of transforms in scene // This is probably not needed. uint n_trans; - // This only counts actual paths, not EndClip. + // This *should* count only actual paths, but in the current + // implementation is redundant with n_elements. uint n_path; + // Total number of BeginClip and EndClip draw objects. + uint n_clip; // Offset (in bytes) of transform stream in scene buffer uint trans_offset; // Offset (in bytes) of linewidth stream in scene diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs index 87fff1c..767f4ba 100644 --- a/piet-gpu/src/encoder.rs +++ b/piet-gpu/src/encoder.rs @@ -20,7 +20,8 @@ use bytemuck::{Pod, Zeroable}; use piet_gpu_hal::BufWrite; use crate::stages::{ - self, Config, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE, + self, Config, PathEncoder, Transform, CLIP_PART_SIZE, DRAW_PART_SIZE, PATHSEG_PART_SIZE, + TRANSFORM_PART_SIZE, }; pub struct Encoder { @@ -31,6 +32,7 @@ pub struct Encoder { drawobj_stream: Vec, n_path: u32, n_pathseg: u32, + n_clip: u32, } /// A scene fragment encoding a glyph. @@ -98,6 +100,7 @@ impl Encoder { drawobj_stream: Vec::new(), n_path: 0, n_pathseg: 0, + n_clip: 0, } } @@ -155,6 +158,7 @@ impl Encoder { ..Default::default() }; self.drawobj_stream.extend(bytemuck::bytes_of(&element)); + self.n_clip += 1; saved } @@ -170,6 +174,7 @@ impl Encoder { // This is a dummy path, and will go away with the new clip impl. self.tag_stream.push(0x10); self.n_path += 1; + self.n_clip += 1; } /// Return a config for the element processing pipeline. @@ -203,6 +208,20 @@ impl Encoder { alloc += n_drawobj_padded * DRAWMONOID_SIZE; let anno_alloc = alloc; alloc += n_drawobj * ANNOTATED_SIZE; + let clip_alloc = alloc; + let n_clip = self.n_clip as usize; + const CLIP_SIZE: usize = 4; + alloc += n_clip * CLIP_SIZE; + let clip_bic_alloc = alloc; + const CLIP_BIC_SIZE: usize = 8; + // This can round down, as we only reduce the prefix + alloc += (n_clip / CLIP_PART_SIZE as usize) * CLIP_BIC_SIZE; + let clip_stack_alloc = alloc; + const CLIP_EL_SIZE: usize = 20; + alloc += n_clip * CLIP_EL_SIZE; + let clip_bbox_alloc = alloc; + const CLIP_BBOX_SIZE: usize = 16; + alloc += align_up(n_clip as usize, CLIP_PART_SIZE as usize) * CLIP_BBOX_SIZE; let config = Config { n_elements: n_drawobj as u32, @@ -212,8 +231,13 @@ impl Encoder { trans_alloc: trans_alloc as u32, bbox_alloc: bbox_alloc as u32, drawmonoid_alloc: drawmonoid_alloc as u32, + clip_alloc: clip_alloc as u32, + clip_bic_alloc: clip_bic_alloc as u32, + clip_stack_alloc: clip_stack_alloc as u32, + clip_bbox_alloc: clip_bbox_alloc as u32, n_trans: n_trans as u32, n_path: self.n_path, + n_clip: self.n_clip, trans_offset: trans_offset as u32, linewidth_offset: linewidth_offset as u32, pathtag_offset: pathtag_offset as u32, @@ -261,6 +285,10 @@ impl Encoder { self.tag_stream.len() } + pub(crate) fn n_clip(&self) -> u32 { + self.n_clip + } + pub(crate) fn encode_glyph(&mut self, glyph: &GlyphEncoder) { self.tag_stream.extend(&glyph.tag_stream); self.pathseg_stream.extend(&glyph.pathseg_stream); diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 97e1f28..b8b7532 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -20,9 +20,9 @@ use piet_gpu_hal::{ }; use pico_svg::PicoSvg; -use stages::{ElementBinding, ElementCode}; +use stages::{ClipBinding, ElementBinding, ElementCode}; -use crate::stages::{Config, ElementStage}; +use crate::stages::{ClipCode, Config, ElementStage}; const TILE_W: usize = 16; const TILE_H: usize = 16; @@ -86,6 +86,9 @@ pub struct Renderer { element_stage: ElementStage, element_bindings: Vec, + clip_code: ClipCode, + clip_binding: ClipBinding, + tile_pipeline: Pipeline, tile_ds: DescriptorSet, @@ -110,6 +113,7 @@ pub struct Renderer { n_paths: usize, n_pathseg: usize, n_pathtag: usize, + n_clip: u32, // Keep a reference to the image so that it is not destroyed. _bg_image: Image, @@ -191,18 +195,20 @@ impl Renderer { let element_stage = ElementStage::new(session, &element_code); let element_bindings = scene_bufs .iter() - .zip(&config_bufs) - .map(|(scene_buf, config_buf)| { + .map(|scene_buf| { element_stage.bind( session, &element_code, - config_buf, + &config_buf, scene_buf, &memory_buf_dev, ) }) .collect(); + let clip_code = ClipCode::new(session); + let clip_binding = ClipBinding::new(session, &clip_code, &config_buf, &memory_buf_dev); + let tile_alloc_code = include_shader!(session, "../shader/gen/tile_alloc"); let tile_pipeline = session .create_compute_pipeline(tile_alloc_code, &[BindType::Buffer, BindType::BufReadOnly])?; @@ -286,6 +292,8 @@ impl Renderer { element_code, element_stage, element_bindings, + clip_code, + clip_binding, tile_pipeline, tile_ds, path_pipeline, @@ -304,6 +312,7 @@ impl Renderer { n_paths: 0, n_pathseg: 0, n_pathtag: 0, + n_clip: 0, _bg_image: bg_image, gradient_bufs, gradients, @@ -329,6 +338,7 @@ impl Renderer { self.n_drawobj = render_ctx.n_drawobj(); self.n_pathseg = render_ctx.n_pathseg() as usize; self.n_pathtag = render_ctx.n_pathtag(); + self.n_clip = render_ctx.n_clip(); // These constants depend on encoding and may need to be updated. // Perhaps we can plumb these from piet-gpu-derive? @@ -342,6 +352,7 @@ impl Renderer { alloc += ((n_drawobj + 255) & !255) * BIN_SIZE; let ptcl_base = alloc; alloc += width_in_tiles * height_in_tiles * PTCL_INITIAL_ALLOC; + config.width_in_tiles = width_in_tiles as u32; config.height_in_tiles = height_in_tiles as u32; config.tile_alloc = tile_base as u32; @@ -401,6 +412,19 @@ impl Renderer { cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 1); cmd_buf.memory_barrier(); + cmd_buf.begin_debug_label("Clip bounding box calculation"); + self.clip_binding + .record(cmd_buf, &self.clip_code, self.n_clip as u32); + cmd_buf.end_debug_label(); + cmd_buf.begin_debug_label("Element binning"); + cmd_buf.dispatch( + &self.bin_pipeline, + &self.bin_ds, + (((self.n_paths + 255) / 256) as u32, 1, 1), + (256, 1, 1), + ); + cmd_buf.end_debug_label(); + cmd_buf.memory_barrier(); cmd_buf.begin_debug_label("Tile allocation"); cmd_buf.dispatch( &self.tile_pipeline, @@ -430,18 +454,7 @@ impl Renderer { ); cmd_buf.end_debug_label(); cmd_buf.write_timestamp(&query_pool, 4); - // Note: this barrier is not needed as an actual dependency between - // pipeline stages, but I am keeping it in so that timer queries are - // easier to interpret. - cmd_buf.memory_barrier(); - cmd_buf.begin_debug_label("Element binning"); - cmd_buf.dispatch( - &self.bin_pipeline, - &self.bin_ds, - (((self.n_paths + 255) / 256) as u32, 1, 1), - (256, 1, 1), - ); - cmd_buf.end_debug_label(); + // TODO: redo query accounting cmd_buf.write_timestamp(&query_pool, 5); cmd_buf.memory_barrier(); cmd_buf.begin_debug_label("Coarse raster"); diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 96bbf03..ef0a3a7 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -123,6 +123,10 @@ impl PietGpuRenderContext { self.new_encoder.n_transform() } + pub fn n_clip(&self) -> u32 { + self.new_encoder.n_clip() + } + pub fn write_scene(&self, buf: &mut BufWrite) { self.new_encoder.write_scene(buf); } diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs index 014cef4..e155c50 100644 --- a/piet-gpu/src/stages.rs +++ b/piet-gpu/src/stages.rs @@ -16,12 +16,14 @@ //! Stages for new element pipeline, exposed for testing. +mod clip; mod draw; mod path; mod transform; use bytemuck::{Pod, Zeroable}; +pub use clip::{ClipBinding, ClipCode, CLIP_PART_SIZE}; pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE}; pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE}; use piet_gpu_hal::{Buffer, CmdBuf, Session}; @@ -47,8 +49,13 @@ pub struct Config { pub trans_alloc: u32, pub bbox_alloc: u32, pub drawmonoid_alloc: u32, + pub clip_alloc: u32, + pub clip_bic_alloc: u32, + pub clip_stack_alloc: u32, + pub clip_bbox_alloc: u32, pub n_trans: u32, pub n_path: u32, + pub n_clip: u32, pub trans_offset: u32, pub linewidth_offset: u32, pub pathtag_offset: u32, diff --git a/piet-gpu/src/stages/clip.rs b/piet-gpu/src/stages/clip.rs new file mode 100644 index 0000000..e4bc3db --- /dev/null +++ b/piet-gpu/src/stages/clip.rs @@ -0,0 +1,94 @@ +// Copyright 2022 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! The clip processing stage (includes substages). + +use piet_gpu_hal::{include_shader, BindType, Buffer, CmdBuf, DescriptorSet, Pipeline, Session}; + +// Note that this isn't the code/stage/binding pattern of most of the other stages +// in the new element processing pipeline. We want to move those temporary buffers +// into common memory and converge on this pattern. +pub struct ClipCode { + reduce_pipeline: Pipeline, + leaf_pipeline: Pipeline, +} + +pub struct ClipBinding { + reduce_ds: DescriptorSet, + leaf_ds: DescriptorSet, +} + +pub const CLIP_PART_SIZE: u32 = 256; + +impl ClipCode { + pub unsafe fn new(session: &Session) -> ClipCode { + let reduce_code = include_shader!(session, "../../shader/gen/clip_reduce"); + let reduce_pipeline = session + .create_compute_pipeline(reduce_code, &[BindType::Buffer, BindType::BufReadOnly]) + .unwrap(); + let leaf_code = include_shader!(session, "../../shader/gen/clip_leaf"); + let leaf_pipeline = session + .create_compute_pipeline(leaf_code, &[BindType::Buffer, BindType::BufReadOnly]) + .unwrap(); + ClipCode { + reduce_pipeline, + leaf_pipeline, + } + } +} + +impl ClipBinding { + pub unsafe fn new( + session: &Session, + code: &ClipCode, + config: &Buffer, + memory: &Buffer, + ) -> ClipBinding { + let reduce_ds = session + .create_simple_descriptor_set(&code.reduce_pipeline, &[memory, config]) + .unwrap(); + let leaf_ds = session + .create_simple_descriptor_set(&code.leaf_pipeline, &[memory, config]) + .unwrap(); + ClipBinding { reduce_ds, leaf_ds } + } + + /// Record the clip dispatches. + /// + /// Assumes memory barrier on entry. Provides memory barrier on exit. + pub unsafe fn record(&self, cmd_buf: &mut CmdBuf, code: &ClipCode, n_clip: u32) { + let n_wg_reduce = n_clip.saturating_sub(1) / CLIP_PART_SIZE; + if n_wg_reduce > 0 { + cmd_buf.dispatch( + &code.reduce_pipeline, + &self.reduce_ds, + (n_wg_reduce, 1, 1), + (CLIP_PART_SIZE, 1, 1), + ); + cmd_buf.memory_barrier(); + } + let n_wg = (n_clip + CLIP_PART_SIZE - 1) / CLIP_PART_SIZE; + if n_wg > 0 { + cmd_buf.dispatch( + &code.leaf_pipeline, + &self.leaf_ds, + (n_wg, 1, 1), + (CLIP_PART_SIZE, 1, 1), + ); + cmd_buf.memory_barrier(); + } + } +} diff --git a/tests/src/clip.rs b/tests/src/clip.rs new file mode 100644 index 0000000..cfd8a35 --- /dev/null +++ b/tests/src/clip.rs @@ -0,0 +1,237 @@ +// Copyright 2022 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! Tests for the piet-gpu clip processing stage. + +use bytemuck::{Pod, Zeroable}; +use rand::Rng; + +use piet_gpu::stages::{self, ClipBinding, ClipCode, DrawMonoid}; +use piet_gpu_hal::{BufWrite, BufferUsage}; + +use crate::{Config, Runner, TestResult}; + +struct ClipData { + clip_stream: Vec, + // In the atomic-int friendly encoding + path_bbox_stream: Vec, +} + +#[derive(Copy, Clone, Debug, Pod, Zeroable, Default)] +#[repr(C)] +struct PathBbox { + bbox: [u32; 4], + linewidth: f32, + trans_ix: u32, +} + +pub unsafe fn clip_test(runner: &mut Runner, config: &Config) -> TestResult { + let mut result = TestResult::new("clip"); + let n_clip: u64 = config.size.choose(1 << 8, 1 << 12, 1 << 16); + let data = ClipData::new(n_clip); + let stage_config = data.get_config(); + let config_buf = runner + .session + .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE) + .unwrap(); + // Need to actually get data uploaded + let mut memory = runner.buf_down(data.memory_size(), BufferUsage::STORAGE); + { + let mut buf_write = memory.map_write(..); + data.fill_memory(&mut buf_write); + } + + let code = ClipCode::new(&runner.session); + let binding = ClipBinding::new(&runner.session, &code, &config_buf, &memory.dev_buf); + + let mut commands = runner.commands(); + commands.write_timestamp(0); + commands.upload(&memory); + binding.record(&mut commands.cmd_buf, &code, n_clip as u32); + commands.download(&memory); + commands.write_timestamp(1); + runner.submit(commands); + let dst = memory.map_read(..); + if let Some(failure) = data.verify(&dst) { + result.fail(failure); + } + result +} + +fn rand_bbox() -> [u32; 4] { + let mut rng = rand::thread_rng(); + const Y_MIN: u32 = 32768; + const Y_MAX: u32 = Y_MIN + 1000; + let mut x0 = rng.gen_range(Y_MIN, Y_MAX); + let mut y0 = rng.gen_range(Y_MIN, Y_MAX); + let mut x1 = rng.gen_range(Y_MIN, Y_MAX); + let mut y1 = rng.gen_range(Y_MIN, Y_MAX); + if x0 > x1 { + std::mem::swap(&mut x0, &mut x1); + } + if y0 > y1 { + std::mem::swap(&mut y0, &mut y1); + } + [x0, y0, x1, y1] +} + +/// Convert from atomic-friendly to normal float bbox. +fn decode_bbox(raw: [u32; 4]) -> [f32; 4] { + fn decode(x: u32) -> f32 { + x as f32 - 32768.0 + } + [ + decode(raw[0]), + decode(raw[1]), + decode(raw[2]), + decode(raw[3]), + ] +} + +fn intersect_bbox(b0: [f32; 4], b1: [f32; 4]) -> [f32; 4] { + [ + b0[0].max(b1[0]), + b0[1].max(b1[1]), + b0[2].min(b1[2]), + b0[3].min(b1[3]), + ] +} + +const INFTY_BBOX: [f32; 4] = [-1e9, -1e9, 1e9, 1e9]; + +impl ClipData { + /// Generate a random clip sequence + fn new(n: u64) -> ClipData { + // Simple LCG random generator, for deterministic results + let mut z = 20170705u64; + let mut depth = 0; + let mut path_bbox_stream = Vec::new(); + let clip_stream = (0..n) + .map(|i| { + let is_push = if depth == 0 { + true + } else if depth >= 255 { + false + } else { + z = z.wrapping_mul(742938285) % ((1 << 31) - 1); + (z % 2) != 0 + }; + if is_push { + depth += 1; + let path_ix = path_bbox_stream.len() as u32; + let bbox = rand_bbox(); + let path_bbox = PathBbox { + bbox, + ..Default::default() + }; + path_bbox_stream.push(path_bbox); + path_ix + } else { + depth -= 1; + !(i as u32) + } + }) + .collect(); + ClipData { + clip_stream, + path_bbox_stream, + } + } + + fn get_config(&self) -> stages::Config { + let n_clip = self.clip_stream.len(); + let n_path = self.path_bbox_stream.len(); + let clip_alloc = 0; + let path_bbox_alloc = clip_alloc + 4 * n_clip; + let drawmonoid_alloc = path_bbox_alloc + 24 * n_path; + let clip_bic_alloc = drawmonoid_alloc + 8 * n_clip; + // TODO: this is over-allocated, we only need one bic per wg + let clip_stack_alloc = clip_bic_alloc + 8 * n_clip; + let clip_bbox_alloc = clip_stack_alloc + 20 * n_clip; + stages::Config { + clip_alloc: clip_alloc as u32, + // TODO: this wants to be renamed to path_bbox_alloc + bbox_alloc: path_bbox_alloc as u32, + drawmonoid_alloc: drawmonoid_alloc as u32, + clip_bic_alloc: clip_bic_alloc as u32, + clip_stack_alloc: clip_stack_alloc as u32, + clip_bbox_alloc: clip_bbox_alloc as u32, + n_clip: n_clip as u32, + ..Default::default() + } + } + + fn memory_size(&self) -> u64 { + (8 + self.clip_stream.len() * (4 + 8 + 8 + 20 + 16) + self.path_bbox_stream.len() * 24) + as u64 + } + + fn fill_memory(&self, buf: &mut BufWrite) { + // offset / header; no dynamic allocation + buf.fill_zero(8); + buf.extend_slice(&self.clip_stream); + buf.extend_slice(&self.path_bbox_stream); + // drawmonoid is left uninitialized + } + + fn verify(&self, buf: &[u8]) -> Option { + let n_clip = self.clip_stream.len(); + let n_path = self.path_bbox_stream.len(); + let clip_bbox_start = 8 + n_clip * (4 + 8 + 8 + 20) + n_path * 24; + let clip_range = clip_bbox_start..(clip_bbox_start + n_clip * 16); + let clip_result = bytemuck::cast_slice::(&buf[clip_range]); + let draw_start = 8 + n_clip * 4 + n_path * 24; + let draw_range = draw_start..(draw_start + n_clip * 8); + let draw_result = bytemuck::cast_slice::(&buf[draw_range]); + let mut bbox_stack = Vec::new(); + let mut parent_stack = Vec::new(); + for (i, path_ix) in self.clip_stream.iter().enumerate() { + let mut expected_path = None; + if *path_ix >= 0x8000_0000 { + let parent = parent_stack.pop().unwrap(); + expected_path = Some(self.clip_stream[parent as usize]); + bbox_stack.pop().unwrap(); + } else { + parent_stack.push(i); + let path_bbox_stream = self.path_bbox_stream[*path_ix as usize]; + let bbox = decode_bbox(path_bbox_stream.bbox); + let new = match bbox_stack.last() { + None => bbox, + Some(old) => intersect_bbox(*old, bbox), + }; + bbox_stack.push(new); + }; + let expected = bbox_stack.last().copied().unwrap_or(INFTY_BBOX); + let clip_bbox = clip_result[i]; + if clip_bbox != expected { + return Some(format!( + "{}: path_ix={}, expected bbox={:?}, clip_bbox={:?}", + i, path_ix, expected, clip_bbox + )); + } + if let Some(expected_path) = expected_path { + let actual_path = draw_result[i].path_ix; + if expected_path != actual_path { + return Some(format!( + "{}: expected path {}, actual {}", + i, expected_path, actual_path + )); + } + } + } + None + } +} diff --git a/tests/src/draw.rs b/tests/src/draw.rs index d79a9d9..7b264d4 100644 --- a/tests/src/draw.rs +++ b/tests/src/draw.rs @@ -102,17 +102,21 @@ impl DrawTestData { // Layout of memory let drawmonoid_alloc = 0; let anno_alloc = drawmonoid_alloc + 8 * n_tags; + let clip_alloc = anno_alloc + ANNOTATED_SIZE * n_tags; let stage_config = stages::Config { n_elements: n_tags as u32, anno_alloc: anno_alloc as u32, drawmonoid_alloc: drawmonoid_alloc as u32, + clip_alloc: clip_alloc as u32, ..Default::default() }; stage_config } fn memory_size(&self) -> u64 { - (8 + self.tags.len() * (8 + ANNOTATED_SIZE)) as u64 + // Note: this overallocates the clip buf a bit - only needed for the + // total number of begin_clip and end_clip tags. + (8 + self.tags.len() * (8 + 4 + ANNOTATED_SIZE)) as u64 } fn fill_scene(&self, buf: &mut BufWrite) { @@ -128,14 +132,13 @@ impl DrawTestData { let actual = bytemuck::cast_slice::(&buf[8..8 + size]); let mut expected = DrawMonoid::default(); for (i, (tag, actual)) in self.tags.iter().zip(actual).enumerate() { - // We compute an inclusive prefix sum, but for this application - // exclusive would be slightly better. We can adapt though. + // Verify exclusive prefix sum. let (path_ix, clip_ix) = Self::reduce_tag(*tag); - expected.path_ix += path_ix; - expected.clip_ix += clip_ix; if *actual != expected { return Some(format!("draw mismatch at {}", i)); } + expected.path_ix += path_ix; + expected.clip_ix += clip_ix; } None } diff --git a/tests/src/main.rs b/tests/src/main.rs index e52ce85..5599f70 100644 --- a/tests/src/main.rs +++ b/tests/src/main.rs @@ -17,6 +17,7 @@ //! Tests for piet-gpu shaders and GPU capabilities. mod clear; +mod clip; mod config; mod draw; mod linkedlist; @@ -139,6 +140,7 @@ fn main() { report(&transform::transform_test(&mut runner, &config)); report(&path::path_test(&mut runner, &config)); report(&draw::draw_test(&mut runner, &config)); + report(&clip::clip_test(&mut runner, &config)); } } } diff --git a/tests/src/runner.rs b/tests/src/runner.rs index 9bca26b..1fd6774 100644 --- a/tests/src/runner.rs +++ b/tests/src/runner.rs @@ -20,8 +20,8 @@ use std::ops::RangeBounds; use bytemuck::Pod; use piet_gpu_hal::{ - BackendType, BufReadGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, QueryPool, - Session, + BackendType, BufReadGuard, BufWriteGuard, Buffer, BufferUsage, CmdBuf, Instance, InstanceFlags, + QueryPool, Session, }; pub struct Runner { @@ -37,15 +37,8 @@ pub struct Commands { query_pool: QueryPool, } -/// Buffer for uploading data to GPU. -#[allow(unused)] -pub struct BufUp { - pub stage_buf: Buffer, - pub dev_buf: Buffer, -} - -/// Buffer for downloading data from GPU. -pub struct BufDown { +/// Buffer for both uploading and downloading +pub struct BufStage { pub stage_buf: Buffer, pub dev_buf: Buffer, } @@ -92,7 +85,7 @@ impl Runner { } #[allow(unused)] - pub fn buf_up(&self, size: u64) -> BufUp { + pub fn buf_up(&self, size: u64) -> BufStage { let stage_buf = self .session .create_buffer(size, BufferUsage::MAP_WRITE | BufferUsage::COPY_SRC) @@ -101,13 +94,13 @@ impl Runner { .session .create_buffer(size, BufferUsage::COPY_DST | BufferUsage::STORAGE) .unwrap(); - BufUp { stage_buf, dev_buf } + BufStage { stage_buf, dev_buf } } /// Create a buffer for download (readback). /// /// The `usage` parameter need not include COPY_SRC and STORAGE. - pub fn buf_down(&self, size: u64, usage: BufferUsage) -> BufDown { + pub fn buf_down(&self, size: u64, usage: BufferUsage) -> BufStage { let stage_buf = self .session .create_buffer(size, BufferUsage::MAP_READ | BufferUsage::COPY_DST) @@ -116,7 +109,7 @@ impl Runner { .session .create_buffer(size, usage | BufferUsage::COPY_SRC | BufferUsage::STORAGE) .unwrap(); - BufDown { stage_buf, dev_buf } + BufStage { stage_buf, dev_buf } } pub fn backend_type(&self) -> BackendType { @@ -129,17 +122,16 @@ impl Commands { self.cmd_buf.write_timestamp(&self.query_pool, query); } - #[allow(unused)] - pub unsafe fn upload(&mut self, buf: &BufUp) { + pub unsafe fn upload(&mut self, buf: &BufStage) { self.cmd_buf.copy_buffer(&buf.stage_buf, &buf.dev_buf); } - pub unsafe fn download(&mut self, buf: &BufDown) { + pub unsafe fn download(&mut self, buf: &BufStage) { self.cmd_buf.copy_buffer(&buf.dev_buf, &buf.stage_buf); } } -impl BufDown { +impl BufStage { pub unsafe fn read(&self, dst: &mut Vec) { self.stage_buf.read(dst).unwrap() } @@ -147,4 +139,8 @@ impl BufDown { pub unsafe fn map_read<'a>(&'a self, range: impl RangeBounds) -> BufReadGuard<'a> { self.stage_buf.map_read(range).unwrap() } + + pub unsafe fn map_write<'a>(&'a mut self, range: impl RangeBounds) -> BufWriteGuard { + self.stage_buf.map_write(range).unwrap() + } }