diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 1b3f252..fc6df21 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -151,6 +151,12 @@ void main() { uint part_start_ix = 0; uint ready_ix = 0; + Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size); + cmd_ref.offset += 4; + // Accounting for allocation of blend memory + uint render_blend_depth = 0; + uint max_blend_depth = 0; + uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2; uint drawtag_start = conf.drawtag_offset >> 2; uint drawdata_start = conf.drawdata_offset >> 2; @@ -414,6 +420,8 @@ void main() { } Cmd_BeginClip_write(cmd_alloc, cmd_ref); cmd_ref.offset += 4; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); } clip_depth++; break; @@ -426,6 +434,7 @@ void main() { uint blend = scene[dd]; Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend)); cmd_ref.offset += 4 + CmdEndClip_size; + render_blend_depth--; break; } } else { @@ -451,5 +460,10 @@ void main() { } if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { Cmd_End_write(cmd_alloc, cmd_ref); + if (max_blend_depth > BLEND_STACK_SPLIT) { + uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4; + MallocResult scratch = malloc(scratch_size); + alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc); + } } } diff --git a/piet-gpu/shader/gen/backdrop.dxil b/piet-gpu/shader/gen/backdrop.dxil index df2be88..0fb9622 100644 Binary files a/piet-gpu/shader/gen/backdrop.dxil and b/piet-gpu/shader/gen/backdrop.dxil differ diff --git a/piet-gpu/shader/gen/backdrop_lg.dxil b/piet-gpu/shader/gen/backdrop_lg.dxil index 81f9b65..e24a6d3 100644 Binary files a/piet-gpu/shader/gen/backdrop_lg.dxil and b/piet-gpu/shader/gen/backdrop_lg.dxil differ diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil index 6b3efaf..6655b7f 100644 Binary files a/piet-gpu/shader/gen/bbox_clear.dxil and b/piet-gpu/shader/gen/bbox_clear.dxil differ diff --git a/piet-gpu/shader/gen/clip_leaf.dxil b/piet-gpu/shader/gen/clip_leaf.dxil index b681a65..29a158e 100644 Binary files a/piet-gpu/shader/gen/clip_leaf.dxil and b/piet-gpu/shader/gen/clip_leaf.dxil differ diff --git a/piet-gpu/shader/gen/clip_reduce.dxil b/piet-gpu/shader/gen/clip_reduce.dxil index 0ccaac9..0dff71b 100644 Binary files a/piet-gpu/shader/gen/clip_reduce.dxil and b/piet-gpu/shader/gen/clip_reduce.dxil differ diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil index c91fcdf..f71cc04 100644 Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl index 0331e33..a7f769f 100644 --- a/piet-gpu/shader/gen/coarse.hlsl +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -175,9 +175,9 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); -RWByteAddressBuffer _260 : register(u0, space0); -ByteAddressBuffer _1005 : register(t1, space0); -ByteAddressBuffer _1372 : register(t2, space0); +RWByteAddressBuffer _266 : register(u0, space0); +ByteAddressBuffer _1020 : register(t1, space0); +ByteAddressBuffer _1399 : register(t2, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -200,8 +200,8 @@ groupshared uint sh_tile_count[256]; Alloc slice_mem(Alloc a, uint offset, uint size) { - Alloc _337 = { a.offset + offset }; - return _337; + Alloc _343 = { a.offset + offset }; + return _343; } bool touch_mem(Alloc alloc, uint offset) @@ -217,7 +217,7 @@ uint read_mem(Alloc alloc, uint offset) { return 0u; } - uint v = _260.Load(offset * 4 + 8); + uint v = _266.Load(offset * 4 + 8); return v; } @@ -230,8 +230,8 @@ Alloc new_alloc(uint offset, uint size, bool mem_ok) BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { - BinInstanceRef _346 = { ref.offset + (index * 4u) }; - return _346; + BinInstanceRef _361 = { ref.offset + (index * 4u) }; + return _361; } BinInstance BinInstance_read(Alloc a, BinInstanceRef ref) @@ -259,8 +259,8 @@ Path Path_read(Alloc a, PathRef ref) uint raw2 = read_mem(param_4, param_5); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); - TileRef _409 = { raw2 }; - s.tiles = _409; + TileRef _424 = { raw2 }; + s.tiles = _424; return s; } @@ -270,11 +270,11 @@ void write_tile_alloc(uint el_ix, Alloc a) Alloc read_tile_alloc(uint el_ix, bool mem_ok) { - uint _892; - _260.GetDimensions(_892); - _892 = (_892 - 8) / 4; + uint _907; + _266.GetDimensions(_907); + _907 = (_907 - 8) / 4; uint param = 0u; - uint param_1 = uint(int(_892) * 4); + uint param_1 = uint(int(_907) * 4); bool param_2 = mem_ok; return new_alloc(param, param_1, param_2); } @@ -288,31 +288,31 @@ Tile Tile_read(Alloc a, TileRef ref) Alloc param_2 = a; uint param_3 = ix + 1u; uint raw1 = read_mem(param_2, param_3); - TileSegRef _434 = { raw0 }; + TileSegRef _449 = { raw0 }; Tile s; - s.tile = _434; + s.tile = _449; s.backdrop = int(raw1); return s; } MallocResult malloc(uint size) { - uint _266; - _260.InterlockedAdd(0, size, _266); - uint offset = _266; - uint _273; - _260.GetDimensions(_273); - _273 = (_273 - 8) / 4; + uint _272; + _266.InterlockedAdd(0, size, _272); + uint offset = _272; + uint _279; + _266.GetDimensions(_279); + _279 = (_279 - 8) / 4; MallocResult r; - r.failed = (offset + size) > uint(int(_273) * 4); + r.failed = (offset + size) > uint(int(_279) * 4); uint param = offset; uint param_1 = size; bool param_2 = !r.failed; r.alloc = new_alloc(param, param_1, param_2); if (r.failed) { - uint _295; - _260.InterlockedMax(4, 1u, _295); + uint _301; + _266.InterlockedMax(4, 1u, _301); return r; } return r; @@ -326,7 +326,7 @@ void write_mem(Alloc alloc, uint offset, uint val) { return; } - _260.Store(offset * 4 + 8, val); + _266.Store(offset * 4 + 8, val); } void CmdJump_write(Alloc a, CmdJumpRef ref, CmdJump s) @@ -344,9 +344,9 @@ void Cmd_Jump_write(Alloc a, CmdRef ref, CmdJump s) uint param_1 = ref.offset >> uint(2); uint param_2 = 11u; write_mem(param, param_1, param_2); - CmdJumpRef _885 = { ref.offset + 4u }; + CmdJumpRef _900 = { ref.offset + 4u }; Alloc param_3 = a; - CmdJumpRef param_4 = _885; + CmdJumpRef param_4 = _900; CmdJump param_5 = s; CmdJump_write(param_3, param_4, param_5); } @@ -358,21 +358,21 @@ bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit return true; } uint param = 1024u; - MallocResult _913 = malloc(param); - MallocResult new_cmd = _913; + MallocResult _928 = malloc(param); + MallocResult new_cmd = _928; if (new_cmd.failed) { return false; } - CmdJump _923 = { new_cmd.alloc.offset }; - CmdJump jump = _923; + CmdJump _938 = { new_cmd.alloc.offset }; + CmdJump jump = _938; Alloc param_1 = cmd_alloc; CmdRef param_2 = cmd_ref; CmdJump param_3 = jump; Cmd_Jump_write(param_1, param_2, param_3); cmd_alloc = new_cmd.alloc; - CmdRef _935 = { cmd_alloc.offset }; - cmd_ref = _935; + CmdRef _950 = { cmd_alloc.offset }; + cmd_ref = _950; cmd_limit = (cmd_alloc.offset + 1024u) - 144u; return true; } @@ -396,9 +396,9 @@ void Cmd_Fill_write(Alloc a, CmdRef ref, CmdFill s) uint param_1 = ref.offset >> uint(2); uint param_2 = 1u; write_mem(param, param_1, param_2); - CmdFillRef _742 = { ref.offset + 4u }; + CmdFillRef _757 = { ref.offset + 4u }; Alloc param_3 = a; - CmdFillRef param_4 = _742; + CmdFillRef param_4 = _757; CmdFill param_5 = s; CmdFill_write(param_3, param_4, param_5); } @@ -430,9 +430,9 @@ void Cmd_Stroke_write(Alloc a, CmdRef ref, CmdStroke s) uint param_1 = ref.offset >> uint(2); uint param_2 = 2u; write_mem(param, param_1, param_2); - CmdStrokeRef _760 = { ref.offset + 4u }; + CmdStrokeRef _775 = { ref.offset + 4u }; Alloc param_3 = a; - CmdStrokeRef param_4 = _760; + CmdStrokeRef param_4 = _775; CmdStroke param_5 = s; CmdStroke_write(param_3, param_4, param_5); } @@ -443,8 +443,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) { if (tile.tile.offset != 0u) { - CmdFill _958 = { tile.tile.offset, tile.backdrop }; - CmdFill cmd_fill = _958; + CmdFill _973 = { tile.tile.offset, tile.backdrop }; + CmdFill cmd_fill = _973; Alloc param = alloc; CmdRef param_1 = cmd_ref; CmdFill param_2 = cmd_fill; @@ -461,8 +461,8 @@ void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) } else { - CmdStroke _988 = { tile.tile.offset, 0.5f * linewidth }; - CmdStroke cmd_stroke = _988; + CmdStroke _1003 = { tile.tile.offset, 0.5f * linewidth }; + CmdStroke cmd_stroke = _1003; Alloc param_5 = alloc; CmdRef param_6 = cmd_ref; CmdStroke param_7 = cmd_stroke; @@ -486,9 +486,9 @@ void Cmd_Color_write(Alloc a, CmdRef ref, CmdColor s) uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; write_mem(param, param_1, param_2); - CmdColorRef _786 = { ref.offset + 4u }; + CmdColorRef _801 = { ref.offset + 4u }; Alloc param_3 = a; - CmdColorRef param_4 = _786; + CmdColorRef param_4 = _801; CmdColor param_5 = s; CmdColor_write(param_3, param_4, param_5); } @@ -520,9 +520,9 @@ void Cmd_LinGrad_write(Alloc a, CmdRef ref, CmdLinGrad s) uint param_1 = ref.offset >> uint(2); uint param_2 = 6u; write_mem(param, param_1, param_2); - CmdLinGradRef _804 = { ref.offset + 4u }; + CmdLinGradRef _819 = { ref.offset + 4u }; Alloc param_3 = a; - CmdLinGradRef param_4 = _804; + CmdLinGradRef param_4 = _819; CmdLinGrad param_5 = s; CmdLinGrad_write(param_3, param_4, param_5); } @@ -582,9 +582,9 @@ void Cmd_RadGrad_write(Alloc a, CmdRef ref, CmdRadGrad s) uint param_1 = ref.offset >> uint(2); uint param_2 = 7u; write_mem(param, param_1, param_2); - CmdRadGradRef _822 = { ref.offset + 4u }; + CmdRadGradRef _837 = { ref.offset + 4u }; Alloc param_3 = a; - CmdRadGradRef param_4 = _822; + CmdRadGradRef param_4 = _837; CmdRadGrad param_5 = s; CmdRadGrad_write(param_3, param_4, param_5); } @@ -608,9 +608,9 @@ void Cmd_Image_write(Alloc a, CmdRef ref, CmdImage s) uint param_1 = ref.offset >> uint(2); uint param_2 = 8u; write_mem(param, param_1, param_2); - CmdImageRef _840 = { ref.offset + 4u }; + CmdImageRef _855 = { ref.offset + 4u }; Alloc param_3 = a; - CmdImageRef param_4 = _840; + CmdImageRef param_4 = _855; CmdImage param_5 = s; CmdImage_write(param_3, param_4, param_5); } @@ -638,9 +638,9 @@ void Cmd_EndClip_write(Alloc a, CmdRef ref, CmdEndClip s) uint param_1 = ref.offset >> uint(2); uint param_2 = 10u; write_mem(param, param_1, param_2); - CmdEndClipRef _866 = { ref.offset + 4u }; + CmdEndClipRef _881 = { ref.offset + 4u }; Alloc param_3 = a; - CmdEndClipRef param_4 = _866; + CmdEndClipRef param_4 = _881; CmdEndClip param_5 = s; CmdEndClip_write(param_3, param_4, param_5); } @@ -653,27 +653,35 @@ void Cmd_End_write(Alloc a, CmdRef ref) write_mem(param, param_1, param_2); } +void alloc_write(Alloc a, uint offset, Alloc alloc) +{ + Alloc param = a; + uint param_1 = offset >> uint(2); + uint param_2 = alloc.offset; + write_mem(param, param_1, param_2); +} + void comp_main() { - uint width_in_bins = ((_1005.Load(8) + 16u) - 1u) / 16u; + uint width_in_bins = ((_1020.Load(8) + 16u) - 1u) / 16u; uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; uint partition_ix = 0u; - uint n_partitions = ((_1005.Load(0) + 256u) - 1u) / 256u; + uint n_partitions = ((_1020.Load(0) + 256u) - 1u) / 256u; uint th_ix = gl_LocalInvocationID.x; uint bin_tile_x = 16u * gl_WorkGroupID.x; uint bin_tile_y = 16u * gl_WorkGroupID.y; uint tile_x = gl_LocalInvocationID.x % 16u; uint tile_y = gl_LocalInvocationID.x / 16u; - uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.Load(8)) + bin_tile_x) + tile_x; - Alloc _1070; - _1070.offset = _1005.Load(24); + uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.Load(8)) + bin_tile_x) + tile_x; + Alloc _1085; + _1085.offset = _1020.Load(24); Alloc param; - param.offset = _1070.offset; + param.offset = _1085.offset; uint param_1 = this_tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); - CmdRef _1079 = { cmd_alloc.offset }; - CmdRef cmd_ref = _1079; + CmdRef _1094 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1094; uint cmd_limit = (cmd_ref.offset + 1024u) - 144u; uint clip_depth = 0u; uint clip_zero_depth = 0u; @@ -681,18 +689,25 @@ void comp_main() uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; - uint drawmonoid_start = _1005.Load(44) >> uint(2); - uint drawtag_start = _1005.Load(100) >> uint(2); - uint drawdata_start = _1005.Load(104) >> uint(2); - uint drawinfo_start = _1005.Load(68) >> uint(2); - bool mem_ok = _260.Load(4) == 0u; - Alloc param_3; - Alloc param_5; - uint _1304; + Alloc param_3 = cmd_alloc; + uint param_4 = 0u; + uint param_5 = 8u; + Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; + uint drawmonoid_start = _1020.Load(44) >> uint(2); + uint drawtag_start = _1020.Load(100) >> uint(2); + uint drawdata_start = _1020.Load(104) >> uint(2); + uint drawinfo_start = _1020.Load(68) >> uint(2); + bool mem_ok = _266.Load(4) == 0u; + Alloc param_6; + Alloc param_8; + uint _1331; uint element_ix; - Alloc param_14; + Alloc param_17; uint tile_count; - uint _1605; + uint _1632; float linewidth; CmdLinGrad cmd_lin; CmdRadGrad cmd_rad; @@ -702,40 +717,40 @@ void comp_main() { sh_bitmaps[i][th_ix] = 0u; } - bool _1356; + bool _1383; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1154 = th_ix < 256u; - bool _1162; - if (_1154) + bool _1181 = th_ix < 256u; + bool _1189; + if (_1181) { - _1162 = (partition_ix + th_ix) < n_partitions; + _1189 = (partition_ix + th_ix) < n_partitions; } else { - _1162 = _1154; + _1189 = _1181; } - if (_1162) + if (_1189) { - uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - Alloc _1179; - _1179.offset = _1005.Load(20); - param_3.offset = _1179.offset; - uint param_4 = in_ix; - count = read_mem(param_3, param_4); - Alloc _1190; - _1190.offset = _1005.Load(20); - param_5.offset = _1190.offset; - uint param_6 = in_ix + 1u; - uint offset = read_mem(param_5, param_6); - uint param_7 = offset; - uint param_8 = count * 4u; - bool param_9 = mem_ok; - sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9); + uint in_ix = (_1020.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + Alloc _1206; + _1206.offset = _1020.Load(20); + param_6.offset = _1206.offset; + uint param_7 = in_ix; + count = read_mem(param_6, param_7); + Alloc _1217; + _1217.offset = _1020.Load(20); + param_8.offset = _1217.offset; + uint param_9 = in_ix + 1u; + uint offset = read_mem(param_8, param_9); + uint param_10 = offset; + uint param_11 = count * 4u; + bool param_12 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); } for (uint i_1 = 0u; i_1 < 8u; i_1++) { @@ -775,35 +790,35 @@ void comp_main() } if (part_ix > 0u) { - _1304 = sh_part_count[part_ix - 1u]; + _1331 = sh_part_count[part_ix - 1u]; } else { - _1304 = part_start_ix; + _1331 = part_start_ix; } - ix -= _1304; + ix -= _1331; Alloc bin_alloc = sh_part_elements[part_ix]; - BinInstanceRef _1323 = { bin_alloc.offset }; - BinInstanceRef inst_ref = _1323; - BinInstanceRef param_10 = inst_ref; - uint param_11 = ix; - Alloc param_12 = bin_alloc; - BinInstanceRef param_13 = BinInstance_index(param_10, param_11); - BinInstance inst = BinInstance_read(param_12, param_13); + BinInstanceRef _1350 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1350; + BinInstanceRef param_13 = inst_ref; + uint param_14 = ix; + Alloc param_15 = bin_alloc; + BinInstanceRef param_16 = BinInstance_index(param_13, param_14); + BinInstance inst = BinInstance_read(param_15, param_16); sh_elements[th_ix] = inst.element_ix; } GroupMemoryBarrierWithGroupSync(); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1346 = (wr_ix - rd_ix) < 256u; - if (_1346) + bool _1373 = (wr_ix - rd_ix) < 256u; + if (_1373) { - _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1356 = _1346; + _1383 = _1373; } - if (_1356) + if (_1383) { continue; } @@ -816,7 +831,7 @@ void comp_main() if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - tag = _1372.Load((drawtag_start + element_ix) * 4 + 0); + tag = _1399.Load((drawtag_start + element_ix) * 4 + 0); } switch (tag) { @@ -828,13 +843,13 @@ void comp_main() case 37u: { uint drawmonoid_base = drawmonoid_start + (4u * element_ix); - uint path_ix = _260.Load(drawmonoid_base * 4 + 8); - PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) }; - Alloc _1400; - _1400.offset = _1005.Load(16); - param_14.offset = _1400.offset; - PathRef param_15 = _1397; - Path path = Path_read(param_14, param_15); + uint path_ix = _266.Load(drawmonoid_base * 4 + 8); + PathRef _1424 = { _1020.Load(16) + (path_ix * 12u) }; + Alloc _1427; + _1427.offset = _1020.Load(16); + param_17.offset = _1427.offset; + PathRef param_18 = _1424; + Path path = Path_read(param_17, param_18); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -849,13 +864,13 @@ void comp_main() tile_count = uint(x1 - x0) * uint(y1 - y0); uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); sh_tile_base[th_ix] = base; - uint param_16 = path.tiles.offset; - uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_18 = mem_ok; - Alloc path_alloc = new_alloc(param_16, param_17, param_18); - uint param_19 = th_ix; - Alloc param_20 = path_alloc; - write_tile_alloc(param_19, param_20); + uint param_19 = path.tiles.offset; + uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_21 = mem_ok; + Alloc path_alloc = new_alloc(param_19, param_20, param_21); + uint param_22 = th_ix; + Alloc param_23 = path_alloc; + write_tile_alloc(param_22, param_23); break; } default: @@ -889,56 +904,56 @@ void comp_main() } } uint element_ix_1 = sh_elements[el_ix]; - uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0); + uint tag_1 = _1399.Load((drawtag_start + element_ix_1) * 4 + 0); if (el_ix > 0u) { - _1605 = sh_tile_count[el_ix - 1u]; + _1632 = sh_tile_count[el_ix - 1u]; } else { - _1605 = 0u; + _1632 = 0u; } - uint seq_ix = ix_1 - _1605; + uint seq_ix = ix_1 - _1632; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); bool include_tile = false; if (mem_ok) { - uint param_21 = el_ix; - bool param_22 = mem_ok; - TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; - Alloc param_23 = read_tile_alloc(param_21, param_22); - TileRef param_24 = _1657; - Tile tile = Tile_read(param_23, param_24); + uint param_24 = el_ix; + bool param_25 = mem_ok; + TileRef _1684 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Alloc param_26 = read_tile_alloc(param_24, param_25); + TileRef param_27 = _1684; + Tile tile = Tile_read(param_26, param_27); bool is_clip = (tag_1 & 1u) != 0u; bool is_blend = false; if (is_clip) { uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); - uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8); + uint scene_offset = _266.Load((drawmonoid_base_1 + 2u) * 4 + 8); uint dd = drawdata_start + (scene_offset >> uint(2)); - uint blend = _1372.Load(dd * 4 + 0); + uint blend = _1399.Load(dd * 4 + 0); is_blend = blend != 32771u; } - bool _1693 = tile.tile.offset != 0u; - bool _1702; - if (!_1693) + bool _1720 = tile.tile.offset != 0u; + bool _1729; + if (!_1720) { - _1702 = (tile.backdrop == 0) == is_clip; + _1729 = (tile.backdrop == 0) == is_clip; } else { - _1702 = _1693; + _1729 = _1720; } - include_tile = _1702 || is_blend; + include_tile = _1729 || is_blend; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1724; - InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1724); + uint _1751; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1751); } } GroupMemoryBarrierWithGroupSync(); @@ -962,179 +977,181 @@ void comp_main() uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); uint element_ix_2 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0); + uint drawtag = _1399.Load((drawtag_start + element_ix_2) * 4 + 0); if (clip_zero_depth == 0u) { - uint param_25 = element_ref_ix; - bool param_26 = mem_ok; - TileRef _1801 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - Alloc param_27 = read_tile_alloc(param_25, param_26); - TileRef param_28 = _1801; - Tile tile_1 = Tile_read(param_27, param_28); + uint param_28 = element_ref_ix; + bool param_29 = mem_ok; + TileRef _1828 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Alloc param_30 = read_tile_alloc(param_28, param_29); + TileRef param_31 = _1828; + Tile tile_1 = Tile_read(param_30, param_31); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); - uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8); - uint info_offset = _260.Load((drawmonoid_base_2 + 3u) * 4 + 8); + uint scene_offset_1 = _266.Load((drawmonoid_base_2 + 2u) * 4 + 8); + uint info_offset = _266.Load((drawmonoid_base_2 + 3u) * 4 + 8); uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); uint di = drawinfo_start + (info_offset >> uint(2)); switch (drawtag) { case 68u: { - linewidth = asfloat(_260.Load(di * 4 + 8)); - Alloc param_29 = cmd_alloc; - CmdRef param_30 = cmd_ref; - uint param_31 = cmd_limit; - bool _1849 = alloc_cmd(param_29, param_30, param_31); - cmd_alloc = param_29; - cmd_ref = param_30; - cmd_limit = param_31; - if (!_1849) + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_32 = cmd_alloc; + CmdRef param_33 = cmd_ref; + uint param_34 = cmd_limit; + bool _1876 = alloc_cmd(param_32, param_33, param_34); + cmd_alloc = param_32; + cmd_ref = param_33; + cmd_limit = param_34; + if (!_1876) { break; } - Alloc param_32 = cmd_alloc; - CmdRef param_33 = cmd_ref; - Tile param_34 = tile_1; - float param_35 = linewidth; - write_fill(param_32, param_33, param_34, param_35); - cmd_ref = param_33; - uint rgba = _1372.Load(dd_1 * 4 + 0); - CmdColor _1872 = { rgba }; - Alloc param_36 = cmd_alloc; - CmdRef param_37 = cmd_ref; - CmdColor param_38 = _1872; - Cmd_Color_write(param_36, param_37, param_38); + Alloc param_35 = cmd_alloc; + CmdRef param_36 = cmd_ref; + Tile param_37 = tile_1; + float param_38 = linewidth; + write_fill(param_35, param_36, param_37, param_38); + cmd_ref = param_36; + uint rgba = _1399.Load(dd_1 * 4 + 0); + CmdColor _1899 = { rgba }; + Alloc param_39 = cmd_alloc; + CmdRef param_40 = cmd_ref; + CmdColor param_41 = _1899; + Cmd_Color_write(param_39, param_40, param_41); cmd_ref.offset += 8u; break; } case 276u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - uint param_41 = cmd_limit; - bool _1890 = alloc_cmd(param_39, param_40, param_41); - cmd_alloc = param_39; - cmd_ref = param_40; - cmd_limit = param_41; - if (!_1890) + Alloc param_42 = cmd_alloc; + CmdRef param_43 = cmd_ref; + uint param_44 = cmd_limit; + bool _1917 = alloc_cmd(param_42, param_43, param_44); + cmd_alloc = param_42; + cmd_ref = param_43; + cmd_limit = param_44; + if (!_1917) { break; } - linewidth = asfloat(_260.Load(di * 4 + 8)); - Alloc param_42 = cmd_alloc; - CmdRef param_43 = cmd_ref; - Tile param_44 = tile_1; - float param_45 = linewidth; - write_fill(param_42, param_43, param_44, param_45); - cmd_ref = param_43; - cmd_lin.index = _1372.Load(dd_1 * 4 + 0); - cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8)); - cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8)); - cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8)); - Alloc param_46 = cmd_alloc; - CmdRef param_47 = cmd_ref; - CmdLinGrad param_48 = cmd_lin; - Cmd_LinGrad_write(param_46, param_47, param_48); + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + Tile param_47 = tile_1; + float param_48 = linewidth; + write_fill(param_45, param_46, param_47, param_48); + cmd_ref = param_46; + cmd_lin.index = _1399.Load(dd_1 * 4 + 0); + cmd_lin.line_x = asfloat(_266.Load((di + 1u) * 4 + 8)); + cmd_lin.line_y = asfloat(_266.Load((di + 2u) * 4 + 8)); + cmd_lin.line_c = asfloat(_266.Load((di + 3u) * 4 + 8)); + Alloc param_49 = cmd_alloc; + CmdRef param_50 = cmd_ref; + CmdLinGrad param_51 = cmd_lin; + Cmd_LinGrad_write(param_49, param_50, param_51); cmd_ref.offset += 20u; break; } case 732u: { - Alloc param_49 = cmd_alloc; - CmdRef param_50 = cmd_ref; - uint param_51 = cmd_limit; - bool _1954 = alloc_cmd(param_49, param_50, param_51); - cmd_alloc = param_49; - cmd_ref = param_50; - cmd_limit = param_51; - if (!_1954) + Alloc param_52 = cmd_alloc; + CmdRef param_53 = cmd_ref; + uint param_54 = cmd_limit; + bool _1981 = alloc_cmd(param_52, param_53, param_54); + cmd_alloc = param_52; + cmd_ref = param_53; + cmd_limit = param_54; + if (!_1981) { break; } - linewidth = asfloat(_260.Load(di * 4 + 8)); - Alloc param_52 = cmd_alloc; - CmdRef param_53 = cmd_ref; - Tile param_54 = tile_1; - float param_55 = linewidth; - write_fill(param_52, param_53, param_54, param_55); - cmd_ref = param_53; - cmd_rad.index = _1372.Load(dd_1 * 4 + 0); - cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8))); - cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8))); - cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8))); - cmd_rad.ra = asfloat(_260.Load((di + 9u) * 4 + 8)); - cmd_rad.roff = asfloat(_260.Load((di + 10u) * 4 + 8)); - Alloc param_56 = cmd_alloc; - CmdRef param_57 = cmd_ref; - CmdRadGrad param_58 = cmd_rad; - Cmd_RadGrad_write(param_56, param_57, param_58); + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_55 = cmd_alloc; + CmdRef param_56 = cmd_ref; + Tile param_57 = tile_1; + float param_58 = linewidth; + write_fill(param_55, param_56, param_57, param_58); + cmd_ref = param_56; + cmd_rad.index = _1399.Load(dd_1 * 4 + 0); + cmd_rad.mat = asfloat(uint4(_266.Load((di + 1u) * 4 + 8), _266.Load((di + 2u) * 4 + 8), _266.Load((di + 3u) * 4 + 8), _266.Load((di + 4u) * 4 + 8))); + cmd_rad.xlat = asfloat(uint2(_266.Load((di + 5u) * 4 + 8), _266.Load((di + 6u) * 4 + 8))); + cmd_rad.c1 = asfloat(uint2(_266.Load((di + 7u) * 4 + 8), _266.Load((di + 8u) * 4 + 8))); + cmd_rad.ra = asfloat(_266.Load((di + 9u) * 4 + 8)); + cmd_rad.roff = asfloat(_266.Load((di + 10u) * 4 + 8)); + Alloc param_59 = cmd_alloc; + CmdRef param_60 = cmd_ref; + CmdRadGrad param_61 = cmd_rad; + Cmd_RadGrad_write(param_59, param_60, param_61); cmd_ref.offset += 48u; break; } case 72u: { - linewidth = asfloat(_260.Load(di * 4 + 8)); - Alloc param_59 = cmd_alloc; - CmdRef param_60 = cmd_ref; - uint param_61 = cmd_limit; - bool _2060 = alloc_cmd(param_59, param_60, param_61); - cmd_alloc = param_59; - cmd_ref = param_60; - cmd_limit = param_61; - if (!_2060) + linewidth = asfloat(_266.Load(di * 4 + 8)); + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + uint param_64 = cmd_limit; + bool _2087 = alloc_cmd(param_62, param_63, param_64); + cmd_alloc = param_62; + cmd_ref = param_63; + cmd_limit = param_64; + if (!_2087) { break; } - Alloc param_62 = cmd_alloc; - CmdRef param_63 = cmd_ref; - Tile param_64 = tile_1; - float param_65 = linewidth; - write_fill(param_62, param_63, param_64, param_65); - cmd_ref = param_63; - uint index = _1372.Load(dd_1 * 4 + 0); - uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0); + Alloc param_65 = cmd_alloc; + CmdRef param_66 = cmd_ref; + Tile param_67 = tile_1; + float param_68 = linewidth; + write_fill(param_65, param_66, param_67, param_68); + cmd_ref = param_66; + uint index = _1399.Load(dd_1 * 4 + 0); + uint raw1 = _1399.Load((dd_1 + 1u) * 4 + 0); int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); - CmdImage _2099 = { index, offset_1 }; - Alloc param_66 = cmd_alloc; - CmdRef param_67 = cmd_ref; - CmdImage param_68 = _2099; - Cmd_Image_write(param_66, param_67, param_68); + CmdImage _2126 = { index, offset_1 }; + Alloc param_69 = cmd_alloc; + CmdRef param_70 = cmd_ref; + CmdImage param_71 = _2126; + Cmd_Image_write(param_69, param_70, param_71); cmd_ref.offset += 12u; break; } case 5u: { - bool _2113 = tile_1.tile.offset == 0u; - bool _2119; - if (_2113) + bool _2140 = tile_1.tile.offset == 0u; + bool _2146; + if (_2140) { - _2119 = tile_1.backdrop == 0; + _2146 = tile_1.backdrop == 0; } else { - _2119 = _2113; + _2146 = _2140; } - if (_2119) + if (_2146) { clip_zero_depth = clip_depth + 1u; } else { - Alloc param_69 = cmd_alloc; - CmdRef param_70 = cmd_ref; - uint param_71 = cmd_limit; - bool _2131 = alloc_cmd(param_69, param_70, param_71); - cmd_alloc = param_69; - cmd_ref = param_70; - cmd_limit = param_71; - if (!_2131) + Alloc param_72 = cmd_alloc; + CmdRef param_73 = cmd_ref; + uint param_74 = cmd_limit; + bool _2158 = alloc_cmd(param_72, param_73, param_74); + cmd_alloc = param_72; + cmd_ref = param_73; + cmd_limit = param_74; + if (!_2158) { break; } - Alloc param_72 = cmd_alloc; - CmdRef param_73 = cmd_ref; - Cmd_BeginClip_write(param_72, param_73); + Alloc param_75 = cmd_alloc; + CmdRef param_76 = cmd_ref; + Cmd_BeginClip_write(param_75, param_76); cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); } clip_depth++; break; @@ -1142,30 +1159,31 @@ void comp_main() case 37u: { clip_depth--; - Alloc param_74 = cmd_alloc; - CmdRef param_75 = cmd_ref; - uint param_76 = cmd_limit; - bool _2159 = alloc_cmd(param_74, param_75, param_76); - cmd_alloc = param_74; - cmd_ref = param_75; - cmd_limit = param_76; - if (!_2159) + Alloc param_77 = cmd_alloc; + CmdRef param_78 = cmd_ref; + uint param_79 = cmd_limit; + bool _2191 = alloc_cmd(param_77, param_78, param_79); + cmd_alloc = param_77; + cmd_ref = param_78; + cmd_limit = param_79; + if (!_2191) { break; } - Alloc param_77 = cmd_alloc; - CmdRef param_78 = cmd_ref; - Tile param_79 = tile_1; - float param_80 = -1.0f; - write_fill(param_77, param_78, param_79, param_80); - cmd_ref = param_78; - uint blend_1 = _1372.Load(dd_1 * 4 + 0); - CmdEndClip _2182 = { blend_1 }; - Alloc param_81 = cmd_alloc; - CmdRef param_82 = cmd_ref; - CmdEndClip param_83 = _2182; - Cmd_EndClip_write(param_81, param_82, param_83); + Alloc param_80 = cmd_alloc; + CmdRef param_81 = cmd_ref; + Tile param_82 = tile_1; + float param_83 = -1.0f; + write_fill(param_80, param_81, param_82, param_83); + cmd_ref = param_81; + uint blend_1 = _1399.Load(dd_1 * 4 + 0); + CmdEndClip _2214 = { blend_1 }; + Alloc param_84 = cmd_alloc; + CmdRef param_85 = cmd_ref; + CmdEndClip param_86 = _2214; + Cmd_EndClip_write(param_84, param_85, param_86); cmd_ref.offset += 8u; + render_blend_depth--; break; } } @@ -1198,21 +1216,32 @@ void comp_main() break; } } - bool _2229 = (bin_tile_x + tile_x) < _1005.Load(8); - bool _2238; - if (_2229) + bool _2263 = (bin_tile_x + tile_x) < _1020.Load(8); + bool _2272; + if (_2263) { - _2238 = (bin_tile_y + tile_y) < _1005.Load(12); + _2272 = (bin_tile_y + tile_y) < _1020.Load(12); } else { - _2238 = _2229; + _2272 = _2263; } - if (_2238) + if (_2272) { - Alloc param_84 = cmd_alloc; - CmdRef param_85 = cmd_ref; - Cmd_End_write(param_84, param_85); + Alloc param_87 = cmd_alloc; + CmdRef param_88 = cmd_ref; + Cmd_End_write(param_87, param_88); + if (max_blend_depth > 4u) + { + uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; + uint param_89 = scratch_size; + MallocResult _2293 = malloc(param_89); + MallocResult scratch = _2293; + Alloc param_90 = scratch_alloc; + uint param_91 = scratch_alloc.offset; + Alloc param_92 = scratch.alloc; + alloc_write(param_90, param_91, param_92); + } } } diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl index 854d243..d84add1 100644 --- a/piet-gpu/shader/gen/coarse.msl +++ b/piet-gpu/shader/gen/coarse.msl @@ -226,7 +226,7 @@ bool touch_mem(thread const Alloc& alloc, thread const uint& offset) } static inline __attribute__((always_inline)) -uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_260, constant uint& v_260BufferSize) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = alloc; uint param_1 = offset; @@ -234,7 +234,7 @@ uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memor { return 0u; } - uint v = v_260.memory[offset]; + uint v = v_266.memory[offset]; return v; } @@ -253,30 +253,30 @@ BinInstanceRef BinInstance_index(thread const BinInstanceRef& ref, thread const } static inline __attribute__((always_inline)) -BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_260, constant uint& v_260BufferSize) +BinInstance BinInstance_read(thread const Alloc& a, thread const BinInstanceRef& ref, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize); + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); BinInstance s; s.element_ix = raw0; return s; } static inline __attribute__((always_inline)) -Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_260, constant uint& v_260BufferSize) +Path Path_read(thread const Alloc& a, thread const PathRef& ref, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize); + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize); + uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); Alloc param_4 = a; uint param_5 = ix + 2u; - uint raw2 = read_mem(param_4, param_5, v_260, v_260BufferSize); + uint raw2 = read_mem(param_4, param_5, v_266, v_266BufferSize); Path s; s.bbox = uint4(raw0 & 65535u, raw0 >> uint(16), raw1 & 65535u, raw1 >> uint(16)); s.tiles = TileRef{ raw2 }; @@ -289,24 +289,24 @@ void write_tile_alloc(thread const uint& el_ix, thread const Alloc& a) } static inline __attribute__((always_inline)) -Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_260, constant uint& v_260BufferSize) +Alloc read_tile_alloc(thread const uint& el_ix, thread const bool& mem_ok, device Memory& v_266, constant uint& v_266BufferSize) { uint param = 0u; - uint param_1 = uint(int((v_260BufferSize - 8) / 4) * 4); + uint param_1 = uint(int((v_266BufferSize - 8) / 4) * 4); bool param_2 = mem_ok; return new_alloc(param, param_1, param_2); } static inline __attribute__((always_inline)) -Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_260, constant uint& v_260BufferSize) +Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; - uint raw0 = read_mem(param, param_1, v_260, v_260BufferSize); + uint raw0 = read_mem(param, param_1, v_266, v_266BufferSize); Alloc param_2 = a; uint param_3 = ix + 1u; - uint raw1 = read_mem(param_2, param_3, v_260, v_260BufferSize); + uint raw1 = read_mem(param_2, param_3, v_266, v_266BufferSize); Tile s; s.tile = TileSegRef{ raw0 }; s.backdrop = int(raw1); @@ -314,26 +314,26 @@ Tile Tile_read(thread const Alloc& a, thread const TileRef& ref, device Memory& } static inline __attribute__((always_inline)) -MallocResult malloc(thread const uint& size, device Memory& v_260, constant uint& v_260BufferSize) +MallocResult malloc(thread const uint& size, device Memory& v_266, constant uint& v_266BufferSize) { - uint _266 = atomic_fetch_add_explicit((device atomic_uint*)&v_260.mem_offset, size, memory_order_relaxed); - uint offset = _266; + uint _272 = atomic_fetch_add_explicit((device atomic_uint*)&v_266.mem_offset, size, memory_order_relaxed); + uint offset = _272; MallocResult r; - r.failed = (offset + size) > uint(int((v_260BufferSize - 8) / 4) * 4); + r.failed = (offset + size) > uint(int((v_266BufferSize - 8) / 4) * 4); uint param = offset; uint param_1 = size; bool param_2 = !r.failed; r.alloc = new_alloc(param, param_1, param_2); if (r.failed) { - uint _295 = atomic_fetch_max_explicit((device atomic_uint*)&v_260.mem_error, 1u, memory_order_relaxed); + uint _301 = atomic_fetch_max_explicit((device atomic_uint*)&v_266.mem_error, 1u, memory_order_relaxed); return r; } return r; } static inline __attribute__((always_inline)) -void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_260, constant uint& v_260BufferSize) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = alloc; uint param_1 = offset; @@ -341,42 +341,42 @@ void write_mem(thread const Alloc& alloc, thread const uint& offset, thread cons { return; } - v_260.memory[offset] = val; + v_266.memory[offset] = val; } static inline __attribute__((always_inline)) -void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdJump_write(thread const Alloc& a, thread const CmdJumpRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.new_ref; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_Jump_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdJump& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 11u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdJumpRef param_4 = CmdJumpRef{ ref.offset + 4u }; CmdJump param_5 = s; - CmdJump_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdJump_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_260, constant uint& v_260BufferSize) +bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd_limit, device Memory& v_266, constant uint& v_266BufferSize) { if (cmd_ref.offset < cmd_limit) { return true; } uint param = 1024u; - MallocResult _913 = malloc(param, v_260, v_260BufferSize); - MallocResult new_cmd = _913; + MallocResult _928 = malloc(param, v_266, v_266BufferSize); + MallocResult new_cmd = _928; if (new_cmd.failed) { return false; @@ -385,7 +385,7 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd Alloc param_1 = cmd_alloc; CmdRef param_2 = cmd_ref; CmdJump param_3 = jump; - Cmd_Jump_write(param_1, param_2, param_3, v_260, v_260BufferSize); + Cmd_Jump_write(param_1, param_2, param_3, v_266, v_266BufferSize); cmd_alloc = new_cmd.alloc; cmd_ref = CmdRef{ cmd_alloc.offset }; cmd_limit = (cmd_alloc.offset + 1024u) - 144u; @@ -393,70 +393,70 @@ bool alloc_cmd(thread Alloc& cmd_alloc, thread CmdRef& cmd_ref, thread uint& cmd } static inline __attribute__((always_inline)) -void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdFill_write(thread const Alloc& a, thread const CmdFillRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.tile_ref; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = uint(s.backdrop); - write_mem(param_3, param_4, param_5, v_260, v_260BufferSize); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_Fill_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdFill& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 1u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdFillRef param_4 = CmdFillRef{ ref.offset + 4u }; CmdFill param_5 = s; - CmdFill_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdFill_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_Solid_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 3u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdStroke_write(thread const Alloc& a, thread const CmdStrokeRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.tile_ref; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.half_width); - write_mem(param_3, param_4, param_5, v_260, v_260BufferSize); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_Stroke_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdStroke& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 2u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdStrokeRef param_4 = CmdStrokeRef{ ref.offset + 4u }; CmdStroke param_5 = s; - CmdStroke_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdStroke_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_260, constant uint& v_260BufferSize) +void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Tile& tile, thread const float& linewidth, device Memory& v_266, constant uint& v_266BufferSize) { if (linewidth < 0.0) { @@ -466,14 +466,14 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Alloc param = alloc; CmdRef param_1 = cmd_ref; CmdFill param_2 = cmd_fill; - Cmd_Fill_write(param, param_1, param_2, v_260, v_260BufferSize); + Cmd_Fill_write(param, param_1, param_2, v_266, v_266BufferSize); cmd_ref.offset += 12u; } else { Alloc param_3 = alloc; CmdRef param_4 = cmd_ref; - Cmd_Solid_write(param_3, param_4, v_260, v_260BufferSize); + Cmd_Solid_write(param_3, param_4, v_266, v_266BufferSize); cmd_ref.offset += 4u; } } @@ -483,201 +483,210 @@ void write_fill(thread const Alloc& alloc, thread CmdRef& cmd_ref, thread const Alloc param_5 = alloc; CmdRef param_6 = cmd_ref; CmdStroke param_7 = cmd_stroke; - Cmd_Stroke_write(param_5, param_6, param_7, v_260, v_260BufferSize); + Cmd_Stroke_write(param_5, param_6, param_7, v_266, v_266BufferSize); cmd_ref.offset += 12u; } } static inline __attribute__((always_inline)) -void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdColor_write(thread const Alloc& a, thread const CmdColorRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.rgba_color; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_Color_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdColor& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 5u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdColorRef param_4 = CmdColorRef{ ref.offset + 4u }; CmdColor param_5 = s; - CmdColor_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdColor_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdLinGrad_write(thread const Alloc& a, thread const CmdLinGradRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.line_x); - write_mem(param_3, param_4, param_5, v_260, v_260BufferSize); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.line_y); - write_mem(param_6, param_7, param_8, v_260, v_260BufferSize); + write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.line_c); - write_mem(param_9, param_10, param_11, v_260, v_260BufferSize); + write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_LinGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdLinGrad& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 6u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdLinGradRef param_4 = CmdLinGradRef{ ref.offset + 4u }; CmdLinGrad param_5 = s; - CmdLinGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdLinGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdRadGrad_write(thread const Alloc& a, thread const CmdRadGradRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = as_type(s.mat.x); - write_mem(param_3, param_4, param_5, v_260, v_260BufferSize); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); Alloc param_6 = a; uint param_7 = ix + 2u; uint param_8 = as_type(s.mat.y); - write_mem(param_6, param_7, param_8, v_260, v_260BufferSize); + write_mem(param_6, param_7, param_8, v_266, v_266BufferSize); Alloc param_9 = a; uint param_10 = ix + 3u; uint param_11 = as_type(s.mat.z); - write_mem(param_9, param_10, param_11, v_260, v_260BufferSize); + write_mem(param_9, param_10, param_11, v_266, v_266BufferSize); Alloc param_12 = a; uint param_13 = ix + 4u; uint param_14 = as_type(s.mat.w); - write_mem(param_12, param_13, param_14, v_260, v_260BufferSize); + write_mem(param_12, param_13, param_14, v_266, v_266BufferSize); Alloc param_15 = a; uint param_16 = ix + 5u; uint param_17 = as_type(s.xlat.x); - write_mem(param_15, param_16, param_17, v_260, v_260BufferSize); + write_mem(param_15, param_16, param_17, v_266, v_266BufferSize); Alloc param_18 = a; uint param_19 = ix + 6u; uint param_20 = as_type(s.xlat.y); - write_mem(param_18, param_19, param_20, v_260, v_260BufferSize); + write_mem(param_18, param_19, param_20, v_266, v_266BufferSize); Alloc param_21 = a; uint param_22 = ix + 7u; uint param_23 = as_type(s.c1.x); - write_mem(param_21, param_22, param_23, v_260, v_260BufferSize); + write_mem(param_21, param_22, param_23, v_266, v_266BufferSize); Alloc param_24 = a; uint param_25 = ix + 8u; uint param_26 = as_type(s.c1.y); - write_mem(param_24, param_25, param_26, v_260, v_260BufferSize); + write_mem(param_24, param_25, param_26, v_266, v_266BufferSize); Alloc param_27 = a; uint param_28 = ix + 9u; uint param_29 = as_type(s.ra); - write_mem(param_27, param_28, param_29, v_260, v_260BufferSize); + write_mem(param_27, param_28, param_29, v_266, v_266BufferSize); Alloc param_30 = a; uint param_31 = ix + 10u; uint param_32 = as_type(s.roff); - write_mem(param_30, param_31, param_32, v_260, v_260BufferSize); + write_mem(param_30, param_31, param_32, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_RadGrad_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdRadGrad& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 7u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdRadGradRef param_4 = CmdRadGradRef{ ref.offset + 4u }; CmdRadGrad param_5 = s; - CmdRadGrad_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdRadGrad_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdImage_write(thread const Alloc& a, thread const CmdImageRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.index; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; uint param_4 = ix + 1u; uint param_5 = (uint(s.offset.x) & 65535u) | (uint(s.offset.y) << uint(16)); - write_mem(param_3, param_4, param_5, v_260, v_260BufferSize); + write_mem(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_Image_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdImage& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 8u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdImageRef param_4 = CmdImageRef{ ref.offset + 4u }; CmdImage param_5 = s; - CmdImage_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdImage_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_BeginClip_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 9u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize) +void CmdEndClip_write(thread const Alloc& a, thread const CmdEndClipRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) { uint ix = ref.offset >> uint(2); Alloc param = a; uint param_1 = ix + 0u; uint param_2 = s.blend; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_EndClip_write(thread const Alloc& a, thread const CmdRef& ref, thread const CmdEndClip& s, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 10u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); Alloc param_3 = a; CmdEndClipRef param_4 = CmdEndClipRef{ ref.offset + 4u }; CmdEndClip param_5 = s; - CmdEndClip_write(param_3, param_4, param_5, v_260, v_260BufferSize); + CmdEndClip_write(param_3, param_4, param_5, v_266, v_266BufferSize); } static inline __attribute__((always_inline)) -void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_260, constant uint& v_260BufferSize) +void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memory& v_266, constant uint& v_266BufferSize) { Alloc param = a; uint param_1 = ref.offset >> uint(2); uint param_2 = 0u; - write_mem(param, param_1, param_2, v_260, v_260BufferSize); + write_mem(param, param_1, param_2, v_266, v_266BufferSize); } -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +static inline __attribute__((always_inline)) +void alloc_write(thread const Alloc& a, thread const uint& offset, thread const Alloc& alloc, device Memory& v_266, constant uint& v_266BufferSize) +{ + Alloc param = a; + uint param_1 = offset >> uint(2); + uint param_2 = alloc.offset; + write_mem(param, param_1, param_2, v_266, v_266BufferSize); +} + +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_266 [[buffer(0)]], const device ConfigBuf& _1020 [[buffer(1)]], const device SceneBuf& _1399 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint sh_bitmaps[8][256]; threadgroup Alloc sh_part_elements[256]; @@ -689,19 +698,19 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M threadgroup uint sh_tile_y0[256]; threadgroup uint sh_tile_base[256]; threadgroup uint sh_tile_count[256]; - constant uint& v_260BufferSize = spvBufferSizeConstants[0]; - uint width_in_bins = ((_1005.conf.width_in_tiles + 16u) - 1u) / 16u; + constant uint& v_266BufferSize = spvBufferSizeConstants[0]; + uint width_in_bins = ((_1020.conf.width_in_tiles + 16u) - 1u) / 16u; uint bin_ix = (width_in_bins * gl_WorkGroupID.y) + gl_WorkGroupID.x; uint partition_ix = 0u; - uint n_partitions = ((_1005.conf.n_elements + 256u) - 1u) / 256u; + uint n_partitions = ((_1020.conf.n_elements + 256u) - 1u) / 256u; uint th_ix = gl_LocalInvocationID.x; uint bin_tile_x = 16u * gl_WorkGroupID.x; uint bin_tile_y = 16u * gl_WorkGroupID.y; uint tile_x = gl_LocalInvocationID.x % 16u; uint tile_y = gl_LocalInvocationID.x / 16u; - uint this_tile_ix = (((bin_tile_y + tile_y) * _1005.conf.width_in_tiles) + bin_tile_x) + tile_x; + uint this_tile_ix = (((bin_tile_y + tile_y) * _1020.conf.width_in_tiles) + bin_tile_x) + tile_x; Alloc param; - param.offset = _1005.conf.ptcl_alloc.offset; + param.offset = _1020.conf.ptcl_alloc.offset; uint param_1 = this_tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); @@ -713,18 +722,25 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; - uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2); - uint drawtag_start = _1005.conf.drawtag_offset >> uint(2); - uint drawdata_start = _1005.conf.drawdata_offset >> uint(2); - uint drawinfo_start = _1005.conf.drawinfo_alloc.offset >> uint(2); - bool mem_ok = v_260.mem_error == 0u; - Alloc param_3; - Alloc param_5; - uint _1304; + Alloc param_3 = cmd_alloc; + uint param_4 = 0u; + uint param_5 = 8u; + Alloc scratch_alloc = slice_mem(param_3, param_4, param_5); + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; + uint drawmonoid_start = _1020.conf.drawmonoid_alloc.offset >> uint(2); + uint drawtag_start = _1020.conf.drawtag_offset >> uint(2); + uint drawdata_start = _1020.conf.drawdata_offset >> uint(2); + uint drawinfo_start = _1020.conf.drawinfo_alloc.offset >> uint(2); + bool mem_ok = v_266.mem_error == 0u; + Alloc param_6; + Alloc param_8; + uint _1331; uint element_ix; - Alloc param_14; + Alloc param_17; uint tile_count; - uint _1605; + uint _1632; float linewidth; CmdLinGrad cmd_lin; CmdRadGrad cmd_rad; @@ -734,36 +750,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { sh_bitmaps[i][th_ix] = 0u; } - bool _1356; + bool _1383; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1154 = th_ix < 256u; - bool _1162; - if (_1154) + bool _1181 = th_ix < 256u; + bool _1189; + if (_1181) { - _1162 = (partition_ix + th_ix) < n_partitions; + _1189 = (partition_ix + th_ix) < n_partitions; } else { - _1162 = _1154; + _1189 = _1181; } - if (_1162) + if (_1189) { - uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - param_3.offset = _1005.conf.bin_alloc.offset; - uint param_4 = in_ix; - count = read_mem(param_3, param_4, v_260, v_260BufferSize); - param_5.offset = _1005.conf.bin_alloc.offset; - uint param_6 = in_ix + 1u; - uint offset = read_mem(param_5, param_6, v_260, v_260BufferSize); - uint param_7 = offset; - uint param_8 = count * 4u; - bool param_9 = mem_ok; - sh_part_elements[th_ix] = new_alloc(param_7, param_8, param_9); + uint in_ix = (_1020.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); + param_6.offset = _1020.conf.bin_alloc.offset; + uint param_7 = in_ix; + count = read_mem(param_6, param_7, v_266, v_266BufferSize); + param_8.offset = _1020.conf.bin_alloc.offset; + uint param_9 = in_ix + 1u; + uint offset = read_mem(param_8, param_9, v_266, v_266BufferSize); + uint param_10 = offset; + uint param_11 = count * 4u; + bool param_12 = mem_ok; + sh_part_elements[th_ix] = new_alloc(param_10, param_11, param_12); } for (uint i_1 = 0u; i_1 < 8u; i_1++) { @@ -803,34 +819,34 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } if (part_ix > 0u) { - _1304 = sh_part_count[part_ix - 1u]; + _1331 = sh_part_count[part_ix - 1u]; } else { - _1304 = part_start_ix; + _1331 = part_start_ix; } - ix -= _1304; + ix -= _1331; Alloc bin_alloc = sh_part_elements[part_ix]; BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; - BinInstanceRef param_10 = inst_ref; - uint param_11 = ix; - Alloc param_12 = bin_alloc; - BinInstanceRef param_13 = BinInstance_index(param_10, param_11); - BinInstance inst = BinInstance_read(param_12, param_13, v_260, v_260BufferSize); + BinInstanceRef param_13 = inst_ref; + uint param_14 = ix; + Alloc param_15 = bin_alloc; + BinInstanceRef param_16 = BinInstance_index(param_13, param_14); + BinInstance inst = BinInstance_read(param_15, param_16, v_266, v_266BufferSize); sh_elements[th_ix] = inst.element_ix; } threadgroup_barrier(mem_flags::mem_threadgroup); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1346 = (wr_ix - rd_ix) < 256u; - if (_1346) + bool _1373 = (wr_ix - rd_ix) < 256u; + if (_1373) { - _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1383 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1356 = _1346; + _1383 = _1373; } - if (_1356) + if (_1383) { continue; } @@ -843,7 +859,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - tag = _1372.scene[drawtag_start + element_ix]; + tag = _1399.scene[drawtag_start + element_ix]; } switch (tag) { @@ -855,10 +871,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M case 37u: { uint drawmonoid_base = drawmonoid_start + (4u * element_ix); - uint path_ix = v_260.memory[drawmonoid_base]; - param_14.offset = _1005.conf.tile_alloc.offset; - PathRef param_15 = PathRef{ _1005.conf.tile_alloc.offset + (path_ix * 12u) }; - Path path = Path_read(param_14, param_15, v_260, v_260BufferSize); + uint path_ix = v_266.memory[drawmonoid_base]; + param_17.offset = _1020.conf.tile_alloc.offset; + PathRef param_18 = PathRef{ _1020.conf.tile_alloc.offset + (path_ix * 12u) }; + Path path = Path_read(param_17, param_18, v_266, v_266BufferSize); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -873,13 +889,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M tile_count = uint(x1 - x0) * uint(y1 - y0); uint base = path.tiles.offset - (((uint(dy) * stride) + uint(dx)) * 8u); sh_tile_base[th_ix] = base; - uint param_16 = path.tiles.offset; - uint param_17 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; - bool param_18 = mem_ok; - Alloc path_alloc = new_alloc(param_16, param_17, param_18); - uint param_19 = th_ix; - Alloc param_20 = path_alloc; - write_tile_alloc(param_19, param_20); + uint param_19 = path.tiles.offset; + uint param_20 = ((path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y)) * 8u; + bool param_21 = mem_ok; + Alloc path_alloc = new_alloc(param_19, param_20, param_21); + uint param_22 = th_ix; + Alloc param_23 = path_alloc; + write_tile_alloc(param_22, param_23); break; } default: @@ -913,54 +929,54 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } } uint element_ix_1 = sh_elements[el_ix]; - uint tag_1 = _1372.scene[drawtag_start + element_ix_1]; + uint tag_1 = _1399.scene[drawtag_start + element_ix_1]; if (el_ix > 0u) { - _1605 = sh_tile_count[el_ix - 1u]; + _1632 = sh_tile_count[el_ix - 1u]; } else { - _1605 = 0u; + _1632 = 0u; } - uint seq_ix = ix_1 - _1605; + uint seq_ix = ix_1 - _1632; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); bool include_tile = false; if (mem_ok) { - uint param_21 = el_ix; - bool param_22 = mem_ok; - Alloc param_23 = read_tile_alloc(param_21, param_22, v_260, v_260BufferSize); - TileRef param_24 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; - Tile tile = Tile_read(param_23, param_24, v_260, v_260BufferSize); + uint param_24 = el_ix; + bool param_25 = mem_ok; + Alloc param_26 = read_tile_alloc(param_24, param_25, v_266, v_266BufferSize); + TileRef param_27 = TileRef{ sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + Tile tile = Tile_read(param_26, param_27, v_266, v_266BufferSize); bool is_clip = (tag_1 & 1u) != 0u; bool is_blend = false; if (is_clip) { uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); - uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u]; + uint scene_offset = v_266.memory[drawmonoid_base_1 + 2u]; uint dd = drawdata_start + (scene_offset >> uint(2)); - uint blend = _1372.scene[dd]; + uint blend = _1399.scene[dd]; is_blend = blend != 32771u; } - bool _1693 = tile.tile.offset != 0u; - bool _1702; - if (!_1693) + bool _1720 = tile.tile.offset != 0u; + bool _1729; + if (!_1720) { - _1702 = (tile.backdrop == 0) == is_clip; + _1729 = (tile.backdrop == 0) == is_clip; } else { - _1702 = _1693; + _1729 = _1720; } - include_tile = _1702 || is_blend; + include_tile = _1729 || is_blend; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1724 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + uint _1751 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); } } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -984,176 +1000,178 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); uint element_ix_2 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - uint drawtag = _1372.scene[drawtag_start + element_ix_2]; + uint drawtag = _1399.scene[drawtag_start + element_ix_2]; if (clip_zero_depth == 0u) { - uint param_25 = element_ref_ix; - bool param_26 = mem_ok; - Alloc param_27 = read_tile_alloc(param_25, param_26, v_260, v_260BufferSize); - TileRef param_28 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; - Tile tile_1 = Tile_read(param_27, param_28, v_260, v_260BufferSize); + uint param_28 = element_ref_ix; + bool param_29 = mem_ok; + Alloc param_30 = read_tile_alloc(param_28, param_29, v_266, v_266BufferSize); + TileRef param_31 = TileRef{ sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + Tile tile_1 = Tile_read(param_30, param_31, v_266, v_266BufferSize); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); - uint scene_offset_1 = v_260.memory[drawmonoid_base_2 + 2u]; - uint info_offset = v_260.memory[drawmonoid_base_2 + 3u]; + uint scene_offset_1 = v_266.memory[drawmonoid_base_2 + 2u]; + uint info_offset = v_266.memory[drawmonoid_base_2 + 3u]; uint dd_1 = drawdata_start + (scene_offset_1 >> uint(2)); uint di = drawinfo_start + (info_offset >> uint(2)); switch (drawtag) { case 68u: { - linewidth = as_type(v_260.memory[di]); - Alloc param_29 = cmd_alloc; - CmdRef param_30 = cmd_ref; - uint param_31 = cmd_limit; - bool _1849 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize); - cmd_alloc = param_29; - cmd_ref = param_30; - cmd_limit = param_31; - if (!_1849) + linewidth = as_type(v_266.memory[di]); + Alloc param_32 = cmd_alloc; + CmdRef param_33 = cmd_ref; + uint param_34 = cmd_limit; + bool _1876 = alloc_cmd(param_32, param_33, param_34, v_266, v_266BufferSize); + cmd_alloc = param_32; + cmd_ref = param_33; + cmd_limit = param_34; + if (!_1876) { break; } - Alloc param_32 = cmd_alloc; - CmdRef param_33 = cmd_ref; - Tile param_34 = tile_1; - float param_35 = linewidth; - write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize); - cmd_ref = param_33; - uint rgba = _1372.scene[dd_1]; - Alloc param_36 = cmd_alloc; - CmdRef param_37 = cmd_ref; - CmdColor param_38 = CmdColor{ rgba }; - Cmd_Color_write(param_36, param_37, param_38, v_260, v_260BufferSize); + Alloc param_35 = cmd_alloc; + CmdRef param_36 = cmd_ref; + Tile param_37 = tile_1; + float param_38 = linewidth; + write_fill(param_35, param_36, param_37, param_38, v_266, v_266BufferSize); + cmd_ref = param_36; + uint rgba = _1399.scene[dd_1]; + Alloc param_39 = cmd_alloc; + CmdRef param_40 = cmd_ref; + CmdColor param_41 = CmdColor{ rgba }; + Cmd_Color_write(param_39, param_40, param_41, v_266, v_266BufferSize); cmd_ref.offset += 8u; break; } case 276u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - uint param_41 = cmd_limit; - bool _1890 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize); - cmd_alloc = param_39; - cmd_ref = param_40; - cmd_limit = param_41; - if (!_1890) + Alloc param_42 = cmd_alloc; + CmdRef param_43 = cmd_ref; + uint param_44 = cmd_limit; + bool _1917 = alloc_cmd(param_42, param_43, param_44, v_266, v_266BufferSize); + cmd_alloc = param_42; + cmd_ref = param_43; + cmd_limit = param_44; + if (!_1917) { break; } - linewidth = as_type(v_260.memory[di]); - Alloc param_42 = cmd_alloc; - CmdRef param_43 = cmd_ref; - Tile param_44 = tile_1; - float param_45 = linewidth; - write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize); - cmd_ref = param_43; - cmd_lin.index = _1372.scene[dd_1]; - cmd_lin.line_x = as_type(v_260.memory[di + 1u]); - cmd_lin.line_y = as_type(v_260.memory[di + 2u]); - cmd_lin.line_c = as_type(v_260.memory[di + 3u]); - Alloc param_46 = cmd_alloc; - CmdRef param_47 = cmd_ref; - CmdLinGrad param_48 = cmd_lin; - Cmd_LinGrad_write(param_46, param_47, param_48, v_260, v_260BufferSize); + linewidth = as_type(v_266.memory[di]); + Alloc param_45 = cmd_alloc; + CmdRef param_46 = cmd_ref; + Tile param_47 = tile_1; + float param_48 = linewidth; + write_fill(param_45, param_46, param_47, param_48, v_266, v_266BufferSize); + cmd_ref = param_46; + cmd_lin.index = _1399.scene[dd_1]; + cmd_lin.line_x = as_type(v_266.memory[di + 1u]); + cmd_lin.line_y = as_type(v_266.memory[di + 2u]); + cmd_lin.line_c = as_type(v_266.memory[di + 3u]); + Alloc param_49 = cmd_alloc; + CmdRef param_50 = cmd_ref; + CmdLinGrad param_51 = cmd_lin; + Cmd_LinGrad_write(param_49, param_50, param_51, v_266, v_266BufferSize); cmd_ref.offset += 20u; break; } case 732u: { - Alloc param_49 = cmd_alloc; - CmdRef param_50 = cmd_ref; - uint param_51 = cmd_limit; - bool _1954 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize); - cmd_alloc = param_49; - cmd_ref = param_50; - cmd_limit = param_51; - if (!_1954) + Alloc param_52 = cmd_alloc; + CmdRef param_53 = cmd_ref; + uint param_54 = cmd_limit; + bool _1981 = alloc_cmd(param_52, param_53, param_54, v_266, v_266BufferSize); + cmd_alloc = param_52; + cmd_ref = param_53; + cmd_limit = param_54; + if (!_1981) { break; } - linewidth = as_type(v_260.memory[di]); - Alloc param_52 = cmd_alloc; - CmdRef param_53 = cmd_ref; - Tile param_54 = tile_1; - float param_55 = linewidth; - write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize); - cmd_ref = param_53; - cmd_rad.index = _1372.scene[dd_1]; - cmd_rad.mat = as_type(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u])); - cmd_rad.xlat = as_type(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u])); - cmd_rad.c1 = as_type(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u])); - cmd_rad.ra = as_type(v_260.memory[di + 9u]); - cmd_rad.roff = as_type(v_260.memory[di + 10u]); - Alloc param_56 = cmd_alloc; - CmdRef param_57 = cmd_ref; - CmdRadGrad param_58 = cmd_rad; - Cmd_RadGrad_write(param_56, param_57, param_58, v_260, v_260BufferSize); + linewidth = as_type(v_266.memory[di]); + Alloc param_55 = cmd_alloc; + CmdRef param_56 = cmd_ref; + Tile param_57 = tile_1; + float param_58 = linewidth; + write_fill(param_55, param_56, param_57, param_58, v_266, v_266BufferSize); + cmd_ref = param_56; + cmd_rad.index = _1399.scene[dd_1]; + cmd_rad.mat = as_type(uint4(v_266.memory[di + 1u], v_266.memory[di + 2u], v_266.memory[di + 3u], v_266.memory[di + 4u])); + cmd_rad.xlat = as_type(uint2(v_266.memory[di + 5u], v_266.memory[di + 6u])); + cmd_rad.c1 = as_type(uint2(v_266.memory[di + 7u], v_266.memory[di + 8u])); + cmd_rad.ra = as_type(v_266.memory[di + 9u]); + cmd_rad.roff = as_type(v_266.memory[di + 10u]); + Alloc param_59 = cmd_alloc; + CmdRef param_60 = cmd_ref; + CmdRadGrad param_61 = cmd_rad; + Cmd_RadGrad_write(param_59, param_60, param_61, v_266, v_266BufferSize); cmd_ref.offset += 48u; break; } case 72u: { - linewidth = as_type(v_260.memory[di]); - Alloc param_59 = cmd_alloc; - CmdRef param_60 = cmd_ref; - uint param_61 = cmd_limit; - bool _2060 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize); - cmd_alloc = param_59; - cmd_ref = param_60; - cmd_limit = param_61; - if (!_2060) + linewidth = as_type(v_266.memory[di]); + Alloc param_62 = cmd_alloc; + CmdRef param_63 = cmd_ref; + uint param_64 = cmd_limit; + bool _2087 = alloc_cmd(param_62, param_63, param_64, v_266, v_266BufferSize); + cmd_alloc = param_62; + cmd_ref = param_63; + cmd_limit = param_64; + if (!_2087) { break; } - Alloc param_62 = cmd_alloc; - CmdRef param_63 = cmd_ref; - Tile param_64 = tile_1; - float param_65 = linewidth; - write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize); - cmd_ref = param_63; - uint index = _1372.scene[dd_1]; - uint raw1 = _1372.scene[dd_1 + 1u]; + Alloc param_65 = cmd_alloc; + CmdRef param_66 = cmd_ref; + Tile param_67 = tile_1; + float param_68 = linewidth; + write_fill(param_65, param_66, param_67, param_68, v_266, v_266BufferSize); + cmd_ref = param_66; + uint index = _1399.scene[dd_1]; + uint raw1 = _1399.scene[dd_1 + 1u]; int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); - Alloc param_66 = cmd_alloc; - CmdRef param_67 = cmd_ref; - CmdImage param_68 = CmdImage{ index, offset_1 }; - Cmd_Image_write(param_66, param_67, param_68, v_260, v_260BufferSize); + Alloc param_69 = cmd_alloc; + CmdRef param_70 = cmd_ref; + CmdImage param_71 = CmdImage{ index, offset_1 }; + Cmd_Image_write(param_69, param_70, param_71, v_266, v_266BufferSize); cmd_ref.offset += 12u; break; } case 5u: { - bool _2113 = tile_1.tile.offset == 0u; - bool _2119; - if (_2113) + bool _2140 = tile_1.tile.offset == 0u; + bool _2146; + if (_2140) { - _2119 = tile_1.backdrop == 0; + _2146 = tile_1.backdrop == 0; } else { - _2119 = _2113; + _2146 = _2140; } - if (_2119) + if (_2146) { clip_zero_depth = clip_depth + 1u; } else { - Alloc param_69 = cmd_alloc; - CmdRef param_70 = cmd_ref; - uint param_71 = cmd_limit; - bool _2131 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize); - cmd_alloc = param_69; - cmd_ref = param_70; - cmd_limit = param_71; - if (!_2131) + Alloc param_72 = cmd_alloc; + CmdRef param_73 = cmd_ref; + uint param_74 = cmd_limit; + bool _2158 = alloc_cmd(param_72, param_73, param_74, v_266, v_266BufferSize); + cmd_alloc = param_72; + cmd_ref = param_73; + cmd_limit = param_74; + if (!_2158) { break; } - Alloc param_72 = cmd_alloc; - CmdRef param_73 = cmd_ref; - Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize); + Alloc param_75 = cmd_alloc; + CmdRef param_76 = cmd_ref; + Cmd_BeginClip_write(param_75, param_76, v_266, v_266BufferSize); cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); } clip_depth++; break; @@ -1161,29 +1179,30 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M case 37u: { clip_depth--; - Alloc param_74 = cmd_alloc; - CmdRef param_75 = cmd_ref; - uint param_76 = cmd_limit; - bool _2159 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize); - cmd_alloc = param_74; - cmd_ref = param_75; - cmd_limit = param_76; - if (!_2159) + Alloc param_77 = cmd_alloc; + CmdRef param_78 = cmd_ref; + uint param_79 = cmd_limit; + bool _2191 = alloc_cmd(param_77, param_78, param_79, v_266, v_266BufferSize); + cmd_alloc = param_77; + cmd_ref = param_78; + cmd_limit = param_79; + if (!_2191) { break; } - Alloc param_77 = cmd_alloc; - CmdRef param_78 = cmd_ref; - Tile param_79 = tile_1; - float param_80 = -1.0; - write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize); - cmd_ref = param_78; - uint blend_1 = _1372.scene[dd_1]; - Alloc param_81 = cmd_alloc; - CmdRef param_82 = cmd_ref; - CmdEndClip param_83 = CmdEndClip{ blend_1 }; - Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize); + Alloc param_80 = cmd_alloc; + CmdRef param_81 = cmd_ref; + Tile param_82 = tile_1; + float param_83 = -1.0; + write_fill(param_80, param_81, param_82, param_83, v_266, v_266BufferSize); + cmd_ref = param_81; + uint blend_1 = _1399.scene[dd_1]; + Alloc param_84 = cmd_alloc; + CmdRef param_85 = cmd_ref; + CmdEndClip param_86 = CmdEndClip{ blend_1 }; + Cmd_EndClip_write(param_84, param_85, param_86, v_266, v_266BufferSize); cmd_ref.offset += 8u; + render_blend_depth--; break; } } @@ -1216,21 +1235,32 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M break; } } - bool _2229 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles; - bool _2238; - if (_2229) + bool _2263 = (bin_tile_x + tile_x) < _1020.conf.width_in_tiles; + bool _2272; + if (_2263) { - _2238 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles; + _2272 = (bin_tile_y + tile_y) < _1020.conf.height_in_tiles; } else { - _2238 = _2229; + _2272 = _2263; } - if (_2238) + if (_2272) { - Alloc param_84 = cmd_alloc; - CmdRef param_85 = cmd_ref; - Cmd_End_write(param_84, param_85, v_260, v_260BufferSize); + Alloc param_87 = cmd_alloc; + CmdRef param_88 = cmd_ref; + Cmd_End_write(param_87, param_88, v_266, v_266BufferSize); + if (max_blend_depth > 4u) + { + uint scratch_size = (((max_blend_depth * 16u) * 16u) * 1u) * 4u; + uint param_89 = scratch_size; + MallocResult _2293 = malloc(param_89, v_266, v_266BufferSize); + MallocResult scratch = _2293; + Alloc param_90 = scratch_alloc; + uint param_91 = scratch_alloc.offset; + Alloc param_92 = scratch.alloc; + alloc_write(param_90, param_91, param_92, v_266, v_266BufferSize); + } } } diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv index 56a87e5..fe5eeee 100644 Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil index 7399fe4..c48d59d 100644 Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl index 4839db2..0a6c022 100644 --- a/piet-gpu/shader/gen/kernel4.hlsl +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -999,6 +999,8 @@ void comp_main() Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef _1705 = { cmd_alloc.offset }; CmdRef cmd_ref = _1705; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); float4 rgba[8]; @@ -1011,7 +1013,9 @@ void comp_main() float df[8]; TileSegRef tile_seg_ref; float area[8]; - uint blend_stack[128][8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1032,8 +1036,8 @@ void comp_main() { df[k] = 1000000000.0f; } - TileSegRef _1800 = { stroke.tile_ref }; - tile_seg_ref = _1800; + TileSegRef _1810 = { stroke.tile_ref }; + tile_seg_ref = _1810; do { uint param_7 = tile_seg_ref.offset; @@ -1069,8 +1073,8 @@ void comp_main() { area[k_3] = float(fill.backdrop); } - TileSegRef _1920 = { fill.tile_ref }; - tile_seg_ref = _1920; + TileSegRef _1930 = { fill.tile_ref }; + tile_seg_ref = _1930; do { uint param_15 = tile_seg_ref.offset; @@ -1159,10 +1163,10 @@ void comp_main() int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba = gradients[int2(x, int(lin.index))]; float3 param_29 = fg_rgba.xyz; - float3 _2254 = fromsRGB(param_29); - fg_rgba.x = _2254.x; - fg_rgba.y = _2254.y; - fg_rgba.z = _2254.z; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; } @@ -1185,10 +1189,10 @@ void comp_main() int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; float3 param_33 = fg_rgba_1.xyz; - float3 _2364 = fromsRGB(param_33); - fg_rgba_1.x = _2364.x; - fg_rgba_1.y = _2364.y; - fg_rgba_1.z = _2364.z; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; } @@ -1202,9 +1206,9 @@ void comp_main() CmdImage fill_img = Cmd_Image_read(param_34, param_35); uint2 param_36 = xy_uint; CmdImage param_37 = fill_img; - float4 _2407[8]; - fillImage(_2407, param_36, param_37); - float4 img[8] = _2407; + float4 _2417[8]; + fillImage(_2417, param_36, param_37); + float4 img[8] = _2417; for (uint k_11 = 0u; k_11 < 8u; k_11++) { float4 fg_k_3 = img[k_11] * area[k_11]; @@ -1215,13 +1219,26 @@ void comp_main() } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2470 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2470; - rgba[k_12] = 0.0f.xxxx; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2522); + rgba[k_13] = 0.0f.xxxx; + } } clip_depth++; cmd_ref.offset += 4u; @@ -1229,32 +1246,41 @@ void comp_main() } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth >= 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref }; - cmd_ref = _2548; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2621; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1262,9 +1288,9 @@ void comp_main() } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - float3 param_48 = rgba[i_1].xyz; - image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w); + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w); } } diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl index 4caeaf0..4d7068d 100644 --- a/piet-gpu/shader/gen/kernel4.msl +++ b/piet-gpu/shader/gen/kernel4.msl @@ -1056,6 +1056,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); spvUnsafeArray rgba; @@ -1068,7 +1070,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 spvUnsafeArray df; TileSegRef tile_seg_ref; spvUnsafeArray area; - spvUnsafeArray, 128> blend_stack; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1214,10 +1218,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); float3 param_29 = fg_rgba.xyz; - float3 _2254 = fromsRGB(param_29); - fg_rgba.x = _2254.x; - fg_rgba.y = _2254.y; - fg_rgba.z = _2254.z; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; } @@ -1240,10 +1244,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); float3 param_33 = fg_rgba_1.xyz; - float3 _2364 = fromsRGB(param_33); - fg_rgba_1.x = _2364.x; - fg_rgba_1.y = _2364.y; - fg_rgba_1.z = _2364.z; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; } @@ -1269,13 +1273,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2470 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2470; - rgba[k_12] = float4(0.0); + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2522; + rgba[k_13] = float4(0.0); + } } clip_depth++; cmd_ref.offset += 4u; @@ -1283,31 +1300,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth >= 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref }; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1315,9 +1341,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - float3 param_48 = rgba[i_1].xyz; - image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47)))); + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); } } diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv index f0e2963..c388941 100644 Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil index 7b7c19f..7390167 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl index 5d9b88d..ffada37 100644 --- a/piet-gpu/shader/gen/kernel4_gray.hlsl +++ b/piet-gpu/shader/gen/kernel4_gray.hlsl @@ -999,6 +999,8 @@ void comp_main() Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef _1705 = { cmd_alloc.offset }; CmdRef cmd_ref = _1705; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); float4 rgba[8]; @@ -1011,7 +1013,9 @@ void comp_main() float df[8]; TileSegRef tile_seg_ref; float area[8]; - uint blend_stack[128][8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1032,8 +1036,8 @@ void comp_main() { df[k] = 1000000000.0f; } - TileSegRef _1800 = { stroke.tile_ref }; - tile_seg_ref = _1800; + TileSegRef _1810 = { stroke.tile_ref }; + tile_seg_ref = _1810; do { uint param_7 = tile_seg_ref.offset; @@ -1069,8 +1073,8 @@ void comp_main() { area[k_3] = float(fill.backdrop); } - TileSegRef _1920 = { fill.tile_ref }; - tile_seg_ref = _1920; + TileSegRef _1930 = { fill.tile_ref }; + tile_seg_ref = _1930; do { uint param_15 = tile_seg_ref.offset; @@ -1159,10 +1163,10 @@ void comp_main() int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba = gradients[int2(x, int(lin.index))]; float3 param_29 = fg_rgba.xyz; - float3 _2254 = fromsRGB(param_29); - fg_rgba.x = _2254.x; - fg_rgba.y = _2254.y; - fg_rgba.z = _2254.z; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; } @@ -1185,10 +1189,10 @@ void comp_main() int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; float3 param_33 = fg_rgba_1.xyz; - float3 _2364 = fromsRGB(param_33); - fg_rgba_1.x = _2364.x; - fg_rgba_1.y = _2364.y; - fg_rgba_1.z = _2364.z; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; } @@ -1202,9 +1206,9 @@ void comp_main() CmdImage fill_img = Cmd_Image_read(param_34, param_35); uint2 param_36 = xy_uint; CmdImage param_37 = fill_img; - float4 _2407[8]; - fillImage(_2407, param_36, param_37); - float4 img[8] = _2407; + float4 _2417[8]; + fillImage(_2417, param_36, param_37); + float4 img[8] = _2417; for (uint k_11 = 0u; k_11 < 8u; k_11++) { float4 fg_k_3 = img[k_11] * area[k_11]; @@ -1215,13 +1219,26 @@ void comp_main() } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2470 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2470; - rgba[k_12] = 0.0f.xxxx; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2522); + rgba[k_13] = 0.0f.xxxx; + } } clip_depth++; cmd_ref.offset += 4u; @@ -1229,32 +1246,41 @@ void comp_main() } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth >= 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - CmdRef _2548 = { Cmd_Jump_read(param_45, param_46).new_ref }; - cmd_ref = _2548; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2621 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2621; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1262,8 +1288,8 @@ void comp_main() } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x; + uint param_48 = i_1; + image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x; } } diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl index 8c608c3..a190ab6 100644 --- a/piet-gpu/shader/gen/kernel4_gray.msl +++ b/piet-gpu/shader/gen/kernel4_gray.msl @@ -1056,6 +1056,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); spvUnsafeArray rgba; @@ -1068,7 +1070,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 spvUnsafeArray df; TileSegRef tile_seg_ref; spvUnsafeArray area; - spvUnsafeArray, 128> blend_stack; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1214,10 +1218,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); float3 param_29 = fg_rgba.xyz; - float3 _2254 = fromsRGB(param_29); - fg_rgba.x = _2254.x; - fg_rgba.y = _2254.y; - fg_rgba.z = _2254.z; + float3 _2264 = fromsRGB(param_29); + fg_rgba.x = _2264.x; + fg_rgba.y = _2264.y; + fg_rgba.z = _2264.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; } @@ -1240,10 +1244,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); float3 param_33 = fg_rgba_1.xyz; - float3 _2364 = fromsRGB(param_33); - fg_rgba_1.x = _2364.x; - fg_rgba_1.y = _2364.y; - fg_rgba_1.z = _2364.z; + float3 _2374 = fromsRGB(param_33); + fg_rgba_1.x = _2374.x; + fg_rgba_1.y = _2374.y; + fg_rgba_1.z = _2374.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; } @@ -1269,13 +1273,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2470 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2470; - rgba[k_12] = float4(0.0); + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2479 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2479; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2522 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2522; + rgba[k_13] = float4(0.0); + } } clip_depth++; cmd_ref.offset += 4u; @@ -1283,31 +1300,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth >= 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref }; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1315,8 +1341,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47)))); + uint param_48 = i_1; + image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); } } diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv index 6ff1791..17c7531 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ diff --git a/piet-gpu/shader/gen/path_coarse.dxil b/piet-gpu/shader/gen/path_coarse.dxil index b6c9398..9fd593c 100644 Binary files a/piet-gpu/shader/gen/path_coarse.dxil and b/piet-gpu/shader/gen/path_coarse.dxil differ diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil index 7ce4684..6130712 100644 Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil index ff544b8..4c2bd23 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil index 48584bd..77f12e6 100644 Binary files a/piet-gpu/shader/gen/pathtag_root.dxil and b/piet-gpu/shader/gen/pathtag_root.dxil differ diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil index 0c1e376..f9f31e6 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil index fc3a311..978dd98 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil index a33ff7f..5b4f059 100644 Binary files a/piet-gpu/shader/gen/transform_root.dxil and b/piet-gpu/shader/gen/transform_root.dxil differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index 99fd22e..86751d4 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -100,11 +100,14 @@ void main() { Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); CmdRef cmd_ref = CmdRef(cmd_alloc.offset); + uint blend_offset = memory[cmd_ref.offset >> 2]; + cmd_ref.offset += 4; + uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); mediump vec4 rgba[CHUNK]; - uint blend_stack[MAX_BLEND_STACK][CHUNK]; + uint blend_stack[BLEND_STACK_SPLIT][CHUNK]; for (uint i = 0; i < CHUNK; i++) { rgba[i] = vec4(0.0); } @@ -236,24 +239,38 @@ void main() { cmd_ref.offset += 4 + CmdImage_size; break; case Cmd_BeginClip: - for (uint k = 0; k < CHUNK; k++) { - // We reject any inputs that might overflow in render_ctx.rs. - // The following is a sanity check so we don't corrupt memory should there be malformed inputs. - uint d = min(clip_depth, MAX_BLEND_STACK - 1); - blend_stack[d][k] = packsRGB(vec4(rgba[k])); - rgba[k] = vec4(0.0); + if (clip_depth < BLEND_STACK_SPLIT) { + for (uint k = 0; k < CHUNK; k++) { + blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k])); + rgba[k] = vec4(0.0); + } + } else { + uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX + + CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y); + for (uint k = 0; k < CHUNK; k++) { + memory[base_ix + k] = packsRGB(vec4(rgba[k])); + rgba[k] = vec4(0.0); + } } clip_depth++; cmd_ref.offset += 4; break; case Cmd_EndClip: CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref); - uint blend_mode = uint(end_clip.blend >> 8); - uint comp_mode = uint(end_clip.blend & 0xFF); clip_depth--; + uint base_ix; + if (clip_depth >= BLEND_STACK_SPLIT) { + base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX + + CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y); + } for (uint k = 0; k < CHUNK; k++) { - uint d = min(clip_depth, MAX_BLEND_STACK - 1); - mediump vec4 bg = unpacksRGB(blend_stack[d][k]); + uint bg_rgba; + if (clip_depth < BLEND_STACK_SPLIT) { + bg_rgba = blend_stack[clip_depth][k]; + } else { + bg_rgba = memory[base_ix + k]; + } + mediump vec4 bg = unpacksRGB(bg_rgba); mediump vec4 fg = rgba[k] * area[k]; rgba[k] = mix_blend_compose(bg, fg, end_clip.blend); } diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index ec17188..21206e5 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -27,6 +27,10 @@ #define GRADIENT_WIDTH 512 +// We allocate this many blend stack entries in registers, and spill +// to memory for the overflow. +#define BLEND_STACK_SPLIT 4 + #ifdef ERR_MALLOC_FAILED struct Config { uint n_elements; // paths @@ -91,7 +95,7 @@ struct Config { #define MODE_STROKE 1 // Size of kernel4 clip state, in words. -#define CLIP_STATE_SIZE 2 +#define CLIP_STATE_SIZE 1 // fill_mode_from_flags extracts the fill mode from tag flags. uint fill_mode_from_flags(uint flags) { diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index c75f41f..1ebb5cf 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -34,8 +34,6 @@ const TILE_H: usize = 16; const PTCL_INITIAL_ALLOC: usize = 1024; -const MAX_BLEND_STACK: usize = 128; - #[allow(unused)] fn dump_scene(buf: &[u8]) { for i in 0..(buf.len() / 4) { diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 14f2561..5d4ffd3 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -5,7 +5,6 @@ use std::borrow::Cow; use crate::encoder::GlyphEncoder; use crate::stages::{Config, Transform}; -use crate::MAX_BLEND_STACK; use piet::kurbo::{Affine, PathEl, Point, Rect, Shape}; use piet::{ Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext, @@ -233,9 +232,6 @@ impl RenderContext for PietGpuRenderContext { let path = shape.path_elements(TOLERANCE); self.encode_path(path, true); self.new_encoder.begin_clip(None); - if self.clip_stack.len() >= MAX_BLEND_STACK { - panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK); - } self.clip_stack.push(ClipElement { blend: None }); if let Some(tos) = self.state_stack.last_mut() { tos.n_clip += 1; @@ -337,9 +333,6 @@ impl PietGpuRenderContext { let path = shape.path_elements(TOLERANCE); self.encode_path(path, true); self.new_encoder.begin_clip(Some(blend)); - if self.clip_stack.len() >= MAX_BLEND_STACK { - panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK); - } self.clip_stack.push(ClipElement { blend: Some(blend) }); if let Some(tos) = self.state_stack.last_mut() { tos.n_clip += 1;