diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index 3abb2e0..c93d002 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -151,6 +151,11 @@ void main() { uint part_start_ix = 0; uint ready_ix = 0; + cmd_ref.offset += 4; + // Accounting for allocation of blend memory + uint render_blend_depth = 0; + uint max_blend_depth = 0; + uint drawmonoid_start = conf.drawmonoid_alloc.offset >> 2; uint drawtag_start = conf.drawtag_offset >> 2; uint drawdata_start = conf.drawdata_offset >> 2; @@ -414,6 +419,8 @@ void main() { } Cmd_BeginClip_write(cmd_alloc, cmd_ref); cmd_ref.offset += 4; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); } clip_depth++; break; @@ -426,6 +433,7 @@ void main() { uint blend = scene[dd]; Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend)); cmd_ref.offset += 4 + CmdEndClip_size; + render_blend_depth--; break; } } else { @@ -451,5 +459,8 @@ void main() { } if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { Cmd_End_write(cmd_alloc, cmd_ref); + if (max_blend_depth > BLEND_STACK_SPLIT) { + // TODO: allocate blend memory and write result + } } } diff --git a/piet-gpu/shader/gen/coarse.dxil b/piet-gpu/shader/gen/coarse.dxil index 910925d..cbebec0 100644 Binary files a/piet-gpu/shader/gen/coarse.dxil and b/piet-gpu/shader/gen/coarse.dxil differ diff --git a/piet-gpu/shader/gen/coarse.hlsl b/piet-gpu/shader/gen/coarse.hlsl index 04529bb..0519a63 100644 --- a/piet-gpu/shader/gen/coarse.hlsl +++ b/piet-gpu/shader/gen/coarse.hlsl @@ -177,7 +177,7 @@ static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); RWByteAddressBuffer _260 : register(u0, space0); ByteAddressBuffer _1005 : register(t1, space0); -ByteAddressBuffer _1372 : register(t2, space0); +ByteAddressBuffer _1378 : register(t2, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -681,6 +681,9 @@ void comp_main() uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; uint drawmonoid_start = _1005.Load(44) >> uint(2); uint drawtag_start = _1005.Load(100) >> uint(2); uint drawdata_start = _1005.Load(104) >> uint(2); @@ -688,11 +691,11 @@ void comp_main() bool mem_ok = _260.Load(4) == 0u; Alloc param_3; Alloc param_5; - uint _1304; + uint _1310; uint element_ix; Alloc param_14; uint tile_count; - uint _1605; + uint _1611; float linewidth; CmdLinGrad cmd_lin; CmdRadGrad cmd_rad; @@ -702,34 +705,34 @@ void comp_main() { sh_bitmaps[i][th_ix] = 0u; } - bool _1356; + bool _1362; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1154 = th_ix < 256u; - bool _1162; - if (_1154) + bool _1160 = th_ix < 256u; + bool _1168; + if (_1160) { - _1162 = (partition_ix + th_ix) < n_partitions; + _1168 = (partition_ix + th_ix) < n_partitions; } else { - _1162 = _1154; + _1168 = _1160; } - if (_1162) + if (_1168) { uint in_ix = (_1005.Load(20) >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); - Alloc _1179; - _1179.offset = _1005.Load(20); - param_3.offset = _1179.offset; + Alloc _1185; + _1185.offset = _1005.Load(20); + param_3.offset = _1185.offset; uint param_4 = in_ix; count = read_mem(param_3, param_4); - Alloc _1190; - _1190.offset = _1005.Load(20); - param_5.offset = _1190.offset; + Alloc _1196; + _1196.offset = _1005.Load(20); + param_5.offset = _1196.offset; uint param_6 = in_ix + 1u; uint offset = read_mem(param_5, param_6); uint param_7 = offset; @@ -775,16 +778,16 @@ void comp_main() } if (part_ix > 0u) { - _1304 = sh_part_count[part_ix - 1u]; + _1310 = sh_part_count[part_ix - 1u]; } else { - _1304 = part_start_ix; + _1310 = part_start_ix; } - ix -= _1304; + ix -= _1310; Alloc bin_alloc = sh_part_elements[part_ix]; - BinInstanceRef _1323 = { bin_alloc.offset }; - BinInstanceRef inst_ref = _1323; + BinInstanceRef _1329 = { bin_alloc.offset }; + BinInstanceRef inst_ref = _1329; BinInstanceRef param_10 = inst_ref; uint param_11 = ix; Alloc param_12 = bin_alloc; @@ -794,16 +797,16 @@ void comp_main() } GroupMemoryBarrierWithGroupSync(); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1346 = (wr_ix - rd_ix) < 256u; - if (_1346) + bool _1352 = (wr_ix - rd_ix) < 256u; + if (_1352) { - _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1356 = _1346; + _1362 = _1352; } - if (_1356) + if (_1362) { continue; } @@ -816,7 +819,7 @@ void comp_main() if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - tag = _1372.Load((drawtag_start + element_ix) * 4 + 0); + tag = _1378.Load((drawtag_start + element_ix) * 4 + 0); } switch (tag) { @@ -829,11 +832,11 @@ void comp_main() { uint drawmonoid_base = drawmonoid_start + (4u * element_ix); uint path_ix = _260.Load(drawmonoid_base * 4 + 8); - PathRef _1397 = { _1005.Load(16) + (path_ix * 12u) }; - Alloc _1400; - _1400.offset = _1005.Load(16); - param_14.offset = _1400.offset; - PathRef param_15 = _1397; + PathRef _1403 = { _1005.Load(16) + (path_ix * 12u) }; + Alloc _1406; + _1406.offset = _1005.Load(16); + param_14.offset = _1406.offset; + PathRef param_15 = _1403; Path path = Path_read(param_14, param_15); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; @@ -889,16 +892,16 @@ void comp_main() } } uint element_ix_1 = sh_elements[el_ix]; - uint tag_1 = _1372.Load((drawtag_start + element_ix_1) * 4 + 0); + uint tag_1 = _1378.Load((drawtag_start + element_ix_1) * 4 + 0); if (el_ix > 0u) { - _1605 = sh_tile_count[el_ix - 1u]; + _1611 = sh_tile_count[el_ix - 1u]; } else { - _1605 = 0u; + _1611 = 0u; } - uint seq_ix = ix_1 - _1605; + uint seq_ix = ix_1 - _1611; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); @@ -907,9 +910,9 @@ void comp_main() { uint param_21 = el_ix; bool param_22 = mem_ok; - TileRef _1657 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; + TileRef _1663 = { sh_tile_base[el_ix] + (((sh_tile_stride[el_ix] * y) + x) * 8u) }; Alloc param_23 = read_tile_alloc(param_21, param_22); - TileRef param_24 = _1657; + TileRef param_24 = _1663; Tile tile = Tile_read(param_23, param_24); bool is_clip = (tag_1 & 1u) != 0u; bool is_blend = false; @@ -918,27 +921,27 @@ void comp_main() uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); uint scene_offset = _260.Load((drawmonoid_base_1 + 2u) * 4 + 8); uint dd = drawdata_start + (scene_offset >> uint(2)); - uint blend = _1372.Load(dd * 4 + 0); + uint blend = _1378.Load(dd * 4 + 0); is_blend = blend != 3u; } - bool _1692 = tile.tile.offset != 0u; - bool _1701; - if (!_1692) + bool _1698 = tile.tile.offset != 0u; + bool _1707; + if (!_1698) { - _1701 = (tile.backdrop == 0) == is_clip; + _1707 = (tile.backdrop == 0) == is_clip; } else { - _1701 = _1692; + _1707 = _1698; } - include_tile = _1701 || is_blend; + include_tile = _1707 || is_blend; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1723; - InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1723); + uint _1729; + InterlockedOr(sh_bitmaps[el_slice][(y * 16u) + x], el_mask, _1729); } } GroupMemoryBarrierWithGroupSync(); @@ -962,14 +965,14 @@ void comp_main() uint element_ref_ix = (slice_ix * 32u) + uint(int(firstbitlow(bitmap))); uint element_ix_2 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - uint drawtag = _1372.Load((drawtag_start + element_ix_2) * 4 + 0); + uint drawtag = _1378.Load((drawtag_start + element_ix_2) * 4 + 0); if (clip_zero_depth == 0u) { uint param_25 = element_ref_ix; bool param_26 = mem_ok; - TileRef _1800 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; + TileRef _1806 = { sh_tile_base[element_ref_ix] + (((sh_tile_stride[element_ref_ix] * tile_y) + tile_x) * 8u) }; Alloc param_27 = read_tile_alloc(param_25, param_26); - TileRef param_28 = _1800; + TileRef param_28 = _1806; Tile tile_1 = Tile_read(param_27, param_28); uint drawmonoid_base_2 = drawmonoid_start + (4u * element_ix_2); uint scene_offset_1 = _260.Load((drawmonoid_base_2 + 2u) * 4 + 8); @@ -984,11 +987,11 @@ void comp_main() Alloc param_29 = cmd_alloc; CmdRef param_30 = cmd_ref; uint param_31 = cmd_limit; - bool _1848 = alloc_cmd(param_29, param_30, param_31); + bool _1854 = alloc_cmd(param_29, param_30, param_31); cmd_alloc = param_29; cmd_ref = param_30; cmd_limit = param_31; - if (!_1848) + if (!_1854) { break; } @@ -998,11 +1001,11 @@ void comp_main() float param_35 = linewidth; write_fill(param_32, param_33, param_34, param_35); cmd_ref = param_33; - uint rgba = _1372.Load(dd_1 * 4 + 0); - CmdColor _1871 = { rgba }; + uint rgba = _1378.Load(dd_1 * 4 + 0); + CmdColor _1877 = { rgba }; Alloc param_36 = cmd_alloc; CmdRef param_37 = cmd_ref; - CmdColor param_38 = _1871; + CmdColor param_38 = _1877; Cmd_Color_write(param_36, param_37, param_38); cmd_ref.offset += 8u; break; @@ -1012,11 +1015,11 @@ void comp_main() Alloc param_39 = cmd_alloc; CmdRef param_40 = cmd_ref; uint param_41 = cmd_limit; - bool _1889 = alloc_cmd(param_39, param_40, param_41); + bool _1895 = alloc_cmd(param_39, param_40, param_41); cmd_alloc = param_39; cmd_ref = param_40; cmd_limit = param_41; - if (!_1889) + if (!_1895) { break; } @@ -1027,7 +1030,7 @@ void comp_main() float param_45 = linewidth; write_fill(param_42, param_43, param_44, param_45); cmd_ref = param_43; - cmd_lin.index = _1372.Load(dd_1 * 4 + 0); + cmd_lin.index = _1378.Load(dd_1 * 4 + 0); cmd_lin.line_x = asfloat(_260.Load((di + 1u) * 4 + 8)); cmd_lin.line_y = asfloat(_260.Load((di + 2u) * 4 + 8)); cmd_lin.line_c = asfloat(_260.Load((di + 3u) * 4 + 8)); @@ -1043,11 +1046,11 @@ void comp_main() Alloc param_49 = cmd_alloc; CmdRef param_50 = cmd_ref; uint param_51 = cmd_limit; - bool _1953 = alloc_cmd(param_49, param_50, param_51); + bool _1959 = alloc_cmd(param_49, param_50, param_51); cmd_alloc = param_49; cmd_ref = param_50; cmd_limit = param_51; - if (!_1953) + if (!_1959) { break; } @@ -1058,7 +1061,7 @@ void comp_main() float param_55 = linewidth; write_fill(param_52, param_53, param_54, param_55); cmd_ref = param_53; - cmd_rad.index = _1372.Load(dd_1 * 4 + 0); + cmd_rad.index = _1378.Load(dd_1 * 4 + 0); cmd_rad.mat = asfloat(uint4(_260.Load((di + 1u) * 4 + 8), _260.Load((di + 2u) * 4 + 8), _260.Load((di + 3u) * 4 + 8), _260.Load((di + 4u) * 4 + 8))); cmd_rad.xlat = asfloat(uint2(_260.Load((di + 5u) * 4 + 8), _260.Load((di + 6u) * 4 + 8))); cmd_rad.c1 = asfloat(uint2(_260.Load((di + 7u) * 4 + 8), _260.Load((di + 8u) * 4 + 8))); @@ -1077,11 +1080,11 @@ void comp_main() Alloc param_59 = cmd_alloc; CmdRef param_60 = cmd_ref; uint param_61 = cmd_limit; - bool _2059 = alloc_cmd(param_59, param_60, param_61); + bool _2065 = alloc_cmd(param_59, param_60, param_61); cmd_alloc = param_59; cmd_ref = param_60; cmd_limit = param_61; - if (!_2059) + if (!_2065) { break; } @@ -1091,30 +1094,30 @@ void comp_main() float param_65 = linewidth; write_fill(param_62, param_63, param_64, param_65); cmd_ref = param_63; - uint index = _1372.Load(dd_1 * 4 + 0); - uint raw1 = _1372.Load((dd_1 + 1u) * 4 + 0); + uint index = _1378.Load(dd_1 * 4 + 0); + uint raw1 = _1378.Load((dd_1 + 1u) * 4 + 0); int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); - CmdImage _2098 = { index, offset_1 }; + CmdImage _2104 = { index, offset_1 }; Alloc param_66 = cmd_alloc; CmdRef param_67 = cmd_ref; - CmdImage param_68 = _2098; + CmdImage param_68 = _2104; Cmd_Image_write(param_66, param_67, param_68); cmd_ref.offset += 12u; break; } case 5u: { - bool _2112 = tile_1.tile.offset == 0u; - bool _2118; - if (_2112) + bool _2118 = tile_1.tile.offset == 0u; + bool _2124; + if (_2118) { - _2118 = tile_1.backdrop == 0; + _2124 = tile_1.backdrop == 0; } else { - _2118 = _2112; + _2124 = _2118; } - if (_2118) + if (_2124) { clip_zero_depth = clip_depth + 1u; } @@ -1123,11 +1126,11 @@ void comp_main() Alloc param_69 = cmd_alloc; CmdRef param_70 = cmd_ref; uint param_71 = cmd_limit; - bool _2130 = alloc_cmd(param_69, param_70, param_71); + bool _2136 = alloc_cmd(param_69, param_70, param_71); cmd_alloc = param_69; cmd_ref = param_70; cmd_limit = param_71; - if (!_2130) + if (!_2136) { break; } @@ -1135,6 +1138,8 @@ void comp_main() CmdRef param_73 = cmd_ref; Cmd_BeginClip_write(param_72, param_73); cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); } clip_depth++; break; @@ -1145,11 +1150,11 @@ void comp_main() Alloc param_74 = cmd_alloc; CmdRef param_75 = cmd_ref; uint param_76 = cmd_limit; - bool _2158 = alloc_cmd(param_74, param_75, param_76); + bool _2169 = alloc_cmd(param_74, param_75, param_76); cmd_alloc = param_74; cmd_ref = param_75; cmd_limit = param_76; - if (!_2158) + if (!_2169) { break; } @@ -1159,13 +1164,14 @@ void comp_main() float param_80 = -1.0f; write_fill(param_77, param_78, param_79, param_80); cmd_ref = param_78; - uint blend_1 = _1372.Load(dd_1 * 4 + 0); - CmdEndClip _2181 = { blend_1 }; + uint blend_1 = _1378.Load(dd_1 * 4 + 0); + CmdEndClip _2192 = { blend_1 }; Alloc param_81 = cmd_alloc; CmdRef param_82 = cmd_ref; - CmdEndClip param_83 = _2181; + CmdEndClip param_83 = _2192; Cmd_EndClip_write(param_81, param_82, param_83); cmd_ref.offset += 8u; + render_blend_depth--; break; } } @@ -1198,21 +1204,24 @@ void comp_main() break; } } - bool _2228 = (bin_tile_x + tile_x) < _1005.Load(8); - bool _2237; - if (_2228) + bool _2241 = (bin_tile_x + tile_x) < _1005.Load(8); + bool _2250; + if (_2241) { - _2237 = (bin_tile_y + tile_y) < _1005.Load(12); + _2250 = (bin_tile_y + tile_y) < _1005.Load(12); } else { - _2237 = _2228; + _2250 = _2241; } - if (_2237) + if (_2250) { Alloc param_84 = cmd_alloc; CmdRef param_85 = cmd_ref; Cmd_End_write(param_84, param_85); + if (max_blend_depth > 4u) + { + } } } diff --git a/piet-gpu/shader/gen/coarse.msl b/piet-gpu/shader/gen/coarse.msl index 55812d4..578fa37 100644 --- a/piet-gpu/shader/gen/coarse.msl +++ b/piet-gpu/shader/gen/coarse.msl @@ -677,7 +677,7 @@ void Cmd_End_write(thread const Alloc& a, thread const CmdRef& ref, device Memor write_mem(param, param_1, param_2, v_260, v_260BufferSize); } -kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1372 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device Memory& v_260 [[buffer(0)]], const device ConfigBuf& _1005 [[buffer(1)]], const device SceneBuf& _1378 [[buffer(2)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { threadgroup uint sh_bitmaps[8][256]; threadgroup Alloc sh_part_elements[256]; @@ -713,6 +713,9 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint wr_ix = 0u; uint part_start_ix = 0u; uint ready_ix = 0u; + cmd_ref.offset += 4u; + uint render_blend_depth = 0u; + uint max_blend_depth = 0u; uint drawmonoid_start = _1005.conf.drawmonoid_alloc.offset >> uint(2); uint drawtag_start = _1005.conf.drawtag_offset >> uint(2); uint drawdata_start = _1005.conf.drawdata_offset >> uint(2); @@ -720,11 +723,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M bool mem_ok = v_260.mem_error == 0u; Alloc param_3; Alloc param_5; - uint _1304; + uint _1310; uint element_ix; Alloc param_14; uint tile_count; - uint _1605; + uint _1611; float linewidth; CmdLinGrad cmd_lin; CmdRadGrad cmd_rad; @@ -734,24 +737,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M { sh_bitmaps[i][th_ix] = 0u; } - bool _1356; + bool _1362; for (;;) { if ((ready_ix == wr_ix) && (partition_ix < n_partitions)) { part_start_ix = ready_ix; uint count = 0u; - bool _1154 = th_ix < 256u; - bool _1162; - if (_1154) + bool _1160 = th_ix < 256u; + bool _1168; + if (_1160) { - _1162 = (partition_ix + th_ix) < n_partitions; + _1168 = (partition_ix + th_ix) < n_partitions; } else { - _1162 = _1154; + _1168 = _1160; } - if (_1162) + if (_1168) { uint in_ix = (_1005.conf.bin_alloc.offset >> uint(2)) + ((((partition_ix + th_ix) * 256u) + bin_ix) * 2u); param_3.offset = _1005.conf.bin_alloc.offset; @@ -803,13 +806,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } if (part_ix > 0u) { - _1304 = sh_part_count[part_ix - 1u]; + _1310 = sh_part_count[part_ix - 1u]; } else { - _1304 = part_start_ix; + _1310 = part_start_ix; } - ix -= _1304; + ix -= _1310; Alloc bin_alloc = sh_part_elements[part_ix]; BinInstanceRef inst_ref = BinInstanceRef{ bin_alloc.offset }; BinInstanceRef param_10 = inst_ref; @@ -821,16 +824,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } threadgroup_barrier(mem_flags::mem_threadgroup); wr_ix = min((rd_ix + 256u), ready_ix); - bool _1346 = (wr_ix - rd_ix) < 256u; - if (_1346) + bool _1352 = (wr_ix - rd_ix) < 256u; + if (_1352) { - _1356 = (wr_ix < ready_ix) || (partition_ix < n_partitions); + _1362 = (wr_ix < ready_ix) || (partition_ix < n_partitions); } else { - _1356 = _1346; + _1362 = _1352; } - if (_1356) + if (_1362) { continue; } @@ -843,7 +846,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if ((th_ix + rd_ix) < wr_ix) { element_ix = sh_elements[th_ix]; - tag = _1372.scene[drawtag_start + element_ix]; + tag = _1378.scene[drawtag_start + element_ix]; } switch (tag) { @@ -913,16 +916,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } } uint element_ix_1 = sh_elements[el_ix]; - uint tag_1 = _1372.scene[drawtag_start + element_ix_1]; + uint tag_1 = _1378.scene[drawtag_start + element_ix_1]; if (el_ix > 0u) { - _1605 = sh_tile_count[el_ix - 1u]; + _1611 = sh_tile_count[el_ix - 1u]; } else { - _1605 = 0u; + _1611 = 0u; } - uint seq_ix = ix_1 - _1605; + uint seq_ix = ix_1 - _1611; uint width = sh_tile_width[el_ix]; uint x = sh_tile_x0[el_ix] + (seq_ix % width); uint y = sh_tile_y0[el_ix] + (seq_ix / width); @@ -941,26 +944,26 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint drawmonoid_base_1 = drawmonoid_start + (4u * element_ix_1); uint scene_offset = v_260.memory[drawmonoid_base_1 + 2u]; uint dd = drawdata_start + (scene_offset >> uint(2)); - uint blend = _1372.scene[dd]; + uint blend = _1378.scene[dd]; is_blend = blend != 3u; } - bool _1692 = tile.tile.offset != 0u; - bool _1701; - if (!_1692) + bool _1698 = tile.tile.offset != 0u; + bool _1707; + if (!_1698) { - _1701 = (tile.backdrop == 0) == is_clip; + _1707 = (tile.backdrop == 0) == is_clip; } else { - _1701 = _1692; + _1707 = _1698; } - include_tile = _1701 || is_blend; + include_tile = _1707 || is_blend; } if (include_tile) { uint el_slice = el_ix / 32u; uint el_mask = 1u << (el_ix & 31u); - uint _1723 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); + uint _1729 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&sh_bitmaps[el_slice][(y * 16u) + x], el_mask, memory_order_relaxed); } } threadgroup_barrier(mem_flags::mem_threadgroup); @@ -984,7 +987,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint element_ref_ix = (slice_ix * 32u) + uint(int(spvFindLSB(bitmap))); uint element_ix_2 = sh_elements[element_ref_ix]; bitmap &= (bitmap - 1u); - uint drawtag = _1372.scene[drawtag_start + element_ix_2]; + uint drawtag = _1378.scene[drawtag_start + element_ix_2]; if (clip_zero_depth == 0u) { uint param_25 = element_ref_ix; @@ -1005,11 +1008,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_29 = cmd_alloc; CmdRef param_30 = cmd_ref; uint param_31 = cmd_limit; - bool _1848 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize); + bool _1854 = alloc_cmd(param_29, param_30, param_31, v_260, v_260BufferSize); cmd_alloc = param_29; cmd_ref = param_30; cmd_limit = param_31; - if (!_1848) + if (!_1854) { break; } @@ -1019,7 +1022,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M float param_35 = linewidth; write_fill(param_32, param_33, param_34, param_35, v_260, v_260BufferSize); cmd_ref = param_33; - uint rgba = _1372.scene[dd_1]; + uint rgba = _1378.scene[dd_1]; Alloc param_36 = cmd_alloc; CmdRef param_37 = cmd_ref; CmdColor param_38 = CmdColor{ rgba }; @@ -1032,11 +1035,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_39 = cmd_alloc; CmdRef param_40 = cmd_ref; uint param_41 = cmd_limit; - bool _1889 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize); + bool _1895 = alloc_cmd(param_39, param_40, param_41, v_260, v_260BufferSize); cmd_alloc = param_39; cmd_ref = param_40; cmd_limit = param_41; - if (!_1889) + if (!_1895) { break; } @@ -1047,7 +1050,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M float param_45 = linewidth; write_fill(param_42, param_43, param_44, param_45, v_260, v_260BufferSize); cmd_ref = param_43; - cmd_lin.index = _1372.scene[dd_1]; + cmd_lin.index = _1378.scene[dd_1]; cmd_lin.line_x = as_type(v_260.memory[di + 1u]); cmd_lin.line_y = as_type(v_260.memory[di + 2u]); cmd_lin.line_c = as_type(v_260.memory[di + 3u]); @@ -1063,11 +1066,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_49 = cmd_alloc; CmdRef param_50 = cmd_ref; uint param_51 = cmd_limit; - bool _1953 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize); + bool _1959 = alloc_cmd(param_49, param_50, param_51, v_260, v_260BufferSize); cmd_alloc = param_49; cmd_ref = param_50; cmd_limit = param_51; - if (!_1953) + if (!_1959) { break; } @@ -1078,7 +1081,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M float param_55 = linewidth; write_fill(param_52, param_53, param_54, param_55, v_260, v_260BufferSize); cmd_ref = param_53; - cmd_rad.index = _1372.scene[dd_1]; + cmd_rad.index = _1378.scene[dd_1]; cmd_rad.mat = as_type(uint4(v_260.memory[di + 1u], v_260.memory[di + 2u], v_260.memory[di + 3u], v_260.memory[di + 4u])); cmd_rad.xlat = as_type(uint2(v_260.memory[di + 5u], v_260.memory[di + 6u])); cmd_rad.c1 = as_type(uint2(v_260.memory[di + 7u], v_260.memory[di + 8u])); @@ -1097,11 +1100,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_59 = cmd_alloc; CmdRef param_60 = cmd_ref; uint param_61 = cmd_limit; - bool _2059 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize); + bool _2065 = alloc_cmd(param_59, param_60, param_61, v_260, v_260BufferSize); cmd_alloc = param_59; cmd_ref = param_60; cmd_limit = param_61; - if (!_2059) + if (!_2065) { break; } @@ -1111,8 +1114,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M float param_65 = linewidth; write_fill(param_62, param_63, param_64, param_65, v_260, v_260BufferSize); cmd_ref = param_63; - uint index = _1372.scene[dd_1]; - uint raw1 = _1372.scene[dd_1 + 1u]; + uint index = _1378.scene[dd_1]; + uint raw1 = _1378.scene[dd_1 + 1u]; int2 offset_1 = int2(int(raw1 << uint(16)) >> 16, int(raw1) >> 16); Alloc param_66 = cmd_alloc; CmdRef param_67 = cmd_ref; @@ -1123,17 +1126,17 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M } case 5u: { - bool _2112 = tile_1.tile.offset == 0u; - bool _2118; - if (_2112) + bool _2118 = tile_1.tile.offset == 0u; + bool _2124; + if (_2118) { - _2118 = tile_1.backdrop == 0; + _2124 = tile_1.backdrop == 0; } else { - _2118 = _2112; + _2124 = _2118; } - if (_2118) + if (_2124) { clip_zero_depth = clip_depth + 1u; } @@ -1142,11 +1145,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_69 = cmd_alloc; CmdRef param_70 = cmd_ref; uint param_71 = cmd_limit; - bool _2130 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize); + bool _2136 = alloc_cmd(param_69, param_70, param_71, v_260, v_260BufferSize); cmd_alloc = param_69; cmd_ref = param_70; cmd_limit = param_71; - if (!_2130) + if (!_2136) { break; } @@ -1154,6 +1157,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M CmdRef param_73 = cmd_ref; Cmd_BeginClip_write(param_72, param_73, v_260, v_260BufferSize); cmd_ref.offset += 4u; + render_blend_depth++; + max_blend_depth = max(max_blend_depth, render_blend_depth); } clip_depth++; break; @@ -1164,11 +1169,11 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M Alloc param_74 = cmd_alloc; CmdRef param_75 = cmd_ref; uint param_76 = cmd_limit; - bool _2158 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize); + bool _2169 = alloc_cmd(param_74, param_75, param_76, v_260, v_260BufferSize); cmd_alloc = param_74; cmd_ref = param_75; cmd_limit = param_76; - if (!_2158) + if (!_2169) { break; } @@ -1178,12 +1183,13 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M float param_80 = -1.0; write_fill(param_77, param_78, param_79, param_80, v_260, v_260BufferSize); cmd_ref = param_78; - uint blend_1 = _1372.scene[dd_1]; + uint blend_1 = _1378.scene[dd_1]; Alloc param_81 = cmd_alloc; CmdRef param_82 = cmd_ref; CmdEndClip param_83 = CmdEndClip{ blend_1 }; Cmd_EndClip_write(param_81, param_82, param_83, v_260, v_260BufferSize); cmd_ref.offset += 8u; + render_blend_depth--; break; } } @@ -1216,21 +1222,24 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M break; } } - bool _2228 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles; - bool _2237; - if (_2228) + bool _2241 = (bin_tile_x + tile_x) < _1005.conf.width_in_tiles; + bool _2250; + if (_2241) { - _2237 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles; + _2250 = (bin_tile_y + tile_y) < _1005.conf.height_in_tiles; } else { - _2237 = _2228; + _2250 = _2241; } - if (_2237) + if (_2250) { Alloc param_84 = cmd_alloc; CmdRef param_85 = cmd_ref; Cmd_End_write(param_84, param_85, v_260, v_260BufferSize); + if (max_blend_depth > 4u) + { + } } } diff --git a/piet-gpu/shader/gen/coarse.spv b/piet-gpu/shader/gen/coarse.spv index 6d33ee7..718acca 100644 Binary files a/piet-gpu/shader/gen/coarse.spv and b/piet-gpu/shader/gen/coarse.spv differ diff --git a/piet-gpu/shader/gen/kernel4.dxil b/piet-gpu/shader/gen/kernel4.dxil index da6c563..5617c51 100644 Binary files a/piet-gpu/shader/gen/kernel4.dxil and b/piet-gpu/shader/gen/kernel4.dxil differ diff --git a/piet-gpu/shader/gen/kernel4.hlsl b/piet-gpu/shader/gen/kernel4.hlsl index 5d6f839..30779b7 100644 --- a/piet-gpu/shader/gen/kernel4.hlsl +++ b/piet-gpu/shader/gen/kernel4.hlsl @@ -162,7 +162,7 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); RWByteAddressBuffer _297 : register(u0, space0); -ByteAddressBuffer _1749 : register(t1, space0); +ByteAddressBuffer _1725 : register(t1, space0); RWTexture2D image_atlas : register(u3, space0); RWTexture2D gradients : register(u4, space0); RWTexture2D image : register(u2, space0); @@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; float4 fg_rgba = image_atlas[uv]; float3 param_1 = fg_rgba.xyz; - float3 _1721 = fromsRGB(param_1); - fg_rgba.x = _1721.x; - fg_rgba.y = _1721.y; - fg_rgba.z = _1721.z; + float3 _1697 = fromsRGB(param_1); + fg_rgba.x = _1697.x; + fg_rgba.y = _1697.y; + fg_rgba.z = _1697.z; rgba[i] = fg_rgba; } spvReturnValue = rgba; @@ -918,12 +918,6 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode) break; } case 13u: - { - float rev_as = 1.0f - as; - float rev_ab = 1.0f - ab; - return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab)); - } - case 14u: { return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab)); } @@ -992,16 +986,18 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) void comp_main() { - uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x; - Alloc _1764; - _1764.offset = _1749.Load(24); + uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x; + Alloc _1740; + _1740.offset = _1725.Load(24); Alloc param; - param.offset = _1764.offset; + param.offset = _1740.offset; uint param_1 = tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); - CmdRef _1773 = { cmd_alloc.offset }; - CmdRef cmd_ref = _1773; + CmdRef _1749 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1749; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); float4 rgba[8]; @@ -1014,7 +1010,9 @@ void comp_main() float df[8]; TileSegRef tile_seg_ref; float area[8]; - uint blend_stack[128][8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1035,8 +1033,8 @@ void comp_main() { df[k] = 1000000000.0f; } - TileSegRef _1867 = { stroke.tile_ref }; - tile_seg_ref = _1867; + TileSegRef _1854 = { stroke.tile_ref }; + tile_seg_ref = _1854; do { uint param_7 = tile_seg_ref.offset; @@ -1072,8 +1070,8 @@ void comp_main() { area[k_3] = float(fill.backdrop); } - TileSegRef _1987 = { fill.tile_ref }; - tile_seg_ref = _1987; + TileSegRef _1974 = { fill.tile_ref }; + tile_seg_ref = _1974; do { uint param_15 = tile_seg_ref.offset; @@ -1162,10 +1160,10 @@ void comp_main() int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba = gradients[int2(x, int(lin.index))]; float3 param_29 = fg_rgba.xyz; - float3 _2321 = fromsRGB(param_29); - fg_rgba.x = _2321.x; - fg_rgba.y = _2321.y; - fg_rgba.z = _2321.z; + float3 _2308 = fromsRGB(param_29); + fg_rgba.x = _2308.x; + fg_rgba.y = _2308.y; + fg_rgba.z = _2308.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; } @@ -1188,10 +1186,10 @@ void comp_main() int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; float3 param_33 = fg_rgba_1.xyz; - float3 _2431 = fromsRGB(param_33); - fg_rgba_1.x = _2431.x; - fg_rgba_1.y = _2431.y; - fg_rgba_1.z = _2431.z; + float3 _2418 = fromsRGB(param_33); + fg_rgba_1.x = _2418.x; + fg_rgba_1.y = _2418.y; + fg_rgba_1.z = _2418.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; } @@ -1205,9 +1203,9 @@ void comp_main() CmdImage fill_img = Cmd_Image_read(param_34, param_35); uint2 param_36 = xy_uint; CmdImage param_37 = fill_img; - float4 _2474[8]; - fillImage(_2474, param_36, param_37); - float4 img[8] = _2474; + float4 _2461[8]; + fillImage(_2461, param_36, param_37); + float4 img[8] = _2461; for (uint k_11 = 0u; k_11 < 8u; k_11++) { float4 fg_k_3 = img[k_11] * area[k_11]; @@ -1218,13 +1216,26 @@ void comp_main() } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2537 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2537; - rgba[k_12] = 0.0f.xxxx; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2523 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2523; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2566 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2566); + rgba[k_13] = 0.0f.xxxx; + } } clip_depth++; cmd_ref.offset += 4u; @@ -1232,32 +1243,41 @@ void comp_main() } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth < 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref }; - cmd_ref = _2615; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2665; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1265,9 +1285,9 @@ void comp_main() } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - float3 param_48 = rgba[i_1].xyz; - image[int2(xy_uint + chunk_offset(param_47))] = float4(tosRGB(param_48), rgba[i_1].w); + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image[int2(xy_uint + chunk_offset(param_48))] = float4(tosRGB(param_49), rgba[i_1].w); } } diff --git a/piet-gpu/shader/gen/kernel4.msl b/piet-gpu/shader/gen/kernel4.msl index 796043b..6325914 100644 --- a/piet-gpu/shader/gen/kernel4.msl +++ b/piet-gpu/shader/gen/kernel4.msl @@ -528,10 +528,10 @@ spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImag int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; float4 fg_rgba = image_atlas.read(uint2(uv)); float3 param_1 = fg_rgba.xyz; - float3 _1721 = fromsRGB(param_1); - fg_rgba.x = _1721.x; - fg_rgba.y = _1721.y; - fg_rgba.z = _1721.z; + float3 _1697 = fromsRGB(param_1); + fg_rgba.x = _1697.x; + fg_rgba.y = _1697.y; + fg_rgba.z = _1697.z; rgba[i] = fg_rgba; } return rgba; @@ -985,12 +985,6 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons break; } case 13u: - { - float rev_as = 1.0 - as; - float rev_ab = 1.0 - ab; - return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab)); - } - case 14u: { return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab)); } @@ -1059,15 +1053,17 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me return CmdJump_read(param, param_1, v_297); } -kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x; + uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x; Alloc param; - param.offset = _1749.conf.ptcl_alloc.offset; + param.offset = _1725.conf.ptcl_alloc.offset; uint param_1 = tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); spvUnsafeArray rgba; @@ -1080,7 +1076,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 spvUnsafeArray df; TileSegRef tile_seg_ref; spvUnsafeArray area; - spvUnsafeArray, 128> blend_stack; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1226,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); float3 param_29 = fg_rgba.xyz; - float3 _2321 = fromsRGB(param_29); - fg_rgba.x = _2321.x; - fg_rgba.y = _2321.y; - fg_rgba.z = _2321.z; + float3 _2308 = fromsRGB(param_29); + fg_rgba.x = _2308.x; + fg_rgba.y = _2308.y; + fg_rgba.z = _2308.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; } @@ -1252,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); float3 param_33 = fg_rgba_1.xyz; - float3 _2431 = fromsRGB(param_33); - fg_rgba_1.x = _2431.x; - fg_rgba_1.y = _2431.y; - fg_rgba_1.z = _2431.z; + float3 _2418 = fromsRGB(param_33); + fg_rgba_1.x = _2418.x; + fg_rgba_1.y = _2418.y; + fg_rgba_1.z = _2418.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; } @@ -1281,13 +1279,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2537 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2537; - rgba[k_12] = float4(0.0); + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2523 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2523; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2566 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2566; + rgba[k_13] = float4(0.0); + } } clip_depth++; cmd_ref.offset += 4u; @@ -1295,31 +1306,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth < 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref }; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1327,9 +1347,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - float3 param_48 = rgba[i_1].xyz; - image.write(float4(tosRGB(param_48), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47)))); + uint param_48 = i_1; + float3 param_49 = rgba[i_1].xyz; + image.write(float4(tosRGB(param_49), rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); } } diff --git a/piet-gpu/shader/gen/kernel4.spv b/piet-gpu/shader/gen/kernel4.spv index b145245..978e0a2 100644 Binary files a/piet-gpu/shader/gen/kernel4.spv and b/piet-gpu/shader/gen/kernel4.spv differ diff --git a/piet-gpu/shader/gen/kernel4_gray.dxil b/piet-gpu/shader/gen/kernel4_gray.dxil index abe1d22..37fe62c 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.dxil and b/piet-gpu/shader/gen/kernel4_gray.dxil differ diff --git a/piet-gpu/shader/gen/kernel4_gray.hlsl b/piet-gpu/shader/gen/kernel4_gray.hlsl index f402268..5bd7b3b 100644 --- a/piet-gpu/shader/gen/kernel4_gray.hlsl +++ b/piet-gpu/shader/gen/kernel4_gray.hlsl @@ -162,7 +162,7 @@ struct Config static const uint3 gl_WorkGroupSize = uint3(8u, 4u, 1u); RWByteAddressBuffer _297 : register(u0, space0); -ByteAddressBuffer _1749 : register(t1, space0); +ByteAddressBuffer _1725 : register(t1, space0); RWTexture2D image_atlas : register(u3, space0); RWTexture2D gradients : register(u4, space0); RWTexture2D image : register(u2, space0); @@ -477,10 +477,10 @@ void fillImage(out float4 spvReturnValue[8], uint2 xy, CmdImage cmd_img) int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; float4 fg_rgba = image_atlas[uv]; float3 param_1 = fg_rgba.xyz; - float3 _1721 = fromsRGB(param_1); - fg_rgba.x = _1721.x; - fg_rgba.y = _1721.y; - fg_rgba.z = _1721.z; + float3 _1697 = fromsRGB(param_1); + fg_rgba.x = _1697.x; + fg_rgba.y = _1697.y; + fg_rgba.z = _1697.z; rgba[i] = fg_rgba; } spvReturnValue = rgba; @@ -918,12 +918,6 @@ float4 mix_compose(float3 cb, float3 cs, float ab, float as, uint mode) break; } case 13u: - { - float rev_as = 1.0f - as; - float rev_ab = 1.0f - ab; - return max(0.0f.xxxx, float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab)); - } - case 14u: { return min(1.0f.xxxx, float4((cs * as) + (cb * ab), as + ab)); } @@ -992,16 +986,18 @@ CmdJump Cmd_Jump_read(Alloc a, CmdRef ref) void comp_main() { - uint tile_ix = (gl_WorkGroupID.y * _1749.Load(8)) + gl_WorkGroupID.x; - Alloc _1764; - _1764.offset = _1749.Load(24); + uint tile_ix = (gl_WorkGroupID.y * _1725.Load(8)) + gl_WorkGroupID.x; + Alloc _1740; + _1740.offset = _1725.Load(24); Alloc param; - param.offset = _1764.offset; + param.offset = _1740.offset; uint param_1 = tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); - CmdRef _1773 = { cmd_alloc.offset }; - CmdRef cmd_ref = _1773; + CmdRef _1749 = { cmd_alloc.offset }; + CmdRef cmd_ref = _1749; + uint blend_offset = _297.Load((cmd_ref.offset >> uint(2)) * 4 + 8); + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); float4 rgba[8]; @@ -1014,7 +1010,9 @@ void comp_main() float df[8]; TileSegRef tile_seg_ref; float area[8]; - uint blend_stack[128][8]; + uint blend_stack[4][8]; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1035,8 +1033,8 @@ void comp_main() { df[k] = 1000000000.0f; } - TileSegRef _1867 = { stroke.tile_ref }; - tile_seg_ref = _1867; + TileSegRef _1854 = { stroke.tile_ref }; + tile_seg_ref = _1854; do { uint param_7 = tile_seg_ref.offset; @@ -1072,8 +1070,8 @@ void comp_main() { area[k_3] = float(fill.backdrop); } - TileSegRef _1987 = { fill.tile_ref }; - tile_seg_ref = _1987; + TileSegRef _1974 = { fill.tile_ref }; + tile_seg_ref = _1974; do { uint param_15 = tile_seg_ref.offset; @@ -1162,10 +1160,10 @@ void comp_main() int x = int(round(clamp(my_d, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba = gradients[int2(x, int(lin.index))]; float3 param_29 = fg_rgba.xyz; - float3 _2321 = fromsRGB(param_29); - fg_rgba.x = _2321.x; - fg_rgba.y = _2321.y; - fg_rgba.z = _2321.z; + float3 _2308 = fromsRGB(param_29); + fg_rgba.x = _2308.x; + fg_rgba.y = _2308.y; + fg_rgba.z = _2308.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0f - fg_k_1.w)) + fg_k_1; } @@ -1188,10 +1186,10 @@ void comp_main() int x_1 = int(round(clamp(t_2, 0.0f, 1.0f) * 511.0f)); float4 fg_rgba_1 = gradients[int2(x_1, int(rad.index))]; float3 param_33 = fg_rgba_1.xyz; - float3 _2431 = fromsRGB(param_33); - fg_rgba_1.x = _2431.x; - fg_rgba_1.y = _2431.y; - fg_rgba_1.z = _2431.z; + float3 _2418 = fromsRGB(param_33); + fg_rgba_1.x = _2418.x; + fg_rgba_1.y = _2418.y; + fg_rgba_1.z = _2418.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0f - fg_k_2.w)) + fg_k_2; } @@ -1205,9 +1203,9 @@ void comp_main() CmdImage fill_img = Cmd_Image_read(param_34, param_35); uint2 param_36 = xy_uint; CmdImage param_37 = fill_img; - float4 _2474[8]; - fillImage(_2474, param_36, param_37); - float4 img[8] = _2474; + float4 _2461[8]; + fillImage(_2461, param_36, param_37); + float4 img[8] = _2461; for (uint k_11 = 0u; k_11 < 8u; k_11++) { float4 fg_k_3 = img[k_11] * area[k_11]; @@ -1218,13 +1216,26 @@ void comp_main() } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2537 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2537; - rgba[k_12] = 0.0f.xxxx; + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2523 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2523; + rgba[k_12] = 0.0f.xxxx; + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2566 = packsRGB(param_39); + _297.Store((base_ix + k_13) * 4 + 8, _2566); + rgba[k_13] = 0.0f.xxxx; + } } clip_depth++; cmd_ref.offset += 4u; @@ -1232,32 +1243,41 @@ void comp_main() } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth < 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = _297.Load((base_ix_1 + k_14) * 4 + 8); + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - CmdRef _2615 = { Cmd_Jump_read(param_45, param_46).new_ref }; - cmd_ref = _2615; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + CmdRef _2665 = { Cmd_Jump_read(param_46, param_47).new_ref }; + cmd_ref = _2665; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1265,8 +1285,8 @@ void comp_main() } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - image[int2(xy_uint + chunk_offset(param_47))] = rgba[i_1].w.x; + uint param_48 = i_1; + image[int2(xy_uint + chunk_offset(param_48))] = rgba[i_1].w.x; } } diff --git a/piet-gpu/shader/gen/kernel4_gray.msl b/piet-gpu/shader/gen/kernel4_gray.msl index 9647001..2b550b8 100644 --- a/piet-gpu/shader/gen/kernel4_gray.msl +++ b/piet-gpu/shader/gen/kernel4_gray.msl @@ -528,10 +528,10 @@ spvUnsafeArray fillImage(thread const uint2& xy, thread const CmdImag int2 uv = int2(xy + chunk_offset(param)) + cmd_img.offset; float4 fg_rgba = image_atlas.read(uint2(uv)); float3 param_1 = fg_rgba.xyz; - float3 _1721 = fromsRGB(param_1); - fg_rgba.x = _1721.x; - fg_rgba.y = _1721.y; - fg_rgba.z = _1721.z; + float3 _1697 = fromsRGB(param_1); + fg_rgba.x = _1697.x; + fg_rgba.y = _1697.y; + fg_rgba.z = _1697.z; rgba[i] = fg_rgba; } return rgba; @@ -985,12 +985,6 @@ float4 mix_compose(thread const float3& cb, thread const float3& cs, thread cons break; } case 13u: - { - float rev_as = 1.0 - as; - float rev_ab = 1.0 - ab; - return fast::max(float4(0.0), float4((cs * rev_as) + (cb * rev_ab), rev_as + rev_ab)); - } - case 14u: { return fast::min(float4(1.0), float4((cs * as) + (cb * ab), as + ab)); } @@ -1059,15 +1053,17 @@ CmdJump Cmd_Jump_read(thread const Alloc& a, thread const CmdRef& ref, device Me return CmdJump_read(param, param_1, v_297); } -kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1749 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1725 [[buffer(1)]], texture2d image [[texture(2)]], texture2d image_atlas [[texture(3)]], texture2d gradients [[texture(4)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - uint tile_ix = (gl_WorkGroupID.y * _1749.conf.width_in_tiles) + gl_WorkGroupID.x; + uint tile_ix = (gl_WorkGroupID.y * _1725.conf.width_in_tiles) + gl_WorkGroupID.x; Alloc param; - param.offset = _1749.conf.ptcl_alloc.offset; + param.offset = _1725.conf.ptcl_alloc.offset; uint param_1 = tile_ix * 1024u; uint param_2 = 1024u; Alloc cmd_alloc = slice_mem(param, param_1, param_2); CmdRef cmd_ref = CmdRef{ cmd_alloc.offset }; + uint blend_offset = v_297.memory[cmd_ref.offset >> uint(2)]; + cmd_ref.offset += 4u; uint2 xy_uint = uint2(gl_LocalInvocationID.x + (16u * gl_WorkGroupID.x), gl_LocalInvocationID.y + (16u * gl_WorkGroupID.y)); float2 xy = float2(xy_uint); spvUnsafeArray rgba; @@ -1080,7 +1076,9 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 spvUnsafeArray df; TileSegRef tile_seg_ref; spvUnsafeArray area; - spvUnsafeArray, 128> blend_stack; + spvUnsafeArray, 4> blend_stack; + uint base_ix_1; + uint bg_rgba; while (mem_ok) { Alloc param_3 = cmd_alloc; @@ -1226,10 +1224,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x = int(round(fast::clamp(my_d, 0.0, 1.0) * 511.0)); float4 fg_rgba = gradients.read(uint2(int2(x, int(lin.index)))); float3 param_29 = fg_rgba.xyz; - float3 _2321 = fromsRGB(param_29); - fg_rgba.x = _2321.x; - fg_rgba.y = _2321.y; - fg_rgba.z = _2321.z; + float3 _2308 = fromsRGB(param_29); + fg_rgba.x = _2308.x; + fg_rgba.y = _2308.y; + fg_rgba.z = _2308.z; float4 fg_k_1 = fg_rgba * area[k_9]; rgba[k_9] = (rgba[k_9] * (1.0 - fg_k_1.w)) + fg_k_1; } @@ -1252,10 +1250,10 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 int x_1 = int(round(fast::clamp(t_2, 0.0, 1.0) * 511.0)); float4 fg_rgba_1 = gradients.read(uint2(int2(x_1, int(rad.index)))); float3 param_33 = fg_rgba_1.xyz; - float3 _2431 = fromsRGB(param_33); - fg_rgba_1.x = _2431.x; - fg_rgba_1.y = _2431.y; - fg_rgba_1.z = _2431.z; + float3 _2418 = fromsRGB(param_33); + fg_rgba_1.x = _2418.x; + fg_rgba_1.y = _2418.y; + fg_rgba_1.z = _2418.z; float4 fg_k_2 = fg_rgba_1 * area[k_10]; rgba[k_10] = (rgba[k_10] * (1.0 - fg_k_2.w)) + fg_k_2; } @@ -1281,13 +1279,26 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 9u: { - for (uint k_12 = 0u; k_12 < 8u; k_12++) + if (clip_depth < 4u) { - uint d_2 = min(clip_depth, 127u); - float4 param_38 = float4(rgba[k_12]); - uint _2537 = packsRGB(param_38); - blend_stack[d_2][k_12] = _2537; - rgba[k_12] = float4(0.0); + for (uint k_12 = 0u; k_12 < 8u; k_12++) + { + float4 param_38 = float4(rgba[k_12]); + uint _2523 = packsRGB(param_38); + blend_stack[clip_depth][k_12] = _2523; + rgba[k_12] = float4(0.0); + } + } + else + { + uint base_ix = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + for (uint k_13 = 0u; k_13 < 8u; k_13++) + { + float4 param_39 = float4(rgba[k_13]); + uint _2566 = packsRGB(param_39); + v_297.memory[base_ix + k_13] = _2566; + rgba[k_13] = float4(0.0); + } } clip_depth++; cmd_ref.offset += 4u; @@ -1295,31 +1306,40 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } case 10u: { - Alloc param_39 = cmd_alloc; - CmdRef param_40 = cmd_ref; - CmdEndClip end_clip = Cmd_EndClip_read(param_39, param_40, v_297); - uint blend_mode = end_clip.blend >> uint(8); - uint comp_mode = end_clip.blend & 255u; + Alloc param_40 = cmd_alloc; + CmdRef param_41 = cmd_ref; + CmdEndClip end_clip = Cmd_EndClip_read(param_40, param_41, v_297); clip_depth--; - for (uint k_13 = 0u; k_13 < 8u; k_13++) + if (clip_depth < 4u) { - uint d_3 = min(clip_depth, 127u); - uint param_41 = blend_stack[d_3][k_13]; - float4 bg = unpacksRGB(param_41); - float4 fg_1 = rgba[k_13] * area[k_13]; - float4 param_42 = bg; - float4 param_43 = fg_1; - uint param_44 = end_clip.blend; - rgba[k_13] = mix_blend_compose(param_42, param_43, param_44); + base_ix_1 = ((blend_offset >> uint(2)) + (((clip_depth - 4u) * 16u) * 16u)) + (8u * (gl_LocalInvocationID.x + (8u * gl_LocalInvocationID.y))); + } + for (uint k_14 = 0u; k_14 < 8u; k_14++) + { + if (clip_depth < 4u) + { + bg_rgba = blend_stack[clip_depth][k_14]; + } + else + { + bg_rgba = v_297.memory[base_ix_1 + k_14]; + } + uint param_42 = bg_rgba; + float4 bg = unpacksRGB(param_42); + float4 fg_1 = rgba[k_14] * area[k_14]; + float4 param_43 = bg; + float4 param_44 = fg_1; + uint param_45 = end_clip.blend; + rgba[k_14] = mix_blend_compose(param_43, param_44, param_45); } cmd_ref.offset += 8u; break; } case 11u: { - Alloc param_45 = cmd_alloc; - CmdRef param_46 = cmd_ref; - cmd_ref = CmdRef{ Cmd_Jump_read(param_45, param_46, v_297).new_ref }; + Alloc param_46 = cmd_alloc; + CmdRef param_47 = cmd_ref; + cmd_ref = CmdRef{ Cmd_Jump_read(param_46, param_47, v_297).new_ref }; cmd_alloc.offset = cmd_ref.offset; break; } @@ -1327,8 +1347,8 @@ kernel void main0(device Memory& v_297 [[buffer(0)]], const device ConfigBuf& _1 } for (uint i_1 = 0u; i_1 < 8u; i_1++) { - uint param_47 = i_1; - image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_47)))); + uint param_48 = i_1; + image.write(float4(rgba[i_1].w), uint2(int2(xy_uint + chunk_offset(param_48)))); } } diff --git a/piet-gpu/shader/gen/kernel4_gray.spv b/piet-gpu/shader/gen/kernel4_gray.spv index 2dd46c0..bacd9a8 100644 Binary files a/piet-gpu/shader/gen/kernel4_gray.spv and b/piet-gpu/shader/gen/kernel4_gray.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index a0710d2..c9b5dd3 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -87,11 +87,14 @@ void main() { Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); CmdRef cmd_ref = CmdRef(cmd_alloc.offset); + uint blend_offset = memory[cmd_ref.offset >> 2]; + cmd_ref.offset += 4; + uvec2 xy_uint = uvec2(gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_WorkGroupID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); mediump vec4 rgba[CHUNK]; - uint blend_stack[MAX_BLEND_STACK][CHUNK]; + uint blend_stack[BLEND_STACK_SPLIT][CHUNK]; for (uint i = 0; i < CHUNK; i++) { rgba[i] = vec4(0.0); } @@ -223,24 +226,38 @@ void main() { cmd_ref.offset += 4 + CmdImage_size; break; case Cmd_BeginClip: - for (uint k = 0; k < CHUNK; k++) { - // We reject any inputs that might overflow in render_ctx.rs. - // The following is a sanity check so we don't corrupt memory should there be malformed inputs. - uint d = min(clip_depth, MAX_BLEND_STACK - 1); - blend_stack[d][k] = packsRGB(vec4(rgba[k])); - rgba[k] = vec4(0.0); + if (clip_depth < BLEND_STACK_SPLIT) { + for (uint k = 0; k < CHUNK; k++) { + blend_stack[clip_depth][k] = packsRGB(vec4(rgba[k])); + rgba[k] = vec4(0.0); + } + } else { + uint base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX + + CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y); + for (uint k = 0; k < CHUNK; k++) { + memory[base_ix + k] = packsRGB(vec4(rgba[k])); + rgba[k] = vec4(0.0); + } } clip_depth++; cmd_ref.offset += 4; break; case Cmd_EndClip: CmdEndClip end_clip = Cmd_EndClip_read(cmd_alloc, cmd_ref); - uint blend_mode = uint(end_clip.blend >> 8); - uint comp_mode = uint(end_clip.blend & 0xFF); clip_depth--; + uint base_ix; + if (clip_depth < BLEND_STACK_SPLIT) { + base_ix = (blend_offset >> 2) + (clip_depth - BLEND_STACK_SPLIT) * TILE_HEIGHT_PX * TILE_WIDTH_PX + + CHUNK * (gl_LocalInvocationID.x + CHUNK_DX * gl_LocalInvocationID.y); + } for (uint k = 0; k < CHUNK; k++) { - uint d = min(clip_depth, MAX_BLEND_STACK - 1); - mediump vec4 bg = unpacksRGB(blend_stack[d][k]); + uint bg_rgba; + if (clip_depth < BLEND_STACK_SPLIT) { + bg_rgba = blend_stack[clip_depth][k]; + } else { + bg_rgba = memory[base_ix + k]; + } + mediump vec4 bg = unpacksRGB(bg_rgba); mediump vec4 fg = rgba[k] * area[k]; rgba[k] = mix_blend_compose(bg, fg, end_clip.blend); } diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index ec17188..21206e5 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -27,6 +27,10 @@ #define GRADIENT_WIDTH 512 +// We allocate this many blend stack entries in registers, and spill +// to memory for the overflow. +#define BLEND_STACK_SPLIT 4 + #ifdef ERR_MALLOC_FAILED struct Config { uint n_elements; // paths @@ -91,7 +95,7 @@ struct Config { #define MODE_STROKE 1 // Size of kernel4 clip state, in words. -#define CLIP_STATE_SIZE 2 +#define CLIP_STATE_SIZE 1 // fill_mode_from_flags extracts the fill mode from tag flags. uint fill_mode_from_flags(uint flags) { diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index d32a9c5..8915de4 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -34,8 +34,6 @@ const TILE_H: usize = 16; const PTCL_INITIAL_ALLOC: usize = 1024; -const MAX_BLEND_STACK: usize = 128; - #[allow(unused)] fn dump_scene(buf: &[u8]) { for i in 0..(buf.len() / 4) { diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index dca03eb..f78e8ab 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -2,7 +2,6 @@ use std::borrow::Cow; use crate::encoder::GlyphEncoder; use crate::stages::{Config, Transform}; -use crate::MAX_BLEND_STACK; use piet::kurbo::{Affine, Insets, PathEl, Point, Rect, Shape}; use piet::{ Color, Error, FixedGradient, ImageFormat, InterpolationMode, IntoBrush, RenderContext, @@ -230,9 +229,6 @@ impl RenderContext for PietGpuRenderContext { let path = shape.path_elements(TOLERANCE); self.encode_path(path, true); self.new_encoder.begin_clip(None); - if self.clip_stack.len() >= MAX_BLEND_STACK { - panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK); - } self.clip_stack.push(ClipElement { blend: None }); if let Some(tos) = self.state_stack.last_mut() { tos.n_clip += 1; @@ -334,9 +330,6 @@ impl PietGpuRenderContext { let path = shape.path_elements(TOLERANCE); self.encode_path(path, true); self.new_encoder.begin_clip(Some(blend)); - if self.clip_stack.len() >= MAX_BLEND_STACK { - panic!("Maximum clip/blend stack size {} exceeded", MAX_BLEND_STACK); - } self.clip_stack.push(ClipElement { blend: Some(blend) }); if let Some(tos) = self.state_stack.last_mut() { tos.n_clip += 1;