diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 6f225d9..c8b4858 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -59,10 +59,22 @@ build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl build gen/transform_leaf.msl: msl gen/transform_leaf.spv build gen/pathtag_reduce.spv: glsl pathtag_reduce.comp | pathtag.h setup.h mem.h +build gen/pathtag_reduce.hlsl: hlsl gen/pathtag_reduce.spv +build gen/pathtag_reduce.dxil: dxil gen/pathtag_reduce.hlsl +build gen/pathtag_reduce.msl: msl gen/pathtag_reduce.spv build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h flags = -DROOT +build gen/pathtag_root.hlsl: hlsl gen/pathtag_root.spv +build gen/pathtag_root.dxil: dxil gen/pathtag_root.hlsl +build gen/pathtag_root.msl: msl gen/pathtag_root.spv build gen/bbox_clear.spv: glsl bbox_clear.comp | setup.h mem.h +build gen/bbox_clear.hlsl: hlsl gen/bbox_clear.spv +build gen/bbox_clear.dxil: dxil gen/bbox_clear.hlsl +build gen/bbox_clear.msl: msl gen/bbox_clear.spv build gen/pathseg.spv: glsl pathseg.comp | tile.h pathseg.h pathtag.h setup.h mem.h +build gen/pathseg.hlsl: hlsl gen/pathseg.spv +build gen/pathseg.dxil: dxil gen/pathseg.hlsl +build gen/pathseg.msl: msl gen/pathseg.spv diff --git a/piet-gpu/shader/gen/bbox_clear.dxil b/piet-gpu/shader/gen/bbox_clear.dxil new file mode 100644 index 0000000..8a46725 Binary files /dev/null and b/piet-gpu/shader/gen/bbox_clear.dxil differ diff --git a/piet-gpu/shader/gen/bbox_clear.hlsl b/piet-gpu/shader/gen/bbox_clear.hlsl new file mode 100644 index 0000000..ae40b13 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.hlsl @@ -0,0 +1,55 @@ +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +ByteAddressBuffer _21 : register(t1); +RWByteAddressBuffer _44 : register(u0); + +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.Load(0)) + { + uint out_ix = (_21.Load(40) >> uint(2)) + (4u * ix); + _44.Store(out_ix * 4 + 8, 65535u); + _44.Store((out_ix + 1u) * 4 + 8, 65535u); + _44.Store((out_ix + 2u) * 4 + 8, 0u); + _44.Store((out_ix + 3u) * 4 + 8, 0u); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/bbox_clear.msl b/piet-gpu/shader/gen/bbox_clear.msl new file mode 100644 index 0000000..f424448 --- /dev/null +++ b/piet-gpu/shader/gen/bbox_clear.msl @@ -0,0 +1,57 @@ +#include +#include + +using namespace metal; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +kernel void main0(device Memory& _44 [[buffer(0)]], const device ConfigBuf& _21 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]]) +{ + uint ix = gl_GlobalInvocationID.x; + if (ix < _21.conf.n_elements) + { + uint out_ix = (_21.conf.bbox_alloc.offset >> uint(2)) + (4u * ix); + _44.memory[out_ix] = 65535u; + _44.memory[out_ix + 1u] = 65535u; + _44.memory[out_ix + 2u] = 0u; + _44.memory[out_ix + 3u] = 0u; + } +} + diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil new file mode 100644 index 0000000..5ad35e7 Binary files /dev/null and b/piet-gpu/shader/gen/pathseg.dxil differ diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl new file mode 100644 index 0000000..40e60cd --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -0,0 +1,643 @@ +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; +static const Monoid _567 = { 0.0f.xxxx, 0u }; + +RWByteAddressBuffer _111 : register(u0); +ByteAddressBuffer _574 : register(t2); +ByteAddressBuffer _639 : register(t1); +ByteAddressBuffer _710 : register(t3); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_tag[512]; +groupshared Monoid sh_scratch[512]; + +TagMonoid reduce_tag(uint tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u))); + c.path_ix = uint(int(countbits(tag_word & 269488144u))); + c.trans_ix = uint(int(countbits(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _135; +} + +float2 read_f32_point(uint ix) +{ + float x = asfloat(_574.Load(ix * 4 + 0)); + float y = asfloat(_574.Load((ix + 1u) * 4 + 0)); + return float2(x, y); +} + +float2 read_i16_point(uint ix) +{ + uint raw = _574.Load(ix * 4 + 0); + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +uint read_mem(Alloc alloc, uint offset) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = _111.Load(offset * 4 + 8); + return v; +} + +TransformSeg TransformSeg_read(Alloc a, TransformSegRef ref) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11); + TransformSeg s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _111.Store(offset * 4 + 8, val); +} + +void PathCubic_write(Alloc a, PathCubicRef ref, PathCubic s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.p0.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.p0.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.p1.x); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.p1.y); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.p2.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.p2.y); + write_mem(param_15, param_16, param_17); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = asuint(s.p3.x); + write_mem(param_18, param_19, param_20); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = asuint(s.p3.y); + write_mem(param_21, param_22, param_23); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = asuint(s.stroke.x); + write_mem(param_30, param_31, param_32); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = asuint(s.stroke.y); + write_mem(param_33, param_34, param_35); +} + +void PathSeg_Cubic_write(Alloc a, PathSegRef ref, uint flags, PathCubic s) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2); + PathCubicRef _458 = { ref.offset + 4u }; + Alloc param_3 = a; + PathCubicRef param_4 = _458; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5); +} + +Monoid combine_monoid(Monoid a, Monoid b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +Monoid monoid_identity() +{ + return _567; +} + +uint round_down(float x) +{ + return uint(max(0.0f, floor(x) + 32768.0f)); +} + +uint round_up(float x) +{ + return uint(min(65535.0f, ceil(x) + 32768.0f)); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = _574.Load(((_639.Load(52) >> uint(2)) + (ix >> uint(2))) * 4 + 0); + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 9u; i++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + GroupMemoryBarrierWithGroupSync(); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + TagMonoid _716; + _716.trans_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 0); + _716.linewidth_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 4); + _716.pathseg_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 8); + _716.path_ix = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 12); + _716.pathseg_offset = _710.Load((gl_WorkGroupID.x - 1u) * 20 + 16); + tm.trans_ix = _716.trans_ix; + tm.linewidth_ix = _716.linewidth_ix; + tm.pathseg_ix = _716.pathseg_ix; + tm.path_ix = _716.path_ix; + tm.pathseg_offset = _716.pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.Load(60) >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.Load(56) >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + TransformSegRef _769 = { _639.Load(36) + (tm.trans_ix * 24u) }; + TransformSegRef trans_ref = _769; + PathSegRef _779 = { _639.Load(28) + (tm.pathseg_ix * 52u) }; + PathSegRef ps_ref = _779; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + Monoid local[4]; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12); + } + } + } + float linewidth = asfloat(_574.Load(lw_ix * 4 + 0)); + Alloc _865; + _865.offset = _639.Load(36); + param_13.offset = _865.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(min(p0, p1), max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _935 = bbox; + float2 _938 = min(_935.xy, p2); + bbox.x = _938.x; + bbox.y = _938.y; + float4 _943 = bbox; + float2 _946 = max(_943.zw, p2); + bbox.z = _946.x; + bbox.w = _946.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _971 = bbox; + float2 _974 = min(_971.xy, p3); + bbox.x = _974.x; + bbox.y = _974.y; + float4 _979 = bbox; + float2 _982 = max(_979.zw, p3); + bbox.z = _982.x; + bbox.w = _982.y; + } + else + { + p3 = p2; + p2 = lerp(p1, p2, 0.3333333432674407958984375f.xx); + p1 = lerp(p1, p0, 0.3333333432674407958984375f.xx); + } + } + else + { + p3 = p1; + p2 = lerp(p3, p0, 0.3333333432674407958984375f.xx); + p1 = lerp(p0, p3, 0.3333333432674407958984375f.xx); + } + float2 stroke = 0.0f.xx; + if (linewidth >= 0.0f) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5f * linewidth); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth >= 0.0f); + Alloc _1071; + _1071.offset = _639.Load(28); + param_15.offset = _1071.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = 0.0f.xxxx; + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 9u; i_3++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.Load(40) >> uint(2)) + (path_ix * 4u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1241 = i_4 == 3u; + bool _1248; + if (_1241) + { + _1248 = gl_LocalInvocationID.x == 511u; + } + else + { + _1248 = _1241; + } + if (_1248) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + _111.Store(bbox_out_ix * 4 + 8, round_down(param_25)); + float param_26 = m.bbox.y; + _111.Store((bbox_out_ix + 1u) * 4 + 8, round_down(param_26)); + float param_27 = m.bbox.z; + _111.Store((bbox_out_ix + 2u) * 4 + 8, round_up(param_27)); + float param_28 = m.bbox.w; + _111.Store((bbox_out_ix + 3u) * 4 + 8, round_up(param_28)); + bbox_out_ix += 4u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1300 = m.bbox.z > m.bbox.x; + bool _1309; + if (!_1300) + { + _1309 = m.bbox.w > m.bbox.y; + } + else + { + _1309 = _1300; + } + if (_1309) + { + float param_29 = m.bbox.x; + uint _1318; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1318); + float param_30 = m.bbox.y; + uint _1326; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1326); + float param_31 = m.bbox.z; + uint _1334; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1334); + float param_32 = m.bbox.w; + uint _1342; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1342); + } + bbox_out_ix += 4u; + } + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl new file mode 100644 index 0000000..25d001f --- /dev/null +++ b/piet-gpu/shader/gen/pathseg.msl @@ -0,0 +1,699 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" +#pragma clang diagnostic ignored "-Wunused-variable" + +#include +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct PathCubicRef +{ + uint offset; +}; + +struct PathCubic +{ + float2 p0; + float2 p1; + float2 p2; + float2 p3; + uint path_ix; + uint trans_ix; + float2 stroke; +}; + +struct PathSegRef +{ + uint offset; +}; + +struct Monoid +{ + float4 bbox; + uint flags; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + Alloc_1 bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct ParentBuf +{ + TagMonoid_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +static inline __attribute__((always_inline)) +float2 read_f32_point(thread const uint& ix, const device SceneBuf& v_574) +{ + float x = as_type(v_574.scene[ix]); + float y = as_type(v_574.scene[ix + 1u]); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +float2 read_i16_point(thread const uint& ix, const device SceneBuf& v_574) +{ + uint raw = v_574.scene[ix]; + float x = float(int(raw << uint(16)) >> 16); + float y = float(int(raw) >> 16); + return float2(x, y); +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +uint read_mem(thread const Alloc& alloc, thread const uint& offset, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return 0u; + } + uint v = v_111.memory[offset]; + return v; +} + +static inline __attribute__((always_inline)) +TransformSeg TransformSeg_read(thread const Alloc& a, thread const TransformSegRef& ref, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint raw0 = read_mem(param, param_1, v_111); + Alloc param_2 = a; + uint param_3 = ix + 1u; + uint raw1 = read_mem(param_2, param_3, v_111); + Alloc param_4 = a; + uint param_5 = ix + 2u; + uint raw2 = read_mem(param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 3u; + uint raw3 = read_mem(param_6, param_7, v_111); + Alloc param_8 = a; + uint param_9 = ix + 4u; + uint raw4 = read_mem(param_8, param_9, v_111); + Alloc param_10 = a; + uint param_11 = ix + 5u; + uint raw5 = read_mem(param_10, param_11, v_111); + TransformSeg s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_111) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_111.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void PathCubic_write(thread const Alloc& a, thread const PathCubicRef& ref, thread const PathCubic& s, device Memory& v_111) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.p0.x); + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.p0.y); + write_mem(param_3, param_4, param_5, v_111); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.p1.x); + write_mem(param_6, param_7, param_8, v_111); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.p1.y); + write_mem(param_9, param_10, param_11, v_111); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.p2.x); + write_mem(param_12, param_13, param_14, v_111); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.p2.y); + write_mem(param_15, param_16, param_17, v_111); + Alloc param_18 = a; + uint param_19 = ix + 6u; + uint param_20 = as_type(s.p3.x); + write_mem(param_18, param_19, param_20, v_111); + Alloc param_21 = a; + uint param_22 = ix + 7u; + uint param_23 = as_type(s.p3.y); + write_mem(param_21, param_22, param_23, v_111); + Alloc param_24 = a; + uint param_25 = ix + 8u; + uint param_26 = s.path_ix; + write_mem(param_24, param_25, param_26, v_111); + Alloc param_27 = a; + uint param_28 = ix + 9u; + uint param_29 = s.trans_ix; + write_mem(param_27, param_28, param_29, v_111); + Alloc param_30 = a; + uint param_31 = ix + 10u; + uint param_32 = as_type(s.stroke.x); + write_mem(param_30, param_31, param_32, v_111); + Alloc param_33 = a; + uint param_34 = ix + 11u; + uint param_35 = as_type(s.stroke.y); + write_mem(param_33, param_34, param_35, v_111); +} + +static inline __attribute__((always_inline)) +void PathSeg_Cubic_write(thread const Alloc& a, thread const PathSegRef& ref, thread const uint& flags, thread const PathCubic& s, device Memory& v_111) +{ + Alloc param = a; + uint param_1 = ref.offset >> uint(2); + uint param_2 = (flags << uint(16)) | 1u; + write_mem(param, param_1, param_2, v_111); + Alloc param_3 = a; + PathCubicRef param_4 = PathCubicRef{ ref.offset + 4u }; + PathCubic param_5 = s; + PathCubic_write(param_3, param_4, param_5, v_111); +} + +static inline __attribute__((always_inline)) +Monoid combine_monoid(thread const Monoid& a, thread const Monoid& b) +{ + Monoid c; + c.bbox = b.bbox; + bool _472 = (a.flags & 1u) == 0u; + bool _480; + if (_472) + { + _480 = b.bbox.z <= b.bbox.x; + } + else + { + _480 = _472; + } + bool _488; + if (_480) + { + _488 = b.bbox.w <= b.bbox.y; + } + else + { + _488 = _480; + } + if (_488) + { + c.bbox = a.bbox; + } + else + { + bool _498 = (a.flags & 1u) == 0u; + bool _505; + if (_498) + { + _505 = (b.flags & 2u) == 0u; + } + else + { + _505 = _498; + } + bool _522; + if (_505) + { + bool _512 = a.bbox.z > a.bbox.x; + bool _521; + if (!_512) + { + _521 = a.bbox.w > a.bbox.y; + } + else + { + _521 = _512; + } + _522 = _521; + } + else + { + _522 = _505; + } + if (_522) + { + float4 _529 = c.bbox; + float2 _531 = fast::min(a.bbox.xy, _529.xy); + c.bbox.x = _531.x; + c.bbox.y = _531.y; + float4 _540 = c.bbox; + float2 _542 = fast::max(a.bbox.zw, _540.zw); + c.bbox.z = _542.x; + c.bbox.w = _542.y; + } + } + c.flags = (a.flags & 2u) | b.flags; + c.flags |= ((a.flags & 1u) << uint(1)); + return c; +} + +static inline __attribute__((always_inline)) +Monoid monoid_identity() +{ + return Monoid{ float4(0.0), 0u }; +} + +static inline __attribute__((always_inline)) +uint round_down(thread const float& x) +{ + return uint(fast::max(0.0, floor(x) + 32768.0)); +} + +static inline __attribute__((always_inline)) +uint round_up(thread const float& x) +{ + return uint(fast::min(65535.0, ceil(x) + 32768.0)); +} + +kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _710 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_tag[512]; + threadgroup Monoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 4u; + uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))]; + uint param = tag_word; + TagMonoid local_tm = reduce_tag(param); + sh_tag[gl_LocalInvocationID.x] = local_tm; + for (uint i = 0u; i < 9u; i++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i)) + { + TagMonoid other = sh_tag[gl_LocalInvocationID.x - (1u << i)]; + TagMonoid param_1 = other; + TagMonoid param_2 = local_tm; + local_tm = combine_tag_monoid(param_1, param_2); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_tag[gl_LocalInvocationID.x] = local_tm; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid tm = tag_monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _713 = gl_WorkGroupID.x - 1u; + tm.trans_ix = _710.parent[_713].trans_ix; + tm.linewidth_ix = _710.parent[_713].linewidth_ix; + tm.pathseg_ix = _710.parent[_713].pathseg_ix; + tm.path_ix = _710.parent[_713].path_ix; + tm.pathseg_offset = _710.parent[_713].pathseg_offset; + } + if (gl_LocalInvocationID.x > 0u) + { + TagMonoid param_3 = tm; + TagMonoid param_4 = sh_tag[gl_LocalInvocationID.x - 1u]; + tm = combine_tag_monoid(param_3, param_4); + } + uint ps_ix = (_639.conf.pathseg_offset >> uint(2)) + tm.pathseg_offset; + uint lw_ix = (_639.conf.linewidth_offset >> uint(2)) + tm.linewidth_ix; + uint save_path_ix = tm.path_ix; + TransformSegRef trans_ref = TransformSegRef{ _639.conf.trans_alloc.offset + (tm.trans_ix * 24u) }; + PathSegRef ps_ref = PathSegRef{ _639.conf.pathseg_alloc.offset + (tm.pathseg_ix * 52u) }; + float2 p0; + float2 p1; + float2 p2; + float2 p3; + Alloc param_13; + spvUnsafeArray local; + PathCubic cubic; + Alloc param_15; + for (uint i_1 = 0u; i_1 < 4u; i_1++) + { + uint tag_byte = tag_word >> (i_1 * 8u); + uint seg_type = tag_byte & 3u; + if (seg_type != 0u) + { + if ((tag_byte & 8u) != 0u) + { + uint param_5 = ps_ix; + p0 = read_f32_point(param_5, v_574); + uint param_6 = ps_ix + 2u; + p1 = read_f32_point(param_6, v_574); + if (seg_type >= 2u) + { + uint param_7 = ps_ix + 4u; + p2 = read_f32_point(param_7, v_574); + if (seg_type == 3u) + { + uint param_8 = ps_ix + 6u; + p3 = read_f32_point(param_8, v_574); + } + } + } + else + { + uint param_9 = ps_ix; + p0 = read_i16_point(param_9, v_574); + uint param_10 = ps_ix + 1u; + p1 = read_i16_point(param_10, v_574); + if (seg_type >= 2u) + { + uint param_11 = ps_ix + 2u; + p2 = read_i16_point(param_11, v_574); + if (seg_type == 3u) + { + uint param_12 = ps_ix + 3u; + p3 = read_i16_point(param_12, v_574); + } + } + } + float linewidth = as_type(v_574.scene[lw_ix]); + param_13.offset = _639.conf.trans_alloc.offset; + TransformSegRef param_14 = trans_ref; + TransformSeg transform = TransformSeg_read(param_13, param_14, v_111); + p0 = ((transform.mat.xy * p0.x) + (transform.mat.zw * p0.y)) + transform.translate; + p1 = ((transform.mat.xy * p1.x) + (transform.mat.zw * p1.y)) + transform.translate; + float4 bbox = float4(fast::min(p0, p1), fast::max(p0, p1)); + if (seg_type >= 2u) + { + p2 = ((transform.mat.xy * p2.x) + (transform.mat.zw * p2.y)) + transform.translate; + float4 _935 = bbox; + float2 _938 = fast::min(_935.xy, p2); + bbox.x = _938.x; + bbox.y = _938.y; + float4 _943 = bbox; + float2 _946 = fast::max(_943.zw, p2); + bbox.z = _946.x; + bbox.w = _946.y; + if (seg_type == 3u) + { + p3 = ((transform.mat.xy * p3.x) + (transform.mat.zw * p3.y)) + transform.translate; + float4 _971 = bbox; + float2 _974 = fast::min(_971.xy, p3); + bbox.x = _974.x; + bbox.y = _974.y; + float4 _979 = bbox; + float2 _982 = fast::max(_979.zw, p3); + bbox.z = _982.x; + bbox.w = _982.y; + } + else + { + p3 = p2; + p2 = mix(p1, p2, float2(0.3333333432674407958984375)); + p1 = mix(p1, p0, float2(0.3333333432674407958984375)); + } + } + else + { + p3 = p1; + p2 = mix(p3, p0, float2(0.3333333432674407958984375)); + p1 = mix(p0, p3, float2(0.3333333432674407958984375)); + } + float2 stroke = float2(0.0); + if (linewidth >= 0.0) + { + stroke = float2(length(transform.mat.xz), length(transform.mat.yw)) * (0.5 * linewidth); + bbox += float4(-stroke, stroke); + } + local[i_1].bbox = bbox; + local[i_1].flags = 0u; + cubic.p0 = p0; + cubic.p1 = p1; + cubic.p2 = p2; + cubic.p3 = p3; + cubic.path_ix = tm.path_ix; + cubic.trans_ix = (gl_GlobalInvocationID.x * 4u) + i_1; + cubic.stroke = stroke; + uint fill_mode = uint(linewidth >= 0.0); + param_15.offset = _639.conf.pathseg_alloc.offset; + PathSegRef param_16 = ps_ref; + uint param_17 = fill_mode; + PathCubic param_18 = cubic; + PathSeg_Cubic_write(param_15, param_16, param_17, param_18, v_111); + ps_ref.offset += 52u; + uint n_points = (tag_byte & 3u) + ((tag_byte >> uint(2)) & 1u); + uint n_words = n_points + (n_points & (((tag_byte >> uint(3)) & 1u) * 15u)); + ps_ix += n_words; + } + else + { + local[i_1].bbox = float4(0.0); + uint is_path = (tag_byte >> uint(4)) & 1u; + local[i_1].flags = is_path; + tm.path_ix += is_path; + trans_ref.offset += (((tag_byte >> uint(5)) & 1u) * 24u); + lw_ix += ((tag_byte >> uint(6)) & 1u); + } + } + Monoid agg = local[0]; + for (uint i_2 = 1u; i_2 < 4u; i_2++) + { + Monoid param_19 = agg; + Monoid param_20 = local[i_2]; + agg = combine_monoid(param_19, param_20); + local[i_2] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_3 = 0u; i_3 < 9u; i_3++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_3)) + { + Monoid other_1 = sh_scratch[gl_LocalInvocationID.x - (1u << i_3)]; + Monoid param_21 = other_1; + Monoid param_22 = agg; + agg = combine_monoid(param_21, param_22); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + uint path_ix = save_path_ix; + uint bbox_out_ix = (_639.conf.bbox_alloc.offset >> uint(2)) + (path_ix * 4u); + Monoid row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_4 = 0u; i_4 < 4u; i_4++) + { + Monoid param_23 = row; + Monoid param_24 = local[i_4]; + Monoid m = combine_monoid(param_23, param_24); + bool do_atomic = false; + bool _1241 = i_4 == 3u; + bool _1248; + if (_1241) + { + _1248 = gl_LocalInvocationID.x == 511u; + } + else + { + _1248 = _1241; + } + if (_1248) + { + do_atomic = true; + } + if ((m.flags & 1u) != 0u) + { + if ((m.flags & 2u) == 0u) + { + do_atomic = true; + } + else + { + float param_25 = m.bbox.x; + v_111.memory[bbox_out_ix] = round_down(param_25); + float param_26 = m.bbox.y; + v_111.memory[bbox_out_ix + 1u] = round_down(param_26); + float param_27 = m.bbox.z; + v_111.memory[bbox_out_ix + 2u] = round_up(param_27); + float param_28 = m.bbox.w; + v_111.memory[bbox_out_ix + 3u] = round_up(param_28); + bbox_out_ix += 4u; + do_atomic = false; + } + } + if (do_atomic) + { + bool _1300 = m.bbox.z > m.bbox.x; + bool _1309; + if (!_1300) + { + _1309 = m.bbox.w > m.bbox.y; + } + else + { + _1309 = _1300; + } + if (_1309) + { + float param_29 = m.bbox.x; + uint _1318 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + float param_30 = m.bbox.y; + uint _1326 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + float param_31 = m.bbox.z; + uint _1334 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + float param_32 = m.bbox.w; + uint _1342 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + } + bbox_out_ix += 4u; + } + } +} + diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil new file mode 100644 index 0000000..81448e7 Binary files /dev/null and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl new file mode 100644 index 0000000..5ed84b8 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -0,0 +1,127 @@ +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); + +ByteAddressBuffer _139 : register(t1); +ByteAddressBuffer _151 : register(t2); +RWByteAddressBuffer _239 : register(u3); +RWByteAddressBuffer _259 : register(u0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_scratch[128]; + +TagMonoid reduce_tag(uint tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(countbits((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(countbits(tag_word & 1077952576u))); + c.path_ix = uint(int(countbits(tag_word & 269488144u))); + c.trans_ix = uint(int(countbits(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 4u; + uint scene_ix = (_139.Load(52) >> uint(2)) + ix; + uint tag_word = _151.Load(scene_ix * 4 + 0); + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 4u; i++) + { + tag_word = _151.Load((scene_ix + i) * 4 + 0); + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _239.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _239.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _239.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _239.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _239.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + } +} + +[numthreads(128, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl new file mode 100644 index 0000000..edb6d03 --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -0,0 +1,143 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + Alloc bbox_alloc; + uint n_trans; + uint trans_offset; + uint pathtag_offset; + uint linewidth_offset; + uint pathseg_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct OutBuf +{ + TagMonoid_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(128u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid reduce_tag(thread const uint& tag_word) +{ + uint point_count = tag_word & 50529027u; + TagMonoid c; + c.pathseg_ix = uint(int(popcount((point_count * 7u) & 67372036u))); + c.linewidth_ix = uint(int(popcount(tag_word & 1077952576u))); + c.path_ix = uint(int(popcount(tag_word & 269488144u))); + c.trans_ix = uint(int(popcount(tag_word & 538976288u))); + uint n_points = point_count + ((tag_word >> uint(2)) & 16843009u); + uint a = n_points + (n_points & (((tag_word >> uint(3)) & 16843009u) * 15u)); + a += (a >> uint(8)); + a += (a >> uint(16)); + c.pathseg_offset = a & 255u; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _151 [[buffer(2)]], device OutBuf& _239 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup TagMonoid sh_scratch[128]; + uint ix = gl_GlobalInvocationID.x * 4u; + uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; + uint tag_word = _151.scene[scene_ix]; + uint param = tag_word; + TagMonoid agg = reduce_tag(param); + for (uint i = 1u; i < 4u; i++) + { + tag_word = _151.scene[scene_ix + i]; + uint param_1 = tag_word; + TagMonoid param_2 = agg; + TagMonoid param_3 = reduce_tag(param_1); + agg = combine_tag_monoid(param_2, param_3); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 7u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 128u) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + TagMonoid param_4 = agg; + TagMonoid param_5 = other; + agg = combine_tag_monoid(param_4, param_5); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _239.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _239.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _239.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _239.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _239.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + } +} + diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil new file mode 100644 index 0000000..1f27f26 Binary files /dev/null and b/piet-gpu/shader/gen/pathtag_root.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl new file mode 100644 index 0000000..388f99d --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -0,0 +1,115 @@ +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; + +RWByteAddressBuffer _78 : register(u0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared TagMonoid sh_scratch[512]; + +TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +TagMonoid tag_monoid_identity() +{ + return _18; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TagMonoid _82; + _82.trans_ix = _78.Load(ix * 20 + 0); + _82.linewidth_ix = _78.Load(ix * 20 + 4); + _82.pathseg_ix = _78.Load(ix * 20 + 8); + _82.path_ix = _78.Load(ix * 20 + 12); + _82.pathseg_offset = _78.Load(ix * 20 + 16); + TagMonoid local[8]; + local[0].trans_ix = _82.trans_ix; + local[0].linewidth_ix = _82.linewidth_ix; + local[0].pathseg_ix = _82.pathseg_ix; + local[0].path_ix = _82.path_ix; + local[0].pathseg_offset = _82.pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + TagMonoid param = local[i - 1u]; + TagMonoid _115; + _115.trans_ix = _78.Load((ix + i) * 20 + 0); + _115.linewidth_ix = _78.Load((ix + i) * 20 + 4); + _115.pathseg_ix = _78.Load((ix + i) * 20 + 8); + _115.path_ix = _78.Load((ix + i) * 20 + 12); + _115.pathseg_offset = _78.Load((ix + i) * 20 + 16); + param_1.trans_ix = _115.trans_ix; + param_1.linewidth_ix = _115.linewidth_ix; + param_1.pathseg_ix = _115.pathseg_ix; + param_1.path_ix = _115.path_ix; + param_1.pathseg_offset = _115.pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _211 = ix + i_2; + _78.Store(_211 * 20 + 0, m.trans_ix); + _78.Store(_211 * 20 + 4, m.linewidth_ix); + _78.Store(_211 * 20 + 8, m.pathseg_ix); + _78.Store(_211 * 20 + 12, m.path_ix); + _78.Store(_211 * 20 + 16, m.pathseg_offset); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl new file mode 100644 index 0000000..923e77c --- /dev/null +++ b/piet-gpu/shader/gen/pathtag_root.msl @@ -0,0 +1,146 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct TagMonoid +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct TagMonoid_1 +{ + uint trans_ix; + uint linewidth_ix; + uint pathseg_ix; + uint path_ix; + uint pathseg_offset; +}; + +struct DataBuf +{ + TagMonoid_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) +{ + TagMonoid c; + c.trans_ix = a.trans_ix + b.trans_ix; + c.linewidth_ix = a.linewidth_ix + b.linewidth_ix; + c.pathseg_ix = a.pathseg_ix + b.pathseg_ix; + c.path_ix = a.path_ix + b.path_ix; + c.pathseg_offset = a.pathseg_offset + b.pathseg_offset; + return c; +} + +static inline __attribute__((always_inline)) +TagMonoid tag_monoid_identity() +{ + return TagMonoid{ 0u, 0u, 0u, 0u, 0u }; +} + +kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup TagMonoid sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].trans_ix = _78.data[ix].trans_ix; + local[0].linewidth_ix = _78.data[ix].linewidth_ix; + local[0].pathseg_ix = _78.data[ix].pathseg_ix; + local[0].path_ix = _78.data[ix].path_ix; + local[0].pathseg_offset = _78.data[ix].pathseg_offset; + TagMonoid param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _109 = ix + i; + TagMonoid param = local[i - 1u]; + param_1.trans_ix = _78.data[_109].trans_ix; + param_1.linewidth_ix = _78.data[_109].linewidth_ix; + param_1.pathseg_ix = _78.data[_109].pathseg_ix; + param_1.path_ix = _78.data[_109].path_ix; + param_1.pathseg_offset = _78.data[_109].pathseg_offset; + local[i] = combine_tag_monoid(param, param_1); + } + TagMonoid agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + TagMonoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + TagMonoid param_2 = other; + TagMonoid param_3 = agg; + agg = combine_tag_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + TagMonoid row = tag_monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + TagMonoid param_4 = row; + TagMonoid param_5 = local[i_2]; + TagMonoid m = combine_tag_monoid(param_4, param_5); + uint _211 = ix + i_2; + _78.data[_211].trans_ix = m.trans_ix; + _78.data[_211].linewidth_ix = m.linewidth_ix; + _78.data[_211].pathseg_ix = m.pathseg_ix; + _78.data[_211].path_ix = m.path_ix; + _78.data[_211].pathseg_offset = m.pathseg_offset; + } +} + diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs index b3f417e..e3786fc 100644 --- a/piet-gpu/src/stages/path.rs +++ b/piet-gpu/src/stages/path.rs @@ -17,7 +17,7 @@ //! The path stage (includes substages). use piet_gpu_hal::{ - BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, ShaderCode, + BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, include_shader, }; pub struct PathCode { @@ -55,7 +55,7 @@ const CLEAR_WG: u32 = 512; impl PathCode { pub unsafe fn new(session: &Session) -> PathCode { // TODO: add cross-compilation - let reduce_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_reduce.spv")); + let reduce_code = include_shader!(session, "../../shader/gen/pathtag_reduce"); let reduce_pipeline = session .create_compute_pipeline( reduce_code, @@ -67,15 +67,15 @@ impl PathCode { ], ) .unwrap(); - let tag_root_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathtag_root.spv")); + let tag_root_code = include_shader!(session, "../../shader/gen/pathtag_root"); let tag_root_pipeline = session .create_compute_pipeline(tag_root_code, &[BindType::Buffer]) .unwrap(); - let clear_code = ShaderCode::Spv(include_bytes!("../../shader/gen/bbox_clear.spv")); + let clear_code = include_shader!(session, "../../shader/gen/bbox_clear"); let clear_pipeline = session .create_compute_pipeline(clear_code, &[BindType::Buffer, BindType::BufReadOnly]) .unwrap(); - let pathseg_code = ShaderCode::Spv(include_bytes!("../../shader/gen/pathseg.spv")); + let pathseg_code = include_shader!(session, "../../shader/gen/pathseg"); let pathseg_pipeline = session .create_compute_pipeline( pathseg_code,