diff --git a/Cargo.lock b/Cargo.lock index e65ac2f..737c033 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,6 +96,20 @@ name = "bytemuck" version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "byteorder" @@ -872,6 +886,7 @@ dependencies = [ name = "piet-gpu" version = "0.1.0" dependencies = [ + "bytemuck", "clap", "ndk", "ndk-glue", @@ -920,7 +935,10 @@ version = "0.1.0" dependencies = [ "bytemuck", "clap", + "kurbo", + "piet-gpu", "piet-gpu-hal", + "rand", ] [[package]] diff --git a/piet-gpu/Cargo.toml b/piet-gpu/Cargo.toml index f8f5c0a..faaffbd 100644 --- a/piet-gpu/Cargo.toml +++ b/piet-gpu/Cargo.toml @@ -33,6 +33,7 @@ roxmltree = "0.13" winit = "0.25" clap = "2.33" swash = "0.1.4" +bytemuck = { version = "1.7.2", features = ["derive"] } [target.'cfg(target_os = "android")'.dependencies] ndk = "0.3" diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv index 450c916..870abe4 100644 Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ diff --git a/piet-gpu/shader/backdrop_lg.spv b/piet-gpu/shader/backdrop_lg.spv index 859e3e7..a8b1fd9 100644 Binary files a/piet-gpu/shader/backdrop_lg.spv and b/piet-gpu/shader/backdrop_lg.spv differ diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index 516cc1d..669585a 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index c1fcc92..777a77f 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -1,12 +1,27 @@ # Build file for shaders. -# You must have glslangValidator in your path, or patch here. +# You must have Vulkan tools in your path, or patch here. glslang_validator = glslangValidator +spirv_cross = spirv-cross +dxc = dxc + +# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for +# why we set this. +msl_flags = --msl-decoration-binding rule glsl command = $glslang_validator $flags -V -o $out $in +rule hlsl + command = $spirv_cross --hlsl $in --output $out + +rule dxil + command = $dxc -T cs_6_0 $in -Fo $out + +rule msl + command = $spirv_cross --msl $in --output $out $msl_flags + build elements.spv: glsl elements.comp | scene.h state.h annotated.h @@ -24,3 +39,21 @@ build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h + +# New element pipeline follows + +build gen/transform_reduce.spv: glsl transform_reduce.comp | scene.h setup.h mem.h +build gen/transform_reduce.hlsl: hlsl gen/transform_reduce.spv +build gen/transform_reduce.dxil: dxil gen/transform_reduce.hlsl +build gen/transform_reduce.msl: msl gen/transform_reduce.spv + +build gen/transform_root.spv: glsl transform_scan.comp + flags = -DROOT +build gen/transform_root.hlsl: hlsl gen/transform_root.spv +build gen/transform_root.dxil: dxil gen/transform_root.hlsl +build gen/transform_root.msl: msl gen/transform_root.spv + +build gen/transform_leaf.spv: glsl transform_leaf.comp | scene.h tile.h setup.h mem.h +build gen/transform_leaf.hlsl: hlsl gen/transform_leaf.spv +build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl +build gen/transform_leaf.msl: msl gen/transform_leaf.spv diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index eabb301..4167197 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil new file mode 100644 index 0000000..bc4f941 Binary files /dev/null and b/piet-gpu/shader/gen/transform_leaf.dxil differ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl new file mode 100644 index 0000000..80b5434 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -0,0 +1,219 @@ +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + uint n_trans; + uint trans_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _71 : register(u0); +ByteAddressBuffer _96 : register(t2); +ByteAddressBuffer _278 : register(t1); +ByteAddressBuffer _377 : register(t3); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[512]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _96.Load((ix + 0u) * 4 + 0); + uint raw1 = _96.Load((ix + 1u) * 4 + 0); + uint raw2 = _96.Load((ix + 2u) * 4 + 0); + uint raw3 = _96.Load((ix + 3u) * 4 + 0); + uint raw4 = _96.Load((ix + 4u) * 4 + 0); + uint raw5 = _96.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _85 = { ref.offset + (index * 24u) }; + return _85; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _224; +} + +bool touch_mem(Alloc alloc, uint offset) +{ + return true; +} + +void write_mem(Alloc alloc, uint offset, uint val) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + _71.Store(offset * 4 + 8, val); +} + +void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = asuint(s.mat.x); + write_mem(param, param_1, param_2); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = asuint(s.mat.y); + write_mem(param_3, param_4, param_5); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = asuint(s.mat.z); + write_mem(param_6, param_7, param_8); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = asuint(s.mat.w); + write_mem(param_9, param_10, param_11); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = asuint(s.translate.x); + write_mem(param_12, param_13, param_14); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = asuint(s.translate.y); + write_mem(param_15, param_16, param_17); +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _285 = { _278.Load(44) + (ix * 24u) }; + TransformRef ref = _285; + TransformRef param = ref; + Transform agg = Transform_read(param); + Transform local[8]; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + Transform _383; + _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _383.mat; + row.translate = _383.translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg _423 = { m.mat, m.translate }; + TransformSeg transform = _423; + TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _433; + Alloc _437; + _437.offset = _278.Load(36); + param_12.offset = _437.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl new file mode 100644 index 0000000..6229b25 --- /dev/null +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -0,0 +1,272 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Alloc +{ + uint offset; +}; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct TransformSegRef +{ + uint offset; +}; + +struct TransformSeg +{ + float4 mat; + float2 translate; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc_1 +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc_1 tile_alloc; + Alloc_1 bin_alloc; + Alloc_1 ptcl_alloc; + Alloc_1 pathseg_alloc; + Alloc_1 anno_alloc; + Alloc_1 trans_alloc; + uint n_trans; + uint trans_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct ParentBuf +{ + Transform_1 parent[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_96.scene[ix + 0u]; + uint raw1 = v_96.scene[ix + 1u]; + uint raw2 = v_96.scene[ix + 2u]; + uint raw3 = v_96.scene[ix + 3u]; + uint raw4 = v_96.scene[ix + 4u]; + uint raw5 = v_96.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +static inline __attribute__((always_inline)) +bool touch_mem(thread const Alloc& alloc, thread const uint& offset) +{ + return true; +} + +static inline __attribute__((always_inline)) +void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71) +{ + Alloc param = alloc; + uint param_1 = offset; + if (!touch_mem(param, param_1)) + { + return; + } + v_71.memory[offset] = val; +} + +static inline __attribute__((always_inline)) +void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71) +{ + uint ix = ref.offset >> uint(2); + Alloc param = a; + uint param_1 = ix + 0u; + uint param_2 = as_type(s.mat.x); + write_mem(param, param_1, param_2, v_71); + Alloc param_3 = a; + uint param_4 = ix + 1u; + uint param_5 = as_type(s.mat.y); + write_mem(param_3, param_4, param_5, v_71); + Alloc param_6 = a; + uint param_7 = ix + 2u; + uint param_8 = as_type(s.mat.z); + write_mem(param_6, param_7, param_8, v_71); + Alloc param_9 = a; + uint param_10 = ix + 3u; + uint param_11 = as_type(s.mat.w); + write_mem(param_9, param_10, param_11, v_71); + Alloc param_12 = a; + uint param_13 = ix + 4u; + uint param_14 = as_type(s.translate.x); + write_mem(param_12, param_13, param_14, v_71); + Alloc param_15 = a; + uint param_16 = ix + 5u; + uint param_17 = as_type(s.translate.y); + write_mem(param_15, param_16, param_17, v_71); +} + +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_96); + spvUnsafeArray local; + local[0] = agg; + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_96); + agg = combine_monoid(param_4, param_5); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_6 = other; + Transform param_7 = agg; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_WorkGroupID.x > 0u) + { + uint _380 = gl_WorkGroupID.x - 1u; + row.mat = _377.parent[_380].mat; + row.translate = _377.parent[_380].translate; + } + if (gl_LocalInvocationID.x > 0u) + { + Transform param_8 = row; + Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u]; + row = combine_monoid(param_8, param_9); + } + Alloc param_12; + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_10 = row; + Transform param_11 = local[i_2]; + Transform m = combine_monoid(param_10, param_11); + TransformSeg transform = TransformSeg{ m.mat, m.translate }; + TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) }; + param_12.offset = _278.conf.trans_alloc.offset; + TransformSegRef param_13 = trans_ref; + TransformSeg param_14 = transform; + TransformSeg_write(param_12, param_13, param_14, v_71); + } +} + diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv new file mode 100644 index 0000000..ec47a9f Binary files /dev/null and b/piet-gpu/shader/gen/transform_leaf.spv differ diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil new file mode 100644 index 0000000..65ff944 Binary files /dev/null and b/piet-gpu/shader/gen/transform_reduce.dxil differ diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl new file mode 100644 index 0000000..09504f6 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -0,0 +1,125 @@ +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + uint n_trans; + uint trans_offset; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +ByteAddressBuffer _49 : register(t2); +ByteAddressBuffer _161 : register(t1); +RWByteAddressBuffer _251 : register(u3); +RWByteAddressBuffer _267 : register(u0); + +static uint3 gl_WorkGroupID; +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_WorkGroupID : SV_GroupID; + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[512]; + +Transform Transform_read(TransformRef ref) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = _49.Load((ix + 0u) * 4 + 0); + uint raw1 = _49.Load((ix + 1u) * 4 + 0); + uint raw2 = _49.Load((ix + 2u) * 4 + 0); + uint raw3 = _49.Load((ix + 3u) * 4 + 0); + uint raw4 = _49.Load((ix + 4u) * 4 + 0); + uint raw5 = _49.Load((ix + 5u) * 4 + 0); + Transform s; + s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3)); + s.translate = float2(asfloat(raw4), asfloat(raw5)); + return s; +} + +TransformRef Transform_index(TransformRef ref, uint index) +{ + TransformRef _37 = { ref.offset + (index * 24u) }; + return _37; +} + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef _168 = { _161.Load(44) + (ix * 24u) }; + TransformRef ref = _168; + TransformRef param = ref; + Transform agg = Transform_read(param); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _251.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat)); + _251.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate)); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_WorkGroupID = stage_input.gl_WorkGroupID; + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl new file mode 100644 index 0000000..71e9935 --- /dev/null +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -0,0 +1,138 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" + +#include +#include + +using namespace metal; + +struct TransformRef +{ + uint offset; +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct SceneBuf +{ + uint scene[1]; +}; + +struct Alloc +{ + uint offset; +}; + +struct Config +{ + uint n_elements; + uint n_pathseg; + uint width_in_tiles; + uint height_in_tiles; + Alloc tile_alloc; + Alloc bin_alloc; + Alloc ptcl_alloc; + Alloc pathseg_alloc; + Alloc anno_alloc; + Alloc trans_alloc; + uint n_trans; + uint trans_offset; +}; + +struct ConfigBuf +{ + Config conf; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct OutBuf +{ + Transform_1 outbuf[1]; +}; + +struct Memory +{ + uint mem_offset; + uint mem_error; + uint memory[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49) +{ + uint ix = ref.offset >> uint(2); + uint raw0 = v_49.scene[ix + 0u]; + uint raw1 = v_49.scene[ix + 1u]; + uint raw2 = v_49.scene[ix + 2u]; + uint raw3 = v_49.scene[ix + 3u]; + uint raw4 = v_49.scene[ix + 4u]; + uint raw5 = v_49.scene[ix + 5u]; + Transform s; + s.mat = float4(as_type(raw0), as_type(raw1), as_type(raw2), as_type(raw3)); + s.translate = float2(as_type(raw4), as_type(raw5)); + return s; +} + +static inline __attribute__((always_inline)) +TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index) +{ + return TransformRef{ ref.offset + (index * 24u) }; +} + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _251 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +{ + threadgroup Transform sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) }; + TransformRef param = ref; + Transform agg = Transform_read(param, v_49); + for (uint i = 1u; i < 8u; i++) + { + TransformRef param_1 = ref; + uint param_2 = i; + TransformRef param_3 = Transform_index(param_1, param_2); + Transform param_4 = agg; + Transform param_5 = Transform_read(param_3, v_49); + agg = combine_monoid(param_4, param_5); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + { + Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; + Transform param_6 = agg; + Transform param_7 = other; + agg = combine_monoid(param_6, param_7); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0u) + { + _251.outbuf[gl_WorkGroupID.x].mat = agg.mat; + _251.outbuf[gl_WorkGroupID.x].translate = agg.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv new file mode 100644 index 0000000..d6f84a2 Binary files /dev/null and b/piet-gpu/shader/gen/transform_reduce.spv differ diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil new file mode 100644 index 0000000..0d16d04 Binary files /dev/null and b/piet-gpu/shader/gen/transform_root.dxil differ diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl new file mode 100644 index 0000000..42bbd38 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -0,0 +1,94 @@ +struct Transform +{ + float4 mat; + float2 translate; +}; + +static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); + +static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; + +RWByteAddressBuffer _89 : register(u0); + +static uint3 gl_LocalInvocationID; +static uint3 gl_GlobalInvocationID; +struct SPIRV_Cross_Input +{ + uint3 gl_LocalInvocationID : SV_GroupThreadID; + uint3 gl_GlobalInvocationID : SV_DispatchThreadID; +}; + +groupshared Transform sh_scratch[512]; + +Transform combine_monoid(Transform a, Transform b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +Transform monoid_identity() +{ + return _23; +} + +void comp_main() +{ + uint ix = gl_GlobalInvocationID.x * 8u; + Transform _93; + _93.mat = asfloat(_89.Load4(ix * 32 + 0)); + _93.translate = asfloat(_89.Load2(ix * 32 + 16)); + Transform local[8]; + local[0].mat = _93.mat; + local[0].translate = _93.translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + Transform param = local[i - 1u]; + Transform _119; + _119.mat = asfloat(_89.Load4((ix + i) * 32 + 0)); + _119.translate = asfloat(_89.Load2((ix + i) * 32 + 16)); + param_1.mat = _119.mat; + param_1.translate = _119.translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + GroupMemoryBarrierWithGroupSync(); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + GroupMemoryBarrierWithGroupSync(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + GroupMemoryBarrierWithGroupSync(); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _209 = ix + i_2; + _89.Store4(_209 * 32 + 0, asuint(m.mat)); + _89.Store2(_209 * 32 + 16, asuint(m.translate)); + } +} + +[numthreads(512, 1, 1)] +void main(SPIRV_Cross_Input stage_input) +{ + gl_LocalInvocationID = stage_input.gl_LocalInvocationID; + gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID; + comp_main(); +} diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl new file mode 100644 index 0000000..2c58c06 --- /dev/null +++ b/piet-gpu/shader/gen/transform_root.msl @@ -0,0 +1,129 @@ +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wmissing-braces" + +#include +#include + +using namespace metal; + +template +struct spvUnsafeArray +{ + T elements[Num ? Num : 1]; + + thread T& operator [] (size_t pos) thread + { + return elements[pos]; + } + constexpr const thread T& operator [] (size_t pos) const thread + { + return elements[pos]; + } + + device T& operator [] (size_t pos) device + { + return elements[pos]; + } + constexpr const device T& operator [] (size_t pos) const device + { + return elements[pos]; + } + + constexpr const constant T& operator [] (size_t pos) const constant + { + return elements[pos]; + } + + threadgroup T& operator [] (size_t pos) threadgroup + { + return elements[pos]; + } + constexpr const threadgroup T& operator [] (size_t pos) const threadgroup + { + return elements[pos]; + } +}; + +struct Transform +{ + float4 mat; + float2 translate; +}; + +struct Transform_1 +{ + float4 mat; + float2 translate; + char _m0_final_padding[8]; +}; + +struct DataBuf +{ + Transform_1 data[1]; +}; + +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); + +static inline __attribute__((always_inline)) +Transform combine_monoid(thread const Transform& a, thread const Transform& b) +{ + Transform c; + c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww); + c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate; + return c; +} + +static inline __attribute__((always_inline)) +Transform monoid_identity() +{ + return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) }; +} + +kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) +{ + threadgroup Transform sh_scratch[512]; + uint ix = gl_GlobalInvocationID.x * 8u; + spvUnsafeArray local; + local[0].mat = _89.data[ix].mat; + local[0].translate = _89.data[ix].translate; + Transform param_1; + for (uint i = 1u; i < 8u; i++) + { + uint _113 = ix + i; + Transform param = local[i - 1u]; + param_1.mat = _89.data[_113].mat; + param_1.translate = _89.data[_113].translate; + local[i] = combine_monoid(param, param_1); + } + Transform agg = local[7]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i_1 = 0u; i_1 < 9u; i_1++) + { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (gl_LocalInvocationID.x >= (1u << i_1)) + { + Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)]; + Transform param_2 = other; + Transform param_3 = agg; + agg = combine_monoid(param_2, param_3); + } + threadgroup_barrier(mem_flags::mem_threadgroup); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + Transform row = monoid_identity(); + if (gl_LocalInvocationID.x > 0u) + { + row = sh_scratch[gl_LocalInvocationID.x - 1u]; + } + for (uint i_2 = 0u; i_2 < 8u; i_2++) + { + Transform param_4 = row; + Transform param_5 = local[i_2]; + Transform m = combine_monoid(param_4, param_5); + uint _209 = ix + i_2; + _89.data[_209].mat = m.mat; + _89.data[_209].translate = m.translate; + } +} + diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv new file mode 100644 index 0000000..7824d09 Binary files /dev/null and b/piet-gpu/shader/gen/transform_root.spv differ diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 322a047..072910b 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index d86ff9b..54814b7 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index cb41a4b..52ea6e4 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -38,6 +38,12 @@ struct Config { Alloc pathseg_alloc; Alloc anno_alloc; Alloc trans_alloc; + // new element pipeline stuff follows + + // Number of transforms in scene + uint n_trans; + // Offset (in bytes) of transform stream in scene buffer + uint trans_offset; }; // Fill modes. diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv index 2d7363d..b123f18 100644 Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ diff --git a/piet-gpu/shader/transform_leaf.comp b/piet-gpu/shader/transform_leaf.comp new file mode 100644 index 0000000..e158c50 --- /dev/null +++ b/piet-gpu/shader/transform_leaf.comp @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// A scan for a tree reduction prefix scan that outputs the final result. +// Output is written into memory at trans_alloc. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define N_ROWS 8 +#define LG_WG_SIZE 9 +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(binding = 2) readonly buffer SceneBuf { + uint[] scene; +}; + +#include "scene.h" +#include "tile.h" + +#define Monoid Transform + +layout(set = 0, binding = 3) readonly buffer ParentBuf { + Monoid[] parent; +}; + +Monoid monoid_identity() { + return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0)); +} + +Monoid combine_monoid(Monoid a, Monoid b) { + Monoid c; + c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww; + c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate; + return c; +} + +shared Monoid sh_scratch[WG_SIZE]; + +void main() { + Monoid local[N_ROWS]; + + uint ix = gl_GlobalInvocationID.x * N_ROWS; + TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size); + + Monoid agg = Transform_read(ref); + local[0] = agg; + for (uint i = 1; i < N_ROWS; i++) { + agg = combine_monoid(agg, Transform_read(Transform_index(ref, i))); + local[i] = agg; + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1u << i)) { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)]; + agg = combine_monoid(other, agg); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + + barrier(); + Monoid row = monoid_identity(); + if (gl_WorkGroupID.x > 0) { + row = parent[gl_WorkGroupID.x - 1]; + } + if (gl_LocalInvocationID.x > 0) { + row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]); + } + for (uint i = 0; i < N_ROWS; i++) { + Monoid m = combine_monoid(row, local[i]); + TransformSeg transform = TransformSeg(m.mat, m.translate); + TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (ix + i) * TransformSeg_size); + TransformSeg_write(conf.trans_alloc, trans_ref, transform); + } +} diff --git a/piet-gpu/shader/transform_reduce.comp b/piet-gpu/shader/transform_reduce.comp new file mode 100644 index 0000000..4b72b11 --- /dev/null +++ b/piet-gpu/shader/transform_reduce.comp @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// The reduction phase for transform scan implemented as a tree reduction. + +#version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "mem.h" +#include "setup.h" + +#define N_ROWS 8 +#define LG_WG_SIZE 9 +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +layout(binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(binding = 2) readonly buffer SceneBuf { + uint[] scene; +}; + +#include "scene.h" + +#define Monoid Transform + +layout(set = 0, binding = 3) buffer OutBuf { + Monoid[] outbuf; +}; + +Monoid monoid_identity() { + return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0)); +} + +Monoid combine_monoid(Monoid a, Monoid b) { + Monoid c; + c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww; + c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate; + return c; +} + +shared Monoid sh_scratch[WG_SIZE]; + +void main() { + uint ix = gl_GlobalInvocationID.x * N_ROWS; + TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size); + + Monoid agg = Transform_read(ref); + for (uint i = 1; i < N_ROWS; i++) { + agg = combine_monoid(agg, Transform_read(Transform_index(ref, i))); + } + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + // We could make this predicate tighter, but would it help? + if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) { + Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)]; + agg = combine_monoid(agg, other); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + if (gl_LocalInvocationID.x == 0) { + outbuf[gl_WorkGroupID.x] = agg; + } +} diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp new file mode 100644 index 0000000..e8e0019 --- /dev/null +++ b/piet-gpu/shader/transform_scan.comp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +// A scan for a tree reduction prefix scan (either root or not, by ifdef). + +#version 450 + +#define N_ROWS 8 +#define LG_WG_SIZE 9 +#define WG_SIZE (1 << LG_WG_SIZE) +#define PARTITION_SIZE (WG_SIZE * N_ROWS) + +layout(local_size_x = WG_SIZE, local_size_y = 1) in; + +// This is copy-pasted from scene.h. It might be better for DRY +// to include it, but that pulls in more stuff we don't need. +struct Transform { + vec4 mat; + vec2 translate; +}; + +#define Monoid Transform + +layout(binding = 0) buffer DataBuf { + Monoid[] data; +}; + +#ifndef ROOT +layout(binding = 1) readonly buffer ParentBuf { + Monoid[] parent; +}; +#endif + +Monoid monoid_identity() { + return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0)); +} + +Monoid combine_monoid(Monoid a, Monoid b) { + Monoid c; + c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww; + c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate; + return c; +} + +shared Monoid sh_scratch[WG_SIZE]; + +void main() { + Monoid local[N_ROWS]; + + uint ix = gl_GlobalInvocationID.x * N_ROWS; + + // TODO: gate buffer read + local[0] = data[ix]; + for (uint i = 1; i < N_ROWS; i++) { + local[i] = combine_monoid(local[i - 1], data[ix + i]); + } + Monoid agg = local[N_ROWS - 1]; + sh_scratch[gl_LocalInvocationID.x] = agg; + for (uint i = 0; i < LG_WG_SIZE; i++) { + barrier(); + if (gl_LocalInvocationID.x >= (1u << i)) { + Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)]; + agg = combine_monoid(other, agg); + } + barrier(); + sh_scratch[gl_LocalInvocationID.x] = agg; + } + + barrier(); + // This could be a semigroup instead of a monoid if we reworked the + // conditional logic, but that might impact performance. + Monoid row = monoid_identity(); +#ifdef ROOT + if (gl_LocalInvocationID.x > 0) { + row = sh_scratch[gl_LocalInvocationID.x - 1]; + } +#else + if (gl_WorkGroupID.x > 0) { + row = parent[gl_WorkGroupID.x - 1]; + } + if (gl_LocalInvocationID.x > 0) { + row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]); + } +#endif + for (uint i = 0; i < N_ROWS; i++) { + Monoid m = combine_monoid(row, local[i]); + // TODO: gate buffer write + data[ix + i] = m; + } +} diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index bee07aa..8d21fe6 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -1,6 +1,7 @@ mod gradient; mod pico_svg; mod render_ctx; +pub mod stages; pub mod test_scenes; mod text; @@ -20,6 +21,8 @@ use piet_gpu_hal::{ use pico_svg::PicoSvg; +use crate::stages::Config; + const TILE_W: usize = 16; const TILE_H: usize = 16; @@ -123,7 +126,7 @@ impl Renderer { let image_dev = session.create_image2d(width as u32, height as u32)?; // Note: this must be updated when the config struct size changes. - const CONFIG_BUFFER_SIZE: u64 = 40; + const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::() as u64; let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap(); // TODO: separate staging buffer (if needed) let config_bufs = (0..n_bufs) @@ -295,25 +298,28 @@ impl Renderer { alloc += (n_paths * ANNO_SIZE + 3) & !3; let trans_base = alloc; alloc += (n_trans * TRANS_SIZE + 3) & !3; - let config = &[ - n_paths as u32, - n_pathseg as u32, - width_in_tiles as u32, - height_in_tiles as u32, - tile_base as u32, - bin_base as u32, - ptcl_base as u32, - pathseg_base as u32, - anno_base as u32, - trans_base as u32, - ]; + let trans_offset = 0; // For new element pipeline, not yet used + let config = Config { + n_elements: n_paths as u32, + n_pathseg: n_pathseg as u32, + width_in_tiles: width_in_tiles as u32, + height_in_tiles: height_in_tiles as u32, + tile_alloc: tile_base as u32, + bin_alloc: bin_base as u32, + ptcl_alloc: ptcl_base as u32, + pathseg_alloc: pathseg_base as u32, + anno_alloc: anno_base as u32, + trans_alloc: trans_base as u32, + n_trans: n_trans as u32, + trans_offset: trans_offset as u32, + }; unsafe { let scene = render_ctx.get_scene_buf(); self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); // TODO: reallocate scene buffer if size is inadequate assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len()); self.scene_bufs[buf_ix].write(scene)?; - self.config_bufs[buf_ix].write(config)?; + self.config_bufs[buf_ix].write(&[config])?; self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; // Upload gradient data. diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs new file mode 100644 index 0000000..0613585 --- /dev/null +++ b/piet-gpu/src/stages.rs @@ -0,0 +1,209 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! Stages for new element pipeline, exposed for testing. + +use bytemuck::{Pod, Zeroable}; + +use piet::kurbo::Affine; +use piet_gpu_hal::{ + include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session, +}; + +/// The configuration block passed to piet-gpu shaders. +/// +/// Note: this should be kept in sync with the version in setup.h. +#[repr(C)] +#[derive(Clone, Copy, Default, Zeroable, Pod)] +pub struct Config { + pub n_elements: u32, // paths + pub n_pathseg: u32, + pub width_in_tiles: u32, + pub height_in_tiles: u32, + pub tile_alloc: u32, + pub bin_alloc: u32, + pub ptcl_alloc: u32, + pub pathseg_alloc: u32, + pub anno_alloc: u32, + pub trans_alloc: u32, + pub n_trans: u32, + pub trans_offset: u32, +} + +// The individual stages will probably be separate files but for now, all in one. + +// This is equivalent to the version in piet-gpu-types, but the bytemuck +// representation will likely be faster. +#[repr(C)] +#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)] +pub struct Transform { + pub mat: [f32; 4], + pub translate: [f32; 2], +} + +const TRANSFORM_WG: u64 = 512; +const TRANSFORM_N_ROWS: u64 = 8; +const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS; + +pub struct TransformCode { + reduce_pipeline: Pipeline, + root_pipeline: Pipeline, + leaf_pipeline: Pipeline, +} + +pub struct TransformStage { + // Right now we're limited to partition^2 (~16M) elements. This can be + // expanded but is tedious. + root_buf: Buffer, + root_ds: DescriptorSet, +} + +pub struct TransformBinding { + reduce_ds: DescriptorSet, + leaf_ds: DescriptorSet, +} + +impl TransformCode { + pub unsafe fn new(session: &Session) -> TransformCode { + let reduce_code = include_shader!(session, "../shader/gen/transform_reduce"); + let reduce_pipeline = session + .create_compute_pipeline( + reduce_code, + &[ + BindType::Buffer, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::Buffer, + ], + ) + .unwrap(); + let root_code = include_shader!(session, "../shader/gen/transform_root"); + let root_pipeline = session + .create_compute_pipeline(root_code, &[BindType::Buffer]) + .unwrap(); + let leaf_code = include_shader!(session, "../shader/gen/transform_leaf"); + let leaf_pipeline = session + .create_compute_pipeline( + leaf_code, + &[ + BindType::Buffer, + BindType::BufReadOnly, + BindType::BufReadOnly, + BindType::BufReadOnly, + ], + ) + .unwrap(); + TransformCode { + reduce_pipeline, + root_pipeline, + leaf_pipeline, + } + } +} + +impl TransformStage { + pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage { + // We're limited to TRANSFORM_PART_SIZE^2 + // Also note: size here allows padding + let root_buf_size = TRANSFORM_PART_SIZE * 32; + let root_buf = session + .create_buffer(root_buf_size, BufferUsage::STORAGE) + .unwrap(); + let root_ds = session + .create_simple_descriptor_set(&code.root_pipeline, &[&root_buf]) + .unwrap(); + TransformStage { root_buf, root_ds } + } + + pub unsafe fn bind( + &self, + session: &Session, + code: &TransformCode, + config_buf: &Buffer, + scene_buf: &Buffer, + memory_buf: &Buffer, + ) -> TransformBinding { + let reduce_ds = session + .create_simple_descriptor_set( + &code.reduce_pipeline, + &[memory_buf, config_buf, scene_buf, &self.root_buf], + ) + .unwrap(); + let leaf_ds = session + .create_simple_descriptor_set( + &code.leaf_pipeline, + &[memory_buf, config_buf, scene_buf, &self.root_buf], + ) + .unwrap(); + TransformBinding { reduce_ds, leaf_ds } + } + + pub unsafe fn record( + &self, + cmd_buf: &mut CmdBuf, + code: &TransformCode, + binding: &TransformBinding, + size: u64, + ) { + if size > TRANSFORM_PART_SIZE.pow(2) { + panic!("very large scan not yet implemented"); + } + let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE; + if n_workgroups > 1 { + cmd_buf.dispatch( + &code.reduce_pipeline, + &binding.reduce_ds, + (n_workgroups as u32, 1, 1), + (TRANSFORM_WG as u32, 1, 1), + ); + cmd_buf.memory_barrier(); + cmd_buf.dispatch( + &code.root_pipeline, + &self.root_ds, + (1, 1, 1), + (TRANSFORM_WG as u32, 1, 1), + ); + cmd_buf.memory_barrier(); + } + cmd_buf.dispatch( + &code.leaf_pipeline, + &binding.leaf_ds, + (n_workgroups as u32, 1, 1), + (TRANSFORM_WG as u32, 1, 1), + ); + } +} + +impl Transform { + pub fn from_kurbo(a: Affine) -> Transform { + let c = a.as_coeffs(); + Transform { + mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32], + translate: [c[4] as f32, c[5] as f32], + } + } + + pub fn to_kurbo(self) -> Affine { + Affine::new([ + self.mat[0] as f64, + self.mat[1] as f64, + self.mat[2] as f64, + self.mat[3] as f64, + self.translate[0] as f64, + self.translate[1] as f64, + ]) + } +} diff --git a/tests/Cargo.toml b/tests/Cargo.toml index a987c9e..1f0760a 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -6,9 +6,18 @@ description = "Tests for piet-gpu shaders and generic GPU capabilities." license = "MIT/Apache-2.0" edition = "2021" +[features] +default = ["piet-gpu"] + [dependencies] clap = "2.33" bytemuck = "1.7.2" +kurbo = "0.7.1" +rand = "0.7.3" [dependencies.piet-gpu-hal] path = "../piet-gpu-hal" + +[dependencies.piet-gpu] +path = "../piet-gpu" +optional = true diff --git a/tests/src/clear.rs b/tests/src/clear.rs index 7d8bee0..009360b 100644 --- a/tests/src/clear.rs +++ b/tests/src/clear.rs @@ -55,12 +55,12 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); out_buf.read(&mut dst); if let Some(failure) = verify(&dst) { diff --git a/tests/src/config.rs b/tests/src/config.rs index edc1140..2593ed9 100644 --- a/tests/src/config.rs +++ b/tests/src/config.rs @@ -22,6 +22,7 @@ pub struct Config { pub groups: Groups, pub size: Size, pub n_iter: u64, + pub verify_all: bool, } pub struct Groups(String); @@ -40,10 +41,12 @@ impl Config { .value_of("n_iter") .and_then(|s| s.parse().ok()) .unwrap_or(1000); + let verify_all = matches.is_present("verify_all"); Config { groups, size, n_iter, + verify_all, } } } diff --git a/tests/src/linkedlist.rs b/tests/src/linkedlist.rs index b3d03ed..3102a73 100644 --- a/tests/src/linkedlist.rs +++ b/tests/src/linkedlist.rs @@ -48,12 +48,12 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR commands.write_timestamp(0); stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&mem_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); mem_buf.read(&mut dst); if !verify(&dst) { diff --git a/tests/src/main.rs b/tests/src/main.rs index dd6f4bd..0ab9340 100644 --- a/tests/src/main.rs +++ b/tests/src/main.rs @@ -25,6 +25,9 @@ mod prefix_tree; mod runner; mod test_result; +#[cfg(feature = "piet-gpu")] +mod transform; + use clap::{App, Arg}; use piet_gpu_hal::InstanceFlags; @@ -62,6 +65,11 @@ fn main() { .help("Number of iterations") .takes_value(true), ) + .arg( + Arg::with_name("verify_all") + .long("verify_all") + .help("Verify all iterations"), + ) .arg( Arg::with_name("dx12") .long("dx12") @@ -123,5 +131,9 @@ fn main() { } report(&linkedlist::run_linkedlist_test(&mut runner, &config)); } + #[cfg(feature = "piet-gpu")] + if config.groups.matches("piet") { + report(&transform::transform_test(&mut runner, &config)); + } } } diff --git a/tests/src/prefix.rs b/tests/src/prefix.rs index 71be865..00a69ca 100644 --- a/tests/src/prefix.rs +++ b/tests/src/prefix.rs @@ -85,12 +85,12 @@ pub unsafe fn run_prefix_test( commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); out_buf.read(&mut dst); if let Some(failure) = verify(&dst) { diff --git a/tests/src/prefix_tree.rs b/tests/src/prefix_tree.rs index 9603385..5957e88 100644 --- a/tests/src/prefix_tree.rs +++ b/tests/src/prefix_tree.rs @@ -66,12 +66,12 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul commands.write_timestamp(0); stage.record(&mut commands, &code, &binding); commands.write_timestamp(1); - if i == 0 { + if i == 0 || config.verify_all { commands.cmd_buf.memory_barrier(); commands.download(&out_buf); } total_elapsed += runner.submit(commands); - if i == 0 { + if i == 0 || config.verify_all { let mut dst: Vec = Default::default(); out_buf.read(&mut dst); if let Some(failure) = verify(&dst) { diff --git a/tests/src/transform.rs b/tests/src/transform.rs new file mode 100644 index 0000000..d696b10 --- /dev/null +++ b/tests/src/transform.rs @@ -0,0 +1,134 @@ +// Copyright 2021 The piet-gpu authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Also licensed under MIT license, at your choice. + +//! Tests for piet-gpu shaders. + +use crate::{Config, Runner, TestResult}; + +use kurbo::Affine; +use piet_gpu::stages::{self, Transform, TransformCode, TransformStage}; +use piet_gpu_hal::BufferUsage; +use rand::Rng; + +struct AffineTestData { + input_data: Vec, + expected: Vec, +} + +pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult { + let mut result = TestResult::new("transform"); + let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24); + // Validate with real transform data. + let data = AffineTestData::new(n_elements as usize); + let data_buf = runner + .session + .create_buffer_init(&data.input_data, BufferUsage::STORAGE) + .unwrap(); + let memory = runner.buf_down(data_buf.size() + 24, BufferUsage::empty()); + let stage_config = stages::Config { + n_trans: n_elements as u32, + // This is a hack to get elements aligned. + trans_alloc: 16, + ..Default::default() + }; + let config_buf = runner + .session + .create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE) + .unwrap(); + + let code = TransformCode::new(&runner.session); + let stage = TransformStage::new(&runner.session, &code); + let binding = stage.bind( + &runner.session, + &code, + &config_buf, + &data_buf, + &memory.dev_buf, + ); + let mut total_elapsed = 0.0; + let n_iter = config.n_iter; + for i in 0..n_iter { + let mut commands = runner.commands(); + commands.write_timestamp(0); + stage.record(&mut commands.cmd_buf, &code, &binding, n_elements); + commands.write_timestamp(1); + if i == 0 || config.verify_all { + commands.cmd_buf.memory_barrier(); + commands.download(&memory); + } + total_elapsed += runner.submit(commands); + if i == 0 || config.verify_all { + let mut dst: Vec = Default::default(); + memory.read(&mut dst); + if let Some(failure) = data.verify(&dst[1..]) { + result.fail(failure); + } + } + } + result.timing(total_elapsed, n_elements * n_iter); + result +} + +impl AffineTestData { + fn new(n: usize) -> AffineTestData { + let mut rng = rand::thread_rng(); + let mut a = Affine::default(); + let mut input_data = Vec::with_capacity(n); + let mut expected = Vec::with_capacity(n); + for _ in 0..n { + loop { + let b = Affine::new([ + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + rng.gen_range(-3.0, 3.0), + ]); + if b.determinant().abs() >= 1.0 { + expected.push(b); + let c = a.inverse() * b; + input_data.push(Transform::from_kurbo(c)); + a = b; + break; + } + } + } + AffineTestData { + input_data, + expected, + } + } + + fn verify(&self, actual: &[Transform]) -> Option { + for (i, (actual, expected)) in actual.iter().zip(&self.expected).enumerate() { + let error: f64 = actual + .to_kurbo() + .as_coeffs() + .iter() + .zip(expected.as_coeffs()) + .map(|(actual, expected)| (actual - expected).powi(2)) + .sum(); + // Hopefully this is right; most of the time the error is much + // smaller, but occasionally we see outliers. + let tolerance = 1e-9 * (i + 1) as f64; + if error > tolerance { + return Some(format!("{}: {} {}", i, error, tolerance)); + } + } + None + } +}