Start work on new element pipeline

There's a bit of reorganizing as well. Shader stages are made available
from piet-gpu to the test rig, config is now a proper structure
(marshaled with bytemuck).

This commit just has the transform stage, which is a simple monoid scan
of affine transforms.

Progress toward #119
This commit is contained in:
Raph Levien 2021-11-23 07:28:50 -08:00
parent a8103a4c20
commit 47f8812e2f
36 changed files with 1674 additions and 23 deletions

18
Cargo.lock generated
View file

@ -96,6 +96,20 @@ name = "bytemuck"
version = "1.7.2" version = "1.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b" checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b"
dependencies = [
"bytemuck_derive",
]
[[package]]
name = "bytemuck_derive"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "byteorder" name = "byteorder"
@ -872,6 +886,7 @@ dependencies = [
name = "piet-gpu" name = "piet-gpu"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"bytemuck",
"clap", "clap",
"ndk", "ndk",
"ndk-glue", "ndk-glue",
@ -920,7 +935,10 @@ version = "0.1.0"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"clap", "clap",
"kurbo",
"piet-gpu",
"piet-gpu-hal", "piet-gpu-hal",
"rand",
] ]
[[package]] [[package]]

View file

@ -33,6 +33,7 @@ roxmltree = "0.13"
winit = "0.25" winit = "0.25"
clap = "2.33" clap = "2.33"
swash = "0.1.4" swash = "0.1.4"
bytemuck = { version = "1.7.2", features = ["derive"] }
[target.'cfg(target_os = "android")'.dependencies] [target.'cfg(target_os = "android")'.dependencies]
ndk = "0.3" ndk = "0.3"

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,12 +1,27 @@
# Build file for shaders. # Build file for shaders.
# You must have glslangValidator in your path, or patch here. # You must have Vulkan tools in your path, or patch here.
glslang_validator = glslangValidator glslang_validator = glslangValidator
spirv_cross = spirv-cross
dxc = dxc
# See https://github.com/KhronosGroup/SPIRV-Cross/issues/1248 for
# why we set this.
msl_flags = --msl-decoration-binding
rule glsl rule glsl
command = $glslang_validator $flags -V -o $out $in command = $glslang_validator $flags -V -o $out $in
rule hlsl
command = $spirv_cross --hlsl $in --output $out
rule dxil
command = $dxc -T cs_6_0 $in -Fo $out
rule msl
command = $spirv_cross --msl $in --output $out $msl_flags
build elements.spv: glsl elements.comp | scene.h state.h annotated.h build elements.spv: glsl elements.comp | scene.h state.h annotated.h
@ -24,3 +39,21 @@ build backdrop_lg.spv: glsl backdrop.comp | annotated.h tile.h setup.h
build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h build coarse.spv: glsl coarse.comp | annotated.h bins.h ptcl.h setup.h
build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h build kernel4.spv: glsl kernel4.comp | ptcl.h setup.h
# New element pipeline follows
build gen/transform_reduce.spv: glsl transform_reduce.comp | scene.h setup.h mem.h
build gen/transform_reduce.hlsl: hlsl gen/transform_reduce.spv
build gen/transform_reduce.dxil: dxil gen/transform_reduce.hlsl
build gen/transform_reduce.msl: msl gen/transform_reduce.spv
build gen/transform_root.spv: glsl transform_scan.comp
flags = -DROOT
build gen/transform_root.hlsl: hlsl gen/transform_root.spv
build gen/transform_root.dxil: dxil gen/transform_root.hlsl
build gen/transform_root.msl: msl gen/transform_root.spv
build gen/transform_leaf.spv: glsl transform_leaf.comp | scene.h tile.h setup.h mem.h
build gen/transform_leaf.hlsl: hlsl gen/transform_leaf.spv
build gen/transform_leaf.dxil: dxil gen/transform_leaf.hlsl
build gen/transform_leaf.msl: msl gen/transform_leaf.spv

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,219 @@
struct Alloc
{
uint offset;
};
struct TransformRef
{
uint offset;
};
struct Transform
{
float4 mat;
float2 translate;
};
struct TransformSegRef
{
uint offset;
};
struct TransformSeg
{
float4 mat;
float2 translate;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
uint n_trans;
uint trans_offset;
};
static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
RWByteAddressBuffer _71 : register(u0);
ByteAddressBuffer _96 : register(t2);
ByteAddressBuffer _278 : register(t1);
ByteAddressBuffer _377 : register(t3);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_WorkGroupID : SV_GroupID;
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
groupshared Transform sh_scratch[512];
Transform Transform_read(TransformRef ref)
{
uint ix = ref.offset >> uint(2);
uint raw0 = _96.Load((ix + 0u) * 4 + 0);
uint raw1 = _96.Load((ix + 1u) * 4 + 0);
uint raw2 = _96.Load((ix + 2u) * 4 + 0);
uint raw3 = _96.Load((ix + 3u) * 4 + 0);
uint raw4 = _96.Load((ix + 4u) * 4 + 0);
uint raw5 = _96.Load((ix + 5u) * 4 + 0);
Transform s;
s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
s.translate = float2(asfloat(raw4), asfloat(raw5));
return s;
}
TransformRef Transform_index(TransformRef ref, uint index)
{
TransformRef _85 = { ref.offset + (index * 24u) };
return _85;
}
Transform combine_monoid(Transform a, Transform b)
{
Transform c;
c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
return c;
}
Transform monoid_identity()
{
return _224;
}
bool touch_mem(Alloc alloc, uint offset)
{
return true;
}
void write_mem(Alloc alloc, uint offset, uint val)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
_71.Store(offset * 4 + 8, val);
}
void TransformSeg_write(Alloc a, TransformSegRef ref, TransformSeg s)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = asuint(s.mat.x);
write_mem(param, param_1, param_2);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = asuint(s.mat.y);
write_mem(param_3, param_4, param_5);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = asuint(s.mat.z);
write_mem(param_6, param_7, param_8);
Alloc param_9 = a;
uint param_10 = ix + 3u;
uint param_11 = asuint(s.mat.w);
write_mem(param_9, param_10, param_11);
Alloc param_12 = a;
uint param_13 = ix + 4u;
uint param_14 = asuint(s.translate.x);
write_mem(param_12, param_13, param_14);
Alloc param_15 = a;
uint param_16 = ix + 5u;
uint param_17 = asuint(s.translate.y);
write_mem(param_15, param_16, param_17);
}
void comp_main()
{
uint ix = gl_GlobalInvocationID.x * 8u;
TransformRef _285 = { _278.Load(44) + (ix * 24u) };
TransformRef ref = _285;
TransformRef param = ref;
Transform agg = Transform_read(param);
Transform local[8];
local[0] = agg;
for (uint i = 1u; i < 8u; i++)
{
TransformRef param_1 = ref;
uint param_2 = i;
TransformRef param_3 = Transform_index(param_1, param_2);
Transform param_4 = agg;
Transform param_5 = Transform_read(param_3);
agg = combine_monoid(param_4, param_5);
local[i] = agg;
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
GroupMemoryBarrierWithGroupSync();
if (gl_LocalInvocationID.x >= (1u << i_1))
{
Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
Transform param_6 = other;
Transform param_7 = agg;
agg = combine_monoid(param_6, param_7);
}
GroupMemoryBarrierWithGroupSync();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
GroupMemoryBarrierWithGroupSync();
Transform row = monoid_identity();
if (gl_WorkGroupID.x > 0u)
{
Transform _383;
_383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0));
_383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16));
row.mat = _383.mat;
row.translate = _383.translate;
}
if (gl_LocalInvocationID.x > 0u)
{
Transform param_8 = row;
Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u];
row = combine_monoid(param_8, param_9);
}
Alloc param_12;
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
Transform param_10 = row;
Transform param_11 = local[i_2];
Transform m = combine_monoid(param_10, param_11);
TransformSeg _423 = { m.mat, m.translate };
TransformSeg transform = _423;
TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) };
TransformSegRef trans_ref = _433;
Alloc _437;
_437.offset = _278.Load(36);
param_12.offset = _437.offset;
TransformSegRef param_13 = trans_ref;
TransformSeg param_14 = transform;
TransformSeg_write(param_12, param_13, param_14);
}
}
[numthreads(512, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}

View file

@ -0,0 +1,272 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
template<typename T, size_t Num>
struct spvUnsafeArray
{
T elements[Num ? Num : 1];
thread T& operator [] (size_t pos) thread
{
return elements[pos];
}
constexpr const thread T& operator [] (size_t pos) const thread
{
return elements[pos];
}
device T& operator [] (size_t pos) device
{
return elements[pos];
}
constexpr const device T& operator [] (size_t pos) const device
{
return elements[pos];
}
constexpr const constant T& operator [] (size_t pos) const constant
{
return elements[pos];
}
threadgroup T& operator [] (size_t pos) threadgroup
{
return elements[pos];
}
constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
{
return elements[pos];
}
};
struct Alloc
{
uint offset;
};
struct TransformRef
{
uint offset;
};
struct Transform
{
float4 mat;
float2 translate;
};
struct TransformSegRef
{
uint offset;
};
struct TransformSeg
{
float4 mat;
float2 translate;
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
struct SceneBuf
{
uint scene[1];
};
struct Alloc_1
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc_1 tile_alloc;
Alloc_1 bin_alloc;
Alloc_1 ptcl_alloc;
Alloc_1 pathseg_alloc;
Alloc_1 anno_alloc;
Alloc_1 trans_alloc;
uint n_trans;
uint trans_offset;
};
struct ConfigBuf
{
Config conf;
};
struct Transform_1
{
float4 mat;
float2 translate;
char _m0_final_padding[8];
};
struct ParentBuf
{
Transform_1 parent[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
static inline __attribute__((always_inline))
Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96)
{
uint ix = ref.offset >> uint(2);
uint raw0 = v_96.scene[ix + 0u];
uint raw1 = v_96.scene[ix + 1u];
uint raw2 = v_96.scene[ix + 2u];
uint raw3 = v_96.scene[ix + 3u];
uint raw4 = v_96.scene[ix + 4u];
uint raw5 = v_96.scene[ix + 5u];
Transform s;
s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
return s;
}
static inline __attribute__((always_inline))
TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index)
{
return TransformRef{ ref.offset + (index * 24u) };
}
static inline __attribute__((always_inline))
Transform combine_monoid(thread const Transform& a, thread const Transform& b)
{
Transform c;
c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
return c;
}
static inline __attribute__((always_inline))
Transform monoid_identity()
{
return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) };
}
static inline __attribute__((always_inline))
bool touch_mem(thread const Alloc& alloc, thread const uint& offset)
{
return true;
}
static inline __attribute__((always_inline))
void write_mem(thread const Alloc& alloc, thread const uint& offset, thread const uint& val, device Memory& v_71)
{
Alloc param = alloc;
uint param_1 = offset;
if (!touch_mem(param, param_1))
{
return;
}
v_71.memory[offset] = val;
}
static inline __attribute__((always_inline))
void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref, thread const TransformSeg& s, device Memory& v_71)
{
uint ix = ref.offset >> uint(2);
Alloc param = a;
uint param_1 = ix + 0u;
uint param_2 = as_type<uint>(s.mat.x);
write_mem(param, param_1, param_2, v_71);
Alloc param_3 = a;
uint param_4 = ix + 1u;
uint param_5 = as_type<uint>(s.mat.y);
write_mem(param_3, param_4, param_5, v_71);
Alloc param_6 = a;
uint param_7 = ix + 2u;
uint param_8 = as_type<uint>(s.mat.z);
write_mem(param_6, param_7, param_8, v_71);
Alloc param_9 = a;
uint param_10 = ix + 3u;
uint param_11 = as_type<uint>(s.mat.w);
write_mem(param_9, param_10, param_11, v_71);
Alloc param_12 = a;
uint param_13 = ix + 4u;
uint param_14 = as_type<uint>(s.translate.x);
write_mem(param_12, param_13, param_14, v_71);
Alloc param_15 = a;
uint param_16 = ix + 5u;
uint param_17 = as_type<uint>(s.translate.y);
write_mem(param_15, param_16, param_17, v_71);
}
kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
{
threadgroup Transform sh_scratch[512];
uint ix = gl_GlobalInvocationID.x * 8u;
TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) };
TransformRef param = ref;
Transform agg = Transform_read(param, v_96);
spvUnsafeArray<Transform, 8> local;
local[0] = agg;
for (uint i = 1u; i < 8u; i++)
{
TransformRef param_1 = ref;
uint param_2 = i;
TransformRef param_3 = Transform_index(param_1, param_2);
Transform param_4 = agg;
Transform param_5 = Transform_read(param_3, v_96);
agg = combine_monoid(param_4, param_5);
local[i] = agg;
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if (gl_LocalInvocationID.x >= (1u << i_1))
{
Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
Transform param_6 = other;
Transform param_7 = agg;
agg = combine_monoid(param_6, param_7);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_scratch[gl_LocalInvocationID.x] = agg;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
Transform row = monoid_identity();
if (gl_WorkGroupID.x > 0u)
{
uint _380 = gl_WorkGroupID.x - 1u;
row.mat = _377.parent[_380].mat;
row.translate = _377.parent[_380].translate;
}
if (gl_LocalInvocationID.x > 0u)
{
Transform param_8 = row;
Transform param_9 = sh_scratch[gl_LocalInvocationID.x - 1u];
row = combine_monoid(param_8, param_9);
}
Alloc param_12;
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
Transform param_10 = row;
Transform param_11 = local[i_2];
Transform m = combine_monoid(param_10, param_11);
TransformSeg transform = TransformSeg{ m.mat, m.translate };
TransformSegRef trans_ref = TransformSegRef{ _278.conf.trans_alloc.offset + ((ix + i_2) * 24u) };
param_12.offset = _278.conf.trans_alloc.offset;
TransformSegRef param_13 = trans_ref;
TransformSeg param_14 = transform;
TransformSeg_write(param_12, param_13, param_14, v_71);
}
}

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,125 @@
struct TransformRef
{
uint offset;
};
struct Transform
{
float4 mat;
float2 translate;
};
struct Alloc
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
uint n_trans;
uint trans_offset;
};
static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
ByteAddressBuffer _49 : register(t2);
ByteAddressBuffer _161 : register(t1);
RWByteAddressBuffer _251 : register(u3);
RWByteAddressBuffer _267 : register(u0);
static uint3 gl_WorkGroupID;
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_WorkGroupID : SV_GroupID;
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
groupshared Transform sh_scratch[512];
Transform Transform_read(TransformRef ref)
{
uint ix = ref.offset >> uint(2);
uint raw0 = _49.Load((ix + 0u) * 4 + 0);
uint raw1 = _49.Load((ix + 1u) * 4 + 0);
uint raw2 = _49.Load((ix + 2u) * 4 + 0);
uint raw3 = _49.Load((ix + 3u) * 4 + 0);
uint raw4 = _49.Load((ix + 4u) * 4 + 0);
uint raw5 = _49.Load((ix + 5u) * 4 + 0);
Transform s;
s.mat = float4(asfloat(raw0), asfloat(raw1), asfloat(raw2), asfloat(raw3));
s.translate = float2(asfloat(raw4), asfloat(raw5));
return s;
}
TransformRef Transform_index(TransformRef ref, uint index)
{
TransformRef _37 = { ref.offset + (index * 24u) };
return _37;
}
Transform combine_monoid(Transform a, Transform b)
{
Transform c;
c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
return c;
}
void comp_main()
{
uint ix = gl_GlobalInvocationID.x * 8u;
TransformRef _168 = { _161.Load(44) + (ix * 24u) };
TransformRef ref = _168;
TransformRef param = ref;
Transform agg = Transform_read(param);
for (uint i = 1u; i < 8u; i++)
{
TransformRef param_1 = ref;
uint param_2 = i;
TransformRef param_3 = Transform_index(param_1, param_2);
Transform param_4 = agg;
Transform param_5 = Transform_read(param_3);
agg = combine_monoid(param_4, param_5);
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
GroupMemoryBarrierWithGroupSync();
if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
{
Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
Transform param_6 = agg;
Transform param_7 = other;
agg = combine_monoid(param_6, param_7);
}
GroupMemoryBarrierWithGroupSync();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
if (gl_LocalInvocationID.x == 0u)
{
_251.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat));
_251.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate));
}
}
[numthreads(512, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_WorkGroupID = stage_input.gl_WorkGroupID;
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}

View file

@ -0,0 +1,138 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct TransformRef
{
uint offset;
};
struct Transform
{
float4 mat;
float2 translate;
};
struct SceneBuf
{
uint scene[1];
};
struct Alloc
{
uint offset;
};
struct Config
{
uint n_elements;
uint n_pathseg;
uint width_in_tiles;
uint height_in_tiles;
Alloc tile_alloc;
Alloc bin_alloc;
Alloc ptcl_alloc;
Alloc pathseg_alloc;
Alloc anno_alloc;
Alloc trans_alloc;
uint n_trans;
uint trans_offset;
};
struct ConfigBuf
{
Config conf;
};
struct Transform_1
{
float4 mat;
float2 translate;
char _m0_final_padding[8];
};
struct OutBuf
{
Transform_1 outbuf[1];
};
struct Memory
{
uint mem_offset;
uint mem_error;
uint memory[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
static inline __attribute__((always_inline))
Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49)
{
uint ix = ref.offset >> uint(2);
uint raw0 = v_49.scene[ix + 0u];
uint raw1 = v_49.scene[ix + 1u];
uint raw2 = v_49.scene[ix + 2u];
uint raw3 = v_49.scene[ix + 3u];
uint raw4 = v_49.scene[ix + 4u];
uint raw5 = v_49.scene[ix + 5u];
Transform s;
s.mat = float4(as_type<float>(raw0), as_type<float>(raw1), as_type<float>(raw2), as_type<float>(raw3));
s.translate = float2(as_type<float>(raw4), as_type<float>(raw5));
return s;
}
static inline __attribute__((always_inline))
TransformRef Transform_index(thread const TransformRef& ref, thread const uint& index)
{
return TransformRef{ ref.offset + (index * 24u) };
}
static inline __attribute__((always_inline))
Transform combine_monoid(thread const Transform& a, thread const Transform& b)
{
Transform c;
c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
return c;
}
kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _251 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]])
{
threadgroup Transform sh_scratch[512];
uint ix = gl_GlobalInvocationID.x * 8u;
TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) };
TransformRef param = ref;
Transform agg = Transform_read(param, v_49);
for (uint i = 1u; i < 8u; i++)
{
TransformRef param_1 = ref;
uint param_2 = i;
TransformRef param_3 = Transform_index(param_1, param_2);
Transform param_4 = agg;
Transform param_5 = Transform_read(param_3, v_49);
agg = combine_monoid(param_4, param_5);
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u)
{
Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)];
Transform param_6 = agg;
Transform param_7 = other;
agg = combine_monoid(param_6, param_7);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_scratch[gl_LocalInvocationID.x] = agg;
}
if (gl_LocalInvocationID.x == 0u)
{
_251.outbuf[gl_WorkGroupID.x].mat = agg.mat;
_251.outbuf[gl_WorkGroupID.x].translate = agg.translate;
}
}

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,94 @@
struct Transform
{
float4 mat;
float2 translate;
};
static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u);
static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx };
RWByteAddressBuffer _89 : register(u0);
static uint3 gl_LocalInvocationID;
static uint3 gl_GlobalInvocationID;
struct SPIRV_Cross_Input
{
uint3 gl_LocalInvocationID : SV_GroupThreadID;
uint3 gl_GlobalInvocationID : SV_DispatchThreadID;
};
groupshared Transform sh_scratch[512];
Transform combine_monoid(Transform a, Transform b)
{
Transform c;
c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
return c;
}
Transform monoid_identity()
{
return _23;
}
void comp_main()
{
uint ix = gl_GlobalInvocationID.x * 8u;
Transform _93;
_93.mat = asfloat(_89.Load4(ix * 32 + 0));
_93.translate = asfloat(_89.Load2(ix * 32 + 16));
Transform local[8];
local[0].mat = _93.mat;
local[0].translate = _93.translate;
Transform param_1;
for (uint i = 1u; i < 8u; i++)
{
Transform param = local[i - 1u];
Transform _119;
_119.mat = asfloat(_89.Load4((ix + i) * 32 + 0));
_119.translate = asfloat(_89.Load2((ix + i) * 32 + 16));
param_1.mat = _119.mat;
param_1.translate = _119.translate;
local[i] = combine_monoid(param, param_1);
}
Transform agg = local[7];
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
GroupMemoryBarrierWithGroupSync();
if (gl_LocalInvocationID.x >= (1u << i_1))
{
Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
Transform param_2 = other;
Transform param_3 = agg;
agg = combine_monoid(param_2, param_3);
}
GroupMemoryBarrierWithGroupSync();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
GroupMemoryBarrierWithGroupSync();
Transform row = monoid_identity();
if (gl_LocalInvocationID.x > 0u)
{
row = sh_scratch[gl_LocalInvocationID.x - 1u];
}
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
Transform param_4 = row;
Transform param_5 = local[i_2];
Transform m = combine_monoid(param_4, param_5);
uint _209 = ix + i_2;
_89.Store4(_209 * 32 + 0, asuint(m.mat));
_89.Store2(_209 * 32 + 16, asuint(m.translate));
}
}
[numthreads(512, 1, 1)]
void main(SPIRV_Cross_Input stage_input)
{
gl_LocalInvocationID = stage_input.gl_LocalInvocationID;
gl_GlobalInvocationID = stage_input.gl_GlobalInvocationID;
comp_main();
}

View file

@ -0,0 +1,129 @@
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wmissing-braces"
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
template<typename T, size_t Num>
struct spvUnsafeArray
{
T elements[Num ? Num : 1];
thread T& operator [] (size_t pos) thread
{
return elements[pos];
}
constexpr const thread T& operator [] (size_t pos) const thread
{
return elements[pos];
}
device T& operator [] (size_t pos) device
{
return elements[pos];
}
constexpr const device T& operator [] (size_t pos) const device
{
return elements[pos];
}
constexpr const constant T& operator [] (size_t pos) const constant
{
return elements[pos];
}
threadgroup T& operator [] (size_t pos) threadgroup
{
return elements[pos];
}
constexpr const threadgroup T& operator [] (size_t pos) const threadgroup
{
return elements[pos];
}
};
struct Transform
{
float4 mat;
float2 translate;
};
struct Transform_1
{
float4 mat;
float2 translate;
char _m0_final_padding[8];
};
struct DataBuf
{
Transform_1 data[1];
};
constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u);
static inline __attribute__((always_inline))
Transform combine_monoid(thread const Transform& a, thread const Transform& b)
{
Transform c;
c.mat = (a.mat.xyxy * b.mat.xxzz) + (a.mat.zwzw * b.mat.yyww);
c.translate = ((a.mat.xy * b.translate.x) + (a.mat.zw * b.translate.y)) + a.translate;
return c;
}
static inline __attribute__((always_inline))
Transform monoid_identity()
{
return Transform{ float4(1.0, 0.0, 0.0, 1.0), float2(0.0) };
}
kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
{
threadgroup Transform sh_scratch[512];
uint ix = gl_GlobalInvocationID.x * 8u;
spvUnsafeArray<Transform, 8> local;
local[0].mat = _89.data[ix].mat;
local[0].translate = _89.data[ix].translate;
Transform param_1;
for (uint i = 1u; i < 8u; i++)
{
uint _113 = ix + i;
Transform param = local[i - 1u];
param_1.mat = _89.data[_113].mat;
param_1.translate = _89.data[_113].translate;
local[i] = combine_monoid(param, param_1);
}
Transform agg = local[7];
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i_1 = 0u; i_1 < 9u; i_1++)
{
threadgroup_barrier(mem_flags::mem_threadgroup);
if (gl_LocalInvocationID.x >= (1u << i_1))
{
Transform other = sh_scratch[gl_LocalInvocationID.x - (1u << i_1)];
Transform param_2 = other;
Transform param_3 = agg;
agg = combine_monoid(param_2, param_3);
}
threadgroup_barrier(mem_flags::mem_threadgroup);
sh_scratch[gl_LocalInvocationID.x] = agg;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
Transform row = monoid_identity();
if (gl_LocalInvocationID.x > 0u)
{
row = sh_scratch[gl_LocalInvocationID.x - 1u];
}
for (uint i_2 = 0u; i_2 < 8u; i_2++)
{
Transform param_4 = row;
Transform param_5 = local[i_2];
Transform m = combine_monoid(param_4, param_5);
uint _209 = ix + i_2;
_89.data[_209].mat = m.mat;
_89.data[_209].translate = m.translate;
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -38,6 +38,12 @@ struct Config {
Alloc pathseg_alloc; Alloc pathseg_alloc;
Alloc anno_alloc; Alloc anno_alloc;
Alloc trans_alloc; Alloc trans_alloc;
// new element pipeline stuff follows
// Number of transforms in scene
uint n_trans;
// Offset (in bytes) of transform stream in scene buffer
uint trans_offset;
}; };
// Fill modes. // Fill modes.

Binary file not shown.

View file

@ -0,0 +1,86 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// A scan for a tree reduction prefix scan that outputs the final result.
// Output is written into memory at trans_alloc.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define N_ROWS 8
#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
#include "scene.h"
#include "tile.h"
#define Monoid Transform
layout(set = 0, binding = 3) readonly buffer ParentBuf {
Monoid[] parent;
};
Monoid monoid_identity() {
return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
}
Monoid combine_monoid(Monoid a, Monoid b) {
Monoid c;
c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
return c;
}
shared Monoid sh_scratch[WG_SIZE];
void main() {
Monoid local[N_ROWS];
uint ix = gl_GlobalInvocationID.x * N_ROWS;
TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size);
Monoid agg = Transform_read(ref);
local[0] = agg;
for (uint i = 1; i < N_ROWS; i++) {
agg = combine_monoid(agg, Transform_read(Transform_index(ref, i)));
local[i] = agg;
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
agg = combine_monoid(other, agg);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
barrier();
Monoid row = monoid_identity();
if (gl_WorkGroupID.x > 0) {
row = parent[gl_WorkGroupID.x - 1];
}
if (gl_LocalInvocationID.x > 0) {
row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
}
for (uint i = 0; i < N_ROWS; i++) {
Monoid m = combine_monoid(row, local[i]);
TransformSeg transform = TransformSeg(m.mat, m.translate);
TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (ix + i) * TransformSeg_size);
TransformSeg_write(conf.trans_alloc, trans_ref, transform);
}
}

View file

@ -0,0 +1,69 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// The reduction phase for transform scan implemented as a tree reduction.
#version 450
#extension GL_GOOGLE_include_directive : enable
#include "mem.h"
#include "setup.h"
#define N_ROWS 8
#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
layout(binding = 1) readonly buffer ConfigBuf {
Config conf;
};
layout(binding = 2) readonly buffer SceneBuf {
uint[] scene;
};
#include "scene.h"
#define Monoid Transform
layout(set = 0, binding = 3) buffer OutBuf {
Monoid[] outbuf;
};
Monoid monoid_identity() {
return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
}
Monoid combine_monoid(Monoid a, Monoid b) {
Monoid c;
c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
return c;
}
shared Monoid sh_scratch[WG_SIZE];
void main() {
uint ix = gl_GlobalInvocationID.x * N_ROWS;
TransformRef ref = TransformRef(conf.trans_offset + ix * Transform_size);
Monoid agg = Transform_read(ref);
for (uint i = 1; i < N_ROWS; i++) {
agg = combine_monoid(agg, Transform_read(Transform_index(ref, i)));
}
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
// We could make this predicate tighter, but would it help?
if (gl_LocalInvocationID.x + (1u << i) < WG_SIZE) {
Monoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i)];
agg = combine_monoid(agg, other);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
if (gl_LocalInvocationID.x == 0) {
outbuf[gl_WorkGroupID.x] = agg;
}
}

View file

@ -0,0 +1,89 @@
// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
// A scan for a tree reduction prefix scan (either root or not, by ifdef).
#version 450
#define N_ROWS 8
#define LG_WG_SIZE 9
#define WG_SIZE (1 << LG_WG_SIZE)
#define PARTITION_SIZE (WG_SIZE * N_ROWS)
layout(local_size_x = WG_SIZE, local_size_y = 1) in;
// This is copy-pasted from scene.h. It might be better for DRY
// to include it, but that pulls in more stuff we don't need.
struct Transform {
vec4 mat;
vec2 translate;
};
#define Monoid Transform
layout(binding = 0) buffer DataBuf {
Monoid[] data;
};
#ifndef ROOT
layout(binding = 1) readonly buffer ParentBuf {
Monoid[] parent;
};
#endif
Monoid monoid_identity() {
return Monoid(vec4(1.0, 0.0, 0.0, 1.0), vec2(0.0, 0.0));
}
Monoid combine_monoid(Monoid a, Monoid b) {
Monoid c;
c.mat = a.mat.xyxy * b.mat.xxzz + a.mat.zwzw * b.mat.yyww;
c.translate = a.mat.xy * b.translate.x + a.mat.zw * b.translate.y + a.translate;
return c;
}
shared Monoid sh_scratch[WG_SIZE];
void main() {
Monoid local[N_ROWS];
uint ix = gl_GlobalInvocationID.x * N_ROWS;
// TODO: gate buffer read
local[0] = data[ix];
for (uint i = 1; i < N_ROWS; i++) {
local[i] = combine_monoid(local[i - 1], data[ix + i]);
}
Monoid agg = local[N_ROWS - 1];
sh_scratch[gl_LocalInvocationID.x] = agg;
for (uint i = 0; i < LG_WG_SIZE; i++) {
barrier();
if (gl_LocalInvocationID.x >= (1u << i)) {
Monoid other = sh_scratch[gl_LocalInvocationID.x - (1u << i)];
agg = combine_monoid(other, agg);
}
barrier();
sh_scratch[gl_LocalInvocationID.x] = agg;
}
barrier();
// This could be a semigroup instead of a monoid if we reworked the
// conditional logic, but that might impact performance.
Monoid row = monoid_identity();
#ifdef ROOT
if (gl_LocalInvocationID.x > 0) {
row = sh_scratch[gl_LocalInvocationID.x - 1];
}
#else
if (gl_WorkGroupID.x > 0) {
row = parent[gl_WorkGroupID.x - 1];
}
if (gl_LocalInvocationID.x > 0) {
row = combine_monoid(row, sh_scratch[gl_LocalInvocationID.x - 1]);
}
#endif
for (uint i = 0; i < N_ROWS; i++) {
Monoid m = combine_monoid(row, local[i]);
// TODO: gate buffer write
data[ix + i] = m;
}
}

View file

@ -1,6 +1,7 @@
mod gradient; mod gradient;
mod pico_svg; mod pico_svg;
mod render_ctx; mod render_ctx;
pub mod stages;
pub mod test_scenes; pub mod test_scenes;
mod text; mod text;
@ -20,6 +21,8 @@ use piet_gpu_hal::{
use pico_svg::PicoSvg; use pico_svg::PicoSvg;
use crate::stages::Config;
const TILE_W: usize = 16; const TILE_W: usize = 16;
const TILE_H: usize = 16; const TILE_H: usize = 16;
@ -123,7 +126,7 @@ impl Renderer {
let image_dev = session.create_image2d(width as u32, height as u32)?; let image_dev = session.create_image2d(width as u32, height as u32)?;
// Note: this must be updated when the config struct size changes. // Note: this must be updated when the config struct size changes.
const CONFIG_BUFFER_SIZE: u64 = 40; const CONFIG_BUFFER_SIZE: u64 = std::mem::size_of::<Config>() as u64;
let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap(); let config_buf = session.create_buffer(CONFIG_BUFFER_SIZE, dev).unwrap();
// TODO: separate staging buffer (if needed) // TODO: separate staging buffer (if needed)
let config_bufs = (0..n_bufs) let config_bufs = (0..n_bufs)
@ -295,25 +298,28 @@ impl Renderer {
alloc += (n_paths * ANNO_SIZE + 3) & !3; alloc += (n_paths * ANNO_SIZE + 3) & !3;
let trans_base = alloc; let trans_base = alloc;
alloc += (n_trans * TRANS_SIZE + 3) & !3; alloc += (n_trans * TRANS_SIZE + 3) & !3;
let config = &[ let trans_offset = 0; // For new element pipeline, not yet used
n_paths as u32, let config = Config {
n_pathseg as u32, n_elements: n_paths as u32,
width_in_tiles as u32, n_pathseg: n_pathseg as u32,
height_in_tiles as u32, width_in_tiles: width_in_tiles as u32,
tile_base as u32, height_in_tiles: height_in_tiles as u32,
bin_base as u32, tile_alloc: tile_base as u32,
ptcl_base as u32, bin_alloc: bin_base as u32,
pathseg_base as u32, ptcl_alloc: ptcl_base as u32,
anno_base as u32, pathseg_alloc: pathseg_base as u32,
trans_base as u32, anno_alloc: anno_base as u32,
]; trans_alloc: trans_base as u32,
n_trans: n_trans as u32,
trans_offset: trans_offset as u32,
};
unsafe { unsafe {
let scene = render_ctx.get_scene_buf(); let scene = render_ctx.get_scene_buf();
self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size(); self.n_elements = scene.len() / piet_gpu_types::scene::Element::fixed_size();
// TODO: reallocate scene buffer if size is inadequate // TODO: reallocate scene buffer if size is inadequate
assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len()); assert!(self.scene_bufs[buf_ix].size() as usize >= scene.len());
self.scene_bufs[buf_ix].write(scene)?; self.scene_bufs[buf_ix].write(scene)?;
self.config_bufs[buf_ix].write(config)?; self.config_bufs[buf_ix].write(&[config])?;
self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?; self.memory_buf_host[buf_ix].write(&[alloc as u32, 0 /* Overflow flag */])?;
// Upload gradient data. // Upload gradient data.

209
piet-gpu/src/stages.rs Normal file
View file

@ -0,0 +1,209 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Stages for new element pipeline, exposed for testing.
use bytemuck::{Pod, Zeroable};
use piet::kurbo::Affine;
use piet_gpu_hal::{
include_shader, BindType, Buffer, BufferUsage, CmdBuf, DescriptorSet, Pipeline, Session,
};
/// The configuration block passed to piet-gpu shaders.
///
/// Note: this should be kept in sync with the version in setup.h.
#[repr(C)]
#[derive(Clone, Copy, Default, Zeroable, Pod)]
pub struct Config {
pub n_elements: u32, // paths
pub n_pathseg: u32,
pub width_in_tiles: u32,
pub height_in_tiles: u32,
pub tile_alloc: u32,
pub bin_alloc: u32,
pub ptcl_alloc: u32,
pub pathseg_alloc: u32,
pub anno_alloc: u32,
pub trans_alloc: u32,
pub n_trans: u32,
pub trans_offset: u32,
}
// The individual stages will probably be separate files but for now, all in one.
// This is equivalent to the version in piet-gpu-types, but the bytemuck
// representation will likely be faster.
#[repr(C)]
#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
pub struct Transform {
pub mat: [f32; 4],
pub translate: [f32; 2],
}
const TRANSFORM_WG: u64 = 512;
const TRANSFORM_N_ROWS: u64 = 8;
const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS;
pub struct TransformCode {
reduce_pipeline: Pipeline,
root_pipeline: Pipeline,
leaf_pipeline: Pipeline,
}
pub struct TransformStage {
// Right now we're limited to partition^2 (~16M) elements. This can be
// expanded but is tedious.
root_buf: Buffer,
root_ds: DescriptorSet,
}
pub struct TransformBinding {
reduce_ds: DescriptorSet,
leaf_ds: DescriptorSet,
}
impl TransformCode {
pub unsafe fn new(session: &Session) -> TransformCode {
let reduce_code = include_shader!(session, "../shader/gen/transform_reduce");
let reduce_pipeline = session
.create_compute_pipeline(
reduce_code,
&[
BindType::Buffer,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::Buffer,
],
)
.unwrap();
let root_code = include_shader!(session, "../shader/gen/transform_root");
let root_pipeline = session
.create_compute_pipeline(root_code, &[BindType::Buffer])
.unwrap();
let leaf_code = include_shader!(session, "../shader/gen/transform_leaf");
let leaf_pipeline = session
.create_compute_pipeline(
leaf_code,
&[
BindType::Buffer,
BindType::BufReadOnly,
BindType::BufReadOnly,
BindType::BufReadOnly,
],
)
.unwrap();
TransformCode {
reduce_pipeline,
root_pipeline,
leaf_pipeline,
}
}
}
impl TransformStage {
pub unsafe fn new(session: &Session, code: &TransformCode) -> TransformStage {
// We're limited to TRANSFORM_PART_SIZE^2
// Also note: size here allows padding
let root_buf_size = TRANSFORM_PART_SIZE * 32;
let root_buf = session
.create_buffer(root_buf_size, BufferUsage::STORAGE)
.unwrap();
let root_ds = session
.create_simple_descriptor_set(&code.root_pipeline, &[&root_buf])
.unwrap();
TransformStage { root_buf, root_ds }
}
pub unsafe fn bind(
&self,
session: &Session,
code: &TransformCode,
config_buf: &Buffer,
scene_buf: &Buffer,
memory_buf: &Buffer,
) -> TransformBinding {
let reduce_ds = session
.create_simple_descriptor_set(
&code.reduce_pipeline,
&[memory_buf, config_buf, scene_buf, &self.root_buf],
)
.unwrap();
let leaf_ds = session
.create_simple_descriptor_set(
&code.leaf_pipeline,
&[memory_buf, config_buf, scene_buf, &self.root_buf],
)
.unwrap();
TransformBinding { reduce_ds, leaf_ds }
}
pub unsafe fn record(
&self,
cmd_buf: &mut CmdBuf,
code: &TransformCode,
binding: &TransformBinding,
size: u64,
) {
if size > TRANSFORM_PART_SIZE.pow(2) {
panic!("very large scan not yet implemented");
}
let n_workgroups = (size + TRANSFORM_PART_SIZE - 1) / TRANSFORM_PART_SIZE;
if n_workgroups > 1 {
cmd_buf.dispatch(
&code.reduce_pipeline,
&binding.reduce_ds,
(n_workgroups as u32, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
cmd_buf.dispatch(
&code.root_pipeline,
&self.root_ds,
(1, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
cmd_buf.memory_barrier();
}
cmd_buf.dispatch(
&code.leaf_pipeline,
&binding.leaf_ds,
(n_workgroups as u32, 1, 1),
(TRANSFORM_WG as u32, 1, 1),
);
}
}
impl Transform {
pub fn from_kurbo(a: Affine) -> Transform {
let c = a.as_coeffs();
Transform {
mat: [c[0] as f32, c[1] as f32, c[2] as f32, c[3] as f32],
translate: [c[4] as f32, c[5] as f32],
}
}
pub fn to_kurbo(self) -> Affine {
Affine::new([
self.mat[0] as f64,
self.mat[1] as f64,
self.mat[2] as f64,
self.mat[3] as f64,
self.translate[0] as f64,
self.translate[1] as f64,
])
}
}

View file

@ -6,9 +6,18 @@ description = "Tests for piet-gpu shaders and generic GPU capabilities."
license = "MIT/Apache-2.0" license = "MIT/Apache-2.0"
edition = "2021" edition = "2021"
[features]
default = ["piet-gpu"]
[dependencies] [dependencies]
clap = "2.33" clap = "2.33"
bytemuck = "1.7.2" bytemuck = "1.7.2"
kurbo = "0.7.1"
rand = "0.7.3"
[dependencies.piet-gpu-hal] [dependencies.piet-gpu-hal]
path = "../piet-gpu-hal" path = "../piet-gpu-hal"
[dependencies.piet-gpu]
path = "../piet-gpu"
optional = true

View file

@ -55,12 +55,12 @@ pub unsafe fn run_clear_test(runner: &mut Runner, config: &Config) -> TestResult
commands.write_timestamp(0); commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1); commands.write_timestamp(1);
if i == 0 { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
} }
total_elapsed += runner.submit(commands); total_elapsed += runner.submit(commands);
if i == 0 { if i == 0 || config.verify_all {
let mut dst: Vec<u32> = Default::default(); let mut dst: Vec<u32> = Default::default();
out_buf.read(&mut dst); out_buf.read(&mut dst);
if let Some(failure) = verify(&dst) { if let Some(failure) = verify(&dst) {

View file

@ -22,6 +22,7 @@ pub struct Config {
pub groups: Groups, pub groups: Groups,
pub size: Size, pub size: Size,
pub n_iter: u64, pub n_iter: u64,
pub verify_all: bool,
} }
pub struct Groups(String); pub struct Groups(String);
@ -40,10 +41,12 @@ impl Config {
.value_of("n_iter") .value_of("n_iter")
.and_then(|s| s.parse().ok()) .and_then(|s| s.parse().ok())
.unwrap_or(1000); .unwrap_or(1000);
let verify_all = matches.is_present("verify_all");
Config { Config {
groups, groups,
size, size,
n_iter, n_iter,
verify_all,
} }
} }
} }

View file

@ -48,12 +48,12 @@ pub unsafe fn run_linkedlist_test(runner: &mut Runner, config: &Config) -> TestR
commands.write_timestamp(0); commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf); stage.record(&mut commands, &code, &binding, &mem_buf.dev_buf);
commands.write_timestamp(1); commands.write_timestamp(1);
if i == 0 { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&mem_buf); commands.download(&mem_buf);
} }
total_elapsed += runner.submit(commands); total_elapsed += runner.submit(commands);
if i == 0 { if i == 0 || config.verify_all {
let mut dst: Vec<u32> = Default::default(); let mut dst: Vec<u32> = Default::default();
mem_buf.read(&mut dst); mem_buf.read(&mut dst);
if !verify(&dst) { if !verify(&dst) {

View file

@ -25,6 +25,9 @@ mod prefix_tree;
mod runner; mod runner;
mod test_result; mod test_result;
#[cfg(feature = "piet-gpu")]
mod transform;
use clap::{App, Arg}; use clap::{App, Arg};
use piet_gpu_hal::InstanceFlags; use piet_gpu_hal::InstanceFlags;
@ -62,6 +65,11 @@ fn main() {
.help("Number of iterations") .help("Number of iterations")
.takes_value(true), .takes_value(true),
) )
.arg(
Arg::with_name("verify_all")
.long("verify_all")
.help("Verify all iterations"),
)
.arg( .arg(
Arg::with_name("dx12") Arg::with_name("dx12")
.long("dx12") .long("dx12")
@ -123,5 +131,9 @@ fn main() {
} }
report(&linkedlist::run_linkedlist_test(&mut runner, &config)); report(&linkedlist::run_linkedlist_test(&mut runner, &config));
} }
#[cfg(feature = "piet-gpu")]
if config.groups.matches("piet") {
report(&transform::transform_test(&mut runner, &config));
}
} }
} }

View file

@ -85,12 +85,12 @@ pub unsafe fn run_prefix_test(
commands.write_timestamp(0); commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1); commands.write_timestamp(1);
if i == 0 { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
} }
total_elapsed += runner.submit(commands); total_elapsed += runner.submit(commands);
if i == 0 { if i == 0 || config.verify_all {
let mut dst: Vec<u32> = Default::default(); let mut dst: Vec<u32> = Default::default();
out_buf.read(&mut dst); out_buf.read(&mut dst);
if let Some(failure) = verify(&dst) { if let Some(failure) = verify(&dst) {

View file

@ -66,12 +66,12 @@ pub unsafe fn run_prefix_test(runner: &mut Runner, config: &Config) -> TestResul
commands.write_timestamp(0); commands.write_timestamp(0);
stage.record(&mut commands, &code, &binding); stage.record(&mut commands, &code, &binding);
commands.write_timestamp(1); commands.write_timestamp(1);
if i == 0 { if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier(); commands.cmd_buf.memory_barrier();
commands.download(&out_buf); commands.download(&out_buf);
} }
total_elapsed += runner.submit(commands); total_elapsed += runner.submit(commands);
if i == 0 { if i == 0 || config.verify_all {
let mut dst: Vec<u32> = Default::default(); let mut dst: Vec<u32> = Default::default();
out_buf.read(&mut dst); out_buf.read(&mut dst);
if let Some(failure) = verify(&dst) { if let Some(failure) = verify(&dst) {

133
tests/src/transform.rs Normal file
View file

@ -0,0 +1,133 @@
// Copyright 2021 The piet-gpu authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Also licensed under MIT license, at your choice.
//! Tests for piet-gpu shaders.
use crate::{Config, Runner, TestResult};
use kurbo::Affine;
use piet_gpu::stages::{self, Transform, TransformCode, TransformStage};
use piet_gpu_hal::BufferUsage;
use rand::Rng;
struct AffineTestData {
input_data: Vec<Transform>,
expected: Vec<Affine>,
}
pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult {
let mut result = TestResult::new("transform");
let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24);
// TODO: would be nice to validate with real transform.
let data = AffineTestData::new(n_elements as usize);
let data_buf = runner
.session
.create_buffer_init(&data.input_data, BufferUsage::STORAGE)
.unwrap();
let memory = runner.buf_down(data_buf.size() + 24, BufferUsage::empty());
let stage_config = stages::Config {
n_trans: n_elements as u32,
// This is a hack to get elements aligned.
trans_alloc: 16,
..Default::default()
};
let config_buf = runner
.session
.create_buffer_init(std::slice::from_ref(&stage_config), BufferUsage::STORAGE)
.unwrap();
let code = TransformCode::new(&runner.session);
let stage = TransformStage::new(&runner.session, &code);
let binding = stage.bind(
&runner.session,
&code,
&config_buf,
&data_buf,
&memory.dev_buf,
);
let mut total_elapsed = 0.0;
let n_iter = config.n_iter;
for i in 0..n_iter {
let mut commands = runner.commands();
commands.write_timestamp(0);
stage.record(&mut commands.cmd_buf, &code, &binding, n_elements);
commands.write_timestamp(1);
if i == 0 || config.verify_all {
commands.cmd_buf.memory_barrier();
commands.download(&memory);
}
total_elapsed += runner.submit(commands);
if i == 0 || config.verify_all {
let mut dst: Vec<Transform> = Default::default();
memory.read(&mut dst);
if let Some(failure) = data.verify(&dst[1..]) {
result.fail(failure);
}
}
}
result.timing(total_elapsed, n_elements * n_iter);
result
}
impl AffineTestData {
fn new(n: usize) -> AffineTestData {
let mut rng = rand::thread_rng();
let mut a = Affine::default();
let mut b;
let mut input_data = Vec::with_capacity(n);
let mut expected = Vec::with_capacity(n);
for _ in 0..n {
loop {
b = Affine::new([
rng.gen_range(-10.0, 10.0),
rng.gen_range(-10.0, 10.0),
rng.gen_range(-10.0, 10.0),
rng.gen_range(-10.0, 10.0),
rng.gen_range(-10.0, 10.0),
rng.gen_range(-10.0, 10.0),
]);
if b.determinant() >= 1.0 {
break;
}
}
expected.push(b);
let c = a.inverse() * b;
input_data.push(Transform::from_kurbo(c));
a = b;
}
AffineTestData {
input_data,
expected,
}
}
fn verify(&self, actual: &[Transform]) -> Option<String> {
for (i, (actual, expected)) in actual.iter().zip(&self.expected).enumerate() {
let error: f64 = actual
.to_kurbo()
.as_coeffs()
.iter()
.zip(expected.as_coeffs())
.map(|(actual, expected)| (actual - expected).powi(2))
.sum();
let tolerance = 1e-6 * (i + 1) as f64;
if error > tolerance {
return Some(format!("{}: {} {}", i, error, tolerance));
}
}
None
}
}