diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index 313310e..c2b81fd 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -84,7 +84,7 @@ void main() { if (x0 == x1) y1 = y0; int x = x0, y = y0; uint my_slice = gl_LocalInvocationID.x / 32; - uint my_mask = 1 << (gl_LocalInvocationID.x & 31); + uint my_mask = 1u << (gl_LocalInvocationID.x & 31); while (y < y1) { atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); x++; diff --git a/piet-gpu/shader/build.ninja b/piet-gpu/shader/build.ninja index 6ed2140..6a59f59 100644 --- a/piet-gpu/shader/build.ninja +++ b/piet-gpu/shader/build.ninja @@ -65,7 +65,7 @@ build gen/transform_reduce.hlsl: hlsl gen/transform_reduce.spv build gen/transform_reduce.dxil: dxil gen/transform_reduce.hlsl build gen/transform_reduce.msl: msl gen/transform_reduce.spv -build gen/transform_root.spv: glsl transform_scan.comp +build gen/transform_root.spv: glsl transform_scan.comp | setup.h flags = -DROOT build gen/transform_root.hlsl: hlsl gen/transform_root.spv build gen/transform_root.dxil: dxil gen/transform_root.hlsl @@ -81,7 +81,7 @@ build gen/pathtag_reduce.hlsl: hlsl gen/pathtag_reduce.spv build gen/pathtag_reduce.dxil: dxil gen/pathtag_reduce.hlsl build gen/pathtag_reduce.msl: msl gen/pathtag_reduce.spv -build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h +build gen/pathtag_root.spv: glsl pathtag_scan.comp | pathtag.h setup.h flags = -DROOT build gen/pathtag_root.hlsl: hlsl gen/pathtag_root.spv build gen/pathtag_root.dxil: dxil gen/pathtag_root.hlsl @@ -102,7 +102,7 @@ build gen/draw_reduce.hlsl: hlsl gen/draw_reduce.spv build gen/draw_reduce.dxil: dxil gen/draw_reduce.hlsl build gen/draw_reduce.msl: msl gen/draw_reduce.spv -build gen/draw_root.spv: glsl draw_scan.comp | drawtag.h +build gen/draw_root.spv: glsl draw_scan.comp | drawtag.h setup.h flags = -DROOT build gen/draw_root.hlsl: hlsl gen/draw_root.spv build gen/draw_root.dxil: dxil gen/draw_root.hlsl diff --git a/piet-gpu/shader/draw_leaf.comp b/piet-gpu/shader/draw_leaf.comp index 85d9528..5de2652 100644 --- a/piet-gpu/shader/draw_leaf.comp +++ b/piet-gpu/shader/draw_leaf.comp @@ -11,7 +11,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/draw_reduce.comp b/piet-gpu/shader/draw_reduce.comp index fe9ab2c..68d43e9 100644 --- a/piet-gpu/shader/draw_reduce.comp +++ b/piet-gpu/shader/draw_reduce.comp @@ -9,7 +9,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/draw_scan.comp b/piet-gpu/shader/draw_scan.comp index d883671..2afc9ba 100644 --- a/piet-gpu/shader/draw_scan.comp +++ b/piet-gpu/shader/draw_scan.comp @@ -5,10 +5,11 @@ #version 450 #extension GL_GOOGLE_include_directive : enable +#include "setup.h" #include "drawtag.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/gen/binning.hlsl b/piet-gpu/shader/gen/binning.hlsl index 2b0901e..b13db37 100644 --- a/piet-gpu/shader/gen/binning.hlsl +++ b/piet-gpu/shader/gen/binning.hlsl @@ -248,11 +248,11 @@ void comp_main() int x = x0; int y = y0; uint my_slice = gl_LocalInvocationID.x / 32u; - uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _438; - InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _438); + uint _437; + InterlockedOr(bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, _437); x++; if (x == x1) { @@ -274,8 +274,8 @@ void comp_main() if (element_count != 0u) { uint param_7 = element_count * 4u; - MallocResult _488 = malloc(param_7); - MallocResult chunk = _488; + MallocResult _487 = malloc(param_7); + MallocResult chunk = _487; chunk_alloc = chunk.alloc; sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; if (chunk.failed) @@ -284,31 +284,31 @@ void comp_main() } } uint out_ix = (_253.Load(20) >> uint(2)) + (((my_partition * 256u) + gl_LocalInvocationID.x) * 2u); - Alloc _517; - _517.offset = _253.Load(20); + Alloc _516; + _516.offset = _253.Load(20); Alloc param_8; - param_8.offset = _517.offset; + param_8.offset = _516.offset; uint param_9 = out_ix; uint param_10 = element_count; write_mem(param_8, param_9, param_10); - Alloc _529; - _529.offset = _253.Load(20); + Alloc _528; + _528.offset = _253.Load(20); Alloc param_11; - param_11.offset = _529.offset; + param_11.offset = _528.offset; uint param_12 = out_ix + 1u; uint param_13 = chunk_alloc.offset; write_mem(param_11, param_12, param_13); GroupMemoryBarrierWithGroupSync(); - bool _544; + bool _543; if (!sh_alloc_failed) { - _544 = _84.Load(4) != 0u; + _543 = _84.Load(4) != 0u; } else { - _544 = sh_alloc_failed; + _543 = sh_alloc_failed; } - if (_544) + if (_543) { return; } @@ -327,11 +327,11 @@ void comp_main() } Alloc out_alloc = sh_chunk_alloc[bin_ix]; uint out_offset = out_alloc.offset + (idx * 4u); - BinInstanceRef _606 = { out_offset }; - BinInstance _608 = { element_ix }; + BinInstanceRef _605 = { out_offset }; + BinInstance _607 = { element_ix }; Alloc param_14 = out_alloc; - BinInstanceRef param_15 = _606; - BinInstance param_16 = _608; + BinInstanceRef param_15 = _605; + BinInstance param_16 = _607; BinInstance_write(param_14, param_15, param_16); } x++; diff --git a/piet-gpu/shader/gen/binning.msl b/piet-gpu/shader/gen/binning.msl index f6e0505..42a11ee 100644 --- a/piet-gpu/shader/gen/binning.msl +++ b/piet-gpu/shader/gen/binning.msl @@ -260,10 +260,10 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M int x = x0; int y = y0; uint my_slice = gl_LocalInvocationID.x / 32u; - uint my_mask = uint(1 << int(gl_LocalInvocationID.x & 31u)); + uint my_mask = 1u << (gl_LocalInvocationID.x & 31u); while (y < y1) { - uint _438 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); + uint _437 = atomic_fetch_or_explicit((threadgroup atomic_uint*)&bitmaps[my_slice][(uint(y) * width_in_bins) + uint(x)], my_mask, memory_order_relaxed); x++; if (x == x1) { @@ -285,8 +285,8 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if (element_count != 0u) { uint param_7 = element_count * 4u; - MallocResult _488 = malloc(param_7, v_84, v_84BufferSize); - MallocResult chunk = _488; + MallocResult _487 = malloc(param_7, v_84, v_84BufferSize); + MallocResult chunk = _487; chunk_alloc = chunk.alloc; sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; if (chunk.failed) @@ -306,16 +306,16 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M uint param_13 = chunk_alloc.offset; write_mem(param_11, param_12, param_13, v_84, v_84BufferSize); threadgroup_barrier(mem_flags::mem_threadgroup); - bool _544; + bool _543; if (!bool(sh_alloc_failed)) { - _544 = v_84.mem_error != 0u; + _543 = v_84.mem_error != 0u; } else { - _544 = bool(sh_alloc_failed); + _543 = bool(sh_alloc_failed); } - if (_544) + if (_543) { return; } diff --git a/piet-gpu/shader/gen/binning.spv b/piet-gpu/shader/gen/binning.spv index 5ec7aec..17043bc 100644 Binary files a/piet-gpu/shader/gen/binning.spv and b/piet-gpu/shader/gen/binning.spv differ diff --git a/piet-gpu/shader/gen/draw_leaf.dxil b/piet-gpu/shader/gen/draw_leaf.dxil index 86b37e9..17bfd04 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.dxil and b/piet-gpu/shader/gen/draw_leaf.dxil differ diff --git a/piet-gpu/shader/gen/draw_leaf.hlsl b/piet-gpu/shader/gen/draw_leaf.hlsl index e3cb387..d0bef52 100644 --- a/piet-gpu/shader/gen/draw_leaf.hlsl +++ b/piet-gpu/shader/gen/draw_leaf.hlsl @@ -151,7 +151,7 @@ struct Config uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const DrawMonoid _418 = { 0u, 0u }; static const DrawMonoid _442 = { 1u, 0u }; @@ -159,8 +159,8 @@ static const DrawMonoid _444 = { 1u, 1u }; RWByteAddressBuffer _201 : register(u0, space0); ByteAddressBuffer _225 : register(t2, space0); -ByteAddressBuffer _1005 : register(t3, space0); -ByteAddressBuffer _1039 : register(t1, space0); +ByteAddressBuffer _1004 : register(t3, space0); +ByteAddressBuffer _1038 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -172,7 +172,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared DrawMonoid sh_scratch[512]; +groupshared DrawMonoid sh_scratch[256]; ElementTag Element_tag(ElementRef ref) { @@ -558,7 +558,7 @@ void comp_main() local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -575,11 +575,11 @@ void comp_main() DrawMonoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - DrawMonoid _1011; - _1011.path_ix = _1005.Load((gl_WorkGroupID.x - 1u) * 8 + 0); - _1011.clip_ix = _1005.Load((gl_WorkGroupID.x - 1u) * 8 + 4); - row.path_ix = _1011.path_ix; - row.clip_ix = _1011.clip_ix; + DrawMonoid _1010; + _1010.path_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 0); + _1010.clip_ix = _1004.Load((gl_WorkGroupID.x - 1u) * 8 + 4); + row.path_ix = _1010.path_ix; + row.clip_ix = _1010.clip_ix; } if (gl_LocalInvocationID.x > 0u) { @@ -588,9 +588,9 @@ void comp_main() row = combine_tag_monoid(param_10, param_11); } uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_1039.Load(44) >> uint(2)) + (out_ix * 2u); - AnnotatedRef _1055 = { _1039.Load(32) + (out_ix * 40u) }; - AnnotatedRef out_ref = _1055; + uint out_base = (_1038.Load(44) >> uint(2)) + (out_ix * 2u); + AnnotatedRef _1054 = { _1038.Load(32) + (out_ix * 40u) }; + AnnotatedRef out_ref = _1054; float4 mat; float2 translate; AnnoColor anno_fill; @@ -617,7 +617,7 @@ void comp_main() tag_word = Element_tag(param_16).tag; if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) { - uint bbox_offset = (_1039.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u)); + uint bbox_offset = (_1038.Load(40) >> uint(2)) + (6u * (m.path_ix - 1u)); float bbox_l = float(_201.Load(bbox_offset * 4 + 8)) - 32768.0f; float bbox_t = float(_201.Load((bbox_offset + 1u) * 4 + 8)) - 32768.0f; float bbox_r = float(_201.Load((bbox_offset + 2u) * 4 + 8)) - 32768.0f; @@ -628,7 +628,7 @@ void comp_main() if ((linewidth >= 0.0f) || (tag_word == 5u)) { uint trans_ix = _201.Load((bbox_offset + 5u) * 4 + 8); - uint t = (_1039.Load(36) >> uint(2)) + (6u * trans_ix); + uint t = (_1038.Load(36) >> uint(2)) + (6u * trans_ix); mat = asfloat(uint4(_201.Load(t * 4 + 8), _201.Load((t + 1u) * 4 + 8), _201.Load((t + 2u) * 4 + 8), _201.Load((t + 3u) * 4 + 8))); if (tag_word == 5u) { @@ -649,9 +649,9 @@ void comp_main() anno_fill.bbox = bbox; anno_fill.linewidth = linewidth; anno_fill.rgba_color = fill.rgba_color; - Alloc _1258; - _1258.offset = _1039.Load(32); - param_18.offset = _1258.offset; + Alloc _1257; + _1257.offset = _1038.Load(32); + param_18.offset = _1257.offset; AnnotatedRef param_19 = out_ref; uint param_20 = fill_mode; AnnoColor param_21 = anno_fill; @@ -674,9 +674,9 @@ void comp_main() anno_lin.line_x = line_x; anno_lin.line_y = line_y; anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y)); - Alloc _1354; - _1354.offset = _1039.Load(32); - param_23.offset = _1354.offset; + Alloc _1353; + _1353.offset = _1038.Load(32); + param_23.offset = _1353.offset; AnnotatedRef param_24 = out_ref; uint param_25 = fill_mode; AnnoLinGradient param_26 = anno_lin; @@ -691,9 +691,9 @@ void comp_main() anno_img.linewidth = linewidth; anno_img.index = fill_img.index; anno_img.offset = fill_img.offset; - Alloc _1382; - _1382.offset = _1039.Load(32); - param_28.offset = _1382.offset; + Alloc _1381; + _1381.offset = _1038.Load(32); + param_28.offset = _1381.offset; AnnotatedRef param_29 = out_ref; uint param_30 = fill_mode; AnnoImage param_31 = anno_img; @@ -711,7 +711,7 @@ void comp_main() anno_begin_clip.bbox = begin_clip.bbox; anno_begin_clip.linewidth = 0.0f; Alloc _1410; - _1410.offset = _1039.Load(32); + _1410.offset = _1038.Load(32); param_33.offset = _1410.offset; AnnotatedRef param_34 = out_ref; uint param_35 = 0u; @@ -726,7 +726,7 @@ void comp_main() Clip end_clip = Element_EndClip_read(param_37); anno_end_clip.bbox = end_clip.bbox; Alloc _1435; - _1435.offset = _1039.Load(32); + _1435.offset = _1038.Load(32); param_38.offset = _1435.offset; AnnotatedRef param_39 = out_ref; AnnoEndClip param_40 = anno_end_clip; @@ -738,7 +738,7 @@ void comp_main() } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/draw_leaf.msl b/piet-gpu/shader/gen/draw_leaf.msl index e20fcb2..06a4e23 100644 --- a/piet-gpu/shader/gen/draw_leaf.msl +++ b/piet-gpu/shader/gen/draw_leaf.msl @@ -230,7 +230,7 @@ struct ConfigBuf Config conf; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) ElementTag Element_tag(thread const ElementRef& ref, const device SceneBuf& v_225) @@ -606,9 +606,9 @@ void Annotated_EndClip_write(thread const Alloc& a, thread const AnnotatedRef& r AnnoEndClip_write(param_3, param_4, param_5, v_201); } -kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1039 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1005 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1038 [[buffer(1)]], const device SceneBuf& v_225 [[buffer(2)]], const device ParentBuf& _1004 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup DrawMonoid sh_scratch[512]; + threadgroup DrawMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; ElementRef ref = ElementRef{ ix * 36u }; ElementRef param = ref; @@ -630,7 +630,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -647,9 +647,9 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 DrawMonoid row = tag_monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _1008 = gl_WorkGroupID.x - 1u; - row.path_ix = _1005.parent[_1008].path_ix; - row.clip_ix = _1005.parent[_1008].clip_ix; + uint _1007 = gl_WorkGroupID.x - 1u; + row.path_ix = _1004.parent[_1007].path_ix; + row.clip_ix = _1004.parent[_1007].clip_ix; } if (gl_LocalInvocationID.x > 0u) { @@ -658,8 +658,8 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 row = combine_tag_monoid(param_10, param_11); } uint out_ix = gl_GlobalInvocationID.x * 8u; - uint out_base = (_1039.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u); - AnnotatedRef out_ref = AnnotatedRef{ _1039.conf.anno_alloc.offset + (out_ix * 40u) }; + uint out_base = (_1038.conf.drawmonoid_alloc.offset >> uint(2)) + (out_ix * 2u); + AnnotatedRef out_ref = AnnotatedRef{ _1038.conf.anno_alloc.offset + (out_ix * 40u) }; float4 mat; float2 translate; AnnoColor anno_fill; @@ -686,7 +686,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 tag_word = Element_tag(param_16, v_225).tag; if (((tag_word == 4u) || (tag_word == 5u)) || (tag_word == 6u)) { - uint bbox_offset = (_1039.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u)); + uint bbox_offset = (_1038.conf.bbox_alloc.offset >> uint(2)) + (6u * (m.path_ix - 1u)); float bbox_l = float(v_201.memory[bbox_offset]) - 32768.0; float bbox_t = float(v_201.memory[bbox_offset + 1u]) - 32768.0; float bbox_r = float(v_201.memory[bbox_offset + 2u]) - 32768.0; @@ -697,7 +697,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 if ((linewidth >= 0.0) || (tag_word == 5u)) { uint trans_ix = v_201.memory[bbox_offset + 5u]; - uint t = (_1039.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); + uint t = (_1038.conf.trans_alloc.offset >> uint(2)) + (6u * trans_ix); mat = as_type(uint4(v_201.memory[t], v_201.memory[t + 1u], v_201.memory[t + 2u], v_201.memory[t + 3u])); if (tag_word == 5u) { @@ -718,7 +718,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_fill.bbox = bbox; anno_fill.linewidth = linewidth; anno_fill.rgba_color = fill.rgba_color; - param_18.offset = _1039.conf.anno_alloc.offset; + param_18.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_19 = out_ref; uint param_20 = fill_mode; AnnoColor param_21 = anno_fill; @@ -741,7 +741,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_lin.line_x = line_x; anno_lin.line_y = line_y; anno_lin.line_c = -((p0.x * line_x) + (p0.y * line_y)); - param_23.offset = _1039.conf.anno_alloc.offset; + param_23.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_24 = out_ref; uint param_25 = fill_mode; AnnoLinGradient param_26 = anno_lin; @@ -756,7 +756,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 anno_img.linewidth = linewidth; anno_img.index = fill_img.index; anno_img.offset = fill_img.offset; - param_28.offset = _1039.conf.anno_alloc.offset; + param_28.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_29 = out_ref; uint param_30 = fill_mode; AnnoImage param_31 = anno_img; @@ -773,7 +773,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 Clip begin_clip = Element_BeginClip_read(param_32, v_225); anno_begin_clip.bbox = begin_clip.bbox; anno_begin_clip.linewidth = 0.0; - param_33.offset = _1039.conf.anno_alloc.offset; + param_33.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_34 = out_ref; uint param_35 = 0u; AnnoBeginClip param_36 = anno_begin_clip; @@ -786,7 +786,7 @@ kernel void main0(device Memory& v_201 [[buffer(0)]], const device ConfigBuf& _1 ElementRef param_37 = this_ref; Clip end_clip = Element_EndClip_read(param_37, v_225); anno_end_clip.bbox = end_clip.bbox; - param_38.offset = _1039.conf.anno_alloc.offset; + param_38.offset = _1038.conf.anno_alloc.offset; AnnotatedRef param_39 = out_ref; AnnoEndClip param_40 = anno_end_clip; Annotated_EndClip_write(param_38, param_39, param_40, v_201); diff --git a/piet-gpu/shader/gen/draw_leaf.spv b/piet-gpu/shader/gen/draw_leaf.spv index 77ed9cf..7e92cbb 100644 Binary files a/piet-gpu/shader/gen/draw_leaf.spv and b/piet-gpu/shader/gen/draw_leaf.spv differ diff --git a/piet-gpu/shader/gen/draw_reduce.dxil b/piet-gpu/shader/gen/draw_reduce.dxil index c001e89..9b1b0fd 100644 Binary files a/piet-gpu/shader/gen/draw_reduce.dxil and b/piet-gpu/shader/gen/draw_reduce.dxil differ diff --git a/piet-gpu/shader/gen/draw_reduce.hlsl b/piet-gpu/shader/gen/draw_reduce.hlsl index e56ec3d..a6ccde9 100644 --- a/piet-gpu/shader/gen/draw_reduce.hlsl +++ b/piet-gpu/shader/gen/draw_reduce.hlsl @@ -15,7 +15,7 @@ struct DrawMonoid uint clip_ix; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); struct Alloc { @@ -49,9 +49,9 @@ static const DrawMonoid _89 = { 1u, 1u }; static const DrawMonoid _91 = { 0u, 0u }; ByteAddressBuffer _46 : register(t2, space0); -RWByteAddressBuffer _200 : register(u3, space0); -RWByteAddressBuffer _214 : register(u0, space0); -ByteAddressBuffer _220 : register(t1, space0); +RWByteAddressBuffer _199 : register(u3, space0); +RWByteAddressBuffer _213 : register(u0, space0); +ByteAddressBuffer _219 : register(t1, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -63,7 +63,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared DrawMonoid sh_scratch[512]; +groupshared DrawMonoid sh_scratch[256]; ElementTag Element_tag(ElementRef ref) { @@ -129,10 +129,10 @@ void comp_main() agg = combine_tag_monoid(param_6, param_7); } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); - if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) { DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; DrawMonoid param_8 = agg; @@ -144,12 +144,12 @@ void comp_main() } if (gl_LocalInvocationID.x == 0u) { - _200.Store(gl_WorkGroupID.x * 8 + 0, agg.path_ix); - _200.Store(gl_WorkGroupID.x * 8 + 4, agg.clip_ix); + _199.Store(gl_WorkGroupID.x * 8 + 0, agg.path_ix); + _199.Store(gl_WorkGroupID.x * 8 + 4, agg.clip_ix); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/draw_reduce.msl b/piet-gpu/shader/gen/draw_reduce.msl index 26a8793..8a87f15 100644 --- a/piet-gpu/shader/gen/draw_reduce.msl +++ b/piet-gpu/shader/gen/draw_reduce.msl @@ -45,7 +45,7 @@ struct Memory uint memory[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); struct Alloc { @@ -124,9 +124,9 @@ DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoi return c; } -kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _200 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _199 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup DrawMonoid sh_scratch[512]; + threadgroup DrawMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; ElementRef ref = ElementRef{ ix * 36u }; ElementRef param = ref; @@ -145,10 +145,10 @@ kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _200 agg = combine_tag_monoid(param_6, param_7); } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); - if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) { DrawMonoid other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; DrawMonoid param_8 = agg; @@ -160,8 +160,8 @@ kernel void main0(const device SceneBuf& v_46 [[buffer(2)]], device OutBuf& _200 } if (gl_LocalInvocationID.x == 0u) { - _200.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; - _200.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix; + _199.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _199.outbuf[gl_WorkGroupID.x].clip_ix = agg.clip_ix; } } diff --git a/piet-gpu/shader/gen/draw_reduce.spv b/piet-gpu/shader/gen/draw_reduce.spv index 73430c6..aabdf6b 100644 Binary files a/piet-gpu/shader/gen/draw_reduce.spv and b/piet-gpu/shader/gen/draw_reduce.spv differ diff --git a/piet-gpu/shader/gen/draw_root.dxil b/piet-gpu/shader/gen/draw_root.dxil index da5cfe2..a84fd4a 100644 Binary files a/piet-gpu/shader/gen/draw_root.dxil and b/piet-gpu/shader/gen/draw_root.dxil differ diff --git a/piet-gpu/shader/gen/draw_root.hlsl b/piet-gpu/shader/gen/draw_root.hlsl index ec75d5c..56b513f 100644 --- a/piet-gpu/shader/gen/draw_root.hlsl +++ b/piet-gpu/shader/gen/draw_root.hlsl @@ -4,7 +4,7 @@ struct DrawMonoid uint clip_ix; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const DrawMonoid _18 = { 0u, 0u }; @@ -18,7 +18,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared DrawMonoid sh_scratch[512]; +groupshared DrawMonoid sh_scratch[256]; DrawMonoid combine_tag_monoid(DrawMonoid a, DrawMonoid b) { @@ -55,7 +55,7 @@ void comp_main() } DrawMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -79,13 +79,13 @@ void comp_main() DrawMonoid param_4 = row; DrawMonoid param_5 = local[i_2]; DrawMonoid m = combine_tag_monoid(param_4, param_5); - uint _178 = ix + i_2; - _57.Store(_178 * 8 + 0, m.path_ix); - _57.Store(_178 * 8 + 4, m.clip_ix); + uint _177 = ix + i_2; + _57.Store(_177 * 8 + 0, m.path_ix); + _57.Store(_177 * 8 + 4, m.clip_ix); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_LocalInvocationID = stage_input.gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/draw_root.msl b/piet-gpu/shader/gen/draw_root.msl index 2ed7ba2..0d22e4b 100644 --- a/piet-gpu/shader/gen/draw_root.msl +++ b/piet-gpu/shader/gen/draw_root.msl @@ -61,7 +61,7 @@ struct DataBuf DrawMonoid_1 data[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) DrawMonoid combine_tag_monoid(thread const DrawMonoid& a, thread const DrawMonoid& b) @@ -80,7 +80,7 @@ DrawMonoid tag_monoid_identity() kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - threadgroup DrawMonoid sh_scratch[512]; + threadgroup DrawMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; spvUnsafeArray local; local[0].path_ix = _57.data[ix].path_ix; @@ -96,7 +96,7 @@ kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID } DrawMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -120,9 +120,9 @@ kernel void main0(device DataBuf& _57 [[buffer(0)]], uint3 gl_GlobalInvocationID DrawMonoid param_4 = row; DrawMonoid param_5 = local[i_2]; DrawMonoid m = combine_tag_monoid(param_4, param_5); - uint _178 = ix + i_2; - _57.data[_178].path_ix = m.path_ix; - _57.data[_178].clip_ix = m.clip_ix; + uint _177 = ix + i_2; + _57.data[_177].path_ix = m.path_ix; + _57.data[_177].clip_ix = m.clip_ix; } } diff --git a/piet-gpu/shader/gen/draw_root.spv b/piet-gpu/shader/gen/draw_root.spv index acecee3..1c11414 100644 Binary files a/piet-gpu/shader/gen/draw_root.spv and b/piet-gpu/shader/gen/draw_root.spv differ diff --git a/piet-gpu/shader/gen/pathseg.dxil b/piet-gpu/shader/gen/pathseg.dxil index 0ca0d18..3c81315 100644 Binary files a/piet-gpu/shader/gen/pathseg.dxil and b/piet-gpu/shader/gen/pathseg.dxil differ diff --git a/piet-gpu/shader/gen/pathseg.hlsl b/piet-gpu/shader/gen/pathseg.hlsl index a9cee25..f7c9e2d 100644 --- a/piet-gpu/shader/gen/pathseg.hlsl +++ b/piet-gpu/shader/gen/pathseg.hlsl @@ -72,7 +72,7 @@ struct Config uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const TagMonoid _135 = { 0u, 0u, 0u, 0u, 0u }; static const Monoid _567 = { 0.0f.xxxx, 0u }; @@ -92,8 +92,8 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared TagMonoid sh_tag[512]; -groupshared Monoid sh_scratch[512]; +groupshared TagMonoid sh_tag[256]; +groupshared Monoid sh_scratch[256]; TagMonoid reduce_tag(uint tag_word) { @@ -360,7 +360,7 @@ void comp_main() uint param = tag_word; TagMonoid local_tm = reduce_tag(param); sh_tag[gl_LocalInvocationID.x] = local_tm; - for (uint i = 0u; i < 9u; i++) + for (uint i = 0u; i < 8u; i++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i)) @@ -547,7 +547,7 @@ void comp_main() local[i_2] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_3 = 0u; i_3 < 9u; i_3++) + for (uint i_3 = 0u; i_3 < 8u; i_3++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_3)) @@ -575,16 +575,16 @@ void comp_main() Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; bool _1263 = i_4 == 3u; - bool _1270; + bool _1269; if (_1263) { - _1270 = gl_LocalInvocationID.x == 511u; + _1269 = gl_LocalInvocationID.x == 255u; } else { - _1270 = _1263; + _1269 = _1263; } - if (_1270) + if (_1269) { do_atomic = true; } @@ -612,37 +612,37 @@ void comp_main() } if (do_atomic) { - bool _1335 = m.bbox.z > m.bbox.x; - bool _1344; - if (!_1335) + bool _1334 = m.bbox.z > m.bbox.x; + bool _1343; + if (!_1334) { - _1344 = m.bbox.w > m.bbox.y; + _1343 = m.bbox.w > m.bbox.y; } else { - _1344 = _1335; + _1343 = _1334; } - if (_1344) + if (_1343) { float param_29 = m.bbox.x; - uint _1353; - _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1353); + uint _1352; + _111.InterlockedMin(bbox_out_ix * 4 + 8, round_down(param_29), _1352); float param_30 = m.bbox.y; - uint _1361; - _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1361); + uint _1360; + _111.InterlockedMin((bbox_out_ix + 1u) * 4 + 8, round_down(param_30), _1360); float param_31 = m.bbox.z; - uint _1369; - _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1369); + uint _1368; + _111.InterlockedMax((bbox_out_ix + 2u) * 4 + 8, round_up(param_31), _1368); float param_32 = m.bbox.w; - uint _1377; - _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1377); + uint _1376; + _111.InterlockedMax((bbox_out_ix + 3u) * 4 + 8, round_up(param_32), _1376); } bbox_out_ix += 6u; } } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/pathseg.msl b/piet-gpu/shader/gen/pathseg.msl index 0f60d4d..9708585 100644 --- a/piet-gpu/shader/gen/pathseg.msl +++ b/piet-gpu/shader/gen/pathseg.msl @@ -156,7 +156,7 @@ struct ParentBuf TagMonoid_1 parent[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) TagMonoid reduce_tag(thread const uint& tag_word) @@ -432,14 +432,14 @@ uint round_up(thread const float& x) kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _639 [[buffer(1)]], const device SceneBuf& v_574 [[buffer(2)]], const device ParentBuf& _709 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup TagMonoid sh_tag[512]; - threadgroup Monoid sh_scratch[512]; + threadgroup TagMonoid sh_tag[256]; + threadgroup Monoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 4u; uint tag_word = v_574.scene[(_639.conf.pathtag_offset >> uint(2)) + (ix >> uint(2))]; uint param = tag_word; TagMonoid local_tm = reduce_tag(param); sh_tag[gl_LocalInvocationID.x] = local_tm; - for (uint i = 0u; i < 9u; i++) + for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i)) @@ -615,7 +615,7 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 local[i_2] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_3 = 0u; i_3 < 9u; i_3++) + for (uint i_3 = 0u; i_3 < 8u; i_3++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_3)) @@ -643,16 +643,16 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 Monoid m = combine_monoid(param_23, param_24); bool do_atomic = false; bool _1263 = i_4 == 3u; - bool _1270; + bool _1269; if (_1263) { - _1270 = gl_LocalInvocationID.x == 511u; + _1269 = gl_LocalInvocationID.x == 255u; } else { - _1270 = _1263; + _1269 = _1263; } - if (_1270) + if (_1269) { do_atomic = true; } @@ -680,26 +680,26 @@ kernel void main0(device Memory& v_111 [[buffer(0)]], const device ConfigBuf& _6 } if (do_atomic) { - bool _1335 = m.bbox.z > m.bbox.x; - bool _1344; - if (!_1335) + bool _1334 = m.bbox.z > m.bbox.x; + bool _1343; + if (!_1334) { - _1344 = m.bbox.w > m.bbox.y; + _1343 = m.bbox.w > m.bbox.y; } else { - _1344 = _1335; + _1343 = _1334; } - if (_1344) + if (_1343) { float param_29 = m.bbox.x; - uint _1353 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); + uint _1352 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix], round_down(param_29), memory_order_relaxed); float param_30 = m.bbox.y; - uint _1361 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); + uint _1360 = atomic_fetch_min_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 1u], round_down(param_30), memory_order_relaxed); float param_31 = m.bbox.z; - uint _1369 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); + uint _1368 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 2u], round_up(param_31), memory_order_relaxed); float param_32 = m.bbox.w; - uint _1377 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); + uint _1376 = atomic_fetch_max_explicit((device atomic_uint*)&v_111.memory[bbox_out_ix + 3u], round_up(param_32), memory_order_relaxed); } bbox_out_ix += 6u; } diff --git a/piet-gpu/shader/gen/pathseg.spv b/piet-gpu/shader/gen/pathseg.spv index fc63eb5..37c9847 100644 Binary files a/piet-gpu/shader/gen/pathseg.spv and b/piet-gpu/shader/gen/pathseg.spv differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.dxil b/piet-gpu/shader/gen/pathtag_reduce.dxil index d585c96..245c492 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.dxil and b/piet-gpu/shader/gen/pathtag_reduce.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_reduce.hlsl b/piet-gpu/shader/gen/pathtag_reduce.hlsl index 291243e..1332429 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.hlsl +++ b/piet-gpu/shader/gen/pathtag_reduce.hlsl @@ -38,8 +38,8 @@ static const uint3 gl_WorkGroupSize = uint3(128u, 1u, 1u); ByteAddressBuffer _139 : register(t1, space0); ByteAddressBuffer _150 : register(t2, space0); -RWByteAddressBuffer _238 : register(u3, space0); -RWByteAddressBuffer _258 : register(u0, space0); +RWByteAddressBuffer _237 : register(u3, space0); +RWByteAddressBuffer _257 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -82,12 +82,12 @@ TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) void comp_main() { - uint ix = gl_GlobalInvocationID.x * 4u; + uint ix = gl_GlobalInvocationID.x * 2u; uint scene_ix = (_139.Load(64) >> uint(2)) + ix; uint tag_word = _150.Load(scene_ix * 4 + 0); uint param = tag_word; TagMonoid agg = reduce_tag(param); - for (uint i = 1u; i < 4u; i++) + for (uint i = 1u; i < 2u; i++) { tag_word = _150.Load((scene_ix + i) * 4 + 0); uint param_1 = tag_word; @@ -111,11 +111,11 @@ void comp_main() } if (gl_LocalInvocationID.x == 0u) { - _238.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); - _238.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); - _238.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); - _238.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); - _238.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); + _237.Store(gl_WorkGroupID.x * 20 + 0, agg.trans_ix); + _237.Store(gl_WorkGroupID.x * 20 + 4, agg.linewidth_ix); + _237.Store(gl_WorkGroupID.x * 20 + 8, agg.pathseg_ix); + _237.Store(gl_WorkGroupID.x * 20 + 12, agg.path_ix); + _237.Store(gl_WorkGroupID.x * 20 + 16, agg.pathseg_offset); } } diff --git a/piet-gpu/shader/gen/pathtag_reduce.msl b/piet-gpu/shader/gen/pathtag_reduce.msl index e82577c..6c0a64f 100644 --- a/piet-gpu/shader/gen/pathtag_reduce.msl +++ b/piet-gpu/shader/gen/pathtag_reduce.msl @@ -103,15 +103,15 @@ TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& return c; } -kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _238 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device SceneBuf& _150 [[buffer(2)]], device OutBuf& _237 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { threadgroup TagMonoid sh_scratch[128]; - uint ix = gl_GlobalInvocationID.x * 4u; + uint ix = gl_GlobalInvocationID.x * 2u; uint scene_ix = (_139.conf.pathtag_offset >> uint(2)) + ix; uint tag_word = _150.scene[scene_ix]; uint param = tag_word; TagMonoid agg = reduce_tag(param); - for (uint i = 1u; i < 4u; i++) + for (uint i = 1u; i < 2u; i++) { tag_word = _150.scene[scene_ix + i]; uint param_1 = tag_word; @@ -135,11 +135,11 @@ kernel void main0(const device ConfigBuf& _139 [[buffer(1)]], const device Scene } if (gl_LocalInvocationID.x == 0u) { - _238.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; - _238.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; - _238.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; - _238.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; - _238.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; + _237.outbuf[gl_WorkGroupID.x].trans_ix = agg.trans_ix; + _237.outbuf[gl_WorkGroupID.x].linewidth_ix = agg.linewidth_ix; + _237.outbuf[gl_WorkGroupID.x].pathseg_ix = agg.pathseg_ix; + _237.outbuf[gl_WorkGroupID.x].path_ix = agg.path_ix; + _237.outbuf[gl_WorkGroupID.x].pathseg_offset = agg.pathseg_offset; } } diff --git a/piet-gpu/shader/gen/pathtag_reduce.spv b/piet-gpu/shader/gen/pathtag_reduce.spv index 6dc35b8..9fc105f 100644 Binary files a/piet-gpu/shader/gen/pathtag_reduce.spv and b/piet-gpu/shader/gen/pathtag_reduce.spv differ diff --git a/piet-gpu/shader/gen/pathtag_root.dxil b/piet-gpu/shader/gen/pathtag_root.dxil index 1f27f26..77f12e6 100644 Binary files a/piet-gpu/shader/gen/pathtag_root.dxil and b/piet-gpu/shader/gen/pathtag_root.dxil differ diff --git a/piet-gpu/shader/gen/pathtag_root.hlsl b/piet-gpu/shader/gen/pathtag_root.hlsl index f1ec389..7ad806c 100644 --- a/piet-gpu/shader/gen/pathtag_root.hlsl +++ b/piet-gpu/shader/gen/pathtag_root.hlsl @@ -7,7 +7,7 @@ struct TagMonoid uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const TagMonoid _18 = { 0u, 0u, 0u, 0u, 0u }; @@ -21,7 +21,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared TagMonoid sh_scratch[512]; +groupshared TagMonoid sh_scratch[256]; TagMonoid combine_tag_monoid(TagMonoid a, TagMonoid b) { @@ -73,7 +73,7 @@ void comp_main() } TagMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -97,16 +97,16 @@ void comp_main() TagMonoid param_4 = row; TagMonoid param_5 = local[i_2]; TagMonoid m = combine_tag_monoid(param_4, param_5); - uint _211 = ix + i_2; - _78.Store(_211 * 20 + 0, m.trans_ix); - _78.Store(_211 * 20 + 4, m.linewidth_ix); - _78.Store(_211 * 20 + 8, m.pathseg_ix); - _78.Store(_211 * 20 + 12, m.path_ix); - _78.Store(_211 * 20 + 16, m.pathseg_offset); + uint _210 = ix + i_2; + _78.Store(_210 * 20 + 0, m.trans_ix); + _78.Store(_210 * 20 + 4, m.linewidth_ix); + _78.Store(_210 * 20 + 8, m.pathseg_ix); + _78.Store(_210 * 20 + 12, m.path_ix); + _78.Store(_210 * 20 + 16, m.pathseg_offset); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_LocalInvocationID = stage_input.gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/pathtag_root.msl b/piet-gpu/shader/gen/pathtag_root.msl index 923e77c..65e3741 100644 --- a/piet-gpu/shader/gen/pathtag_root.msl +++ b/piet-gpu/shader/gen/pathtag_root.msl @@ -67,7 +67,7 @@ struct DataBuf TagMonoid_1 data[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) TagMonoid combine_tag_monoid(thread const TagMonoid& a, thread const TagMonoid& b) @@ -89,7 +89,7 @@ TagMonoid tag_monoid_identity() kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - threadgroup TagMonoid sh_scratch[512]; + threadgroup TagMonoid sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; spvUnsafeArray local; local[0].trans_ix = _78.data[ix].trans_ix; @@ -111,7 +111,7 @@ kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID } TagMonoid agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -135,12 +135,12 @@ kernel void main0(device DataBuf& _78 [[buffer(0)]], uint3 gl_GlobalInvocationID TagMonoid param_4 = row; TagMonoid param_5 = local[i_2]; TagMonoid m = combine_tag_monoid(param_4, param_5); - uint _211 = ix + i_2; - _78.data[_211].trans_ix = m.trans_ix; - _78.data[_211].linewidth_ix = m.linewidth_ix; - _78.data[_211].pathseg_ix = m.pathseg_ix; - _78.data[_211].path_ix = m.path_ix; - _78.data[_211].pathseg_offset = m.pathseg_offset; + uint _210 = ix + i_2; + _78.data[_210].trans_ix = m.trans_ix; + _78.data[_210].linewidth_ix = m.linewidth_ix; + _78.data[_210].pathseg_ix = m.pathseg_ix; + _78.data[_210].path_ix = m.path_ix; + _78.data[_210].pathseg_offset = m.pathseg_offset; } } diff --git a/piet-gpu/shader/gen/pathtag_root.spv b/piet-gpu/shader/gen/pathtag_root.spv index 88e20b9..3783b49 100644 Binary files a/piet-gpu/shader/gen/pathtag_root.spv and b/piet-gpu/shader/gen/pathtag_root.spv differ diff --git a/piet-gpu/shader/gen/tile_alloc.hlsl b/piet-gpu/shader/gen/tile_alloc.hlsl index 010e714..5231c1d 100644 --- a/piet-gpu/shader/gen/tile_alloc.hlsl +++ b/piet-gpu/shader/gen/tile_alloc.hlsl @@ -261,7 +261,7 @@ void comp_main() for (uint i = 0u; i < 8u; i++) { GroupMemoryBarrierWithGroupSync(); - if (th_ix >= uint(1 << int(i))) + if (th_ix >= (1u << i)) { total_tile_count += sh_tile_count[th_ix - (1u << i)]; } @@ -271,46 +271,46 @@ void comp_main() if (th_ix == 255u) { uint param_4 = total_tile_count * 8u; - MallocResult _477 = malloc(param_4); - sh_tile_alloc = _477; + MallocResult _476 = malloc(param_4); + sh_tile_alloc = _476; } GroupMemoryBarrierWithGroupSync(); MallocResult alloc_start = sh_tile_alloc; - bool _488; + bool _487; if (!alloc_start.failed) { - _488 = _92.Load(4) != 0u; + _487 = _92.Load(4) != 0u; } else { - _488 = alloc_start.failed; + _487 = alloc_start.failed; } - if (_488) + if (_487) { return; } if (element_ix < _305.Load(0)) { - uint _501; + uint _500; if (th_ix > 0u) { - _501 = sh_tile_count[th_ix - 1u]; + _500 = sh_tile_count[th_ix - 1u]; } else { - _501 = 0u; + _500 = 0u; } - uint tile_subix = _501; + uint tile_subix = _500; Alloc param_5 = alloc_start.alloc; uint param_6 = 8u * tile_subix; uint param_7 = 8u * tile_count; Alloc tiles_alloc = slice_mem(param_5, param_6, param_7); - TileRef _523 = { tiles_alloc.offset }; - path.tiles = _523; - Alloc _528; - _528.offset = _305.Load(16); + TileRef _522 = { tiles_alloc.offset }; + path.tiles = _522; + Alloc _527; + _527.offset = _305.Load(16); Alloc param_8; - param_8.offset = _528.offset; + param_8.offset = _527.offset; PathRef param_9 = path_ref; Path param_10 = path; Path_write(param_8, param_9, param_10); diff --git a/piet-gpu/shader/gen/tile_alloc.msl b/piet-gpu/shader/gen/tile_alloc.msl index 3906536..49bd1c4 100644 --- a/piet-gpu/shader/gen/tile_alloc.msl +++ b/piet-gpu/shader/gen/tile_alloc.msl @@ -272,7 +272,7 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M for (uint i = 0u; i < 8u; i++) { threadgroup_barrier(mem_flags::mem_threadgroup); - if (th_ix >= uint(1 << int(i))) + if (th_ix >= (1u << i)) { total_tile_count += sh_tile_count[th_ix - (1u << i)]; } @@ -282,36 +282,36 @@ kernel void main0(constant uint* spvBufferSizeConstants [[buffer(25)]], device M if (th_ix == 255u) { uint param_4 = total_tile_count * 8u; - MallocResult _477 = malloc(param_4, v_92, v_92BufferSize); - sh_tile_alloc = _477; + MallocResult _476 = malloc(param_4, v_92, v_92BufferSize); + sh_tile_alloc = _476; } threadgroup_barrier(mem_flags::mem_threadgroup); MallocResult alloc_start = sh_tile_alloc; - bool _488; + bool _487; if (!alloc_start.failed) { - _488 = v_92.mem_error != 0u; + _487 = v_92.mem_error != 0u; } else { - _488 = alloc_start.failed; + _487 = alloc_start.failed; } - if (_488) + if (_487) { return; } if (element_ix < _305.conf.n_elements) { - uint _501; + uint _500; if (th_ix > 0u) { - _501 = sh_tile_count[th_ix - 1u]; + _500 = sh_tile_count[th_ix - 1u]; } else { - _501 = 0u; + _500 = 0u; } - uint tile_subix = _501; + uint tile_subix = _500; Alloc param_5 = alloc_start.alloc; uint param_6 = 8u * tile_subix; uint param_7 = 8u * tile_count; diff --git a/piet-gpu/shader/gen/tile_alloc.spv b/piet-gpu/shader/gen/tile_alloc.spv index d4a6e31..55d62ad 100644 Binary files a/piet-gpu/shader/gen/tile_alloc.spv and b/piet-gpu/shader/gen/tile_alloc.spv differ diff --git a/piet-gpu/shader/gen/transform_leaf.dxil b/piet-gpu/shader/gen/transform_leaf.dxil index 102d2f0..32ec399 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.dxil and b/piet-gpu/shader/gen/transform_leaf.dxil differ diff --git a/piet-gpu/shader/gen/transform_leaf.hlsl b/piet-gpu/shader/gen/transform_leaf.hlsl index 7744e0f..38136c9 100644 --- a/piet-gpu/shader/gen/transform_leaf.hlsl +++ b/piet-gpu/shader/gen/transform_leaf.hlsl @@ -47,14 +47,14 @@ struct Config uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const Transform _224 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; RWByteAddressBuffer _71 : register(u0, space0); ByteAddressBuffer _96 : register(t2, space0); ByteAddressBuffer _278 : register(t1, space0); -ByteAddressBuffer _377 : register(t3, space0); +ByteAddressBuffer _376 : register(t3, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -66,7 +66,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared Transform sh_scratch[512]; +groupshared Transform sh_scratch[256]; Transform Transform_read(TransformRef ref) { @@ -167,7 +167,7 @@ void comp_main() local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -184,11 +184,11 @@ void comp_main() Transform row = monoid_identity(); if (gl_WorkGroupID.x > 0u) { - Transform _383; - _383.mat = asfloat(_377.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); - _383.translate = asfloat(_377.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); - row.mat = _383.mat; - row.translate = _383.translate; + Transform _382; + _382.mat = asfloat(_376.Load4((gl_WorkGroupID.x - 1u) * 32 + 0)); + _382.translate = asfloat(_376.Load2((gl_WorkGroupID.x - 1u) * 32 + 16)); + row.mat = _382.mat; + row.translate = _382.translate; } if (gl_LocalInvocationID.x > 0u) { @@ -202,20 +202,20 @@ void comp_main() Transform param_10 = row; Transform param_11 = local[i_2]; Transform m = combine_monoid(param_10, param_11); - TransformSeg _423 = { m.mat, m.translate }; - TransformSeg transform = _423; - TransformSegRef _433 = { _278.Load(36) + ((ix + i_2) * 24u) }; - TransformSegRef trans_ref = _433; - Alloc _437; - _437.offset = _278.Load(36); - param_12.offset = _437.offset; + TransformSeg _422 = { m.mat, m.translate }; + TransformSeg transform = _422; + TransformSegRef _432 = { _278.Load(36) + ((ix + i_2) * 24u) }; + TransformSegRef trans_ref = _432; + Alloc _436; + _436.offset = _278.Load(36); + param_12.offset = _436.offset; TransformSegRef param_13 = trans_ref; TransformSeg param_14 = transform; TransformSeg_write(param_12, param_13, param_14); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/transform_leaf.msl b/piet-gpu/shader/gen/transform_leaf.msl index 9c7e6b7..6a55784 100644 --- a/piet-gpu/shader/gen/transform_leaf.msl +++ b/piet-gpu/shader/gen/transform_leaf.msl @@ -127,7 +127,7 @@ struct ParentBuf Transform_1 parent[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_96) @@ -214,9 +214,9 @@ void TransformSeg_write(thread const Alloc& a, thread const TransformSegRef& ref write_mem(param_15, param_16, param_17, v_71); } -kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _377 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _278 [[buffer(1)]], const device SceneBuf& v_96 [[buffer(2)]], const device ParentBuf& _376 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup Transform sh_scratch[512]; + threadgroup Transform sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; TransformRef ref = TransformRef{ _278.conf.trans_offset + (ix * 24u) }; TransformRef param = ref; @@ -234,7 +234,7 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 local[i] = agg; } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -251,9 +251,9 @@ kernel void main0(device Memory& v_71 [[buffer(0)]], const device ConfigBuf& _27 Transform row = monoid_identity(); if (gl_WorkGroupID.x > 0u) { - uint _380 = gl_WorkGroupID.x - 1u; - row.mat = _377.parent[_380].mat; - row.translate = _377.parent[_380].translate; + uint _379 = gl_WorkGroupID.x - 1u; + row.mat = _376.parent[_379].mat; + row.translate = _376.parent[_379].translate; } if (gl_LocalInvocationID.x > 0u) { diff --git a/piet-gpu/shader/gen/transform_leaf.spv b/piet-gpu/shader/gen/transform_leaf.spv index e561e9d..f418bbe 100644 Binary files a/piet-gpu/shader/gen/transform_leaf.spv and b/piet-gpu/shader/gen/transform_leaf.spv differ diff --git a/piet-gpu/shader/gen/transform_reduce.dxil b/piet-gpu/shader/gen/transform_reduce.dxil index 1ed5e0e..63df381 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.dxil and b/piet-gpu/shader/gen/transform_reduce.dxil differ diff --git a/piet-gpu/shader/gen/transform_reduce.hlsl b/piet-gpu/shader/gen/transform_reduce.hlsl index 5ada811..af52cdb 100644 --- a/piet-gpu/shader/gen/transform_reduce.hlsl +++ b/piet-gpu/shader/gen/transform_reduce.hlsl @@ -36,12 +36,12 @@ struct Config uint pathseg_offset; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); ByteAddressBuffer _49 : register(t2, space0); ByteAddressBuffer _161 : register(t1, space0); -RWByteAddressBuffer _251 : register(u3, space0); -RWByteAddressBuffer _267 : register(u0, space0); +RWByteAddressBuffer _250 : register(u3, space0); +RWByteAddressBuffer _266 : register(u0, space0); static uint3 gl_WorkGroupID; static uint3 gl_LocalInvocationID; @@ -53,7 +53,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared Transform sh_scratch[512]; +groupshared Transform sh_scratch[256]; Transform Transform_read(TransformRef ref) { @@ -101,10 +101,10 @@ void comp_main() agg = combine_monoid(param_4, param_5); } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); - if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) { Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; Transform param_6 = agg; @@ -116,12 +116,12 @@ void comp_main() } if (gl_LocalInvocationID.x == 0u) { - _251.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat)); - _251.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate)); + _250.Store4(gl_WorkGroupID.x * 32 + 0, asuint(agg.mat)); + _250.Store2(gl_WorkGroupID.x * 32 + 16, asuint(agg.translate)); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_WorkGroupID = stage_input.gl_WorkGroupID; diff --git a/piet-gpu/shader/gen/transform_reduce.msl b/piet-gpu/shader/gen/transform_reduce.msl index ac586d9..c387f03 100644 --- a/piet-gpu/shader/gen/transform_reduce.msl +++ b/piet-gpu/shader/gen/transform_reduce.msl @@ -72,7 +72,7 @@ struct Memory uint memory[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) Transform Transform_read(thread const TransformRef& ref, const device SceneBuf& v_49) @@ -105,9 +105,9 @@ Transform combine_monoid(thread const Transform& a, thread const Transform& b) return c; } -kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _251 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) +kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device SceneBuf& v_49 [[buffer(2)]], device OutBuf& _250 [[buffer(3)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]]) { - threadgroup Transform sh_scratch[512]; + threadgroup Transform sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; TransformRef ref = TransformRef{ _161.conf.trans_offset + (ix * 24u) }; TransformRef param = ref; @@ -122,10 +122,10 @@ kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device Scene agg = combine_monoid(param_4, param_5); } sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); - if ((gl_LocalInvocationID.x + (1u << i_1)) < 512u) + if ((gl_LocalInvocationID.x + (1u << i_1)) < 256u) { Transform other = sh_scratch[gl_LocalInvocationID.x + (1u << i_1)]; Transform param_6 = agg; @@ -137,8 +137,8 @@ kernel void main0(const device ConfigBuf& _161 [[buffer(1)]], const device Scene } if (gl_LocalInvocationID.x == 0u) { - _251.outbuf[gl_WorkGroupID.x].mat = agg.mat; - _251.outbuf[gl_WorkGroupID.x].translate = agg.translate; + _250.outbuf[gl_WorkGroupID.x].mat = agg.mat; + _250.outbuf[gl_WorkGroupID.x].translate = agg.translate; } } diff --git a/piet-gpu/shader/gen/transform_reduce.spv b/piet-gpu/shader/gen/transform_reduce.spv index 5638afb..af5ffb9 100644 Binary files a/piet-gpu/shader/gen/transform_reduce.spv and b/piet-gpu/shader/gen/transform_reduce.spv differ diff --git a/piet-gpu/shader/gen/transform_root.dxil b/piet-gpu/shader/gen/transform_root.dxil index 0d16d04..5b4f059 100644 Binary files a/piet-gpu/shader/gen/transform_root.dxil and b/piet-gpu/shader/gen/transform_root.dxil differ diff --git a/piet-gpu/shader/gen/transform_root.hlsl b/piet-gpu/shader/gen/transform_root.hlsl index 35961b1..d447db6 100644 --- a/piet-gpu/shader/gen/transform_root.hlsl +++ b/piet-gpu/shader/gen/transform_root.hlsl @@ -4,7 +4,7 @@ struct Transform float2 translate; }; -static const uint3 gl_WorkGroupSize = uint3(512u, 1u, 1u); +static const uint3 gl_WorkGroupSize = uint3(256u, 1u, 1u); static const Transform _23 = { float4(1.0f, 0.0f, 0.0f, 1.0f), 0.0f.xx }; @@ -18,7 +18,7 @@ struct SPIRV_Cross_Input uint3 gl_GlobalInvocationID : SV_DispatchThreadID; }; -groupshared Transform sh_scratch[512]; +groupshared Transform sh_scratch[256]; Transform combine_monoid(Transform a, Transform b) { @@ -55,7 +55,7 @@ void comp_main() } Transform agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { GroupMemoryBarrierWithGroupSync(); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -79,13 +79,13 @@ void comp_main() Transform param_4 = row; Transform param_5 = local[i_2]; Transform m = combine_monoid(param_4, param_5); - uint _209 = ix + i_2; - _89.Store4(_209 * 32 + 0, asuint(m.mat)); - _89.Store2(_209 * 32 + 16, asuint(m.translate)); + uint _208 = ix + i_2; + _89.Store4(_208 * 32 + 0, asuint(m.mat)); + _89.Store2(_208 * 32 + 16, asuint(m.translate)); } } -[numthreads(512, 1, 1)] +[numthreads(256, 1, 1)] void main(SPIRV_Cross_Input stage_input) { gl_LocalInvocationID = stage_input.gl_LocalInvocationID; diff --git a/piet-gpu/shader/gen/transform_root.msl b/piet-gpu/shader/gen/transform_root.msl index 2c58c06..8b4b2a1 100644 --- a/piet-gpu/shader/gen/transform_root.msl +++ b/piet-gpu/shader/gen/transform_root.msl @@ -62,7 +62,7 @@ struct DataBuf Transform_1 data[1]; }; -constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(512u, 1u, 1u); +constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(256u, 1u, 1u); static inline __attribute__((always_inline)) Transform combine_monoid(thread const Transform& a, thread const Transform& b) @@ -81,7 +81,7 @@ Transform monoid_identity() kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]]) { - threadgroup Transform sh_scratch[512]; + threadgroup Transform sh_scratch[256]; uint ix = gl_GlobalInvocationID.x * 8u; spvUnsafeArray local; local[0].mat = _89.data[ix].mat; @@ -97,7 +97,7 @@ kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID } Transform agg = local[7]; sh_scratch[gl_LocalInvocationID.x] = agg; - for (uint i_1 = 0u; i_1 < 9u; i_1++) + for (uint i_1 = 0u; i_1 < 8u; i_1++) { threadgroup_barrier(mem_flags::mem_threadgroup); if (gl_LocalInvocationID.x >= (1u << i_1)) @@ -121,9 +121,9 @@ kernel void main0(device DataBuf& _89 [[buffer(0)]], uint3 gl_GlobalInvocationID Transform param_4 = row; Transform param_5 = local[i_2]; Transform m = combine_monoid(param_4, param_5); - uint _209 = ix + i_2; - _89.data[_209].mat = m.mat; - _89.data[_209].translate = m.translate; + uint _208 = ix + i_2; + _89.data[_208].mat = m.mat; + _89.data[_208].translate = m.translate; } } diff --git a/piet-gpu/shader/gen/transform_root.spv b/piet-gpu/shader/gen/transform_root.spv index 7824d09..1578842 100644 Binary files a/piet-gpu/shader/gen/transform_root.spv and b/piet-gpu/shader/gen/transform_root.spv differ diff --git a/piet-gpu/shader/pathseg.comp b/piet-gpu/shader/pathseg.comp index ec0a440..12104eb 100644 --- a/piet-gpu/shader/pathseg.comp +++ b/piet-gpu/shader/pathseg.comp @@ -10,7 +10,7 @@ #include "pathtag.h" #define N_SEQ 4 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_SEQ) diff --git a/piet-gpu/shader/pathtag_reduce.comp b/piet-gpu/shader/pathtag_reduce.comp index 86bb9d0..9e84bf8 100644 --- a/piet-gpu/shader/pathtag_reduce.comp +++ b/piet-gpu/shader/pathtag_reduce.comp @@ -11,8 +11,8 @@ // Note: the partition size is smaller than pathseg by a factor // of 4, as there are 4 tag bytes to a tag word. -#define N_ROWS 4 -#define LG_WG_SIZE 7 +#define N_ROWS 2 +#define LG_WG_SIZE (6 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/pathtag_scan.comp b/piet-gpu/shader/pathtag_scan.comp index c0d386e..7c1e74b 100644 --- a/piet-gpu/shader/pathtag_scan.comp +++ b/piet-gpu/shader/pathtag_scan.comp @@ -5,10 +5,11 @@ #version 450 #extension GL_GOOGLE_include_directive : enable +#include "setup.h" #include "pathtag.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 5d4cc73..4211b08 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -27,6 +27,7 @@ #define GRADIENT_WIDTH 512 +#ifdef ERR_MALLOC_FAILED struct Config { uint n_elements; // paths uint n_pathseg; @@ -59,6 +60,7 @@ struct Config { // Offset (in bytes) of path segment stream in scene uint pathseg_offset; }; +#endif // Fill modes. #define MODE_NONZERO 0 diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 3761e9e..024f499 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -72,7 +72,7 @@ void main() { // Prefix sum of sh_tile_count for (uint i = 0; i < LG_TILE_ALLOC_WG; i++) { barrier(); - if (th_ix >= (1 << i)) { + if (th_ix >= (1u << i)) { total_tile_count += sh_tile_count[th_ix - (1u << i)]; } barrier(); diff --git a/piet-gpu/shader/transform_leaf.comp b/piet-gpu/shader/transform_leaf.comp index e158c50..c51dfe6 100644 --- a/piet-gpu/shader/transform_leaf.comp +++ b/piet-gpu/shader/transform_leaf.comp @@ -10,7 +10,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/transform_reduce.comp b/piet-gpu/shader/transform_reduce.comp index 4b72b11..e59d559 100644 --- a/piet-gpu/shader/transform_reduce.comp +++ b/piet-gpu/shader/transform_reduce.comp @@ -9,7 +9,7 @@ #include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/shader/transform_scan.comp b/piet-gpu/shader/transform_scan.comp index 492bf04..c4d6745 100644 --- a/piet-gpu/shader/transform_scan.comp +++ b/piet-gpu/shader/transform_scan.comp @@ -3,9 +3,12 @@ // A scan for a tree reduction prefix scan (either root or not, by ifdef). #version 450 +#extension GL_GOOGLE_include_directive : enable + +#include "setup.h" #define N_ROWS 8 -#define LG_WG_SIZE 9 +#define LG_WG_SIZE (7 + LG_WG_FACTOR) #define WG_SIZE (1 << LG_WG_SIZE) #define PARTITION_SIZE (WG_SIZE * N_ROWS) diff --git a/piet-gpu/src/encoder.rs b/piet-gpu/src/encoder.rs index fb32f26..87fff1c 100644 --- a/piet-gpu/src/encoder.rs +++ b/piet-gpu/src/encoder.rs @@ -19,7 +19,9 @@ use bytemuck::{Pod, Zeroable}; use piet_gpu_hal::BufWrite; -use crate::stages::{self, Config, PathEncoder, Transform}; +use crate::stages::{ + self, Config, PathEncoder, Transform, DRAW_PART_SIZE, PATHSEG_PART_SIZE, TRANSFORM_PART_SIZE, +}; pub struct Encoder { transform_stream: Vec, @@ -52,12 +54,6 @@ const BBOX_SIZE: usize = 24; const DRAWMONOID_SIZE: usize = 8; const ANNOTATED_SIZE: usize = 40; -// Maybe pull these from the relevant stages? In any case, they may depend -// on runtime query of GPU (supported workgroup size). -const TRANSFORM_PART_SIZE: usize = 4096; -const PATHSEG_PART_SIZE: usize = 2048; -const DRAWOBJ_PART_SIZE: usize = 4096; - // These are bytemuck versions of elements currently defined in the // Element struct in piet-gpu-types; that's pretty much going away. @@ -183,15 +179,15 @@ impl Encoder { pub fn stage_config(&self) -> (Config, usize) { // Layout of scene buffer let n_drawobj = self.n_drawobj(); - let n_drawobj_padded = align_up(n_drawobj, DRAWOBJ_PART_SIZE); + let n_drawobj_padded = align_up(n_drawobj, DRAW_PART_SIZE as usize); let trans_offset = n_drawobj_padded * DRAWOBJ_SIZE; let n_trans = self.transform_stream.len(); - let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE); + let n_trans_padded = align_up(n_trans, TRANSFORM_PART_SIZE as usize); let linewidth_offset = trans_offset + n_trans_padded * TRANSFORM_SIZE; let n_linewidth = self.linewidth_stream.len(); let pathtag_offset = linewidth_offset + n_linewidth * LINEWIDTH_SIZE; let n_pathtag = self.tag_stream.len(); - let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE); + let n_pathtag_padded = align_up(n_pathtag, PATHSEG_PART_SIZE as usize); let pathseg_offset = pathtag_offset + n_pathtag_padded; // Layout of memory @@ -230,14 +226,14 @@ impl Encoder { pub fn write_scene(&self, buf: &mut BufWrite) { buf.extend_slice(&self.drawobj_stream); let n_drawobj = self.drawobj_stream.len() / DRAWOBJ_SIZE; - buf.fill_zero(padding(n_drawobj, DRAWOBJ_PART_SIZE) * DRAWOBJ_SIZE); + buf.fill_zero(padding(n_drawobj, DRAW_PART_SIZE as usize) * DRAWOBJ_SIZE); buf.extend_slice(&self.transform_stream); let n_trans = self.transform_stream.len(); - buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE) * TRANSFORM_SIZE); + buf.fill_zero(padding(n_trans, TRANSFORM_PART_SIZE as usize) * TRANSFORM_SIZE); buf.extend_slice(&self.linewidth_stream); buf.extend_slice(&self.tag_stream); let n_pathtag = self.tag_stream.len(); - buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE)); + buf.fill_zero(padding(n_pathtag, PATHSEG_PART_SIZE as usize)); buf.extend_slice(&self.pathseg_stream); } diff --git a/piet-gpu/src/render_ctx.rs b/piet-gpu/src/render_ctx.rs index 31fbf9e..96bbf03 100644 --- a/piet-gpu/src/render_ctx.rs +++ b/piet-gpu/src/render_ctx.rs @@ -11,7 +11,7 @@ use piet::{ use piet_gpu_hal::BufWrite; use piet_gpu_types::encoder::{Encode, Encoder}; -use piet_gpu_types::scene::{Element, SetFillMode}; +use piet_gpu_types::scene::Element; use crate::gradient::{LinearGradient, RampCache}; use crate::text::Font; @@ -25,7 +25,6 @@ pub struct PietGpuRenderContext { // Will probably need direct accesss to hal Device to create images etc. inner_text: PietGpuText, stroke_width: f32, - fill_mode: FillMode, // We're tallying these cpu-side for expedience, but will probably // move this to some kind of readback from element processing. /// The count of elements that make it through to coarse rasterization. @@ -69,14 +68,6 @@ struct ClipElement { bbox: Option, } -#[derive(Clone, Copy, PartialEq)] -pub(crate) enum FillMode { - // Fill path according to the non-zero winding rule. - Nonzero = 0, - // Fill stroked path. - Stroke = 1, -} - const TOLERANCE: f64 = 0.25; impl PietGpuRenderContext { @@ -91,7 +82,6 @@ impl PietGpuRenderContext { elements, inner_text, stroke_width, - fill_mode: FillMode::Nonzero, path_count: 0, pathseg_count: 0, trans_count: 0, @@ -160,15 +150,6 @@ impl PietGpuRenderContext { pub fn get_ramp_data(&self) -> Vec { self.ramp_cache.get_ramp_data() } - - pub(crate) fn set_fill_mode(&mut self, fill_mode: FillMode) { - if self.fill_mode != fill_mode { - self.elements.push(Element::SetFillMode(SetFillMode { - fill_mode: fill_mode as u32, - })); - self.fill_mode = fill_mode; - } - } } impl RenderContext for PietGpuRenderContext { diff --git a/piet-gpu/src/stages.rs b/piet-gpu/src/stages.rs index 1683cac..014cef4 100644 --- a/piet-gpu/src/stages.rs +++ b/piet-gpu/src/stages.rs @@ -22,10 +22,12 @@ mod transform; use bytemuck::{Pod, Zeroable}; -pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage}; -pub use path::{PathBinding, PathCode, PathEncoder, PathStage}; +pub use draw::{DrawBinding, DrawCode, DrawMonoid, DrawStage, DRAW_PART_SIZE}; +pub use path::{PathBinding, PathCode, PathEncoder, PathStage, PATHSEG_PART_SIZE}; use piet_gpu_hal::{Buffer, CmdBuf, Session}; -pub use transform::{Transform, TransformBinding, TransformCode, TransformStage}; +pub use transform::{ + Transform, TransformBinding, TransformCode, TransformStage, TRANSFORM_PART_SIZE, +}; /// The configuration block passed to piet-gpu shaders. /// diff --git a/piet-gpu/src/stages/draw.rs b/piet-gpu/src/stages/draw.rs index da773cf..5328a84 100644 --- a/piet-gpu/src/stages/draw.rs +++ b/piet-gpu/src/stages/draw.rs @@ -30,9 +30,9 @@ pub struct DrawMonoid { pub clip_ix: u32, } -const DRAW_WG: u64 = 512; +const DRAW_WG: u64 = 256; const DRAW_N_ROWS: u64 = 8; -const DRAW_PART_SIZE: u64 = DRAW_WG * DRAW_N_ROWS; +pub const DRAW_PART_SIZE: u64 = DRAW_WG * DRAW_N_ROWS; pub struct DrawCode { reduce_pipeline: Pipeline, diff --git a/piet-gpu/src/stages/path.rs b/piet-gpu/src/stages/path.rs index c9d2c60..6c524a2 100644 --- a/piet-gpu/src/stages/path.rs +++ b/piet-gpu/src/stages/path.rs @@ -39,22 +39,23 @@ pub struct PathBinding { } const REDUCE_WG: u32 = 128; -const REDUCE_N_ROWS: u32 = 4; +const REDUCE_N_ROWS: u32 = 2; const REDUCE_PART_SIZE: u32 = REDUCE_WG * REDUCE_N_ROWS; -const ROOT_WG: u32 = 512; +const ROOT_WG: u32 = 256; const ROOT_N_ROWS: u32 = 8; const ROOT_PART_SIZE: u32 = ROOT_WG * ROOT_N_ROWS; -const SCAN_WG: u32 = 512; +const SCAN_WG: u32 = 256; const SCAN_N_ROWS: u32 = 4; const SCAN_PART_SIZE: u32 = SCAN_WG * SCAN_N_ROWS; -const CLEAR_WG: u32 = 512; +pub const PATHSEG_PART_SIZE: u32 = SCAN_PART_SIZE; + +const CLEAR_WG: u32 = 256; impl PathCode { pub unsafe fn new(session: &Session) -> PathCode { - // TODO: add cross-compilation let reduce_code = include_shader!(session, "../../shader/gen/pathtag_reduce"); let reduce_pipeline = session .create_compute_pipeline( diff --git a/piet-gpu/src/stages/transform.rs b/piet-gpu/src/stages/transform.rs index 4383c14..b21712f 100644 --- a/piet-gpu/src/stages/transform.rs +++ b/piet-gpu/src/stages/transform.rs @@ -33,9 +33,9 @@ pub struct Transform { pub translate: [f32; 2], } -const TRANSFORM_WG: u64 = 512; +const TRANSFORM_WG: u64 = 256; const TRANSFORM_N_ROWS: u64 = 8; -const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS; +pub const TRANSFORM_PART_SIZE: u64 = TRANSFORM_WG * TRANSFORM_N_ROWS; pub struct TransformCode { reduce_pipeline: Pipeline, diff --git a/piet-gpu/src/text.rs b/piet-gpu/src/text.rs index a47c614..dec3ffa 100644 --- a/piet-gpu/src/text.rs +++ b/piet-gpu/src/text.rs @@ -11,7 +11,7 @@ use piet::{ }; use crate::encoder::GlyphEncoder; -use crate::render_ctx::{self, FillMode}; +use crate::render_ctx; use crate::stages::Transform; use crate::PietGpuRenderContext; @@ -172,7 +172,6 @@ impl PietGpuTextLayout { let mut inv_transform = None; // TODO: handle y offsets also let mut last_x = 0.0; - ctx.set_fill_mode(FillMode::Nonzero); for glyph in &self.glyphs { let transform = match &mut inv_transform { None => { diff --git a/tests/src/draw.rs b/tests/src/draw.rs index 916f14c..d79a9d9 100644 --- a/tests/src/draw.rs +++ b/tests/src/draw.rs @@ -38,7 +38,8 @@ struct DrawTestData { pub unsafe fn draw_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("draw"); - let n_tag: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 24); + // TODO: implement large scan and set large to 1 << 24 + let n_tag: u64 = config.size.choose(1 << 12, 1 << 20, 1 << 22); let data = DrawTestData::new(n_tag); let stage_config = data.get_config(); diff --git a/tests/src/path.rs b/tests/src/path.rs index 7c5388f..6f1f61a 100644 --- a/tests/src/path.rs +++ b/tests/src/path.rs @@ -62,7 +62,8 @@ struct Bbox { pub unsafe fn path_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("path"); - let n_path: u64 = config.size.choose(1 << 12, 1 << 16, 1 << 18); + // TODO: implement large scans and raise limit + let n_path: u64 = config.size.choose(1 << 12, 1 << 16, 209_000); let path_data = PathData::new(n_path as u32); let stage_config = path_data.get_config(); let config_buf = runner diff --git a/tests/src/transform.rs b/tests/src/transform.rs index 1c15634..6edcc3f 100644 --- a/tests/src/transform.rs +++ b/tests/src/transform.rs @@ -30,7 +30,8 @@ struct AffineTestData { pub unsafe fn transform_test(runner: &mut Runner, config: &Config) -> TestResult { let mut result = TestResult::new("transform"); - let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 24); + // TODO: implement large scan and set large to 1 << 24 + let n_elements: u64 = config.size.choose(1 << 12, 1 << 18, 1 << 22); // Validate with real transform data. let data = AffineTestData::new(n_elements as usize); let data_buf = runner