replace branches with chained selects

This exchanges the per-pixel branching with additional ALU + selects. My expectation is that this will be faster, but that may be hardware/driver dependent and likely requires profiling and examination of generated code.

The original code is kept in a comment with notes to explain the more obfuscated select version.
This commit is contained in:
Chad Brokaw 2023-05-11 12:37:36 -04:00
parent b103a55301
commit 5e1188f968

View file

@ -115,22 +115,33 @@ fn read_end_clip(cmd_ix: u32) -> CmdEndClip {
} }
fn extend_mode(t: f32, mode: u32) -> f32 { fn extend_mode(t: f32, mode: u32) -> f32 {
// This can be replaced with two selects, exchanging the cost let EXTEND_PAD = 0u;
// of a branch for additional ALU let EXTEND_REPEAT = 1u;
switch mode { let EXTEND_REFLECT = 2u;
// PAD // Branching version of the code below:
case 0u: { //
return clamp(t, 0.0, 1.0); // switch mode {
} // // EXTEND_PAD
// REPEAT // case 0u: {
case 1u: { // return clamp(t, 0.0, 1.0);
return fract(t); // }
} // // EXTEND_REPEAT
// REFLECT (2) // case 1u: {
default: { // return fract(t);
return abs(t - 2.0 * round(0.5 * t)); // }
} // // EXTEND_REFLECT
} // default: {
// return abs(t - 2.0 * round(0.5 * t));
// }
// }
let pad = clamp(t, 0.0, 1.0);
let repeat = fract(t);
let reflect = abs(t - 2.0 * round(0.5 * t));
return select(
select(pad, repeat, mode == EXTEND_REPEAT),
reflect,
mode == EXTEND_REFLECT
);
} }
#else #else
@ -304,9 +315,9 @@ fn main(
let is_circular = rad.kind == RAD_GRAD_KIND_CIRCULAR; let is_circular = rad.kind == RAD_GRAD_KIND_CIRCULAR;
let is_focal_on_circle = rad.kind == RAD_GRAD_KIND_FOCAL_ON_CIRCLE; let is_focal_on_circle = rad.kind == RAD_GRAD_KIND_FOCAL_ON_CIRCLE;
let is_swapped = (rad.flags & RAD_GRAD_SWAPPED) != 0u; let is_swapped = (rad.flags & RAD_GRAD_SWAPPED) != 0u;
let is_greater = radius > 1.0;
let inv_r1 = select(1.0 / radius, 0.0, is_circular); let inv_r1 = select(1.0 / radius, 0.0, is_circular);
let root_f = select(1.0, -1.0, is_swapped || one_minus_focal_x < 0.0); let less_scale = select(1.0, -1.0, is_swapped || one_minus_focal_x < 0.0);
let t_base_scale = select(vec2(0.0, -1.0), vec2(1.0, 1.0), is_swapped);
let t_sign = sign(one_minus_focal_x); let t_sign = sign(one_minus_focal_x);
for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) { for (var i = 0u; i < PIXELS_PER_THREAD; i += 1u) {
let my_xy = vec2(xy.x + f32(i), xy.y); let my_xy = vec2(xy.x + f32(i), xy.y);
@ -316,25 +327,67 @@ fn main(
let xx = x * x; let xx = x * x;
let yy = y * y; let yy = y * y;
let x_inv_r1 = x * inv_r1; let x_inv_r1 = x * inv_r1;
var t = 0.0; // This is the branching version of the code implemented
var valid = true; // by the chained selects below:
if is_strip { //
let a = radius - yy; // var t = 0.0;
t = sqrt(a) + x; // var is_valid = true;
valid = a >= 0.0; // if is_strip {
} else if is_focal_on_circle { // let a = radius - yy;
t = (xx + yy) / x; // t = sqrt(a) + x;
valid = t >= 0.0; // is_valid = a >= 0.0;
} else if radius > 1.0 { // } else if is_focal_on_circle {
t = sqrt(xx + yy) - x_inv_r1; // t = (xx + yy) / x;
} else { // is_valid = t >= 0.0;
let a = xx - yy; // } else if radius > 1.0 {
t = root_f * sqrt(a) - x_inv_r1; // t = sqrt(xx + yy) - x_inv_r1;
valid = a >= 0.0 && t >= 0.0; // } else {
} // let a = xx - yy;
if valid { // t = root_f * sqrt(a) - x_inv_r1;
// is_valid = a >= 0.0 && t >= 0.0;
// }
//
// The pattern is that these can all be computed with
// the expression: a * sqrt(b) + c
//
// The parameters to the expression are computed up front
// and chosen with chained selects based on their
// respective conditions. The same process is done
// for determining the validity of the resulting value.
var strip_params = vec3(1.0, radius - yy, x);
var foc_params = vec3(1.0, 0.0, (xx + yy) / x);
var greater_params = vec3(1.0, xx + yy, -x_inv_r1);
var less_params = vec3(less_scale, xx - yy, -x_inv_r1);
var params = select(
select(
select(
less_params,
greater_params,
is_greater,
),
foc_params,
is_focal_on_circle,
),
strip_params,
is_strip,
);
var t = params.x * sqrt(params.y) + params.z;
let is_valid = select(
select(
select(
params.y >= 0.0 && t >= 0.0,
true,
is_greater
),
t >= 0.0 && x != 0.0,
is_focal_on_circle,
),
params.y >= 0.0,
is_strip,
);
if is_valid {
t = extend_mode(focal_x + t_sign * t, rad.extend_mode); t = extend_mode(focal_x + t_sign * t, rad.extend_mode);
t = (t_base_scale.x - t) * t_base_scale.y; t = select(t, 1.0 - t, is_swapped);
let x = i32(round(t * f32(GRADIENT_WIDTH - 1))); let x = i32(round(t * f32(GRADIENT_WIDTH - 1)));
let fg_rgba = textureLoad(gradients, vec2(x, i32(rad.index)), 0); let fg_rgba = textureLoad(gradients, vec2(x, i32(rad.index)), 0);
let fg_i = fg_rgba * area[i]; let fg_i = fg_rgba * area[i];