diff --git a/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang index d910887..4e8a32c 100644 --- a/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang index 36997c4..d5a77a3 100644 --- a/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11fast-horizontal-last-pass.slang b/blurs/blur11fast-horizontal-last-pass.slang index 1220392..95ae5bb 100644 --- a/blurs/blur11fast-horizontal-last-pass.slang +++ b/blurs/blur11fast-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11fast-horizontal.slang b/blurs/blur11fast-horizontal.slang index 72bdcf0..5ecf46f 100644 --- a/blurs/blur11fast-horizontal.slang +++ b/blurs/blur11fast-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang index 843c0b1..4dcad35 100644 --- a/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11fast-vertical.slang b/blurs/blur11fast-vertical.slang index 1cb691e..b4d96c2 100644 --- a/blurs/blur11fast-vertical.slang +++ b/blurs/blur11fast-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang index c67e1f9..d9f57f2 100644 --- a/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang index e14b773..2c6bfdf 100644 --- a/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11resize-horizontal-last-pass.slang b/blurs/blur11resize-horizontal-last-pass.slang index 3e105a1..6ee8518 100644 --- a/blurs/blur11resize-horizontal-last-pass.slang +++ b/blurs/blur11resize-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11resize-horizontal.slang b/blurs/blur11resize-horizontal.slang index 066df28..861d0aa 100644 --- a/blurs/blur11resize-horizontal.slang +++ b/blurs/blur11resize-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang index 160729c..5902484 100644 --- a/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur11resize-vertical.slang b/blurs/blur11resize-vertical.slang index 90158c5..ffd46b9 100644 --- a/blurs/blur11resize-vertical.slang +++ b/blurs/blur11resize-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur12x12shared.slang b/blurs/blur12x12shared.slang new file mode 100644 index 0000000..cca3ed4 --- /dev/null +++ b/blurs/blur12x12shared.slang @@ -0,0 +1,87 @@ +#version 450 + +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; +} global; + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the .cgp preset file. +//#define GAMMA_ENCODE_EVERY_FBO +//#define FIRST_PASS +//#define LAST_PASS +//#define SIMULATE_CRT_ON_LCD +//#define SIMULATE_GBA_ON_LCD +//#define SIMULATE_LCD_ON_CRT +//#define SIMULATE_GBA_ON_CRT + +// blur-functions.h needs to know our profile's capabilities: +// 1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs. +// 2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful +// artifacts without it due to funky texture sampling derivatives. +#define DRIVERS_ALLOW_DERIVATIVES +#define DRIVERS_ALLOW_TEX2DLOD + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +#include "../include/compat_macros.inc" +#pragma stage vertex +#include "vertex-shader-blur-one-pass-shared-sample.h" + +#pragma stage fragment +layout(location = 0) in vec4 tex_uv; +layout(location = 1) in vec2 blur_dxdy; +layout(location = 2) in vec4 output_pixel_num; +layout(location = 0) out vec4 FragColor; +layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" + +void main() +{ + // Get the integer output pixel number from two origins (uv and screen): + float4 output_pixel_num_integer = floor(output_pixel_num); + // Get the fragment's position in the pixel quad and do a shared-sample blur: + float4 quad_vector = get_quad_vector(output_pixel_num_integer); + float3 color = tex2Dblur12x12shared(input_texture, tex_uv, + blur_dxdy, quad_vector); + // Encode and output the blurred image: + FragColor = encode_output(float4(color, 1.0)); +} \ No newline at end of file diff --git a/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang index 212ab6f..0ff0d43 100644 --- a/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang @@ -51,12 +51,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_GBA_ON_CRT -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +64,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang index 188c6b4..e4d4483 100644 --- a/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -51,12 +51,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_GBA_ON_CRT -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +64,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3fast-horizontal-last-pass.slang b/blurs/blur3fast-horizontal-last-pass.slang index d508039..432b541 100644 --- a/blurs/blur3fast-horizontal-last-pass.slang +++ b/blurs/blur3fast-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3fast-horizontal.slang b/blurs/blur3fast-horizontal.slang index 4d9d5f4..97fffd4 100644 --- a/blurs/blur3fast-horizontal.slang +++ b/blurs/blur3fast-horizontal.slang @@ -51,12 +51,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_GBA_ON_CRT -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +64,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang index 15a9e6b..e59be4b 100644 --- a/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3fast-vertical.slang b/blurs/blur3fast-vertical.slang index a42c2b8..e9666a0 100644 --- a/blurs/blur3fast-vertical.slang +++ b/blurs/blur3fast-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang index c497bf0..f5c7a64 100644 --- a/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang index 4eadb90..daf0908 100644 --- a/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3resize-horizontal-last-pass.slang b/blurs/blur3resize-horizontal-last-pass.slang index d339ce5..18a2f8a 100644 --- a/blurs/blur3resize-horizontal-last-pass.slang +++ b/blurs/blur3resize-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3resize-horizontal.slang b/blurs/blur3resize-horizontal.slang index 5cc3f1c..f9a4eb1 100644 --- a/blurs/blur3resize-horizontal.slang +++ b/blurs/blur3resize-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang index 35ecc9b..faafc5e 100644 --- a/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3resize-vertical.slang b/blurs/blur3resize-vertical.slang index e5b11ce..ebe89a3 100644 --- a/blurs/blur3resize-vertical.slang +++ b/blurs/blur3resize-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3-gamma-encode-every-fbo.slang b/blurs/blur3x3-gamma-encode-every-fbo.slang index 6af8321..0ca86e8 100644 --- a/blurs/blur3x3-gamma-encode-every-fbo.slang +++ b/blurs/blur3x3-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang index 2567a41..8cd8ecd 100644 --- a/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang @@ -50,15 +50,11 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex -#include "vertex-shader-blur-one-pass-resize.h" +#include "vertex-shader-blur-one-pass.h" /////////////////////////////// FRAGMENT SHADER ////////////////////////////// @@ -67,10 +63,15 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { - vec3 color = tex2Dblur3x3resize(Source, tex_uv, blur_dxdy); + vec3 color = tex2Dblur3x3(Source, tex_uv, blur_dxdy); // Encode and output the blurred image: FragColor = encode_output(vec4(color, 1.0)); } \ No newline at end of file diff --git a/blurs/blur3x3-last-pass.slang b/blurs/blur3x3-last-pass.slang index 4290a6c..2073a26 100644 --- a/blurs/blur3x3-last-pass.slang +++ b/blurs/blur3x3-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3.slang b/blurs/blur3x3.slang index caffdf8..af918be 100644 --- a/blurs/blur3x3.slang +++ b/blurs/blur3x3.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3resize-gamma-encode-every-fbo.slang b/blurs/blur3x3resize-gamma-encode-every-fbo.slang index abc4b1e..7c0d356 100644 --- a/blurs/blur3x3resize-gamma-encode-every-fbo.slang +++ b/blurs/blur3x3resize-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass-resize.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang index 2567a41..092e074 100644 --- a/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass-resize.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3resize-last-pass.slang b/blurs/blur3x3resize-last-pass.slang index b430f00..84ed9c1 100644 --- a/blurs/blur3x3resize-last-pass.slang +++ b/blurs/blur3x3resize-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass-resize.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur3x3resize.slang b/blurs/blur3x3resize.slang index c72d02f..35c8f7d 100644 --- a/blurs/blur3x3resize.slang +++ b/blurs/blur3x3resize.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass-resize.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang index 1644912..8f2c028 100644 --- a/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang index c074006..d396723 100644 --- a/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5fast-horizontal-last-pass.slang b/blurs/blur5fast-horizontal-last-pass.slang index 240b90e..d8c6b3d 100644 --- a/blurs/blur5fast-horizontal-last-pass.slang +++ b/blurs/blur5fast-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5fast-horizontal.slang b/blurs/blur5fast-horizontal.slang index b638643..1b0b167 100644 --- a/blurs/blur5fast-horizontal.slang +++ b/blurs/blur5fast-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang index 922c71c..4a6b833 100644 --- a/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5fast-vertical.slang b/blurs/blur5fast-vertical.slang index 0750af7..0d76d57 100644 --- a/blurs/blur5fast-vertical.slang +++ b/blurs/blur5fast-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang index 0f4b9ff..81d640b 100644 --- a/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang index 09935ac..5e8ccc8 100644 --- a/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5resize-horizontal-last-pass.slang b/blurs/blur5resize-horizontal-last-pass.slang index 929d899..14862b9 100644 --- a/blurs/blur5resize-horizontal-last-pass.slang +++ b/blurs/blur5resize-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5resize-horizontal.slang b/blurs/blur5resize-horizontal.slang index baf3430..c98c620 100644 --- a/blurs/blur5resize-horizontal.slang +++ b/blurs/blur5resize-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang index f59b54d..958c4cf 100644 --- a/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5resize-vertical.slang b/blurs/blur5resize-vertical.slang index 061ee15..290992e 100644 --- a/blurs/blur5resize-vertical.slang +++ b/blurs/blur5resize-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang b/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang index 4019848..cb5f7e6 100644 --- a/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5x5-last-pass.slang b/blurs/blur5x5-last-pass.slang index 80a958f..aeb2519 100644 --- a/blurs/blur5x5-last-pass.slang +++ b/blurs/blur5x5-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur5x5.slang b/blurs/blur5x5.slang index 39b01a3..03560d9 100644 --- a/blurs/blur5x5.slang +++ b/blurs/blur5x5.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang index 5e973be..c1602ed 100644 --- a/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang index 20b09ab..49a452c 100644 --- a/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7fast-horizontal-last-pass.slang b/blurs/blur7fast-horizontal-last-pass.slang index dc8029b..c7079e5 100644 --- a/blurs/blur7fast-horizontal-last-pass.slang +++ b/blurs/blur7fast-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7fast-horizontal.slang b/blurs/blur7fast-horizontal.slang index 9f0bb91..6cb6277 100644 --- a/blurs/blur7fast-horizontal.slang +++ b/blurs/blur7fast-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang index 6ead23e..b757482 100644 --- a/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7fast-vertical.slang b/blurs/blur7fast-vertical.slang index bff459e..9da9398 100644 --- a/blurs/blur7fast-vertical.slang +++ b/blurs/blur7fast-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang index afcbb86..4915ab8 100644 --- a/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang index f4f497b..8c71b17 100644 --- a/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7resize-horizontal-last-pass.slang b/blurs/blur7resize-horizontal-last-pass.slang index 2183817..85000dc 100644 --- a/blurs/blur7resize-horizontal-last-pass.slang +++ b/blurs/blur7resize-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7resize-horizontal.slang b/blurs/blur7resize-horizontal.slang index 4726b55..51e9fbb 100644 --- a/blurs/blur7resize-horizontal.slang +++ b/blurs/blur7resize-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang index 18cb379..1c016c4 100644 --- a/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7resize-vertical.slang b/blurs/blur7resize-vertical.slang index 250e84d..4fbc043 100644 --- a/blurs/blur7resize-vertical.slang +++ b/blurs/blur7resize-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7x7-gamma-encode-every-fbo.slang b/blurs/blur7x7-gamma-encode-every-fbo.slang index 82aa0cb..fcaf5d3 100644 --- a/blurs/blur7x7-gamma-encode-every-fbo.slang +++ b/blurs/blur7x7-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang b/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang index 1d2b266..65bde8c 100644 --- a/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7x7-last-pass.slang b/blurs/blur7x7-last-pass.slang index 5967195..541a536 100644 --- a/blurs/blur7x7-last-pass.slang +++ b/blurs/blur7x7-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur7x7.slang b/blurs/blur7x7.slang index 9e5fd8a..9ef0177 100644 --- a/blurs/blur7x7.slang +++ b/blurs/blur7x7.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang index a84e349..202bd8d 100755 --- a/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang index 645bee6..7d8d2a7 100755 --- a/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9fast-horizontal-last-pass.slang b/blurs/blur9fast-horizontal-last-pass.slang index 12c6207..5885f2e 100755 --- a/blurs/blur9fast-horizontal-last-pass.slang +++ b/blurs/blur9fast-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9fast-horizontal.slang b/blurs/blur9fast-horizontal.slang index 17aa56d..ff679f5 100755 --- a/blurs/blur9fast-horizontal.slang +++ b/blurs/blur9fast-horizontal.slang @@ -1,18 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} params; - -layout(std140, set = 0, binding = 0) uniform UBO -{ - mat4 MVP; -} global; - ///////////////////////////////// MIT LICENSE //////////////////////////////// // Copyright (C) 2014 TroggleMonkey @@ -35,6 +22,18 @@ layout(std140, set = 0, binding = 0) uniform UBO // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; +} global; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// @@ -50,24 +49,24 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-horizontal.h" /////////////////////////////// FRAGMENT SHADER ////////////////////////////// #pragma stage fragment -#pragma format R8G8B8A8_SRGB layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang index c351fbf..5db9055 100755 --- a/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9fast-vertical.slang b/blurs/blur9fast-vertical.slang index 2f39809..356df0d 100755 --- a/blurs/blur9fast-vertical.slang +++ b/blurs/blur9fast-vertical.slang @@ -1,18 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} params; - -layout(std140, set = 0, binding = 0) uniform UBO -{ - mat4 MVP; -} global; - ///////////////////////////////// MIT LICENSE //////////////////////////////// // Copyright (C) 2014 TroggleMonkey @@ -35,6 +22,18 @@ layout(std140, set = 0, binding = 0) uniform UBO // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; +} global; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// @@ -50,28 +49,28 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-fast-vertical.h" /////////////////////////////// FRAGMENT SHADER ////////////////////////////// #pragma stage fragment -#pragma format R8G8B8A8_SRGB layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { - vec3 color = tex2Dblur9fast(Source, tex_uv, blur_dxdy); + vec3 color = tex2Dblur9fast(input_texture, tex_uv, blur_dxdy); // Encode and output the blurred image: - FragColor = encode_output(vec4(color, 1.0)); + FragColor = encode_output(float4(color, 1.0)); } diff --git a/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang index e50fb4b..a7a6b3e 100644 --- a/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang +++ b/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang index 0a436c8..a678730 100644 --- a/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9resize-horizontal-last-pass.slang b/blurs/blur9resize-horizontal-last-pass.slang index 71a5b0c..b9b5fa8 100644 --- a/blurs/blur9resize-horizontal-last-pass.slang +++ b/blurs/blur9resize-horizontal-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9resize-horizontal.slang b/blurs/blur9resize-horizontal.slang index 11248b9..b89c1d5 100644 --- a/blurs/blur9resize-horizontal.slang +++ b/blurs/blur9resize-horizontal.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-horizontal.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang index cdf42d3..e951332 100644 --- a/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang +++ b/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9resize-vertical.slang b/blurs/blur9resize-vertical.slang index 20426cc..beb6ef4 100644 --- a/blurs/blur9resize-vertical.slang +++ b/blurs/blur9resize-vertical.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-resize-vertical.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9x9-gamma-encode-every-fbo.slang b/blurs/blur9x9-gamma-encode-every-fbo.slang index e9c074b..8d69528 100644 --- a/blurs/blur9x9-gamma-encode-every-fbo.slang +++ b/blurs/blur9x9-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang b/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang index d3dedf2..cf2d2ab 100644 --- a/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang +++ b/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9x9-last-pass.slang b/blurs/blur9x9-last-pass.slang index cb6a5ce..2f3c40d 100644 --- a/blurs/blur9x9-last-pass.slang +++ b/blurs/blur9x9-last-pass.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/blur9x9.slang b/blurs/blur9x9.slang index ed1c665..d33e933 100644 --- a/blurs/blur9x9.slang +++ b/blurs/blur9x9.slang @@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO //#define SIMULATE_LCD_ON_CRT //#define SIMULATE_GBA_ON_CRT +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// -////////////////////////////////// INCLUDES ////////////////////////////////// - -// #included by vertex shader: -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" - +#include "../include/compat_macros.inc" #pragma stage vertex #include "vertex-shader-blur-one-pass.h" @@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 blur_dxdy; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +///////////////////////////// FRAGMENT INCLUDES ///////////////////////////// +#include "../include/gamma-management.h" +#include "../include/blur-functions.h" void main() { diff --git a/blurs/vertex-shader-blur-fast-horizontal.h b/blurs/vertex-shader-blur-fast-horizontal.h index 78390a6..9398b23 100644 --- a/blurs/vertex-shader-blur-fast-horizontal.h +++ b/blurs/vertex-shader-blur-fast-horizontal.h @@ -30,8 +30,8 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; diff --git a/blurs/vertex-shader-blur-fast-vertical.h b/blurs/vertex-shader-blur-fast-vertical.h index cd324a0..f5d9ddb 100644 --- a/blurs/vertex-shader-blur-fast-vertical.h +++ b/blurs/vertex-shader-blur-fast-vertical.h @@ -23,17 +23,10 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. - -///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// - -// PASS SETTINGS: -// Pass settings should be set by the shader file that #includes this one. - - ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; @@ -55,10 +48,10 @@ void main() // (not output pixels), but we avoid this and consistently blur at the // destination size. Otherwise, combining statically calculated weights // with bilinear sample exploitation would result in terrible artifacts. - const vec2 dxdy_scale = params.SourceSize.xy * params.OutputSize.zw; - const vec2 dxdy = dxdy_scale * params.SourceSize.zw; + const float2 dxdy_scale = IN.video_size/IN.output_size; + const float2 dxdy = dxdy_scale/IN.texture_size; // This blur is vertical-only, so zero out the horizontal offset: - blur_dxdy = vec2(0.0, dxdy.y); + blur_dxdy = float2(0.0, dxdy.y); } #endif // VERTEX_SHADER_BLUR_FAST_VERTICAL_H \ No newline at end of file diff --git a/blurs/vertex-shader-blur-one-pass-resize.h b/blurs/vertex-shader-blur-one-pass-resize.h index 3321625..2082630 100644 --- a/blurs/vertex-shader-blur-one-pass-resize.h +++ b/blurs/vertex-shader-blur-one-pass-resize.h @@ -32,8 +32,8 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; diff --git a/blurs/vertex-shader-blur-one-pass-shared-sample.h b/blurs/vertex-shader-blur-one-pass-shared-sample.h index fd960d4..4dd8e30 100644 --- a/blurs/vertex-shader-blur-one-pass-shared-sample.h +++ b/blurs/vertex-shader-blur-one-pass-shared-sample.h @@ -32,20 +32,20 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 tex_uv; +layout(location = 0) out vec4 tex_uv; layout(location = 1) out vec4 output_pixel_num; layout(location = 2) out vec2 blur_dxdy; void main() { gl_Position = global.MVP * Position; - tex_uv = TexCoord; + vec2 tex_uv_ = TexCoord; // Get the uv sample distance between output pixels. Blurs are not generic // Gaussian resizers, and correct blurs require: @@ -57,21 +57,21 @@ void main() // (not output pixels), but we avoid this and consistently blur at the // destination size. Otherwise, combining statically calculated weights // with bilinear sample exploitation would result in terrible artifacts. - const vec2 dxdy_scale params.SourceSize.xy * params.OutputSize.zw; + const vec2 dxdy_scale = params.SourceSize.xy * params.OutputSize.zw; blur_dxdy = dxdy_scale * params.SourceSize.zw; // Get the output pixel number in ([0, xres), [0, yres)) with respect to // the uv origin (.xy components) and the screen origin (.zw components). // Both are useful. Don't round until the fragment shader. - const float2 video_uv = tex_uv; + const float2 video_uv = tex_uv_; output_pixel_num.xy = params.OutputSize.xy * vec2(video_uv.x, video_uv.y); output_pixel_num.zw = params.OutputSize.xy * - (out_position.xy * 0.5 + vec2(0.5)); + (gl_Position.xy * 0.5 + vec2(0.5)); // Set the mip level correctly for shared-sample blurs (where the // derivatives are unreliable): const float mip_level = log2(params.SourceSize.xy * params.OutputSize.zw).y; - tex_uv = vec4(tex_uv, 0.0, mip_level); + tex_uv = vec4(tex_uv_, 0.0, mip_level); } #endif // VERTEX_SHADER_BLUR_ONE_PASS_SHARED_SAMPLE_H \ No newline at end of file diff --git a/blurs/vertex-shader-blur-one-pass.h b/blurs/vertex-shader-blur-one-pass.h index d94a6df..7d62899 100644 --- a/blurs/vertex-shader-blur-one-pass.h +++ b/blurs/vertex-shader-blur-one-pass.h @@ -32,8 +32,8 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; diff --git a/blurs/vertex-shader-blur-resize-horizontal.h b/blurs/vertex-shader-blur-resize-horizontal.h index 66b2179..407d47d 100644 --- a/blurs/vertex-shader-blur-resize-horizontal.h +++ b/blurs/vertex-shader-blur-resize-horizontal.h @@ -32,8 +32,8 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; diff --git a/blurs/vertex-shader-blur-resize-vertical.h b/blurs/vertex-shader-blur-resize-vertical.h index 41fbe2b..6265af7 100644 --- a/blurs/vertex-shader-blur-resize-vertical.h +++ b/blurs/vertex-shader-blur-resize-vertical.h @@ -32,8 +32,8 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// -#include "../include/gamma-management.h" -#include "../include/blur-functions.h" +//#include "../include/gamma-management.h" +//#include "../include/blur-functions.h" #pragma stage vertex layout(location = 0) in vec4 Position; diff --git a/crt/crt-royale-fake-bloom-intel.slangp b/crt/crt-royale-fake-bloom-intel.slangp new file mode 100644 index 0000000..468fe6a --- /dev/null +++ b/crt/crt-royale-fake-bloom-intel.slangp @@ -0,0 +1,134 @@ +# IMPORTANT: +# Shader passes need to know details about the image in the mask_texture LUT +# files, so set the following constants in user-cgp-constants.h accordingly: +# 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's) +# 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's) +# 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's) +# 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1]) +# 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1]) +# 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1]) +# Shader passes also need to know certain scales set in this .slangp, but their +# compilation model doesn't currently allow the .slangp file to tell them. Make +# sure to set the following constants in user-cgp-constants.h accordingly too: +# 1.) bloom_approx_scale_x_for_fake = scale_x2 +# 2.) mask_resize_viewport_scale = float2(scale_x6, scale_y5) +# Finally, shader passes need to know the value of geom_max_aspect_ratio used to +# calculate scale_y5 (among other values): +# 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5) + +shaders = "7" + +# Set an identifier, filename, and sampling traits for the phosphor mask texture. +# Load an aperture grille, slot mask, and an EDP shadow mask, and load a small +# non-mipmapped version and a large mipmapped version. +# TODO: Test masks in other directories. +textures = "mask_grille_texture_small;mask_grille_texture_large;mask_slot_texture_small;mask_slot_texture_large;mask_shadow_texture_small;mask_shadow_texture_large" +mask_grille_texture_small = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png" +mask_grille_texture_large = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png" +mask_slot_texture_small = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png" +mask_slot_texture_large = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png" +mask_shadow_texture_small = "shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png" +mask_shadow_texture_large = "shaders/crt-royale/TileableLinearShadowMaskEDP.png" +mask_grille_texture_small_wrap_mode = "repeat" +mask_grille_texture_large_wrap_mode = "repeat" +mask_slot_texture_small_wrap_mode = "repeat" +mask_slot_texture_large_wrap_mode = "repeat" +mask_shadow_texture_small_wrap_mode = "repeat" +mask_shadow_texture_large_wrap_mode = "repeat" +mask_grille_texture_small_linear = "true" +mask_grille_texture_large_linear = "true" +mask_slot_texture_small_linear = "true" +mask_slot_texture_large_linear = "true" +mask_shadow_texture_small_linear = "true" +mask_shadow_texture_large_linear = "true" +mask_grille_texture_small_mipmap = "false" # Mipmapping causes artifacts with manually resized masks without tex2Dlod +mask_grille_texture_large_mipmap = "true" # Essential for hardware-resized masks +mask_slot_texture_small_mipmap = "false" # Mipmapping causes artifacts with manually resized masks without tex2Dlod +mask_slot_texture_large_mipmap = "true" # Essential for hardware-resized masks +mask_shadow_texture_small_mipmap = "false" # Mipmapping causes artifacts with manually resized masks without tex2Dlod +mask_shadow_texture_large_mipmap = "true" # Essential for hardware-resized masks + + +# Pass0: Linearize the input based on CRT gamma and bob interlaced fields. +# (Bobbing ensures we can immediately blur without getting artifacts.) +shader0 = "shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang" +alias0 = "ORIG_LINEARIZED" +filter_linear0 = "false" +scale_type0 = "source" +scale0 = "1.0" +srgb_framebuffer0 = "true" + +# Pass1: Resample interlaced (and misconverged) scanlines vertically. +# Separating vertical/horizontal scanline sampling is faster: It lets us +# consider more scanlines while calculating weights for fewer pixels, and +# it reduces our samples from vertical*horizontal to vertical+horizontal. +# This has to come right after ORIG_LINEARIZED, because there's no +# "original_source" scale_type we can use later. +shader1 = "shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang" +alias1 = "VERTICAL_SCANLINES" +filter_linear1 = "true" +scale_type_x1 = "source" +scale_x1 = "1.0" +scale_type_y1 = "viewport" +scale_y1 = "1.0" +#float_framebuffer1 = "true" +srgb_framebuffer1 = "true" + +# Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and +# account for convergence offsets. We want to blur a predictable portion of the +# screen to match the phosphor bloom, and absolute scale works best for +# reliable results with a fixed-size bloom. Picking a scale is tricky: +# a.) 400x300 is a good compromise for the "fake-bloom" version: It's low enough +# to blur high-res/interlaced sources but high enough that resampling +# doesn't smear low-res sources too much. +# b.) 320x240 works well for the "real bloom" version: It's 1-1.5% faster, and +# the only noticeable visual difference is a larger halation spread (which +# may be a good thing for people who like to crank it up). +# Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's +# *intended* for an ~4:3 aspect ratio). +shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx-fake-bloom-intel.slang" +alias2 = "BLOOM_APPROX" +filter_linear2 = "true" +scale_type2 = "absolute" +scale_x2 = "400" +scale_y2 = "300" +srgb_framebuffer2 = "true" + +# Pass3: Vertically blur the input for halation and refractive diffusion. +# Base this on BLOOM_APPROX: This blur should be small and fast, and blurring +# a constant portion of the screen is probably physically correct if the +# viewport resolution is proportional to the simulated CRT size. +shader3 = "../blurs/blur9fast-vertical.slang" +filter_linear3 = "true" +scale_type3 = "source" +scale3 = "1.0" +srgb_framebuffer3 = "true" + +# Pass4: Horizontally blur the input for halation and refractive diffusion. +# Note: Using a one-pass 9x9 blur is about 1% slower. +shader4 = "../blurs/blur9fast-horizontal.slang" +alias4 = "HALATION_BLUR" +filter_linear4 = "true" +scale_type4 = "source" +scale4 = "1.0" +srgb_framebuffer4 = "true" + +# Pass5: Resample (misconverged) scanlines horizontally, apply halation, and +# apply the phosphor mask, then fake a phosphor bloom, all in one pass. +shader5 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-fake-bloom-intel.slang" +alias5 = "MASKED_SCANLINES" +filter_linear5 = "true" # This could just as easily be nearest neighbor. +scale_type5 = "viewport" +scale5 = "1.0" +#float_framebuffer5 = "true" +srgb_framebuffer5 = "true" + +# Pass 6: Compute curvature/AA: +shader6 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang" +filter_linear6 = "true" +scale_type6 = "viewport" +mipmap_input6 = "true" +texture_wrap_mode6 = "clamp_to_edge" + + + diff --git a/crt/crt-royale_fallback.slangp b/crt/crt-royale-intel.slangp similarity index 56% rename from crt/crt-royale_fallback.slangp rename to crt/crt-royale-intel.slangp index 931f032..b2c1d46 100644 --- a/crt/crt-royale_fallback.slangp +++ b/crt/crt-royale-intel.slangp @@ -1,22 +1,22 @@ # IMPORTANT: # Shader passes need to know details about the image in the mask_texture LUT -# files, so set the following constants in user-preset-constants.h accordingly: +# files, so set the following constants in user-cgp-constants.h accordingly: # 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's) # 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's) # 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's) # 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1]) # 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1]) # 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1]) -# Shader passes also need to know certain scales set in this preset, but their -# compilation model doesn't currently allow the preset file to tell them. Make -# sure to set the following constants in user-preset-constants.h accordingly too: +# Shader passes also need to know certain scales set in this .slangp, but their +# compilation model doesn't currently allow the .slangp file to tell them. Make +# sure to set the following constants in user-cgp-constants.h accordingly too: # 1.) bloom_approx_scale_x = scale_x2 -# 2.) mask_resize_viewport_scale = vec2(scale_x6, scale_y5) +# 2.) mask_resize_viewport_scale = float2(scale_x6, scale_y5) # Finally, shader passes need to know the value of geom_max_aspect_ratio used to # calculate scale_y5 (among other values): # 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5) -shaders = "12" +shaders = "10" # Set an identifier, filename, and sampling traits for the phosphor mask texture. # Load an aperture grille, slot mask, and an EDP shadow mask, and load a small @@ -71,6 +71,7 @@ scale_type_x1 = "source" scale_x1 = "1.0" scale_type_y1 = "viewport" scale_y1 = "1.0" +#float_framebuffer1 = "true" srgb_framebuffer1 = "true" # Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and @@ -85,7 +86,7 @@ srgb_framebuffer1 = "true" # may be a good thing for people who like to crank it up). # Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's # *intended* for an ~4:3 aspect ratio). -shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang" +shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang" alias2 = "BLOOM_APPROX" filter_linear2 = "true" scale_type2 = "absolute" @@ -112,95 +113,42 @@ scale_type4 = "source" scale4 = "1.0" srgb_framebuffer4 = "true" -# Pass5: Lanczos-resize the phosphor mask vertically. Set the absolute -# scale_x5 == mask_texture_small_size.x (see IMPORTANT above). Larger scales -# will blur, and smaller scales could get nasty. The vertical size must be -# based on the viewport size and calculated carefully to avoid artifacts later. -# First calculate the minimum number of mask tiles we need to draw. -# Since curvature is computed after the scanline masking pass: -# num_resized_mask_tiles = 2.0; -# If curvature were computed in the scanline masking pass (it's not): -# max_mask_texel_border = ~3.0 * (1/3.0 + 4.0*sqrt(2.0) + 0.5 + 1.0); -# max_mask_tile_border = max_mask_texel_border/ -# (min_resized_phosphor_triad_size * mask_triads_per_tile); -# num_resized_mask_tiles = max(2.0, 1.0 + max_mask_tile_border * 2.0); -# At typical values (triad_size >= 2.0, mask_triads_per_tile == 8): -# num_resized_mask_tiles = ~3.8 -# Triad sizes are given in horizontal terms, so we need geom_max_aspect_ratio -# to relate them to vertical resolution. The widest we expect is: -# geom_max_aspect_ratio = 4.0/3.0 # Note: Shader passes need to know this! -# The fewer triads we tile across the screen, the larger each triad will be as a -# fraction of the viewport size, and the larger scale_y5 must be to draw a full -# num_resized_mask_tiles. Therefore, we must decide the smallest number of -# triads we'll guarantee can be displayed on screen. We'll set this according -# to 3-pixel triads at 768p resolution (the lowest anyone's likely to use): -# min_allowed_viewport_triads = 768.0*geom_max_aspect_ratio / 3.0 = 341.333333 -# Now calculate the viewport scale that ensures we can draw resized_mask_tiles: -# min_scale_x = resized_mask_tiles * mask_triads_per_tile / -# min_allowed_viewport_triads -# scale_y5 = geom_max_aspect_ratio * min_scale_x -# # Some code might depend on equal scales: -# scale_x6 = scale_y5 -# Given our default geom_max_aspect_ratio and min_allowed_viewport_triads: -# scale_y5 = 4.0/3.0 * 2.0/(341.33333 / 8.0) = 0.0625 -# IMPORTANT: The scales MUST be calculated in this way. If you wish to change -# geom_max_aspect_ratio, update that constant in user-preset-constants.h! -shader5 = "shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang" -filter_linear5 = "true" -scale_type_x5 = "absolute" -scale_x5 = "64" -scale_type_y5 = "viewport" -scale_y5 = "0.0625" # Safe for >= 341.333 horizontal triads at viewport size -#srgb_framebuffer5 = "false" # mask_texture is already assumed linear - -# Pass6: Lanczos-resize the phosphor mask horizontally. scale_x6 = scale_y5. -# TODO: Check again if the shaders actually require equal scales. -shader6 = "shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang" -alias6 = "MASK_RESIZE" -filter_linear6 = "false" -scale_type_x6 = "viewport" -scale_x6 = "0.0625" -scale_type_y6 = "source" -scale_y6 = "1.0" -#srgb_framebuffer6 = "false" # mask_texture is already assumed linear - -# Pass7: Resample (misconverged) scanlines horizontally, apply halation, and +# Pass5: Resample (misconverged) scanlines horizontally, apply halation, and # apply the phosphor mask. -shader7 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang" -alias7 = "MASKED_SCANLINES" +shader5 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang" +alias5 = "MASKED_SCANLINES" +filter_linear5 = "true" # This could just as easily be nearest neighbor. +scale_type5 = "viewport" +scale5 = "1.0" +#float_framebuffer5 = "true" +srgb_framebuffer5 = "true" + +# Pass 6: Compute a brightpass. This will require reading the final mask. +shader6 = "shaders/crt-royale/src/crt-royale-brightpass.slang" +alias6 = "BRIGHTPASS" +filter_linear6 = "true" # This could just as easily be nearest neighbor. +scale_type6 = "viewport" +scale6 = "1.0" +srgb_framebuffer6 = "true" + +# Pass 7: Blur the brightpass vertically +shader7 = "shaders/crt-royale/src/crt-royale-bloom-vertical.slang" filter_linear7 = "true" # This could just as easily be nearest neighbor. -scale_type7 = "viewport" +scale_type7 = "source" scale7 = "1.0" srgb_framebuffer7 = "true" -# Pass 8: Compute a brightpass. This will require reading the final mask. -shader8 = "shaders/crt-royale/src/crt-royale-brightpass.slang" -alias8 = "BRIGHTPASS" -filter_linear8 = "true" # This could just as easily be nearest neighbor. -scale_type8 = "viewport" +# Pass 8: Blur the brightpass horizontally and combine it with the dimpass: +shader8 = "shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang" +filter_linear8 = "true" +scale_type8 = "source" scale8 = "1.0" srgb_framebuffer8 = "true" -# Pass 9: Blur the brightpass vertically -shader9 = "shaders/crt-royale/src/crt-royale-bloom-vertical.slang" -filter_linear9 = "true" # This could just as easily be nearest neighbor. -scale_type9 = "source" -scale9 = "1.0" -srgb_framebuffer9 = "true" +# Pass 9: Compute curvature/AA: +shader9 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang" +filter_linear9 = "true" +scale_type9 = "viewport" +mipmap_input9 = "true" +texture_wrap_mode9 = "clamp_to_edge" -# Pass 10: Blur the brightpass horizontally and combine it with the dimpass: -shader10 = "shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang" -filter_linear10 = "true" -scale_type10 = "source" -scale10 = "1.0" -srgb_framebuffer10 = "true" - -# Pass 11: Compute curvature/AA: -shader11 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang" -filter_linear11 = "true" -scale_type11 = "viewport" -mipmap_input11 = "true" -texture_wrap_mode11 = "clamp_to_edge" - -parameters = "beam_num_scanlines" -beam_num_scanlines = 3.0 \ No newline at end of file diff --git a/crt/shaders/crt-royale/LICENSE.TXT b/crt/shaders/crt-royale/LICENSE.TXT new file mode 100644 index 0000000..d8cf7d4 --- /dev/null +++ b/crt/shaders/crt-royale/LICENSE.TXT @@ -0,0 +1,280 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/crt/shaders/crt-royale/README.TXT b/crt/shaders/crt-royale/README.TXT new file mode 100644 index 0000000..16636b8 --- /dev/null +++ b/crt/shaders/crt-royale/README.TXT @@ -0,0 +1,493 @@ +//////////////////////////////////////////////////////////////////////////////// +//// crt-royale, by TroggleMonkey //// +//// Last Updated: August 16, 2014 //// +//////////////////////////////////////////////////////////////////////////////// + +REQUIREMENTS: +The earliest official Retroarch version fully supporting crt-royale is 1.0.0.3 +(currently unreleased). Earlier versions lack shader parameters and proper +mipmapping and sRGB support, but the shader may still run at reduced quality. + +The earliest development version fully supporting this shader is: + commit ba40be909913c9ccc34dab5d452fba4fe61af9d0 + Author: Themaister + Date: Thu Jun 5 17:41:10 2014 +0200 +A few earlier revisions support the required features, but they may be buggier. + + +BASICS: +crt-royale is a highly customizable CRT shader for Retroarch and other programs +supporting the libretro Cg shader standard. It uses a number of nonstandardized +extensions like sRGB FBO's, mipmapping, and runtime shader parameters, but +hopefully it will run without much of a fuss on new implementations of the +standard as well. + +There are a huge number of parameters. Among the things you can customize: +* Phosphor mask type: An aperture grille, slot mask, and shadow mask are each + included, although the latter won't be seeing much usage until 1440p displays + and better become more common (4k UHD and 8k UHD are increasingly optimal). +* Phosphor mask dot pitch +* Phosphor mask resampling method: Choose between Lanczos sinc resizing, + mipmapped hardware resizing, and no resizing of the input LUT. +* Phosphor bloom softness and type (real or fake ;)) +* Gaussian and generalized Gaussian scanline beam properties/distribution, + including convergence offsets +* Screen geometry, including curvature (spherical, alternative spherical, or + cylindrical like Trinitrons), tilt, and borders +* Antialiasing level, resampling filter, and sharpness parameters for gracefully + combining screen curvature with high-frequency phosphor details, including + optionally resampling based on RGB subpixel positions. +* Halation (electrons bouncing under the glass and lighting random phosphors) + random phosphors) +* Refractive diffusion (light spreading from the imperfect CRT glass face) +* Interlacing options +* etc. + +There are two major ways to customize the shader: +* Runtime shader parameters allow convenient experimentation with real-time + feedback, but they are much slower, because they prevent static evaluation of + a lot of math. Disabling them drastically speeds up the shader. +* If runtime shader parameters are disabled (partially or totally), those same + settings can be freely altered in the text of the user-settings.h file. There + are also a number of other static-only settings, including the #define macros + which indicate where and when to allow runtime shader parameters. To disable + them entirely, comment out the "#define RUNTIME_SHADER_PARAMS_ENABLE" line by + putting a double-slash ("//") at the beginning...your FPS will skyrocket. + +You may also note that there are two major versions of the shader preset: +* crt-royale.cgh is the "full" version of the shader, which blooms the light + from the brighter phosphors to maintain brightness and avoid clipping. +* crt-royale-fake-bloom.cgh is the "cheater's" version of the shader, which + only fakes the bloom based on carefully blending in a [potentially blurred] + version of the original input. This version is MUCH faster, and you have to + strain to see the difference, so people with slower GPU's will prefer it. + +There's a lot to play around with, and I encourage everyone using this shader to +read through the user-settings.h file to learn about the parameters. Before +loading the shader, be sure to read the next section, entitled... + + +//////////////////////////////////////////////////////////////////////////////// +//// FREQUENTLY EXPECTED QUESTIONS: //// +//////////////////////////////////////////////////////////////////////////////// + +1.) WHY IS THE SHADER CRASHING WHEN I LOAD IT?!? +Do you get C6001 or C6002 errors with integrated graphics, like Intel HD 4000? +If so, please try one of the following .cgp presets: +* crt-royale-intel.cgp +* crt-royale-fake-bloom-intel.cgp +These load .cg wrappers that #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE +(also available in user-settings.h) before loading the main .cg shader files. + +Integrated graphics compatibility mode will disable these three features, which +currently require more registers or instructions than Intel GPU's allow: +* PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. + (This may be reenabled in a later release.) +* RUNTIME_GEOMETRY_MODE: You must change the screen geometry/curvature using + the geom_mode_static setting in user-settings.h. +* The high-quality 4x4 Gaussian resize for the bloom approximation + +Using Intel-specific .cgp files is equivalent to #defining +INTEGRATED_GRAPHICS_COMPATIBILITY_MODE in your user-settings.h. Out of the box, +user-settings.h is configured for maximum configurability and compatibility with +dedicated nVidia and AMD/ATI GPU's. Compatibility mode is disabled by default +to avoid silently degrading quality for AMD/ATI and nVidia users, so Intel- +specific .cgp's are a convenient way for Intel users to play with the shader +without editing text files. + +I've tested this solution on Intel HD 4000 graphics, and it should work for that +GPU at least, but please let me know if you're still having problems! + +-------------------------------------------------------------------------------- + +2.) WHY IS EVERYTHING SO SLOW?!?: +Out of the box, this will be a problem for all but monster GPU's. The default +user-settings.h file disables any features and optimizations which might cause +compilation failure on AMD/ATI GPU's. Despite the name of the options, this is +not a problem with your card or drivers; it's a shortcoming in the Cg shader +compiler's nVidia-centric profile setups. + +Uncommenting the following #define macros at the top of user-settings.h will +help performance a good deal on compatible nVidia cards: + #define DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define DRIVERS_ALLOW_TEX2DLOD + #define DRIVERS_ALLOW_TEX2DBIAS +A few of these warrant some elaboration. First, derivatives: + +Derivatives allow the shader to cheaply calculate a tangent-space matrix for +correct antialiasing when curvature or overscan are used. Without them, there +are two options: + a.) Cheat, and there will be artifacts with strong cylindrical curvature + b.) Compute the correct tangent-space matrix analytically. This is used + by default, and it's controlled by this option near the bottom: + geom_force_correct_tangent_matrix = true + +Dynamic branches: +Dynamic branches allow the shader to avoid performing computations that it +doesn't need (but might have, given different runtime options). Without them, +the shader has to either let the GPU evaluate every possible codepath and select +a result, or make a "best guess" ahead of time. The full phosphor bloom suffers +most from not having dynamic branches, because the shader doesn't know how big +of a blur to use until it knows your phosphor mask dot pitch...which you set at +runtime if shader parameters are enabled. + +If RUNTIME_PHOSPHOR_BLOOM_SIGMA is commented out (faster), this won't matter: +The shader will just select the blur size and standard deviation suitable for +the mask_triad_size_desired_static setting in user-settings.cgp. It will be +fast, but larger triads won't blur enough, and smaller triads will blur more +than they need to. However, if RUNTIME_PHOSPHOR_BLOOM_SIGMA is enabled, the +shader will calculate an optimal standard deviation and *try* to use the right +blur size for it...but using an "if standard deviation is such and such" +condition would be prohibitively slow without dynamic branches. Instead, the +shader uses the largest and slowest blur the user lets it use (to cover the +widest range of triad sizes and standard deviations), according to these macros: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS +The more you have uncommented, the larger the triads you can blur, but the +slower runtime sigmas will be if your GPU can't use dynamic branches. By +default, triads up to 6 pixels wide will be bloomed perfectly, and a little +beyond that (8 should be fine), but going too far beyond that will create +blocking artifacts in the blur due to an insufficient support size. + +tex2Dlod: +The tex2Dlod function allows the shader to disables anisotropic filtering, which +can get confused when we're manually tiling the texture coordinates for a small +resized phosphor mask tile (it creates nasty seam artifacts). There are several +ways the shader can deal with this: The cheapest is to use tex2Dlod to tile the +output of MASK_RESIZE across the screen...and the slower alternatives either +require derivatives or force the shader to draw 2 tiles to MASK_RESIZE in each +direction, thereby reducing your maximum allowed dot pitch by half. + +tex2Dbias: +According to nVidia's Cg language standard library page, tex2Dbias requires the +fp30 profile, which doesn't work on ATI/AMD cards...but you might actually have +mixed results. This can be used as a substitute for tex2Dlod at times, so it's +worth trying even on ATI. + +-------------------------------------------------------------------------------- + +3.) WHY IS EVERYTHING STILL SO SLOW?!?: +For maximum quality and configurability out of the box, almost all shader +parameters are enabled by default (except for the disproportionately expensive +runtime subpixel offsets). Some are more expensive than others. Commenting +the following macro disables all shader parameters: + #define RUNTIME_SHADER_PARAMS_ENABLE +Commenting these macros disables selective shader parameters: + #define RUNTIME_PHOSPHOR_BLOOM_SIGMA + #define RUNTIME_ANTIALIAS_WEIGHTS + //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define RUNTIME_GEOMETRY_TILT + #define RUNTIME_GEOMETRY_MODE + #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +Note that all shader parameters will still show up in your GUI list, and the +disabled ones simply won't work. + +Finally, there are a lot of other options enabled by default that carry serious +performance penalties. For instance, the default antialiasing filter is a +cubic filter, because it's the most configurable, but it's also quite slow if +RUNTIME_ANTIALIAS_WEIGHTS is #defined. A lot of the static true/false options +have a significant influence, and the shader is faster if the red subpixel +offset (from which the blue one is calculated as well) is zero...even if it's +a static value, because RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS is commented out. +To avoid any confusion, I should also clarify now that subpixel offsets are +separate from scanline beam convergence offsets. + +To quickly see how much performance you can get from other settings, you can +temporarily replace your user-settings.h with one of: +a.) crt-royale-settings-files/user-settings-fast-static-ati.h +b.) crt-royale-settings-files/user-settings-fast-static-nvidia.h +Then load crt-royale-fake-bloom.cgp. It should be far more playable. + +-------------------------------------------------------------------------------- + +4.) WHY WON'T MY SHADER BLOOM MY PHOSPHORS ENOUGH? +First, see the discussion about dynamic branching above, in 1. +If you don't have dynamic branches, you can either uncomment the lines that +let the shader pessimistically use larger blurs than it's guaranteed to need +(which is slow), or...you can just use crt-royale-fake-bloom.cgp, which +doesn't have this problem. :) + +-------------------------------------------------------------------------------- + +5.) WHY CAN'T I MAKE MY PHOSPHORS ANY BIGGER? +By default, the phosphor mask is Lanczos-resized in earlier passes to your +specified dot pitch (mask_sample_mode = 0). This gives a much sharper result +than mipmapped hardware sampling (mask_sample_mode = 1), but it can be much +slower without taking proper care: If the input mask tile (containing 8 +phosphor triads by default) is large, like 512x512, and you try to resize it +to 24x24 for 3x3 pixel triads, the resizer has to take 128 samples in each +pass/direction (the max allowed) for a 3-lobe Lanczos filter. This can be +very slow, so I made the output of MASK_RESIZE very small by default: Just +1/16th of the viewport size in each direction. The exact limit scales with +your viewport size, and it *should* be reasonable, but the restrictions can +get tighter if we can't use tex2Dlod and have to fit two whole tiles (16 +phosphor triads with default 8-triad tiles) into the MASK_RESIZE pass for +compatibility with anisotropic filtering (long story). + +If you want bigger phosphor triads, you have two options: +a.) Set mask_sample_mode to 1 in your shader params (if enabled) or set + mask_sample_mode_static to 1 in your user-settings.h file. This will use + hardware sampling, which is softer but has no limitations. +b.) To increase the limit with manual mask-resizing (best quality), you need to + do five things: + 1.) Go into your .cgp file and find the MASK_RESIZE pass (the horizontal + mask resizing pass) and the one before it (the vertical mask resizing + pass). Find the viewport-relative scales, which should say 0.0625, and + change them to 0.125 or even 0.25. + 2.) Still in your .cgp file, also make sure your mask_*_texture_small + filenames point to LUT textures that are larger than your final desired + onscreen size (upsizing is not currently permitted). + 3.) Go into user-cgp-constants.h and change mask_resize_viewport_scale from + 0.0625 to the new value you changed it to in step 1. This is necessary, + because we can't pass that value from the .cgp file to the shader, and + the shader can't compute the viewport size (necessary) without it. + 4.) Still in user-cgp-constants.h, update mask_texture_small_size and + mask_triads_per_tile appropriately if you changed your LUT texture in + step 2. + 5.) Reload your .cgp file. +I REALLY wish there was an easier way to do that, but my hands are tied until +.cgp files are allowed to pass more information to .cg shaders (which would +require major updates to the cg2glsl script). + +-------------------------------------------------------------------------------- + +6.) WHY CAN'T I MAKE MY PHOSPHORS ANY SMALLER THAN 2 PIXELS PER TRIAD? +This is controlled by mask_min_allowed_triad_size in your user-settings.h file. +Set it to 1.0 instead of 2.0 (anything lower than 1 is pointless), and you're +set. It defaults to 2.0 to make mask resizing twice as fast when dynamic +branches aren't allowed. Some people may want to be able to fade the phosphors +away entirely to get a more PVM-like scanlined image though, so change it to 1.0 +for that (or get a higher-resolution display ;)). + +Note: This setting should be obsolete soon. I have some ideas for more +sophisticated mask resampling that I just don't have a spare few hours to +implement yet. + +-------------------------------------------------------------------------------- + +7.) I AM NOT RUNNING INTEGRATED GRAPHICS. WHY AM I GETTING ERRORS? +First recheck the top of your user-settings.h to make sure incompatible driver +options are commented out (disabled). If they're all disabled and you're still +having problems, you've probably found a bug. There are bound to be a number of +them with certain setting combinations, and there might even be a few individual +settings I broke more recently than I tested them. My contact information is up +top, so let me know! + +-------------------------------------------------------------------------------- + +8.) WHY AM I GETTING BANDING IN DARK COLORS? OR, WHY WON'T MIPMAPPING WORK? +crt-royale uses features like sRGB and mipmapping, which are not available in +the latest Retroarch release (1.0.0.2) at the time of this writing. + +You may get banding in dark colors if your platform or Retroarch version doesn't +support sRGB FBO's, and mask_sample_mode 1 will look awful without mipmapping. +I expect most platforms capable of running this shader at full speed will +support sRGB FBO's, but if yours doesn't, please let me know, and I'll include +a note about it. + +Alternately, setting levels_autodim_temp too low will cause precision loss and +banding. + +-------------------------------------------------------------------------------- + +9.) HOW DO I SET GEOMETRY/CURVATURE/ETC.? +If RUNTIME_SHADER_PARAMS_ENABLE and RUNTIME_GEOMETRY_MODE are both #defined (not +commented out) in user-settings.cgp, you can find these options in your shader +parameters (in Retroarch's RGUI for instance) under e.g. geom_mode. Otherwise, +you can set the corresponding e.g. geom_mode_static options in user-settings.h. + +-------------------------------------------------------------------------------- + +10.) WHY DON'T MY SHADER PARAMETERS STICK? +This is a bit confusing, at least in the version of Retroarch I'm using. +In the Shader Options menu, Parameters (Current) controls what's on your screen +right now, whereas Parameters (RGUI) seems to control what gets saved to a +shader preset (in your base shaders directory) with Save As Shader Preset. + +-------------------------------------------------------------------------------- + +11.) WHY DID YOU SLOW THE SHADER DOWN WITH ALL OF THESE FEATURES I DON'T WANT? + WHY DIDN'T YOU MAKE THE DEFAULTS MORE TO MY LIKING? + +The default settings tend to best match flat ~13" slot mask TV's with sharp +scanlines. Real CRT's however vary a lot in their characteristics (and many are +softer in more ways than one), so it's impossible to make the default settings +look like everyone's favorite CRT. Moreover, it's impossible to decide which +of the slower features and options are superfluous: + +Some people love curvature, and some people hate it. Some people love +scanlines, and some people hate them. Some people love phosphors, and some +people hate them. Some people love interlacing support, and some people hate +it. Some people love sharpness, and some people hate it. Some people love +convergence error, and some people hate it. The one thing you hate the most is +probably someone else's most critical feature. This is why there are so many +options, why the shader is so complicated, and why it's impossible to please +everyone out of the box...unfortunately. + +That said, if you spend some time tweaking the settings, you're bound to get a +picture you like. Once you've made up your mind, you can save the settings to +a user-settings.h file and disable shader parameters and other slow options to +get the kind of performance you want. + +-------------------------------------------------------------------------------- + +12.) WHY DIDN'T YOU INCLUDE A SHADER PRESET WITH NTSC SUPPORT? WHY DIDN'T YOU + INCLUDE MORE CANNED PRESETS WITH DIFFERENT OPTIONS? WHY CAN'T I SELECT + FROM ONE OF SEVERAL USER SETTINGS FILES WITHOUT MANUAL FILE RENAMING? + +I do plan on adding a version that uses the NTSC shader for the first two +passes, but it will take a bit of work, because there are several NTSC shader +versions as it is. It's easy enough to combine the HALATION_BLUR passes into a +one-pass blur from blurs/blur9x9fast.cg, but I'm not sure yet just how much +modification the NTSC shader passes themselves might need for best results. + +I originally wanted NTSC support to be included out-of-the-box, but I'd also +like to release the shader ASAP, so it'll have to wait. + +As for other canned presets, that's a little more complicated: I DO intend on +creating more canned presets, but the combinatorial explosion of major codepath +options in this shader is too overwhelming to be as exhaustive as I'd like. +When I get the time, I'll add what I can to make this more user-friendly. +In the meantime, I'll start adding a few different default versions of the +user settings file and put them in a subdirectory for people to manually +place in the main directory and rename to "user-settings.h." + +However, the libretro Cg shader specification (and the Cg to GLSL compiler) does +not currently allow .cgp files to pass any static settings to the source files. +This presents a huge problem, because it means that in order to create a new +preset with different options, I also have to create duplicate files for EVERY +single .cg pass for every permutation, not just the .cgp. I plan on creating +a number of skeleton wrapper .cg files in a subdirectory (which set a few +options and then include the main .cg file for the pass), but it'll be a while +yet. In the meantime, I'd rather let people play with what's already done than +keep it hidden on my hard drive. + +-------------------------------------------------------------------------------- + +13.) WHY DO SO MANY VALUES IN USER_SETTINGS.H HAVE A _STATIC SUFFIX? + +The "_static" suffix is there to prevent naming conflicts with runtime shader +parameters: The shader usually uses a version without the suffix, which is +assigned either the value of the "_static" version or the runtime shader +parameter version. If a value in uset-settings.h doesn't have a "_static" +suffix, it's usually because it's a static compile-time option only, with no +corresponding runtime version. Basically, you can ignore the suffix. :) + +-------------------------------------------------------------------------------- + +14.) ARE THERE ANY BROKEN SETTINGS I SHOULD BE AWARE OF? + WHAT IF I WANT TO CHANGE SETTINGS IN THE .CGP FILE? + +As far as I know, all of the options in user-settings.h and the runtime shader +parameters are pretty robust, with a few caveats: +* As noted above, there are some tradeoffs between runtime and compile-time + options. If runtime blur sigmas are disabled for instance, the phosphor + bloom (and to a lesser extent, the fake bloom) may not blur the right amount. +* If you set your aspect ratio incorrectly, and mask_specify_num_triads == 1.0 + (i.e. true, as opposed to 0.0, which is false), the shader will misinterpret + the number of triads you want by the same proportion. +* Disabled shader parameters will do nothing, including either: + a.) mask_triad_size_desired + b.) mask_num_triads_desired, + depending on the value of mask_specify_num_triads. + +There is a broken and unimplemented option in derived-settings-and-constants.h, +but users shouldn't need to mess around in there anyway. (It's related to the +more efficient phosphor mask resampling I want to implement.) + +However, the .cgp files are another story: They are pretty brittle, especially +when it comes to their interaction with user-cgp-constants.h. Be aware that the +shader passes rely on scale types and sizes in your .cgp file being exactly what +they expect. Do not change any scale types from the defaults, or you'll get +artifacts under certain conditions. You can change the BLOOM_APPROX and +MASK_RESIZE scale values (not scale types), but you must update the associated +constant in user-cgp-constants.h to let the .cg shader files know about it, and +the implications may reach farther than you expect. Similarly, if you plan on +changing an LUT texture, make sure you update the associated constants in +user-cgp-constants.h. In short, if you plan on changing anything in a .cgp +file, you'll want to read it thoroughly first, especially the "IMPORTANT" +section at the top. + +-------------------------------------------------------------------------------- + +15.) WHAT ARE THE MOST COMMON DOT PITCHES FOR CRT TELEVISIONS? + WHAT KIND OF RESOLUTION WOULD I NEED FOR A REAL SHADOW MASK? + +The most demanding CRT we're ever likely to emulate is a Sony PVM-20M4U: + Width: 450mm + Aperture Grille Pitch: 0.31mm + Triads in 4:3 frame: 1451, assuming little to no overscan +For 3-pixel triads, we would need about 6k UHD resolution. A BVM-20F1U has +similar requirements. + +However, common slot masks are far more similar to the kind of image this shader +will produce at 900p, 1080p, 1200p, and 1440p: +1.) A typical 13" diagonal CRT might have a 0.60mm slot pitch, for a total of + 440.26666666666665 or so phosphor triads horizontally. +2.) A typical 19" diagonal CRT might have a 0.75mm slot pitch, for a total of + 514.7733333333333 or so phosphor triads horizontally. +3.) According to http://repairfaq.ece.drexel.edu/REPAIR/F_crtfaq.html, a + typical 25" diagonal CRT might have a 0.9mm slot pitch, for a total of + 564.4444444444445 or so phosphor triads horizontally. +4.) A 21" Samsung SMC210N CCTV monitor (450 TV lines) has a 0.7mm stripe + pitch, for a total of 609.6 or so phosphor triads horizontally. + +The included EDP shadow mask starts looking very good with ~6-pixel triads, so +it may take nearly 4k resolution to make it a particularly compelling option. +However, it's possible to make smaller shadow masks on a pixel-by-pixel basis +and tile them at a 1:1 ratio (mask_sample_mode = 2). I may include a mask like +this in a future update. + +-------------------------------------------------------------------------------- + +16.) IS THIS PHOSPHOR BLOOM REALISTIC? + +Probably not: + +Realistically, the "phosphor bloom" blurs bright phosphors significantly more +than your eyes would bloom the brighter phosphors on a real CRT. This extra +blurring however is necessary to distribute enough brightness to nearby pixels +that we can amplify the overall brightness to that of the original source after +applying the phosphor mask. If you're interested, there are more comments on +the subject at the top of the fragment shader in crt-royale-bloom-approx.cg. + +On the subject of the phosphor bloom: I intended to include some exposition +about the math behind the brightpass calculation (and the much more complex +and thorough calculation I originally used to blur the minimal amount necessary, +which turned out to be inferior in practice), but that document isn't release- +ready at the moment. Sorry Hyllian. ;) + +-------------------------------------------------------------------------------- + +17.) SO WHAT DO YOU PLAN ON ADDING IN THE FUTURE? + +I'd like to add these relatively soon: +1.) A combined ntsc-crt-royale.cgp and ntsc-crt-royale-fake-bloom.cgp. +2.) More presets, especially if maister or squarepusher find a way to make the +Cg to GLSL compiler process .cgp files (which will allows .cgp's to pass +arbitrary #defines to the .cg shader passes). +3.) More efficient and flexible phosphor mask resampling. Hopefully, this will + make it possible to manually resize the mask on Intel HD Graphics as well. +4.) Make it more easy and convenient to use and experiment with mask_sample_mode + 2 (direct 1:1 tiling of an input texture) by using a separate LUT texture + with its own parameters in user-cgp-constants.h, etc. I haven't done this + yet because it requires yet another texture sample that could hurt other + codepaths, and I'm waiting until I have time to optimize it. +5.) Refine the runtime shader parameters: Some of them are probably too fine- + grained and slow to change. + +Maybe's: +1.) I've had trouble getting LUT's from subdirectories to work consistently + across platforms, but I'd like to get around that and include more mask + textures I've made. +2.) If you're using spherical curvature with a small radius, the edges of the + sphere are blocky due to the pixel discards being done in 2x2 fragment + blocks. I'd like to fix this if it can be done without a performance hit. +3.) I have some ideas for procedural mask generation with a fast, closed-form + low-pass filter, but I don't know if I'll ever get around to it. + diff --git a/crt/shaders/crt-royale/THANKS.TXT b/crt/shaders/crt-royale/THANKS.TXT new file mode 100644 index 0000000..4966f0e --- /dev/null +++ b/crt/shaders/crt-royale/THANKS.TXT @@ -0,0 +1,43 @@ +Thank you squarepusher and maister, for hammering out the shader framework that +made this possible and being so receptive to my feedback for Retroarch and the +libretro Cg shader spec. Thank you especially maister, for designing the sRGB +support with me and implementing all the code for both sRGB FBO's and mipmapped +FBO's in less time than it took me to add mipmapped LUT's alone! + +I want to thank xythen and DOLLS for inspiring me with their early efforts: + http://board.byuu.org/viewtopic.php?f=10&t=147 + http://board.byuu.org/viewtopic.php?p=3820#p3834 +I've never spoken with them, but I never would have thought to make this shader +if xythen hadn't gotten the ball rolling, or if DOLLS hadn't made his point +about just how far CRT emulation could go with his phosphor mask prototypes, +convergence error images, and barrel distortion code. + +I also want to thank hunterk for his excellent blog, especially this post: + http://filthypants.blogspot.com/2011/05/ + more-emulator-pixel-shaders-crt-updated.html +Along with caligari's work, his PhosphorLUT shader provoked me to experiment +with game-style bloom as a way to reconcile shadow masks with full brightness. +Along with Pulp Fiction, he also gets credit for helping me name this shader. :D + +Thank you Hyllian for your enthusiasm: It kept me focused on actually releasing +this shader instead of refining it in perpetuity! + +Finally, I want to thank cgwg for everything he has done for CRT emulation: +He was the first to consider the effects of halation, and (in addition to +caligari?) he did the most research on the Gaussian properties of scanline +electron beams. His forum posts and links to academic research were very +helpful, and so were the few PM's we exchanged many months ago: I originally +meant to wet my feet by extending his shader with cylindrical curvature before +writing my own. I never managed to understand his curvature code (due to all of +the different algebraic/trigonometric stages being rolled into one), and I gave +up and started from scratch, but talking with him helped me piece together how +his spherical uv<=>xyz mapping worked mathematically. My own is subtly +different, but not on purpose. ;) A lot of the user parameters for geometry +were inspired by his own (including Euler angle tilt and a "view distance" for +controlling the field of view with a simplified near-plane). Last but not +least, my border dimming code was based more directly off of his: I did what I +could to write a fresh implementation of his algorithm with new features, but +the line between code and algorithm is pretty thin in that function, and it's +a testament to him coming up with such an elegant solution. + +TroggleMonkey diff --git a/crt/shaders/crt-royale/src/bind-shader-params.h b/crt/shaders/crt-royale/src/bind-shader-params.h index 5a1792e..08555da 100644 --- a/crt/shaders/crt-royale/src/bind-shader-params.h +++ b/crt/shaders/crt-royale/src/bind-shader-params.h @@ -27,7 +27,7 @@ // Override some parameters for gamma-management.h and tex2Dantialias.h: #define OVERRIDE_DEVICE_GAMMA -const float gba_gamma = 3.5; // Irrelevant but necessary to define. +static const float gba_gamma = 3.5; // Irrelevant but necessary to define. #define ANTIALIAS_OVERRIDE_BASICS #define ANTIALIAS_OVERRIDE_PARAMETERS @@ -38,8 +38,9 @@ const float gba_gamma = 3.5; // Irrelevant but necessary to define. #endif // Bind option names to shader parameter uniforms or static constants. +#ifdef HARDCODE_SETTINGS #ifdef PARAMETER_UNIFORM -/* uniform float crt_gamma; + uniform float crt_gamma; uniform float lcd_gamma; uniform float levels_contrast; uniform float halation_weight; @@ -57,8 +58,8 @@ const float gba_gamma = 3.5; // Irrelevant but necessary to define. uniform float beam_horiz_filter; uniform float beam_horiz_linear_rgb_weight; #else - const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0); - const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0); + static const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0); + static const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0); #endif uniform float convergence_offset_x_r; uniform float convergence_offset_x_g; @@ -69,7 +70,7 @@ const float gba_gamma = 3.5; // Irrelevant but necessary to define. #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT uniform float mask_type; #else - const float mask_type = clamp(mask_type_static, 0.0, 2.0); + static const float mask_type = clamp(mask_type_static, 0.0, 2.0); #endif uniform float mask_sample_mode_desired; uniform float mask_specify_num_triads; @@ -81,8 +82,8 @@ const float gba_gamma = 3.5; // Irrelevant but necessary to define. uniform float aa_cubic_c; uniform float aa_gauss_sigma; #else - const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]? - const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]? + static const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]? + static const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]? #endif uniform float geom_mode_runtime; uniform float geom_radius; @@ -97,113 +98,114 @@ const float gba_gamma = 3.5; // Irrelevant but necessary to define. uniform float border_darkness; uniform float border_compress; uniform float interlace_bff; - uniform float interlace_1080i; */ + uniform float interlace_1080i; #else // Use constants from user-settings.h, and limit ranges appropriately: - const float crt_gamma = max(0.0, crt_gamma_static); - const float lcd_gamma = max(0.0, lcd_gamma_static); - const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0); - const float halation_weight = clamp(halation_weight_static, 0.0, 1.0); - const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0); - const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static); - const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0); - const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static); - const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static); - const float beam_spot_power = max(beam_spot_power_static, 0.0); - const float beam_min_shape = max(2.0, beam_min_shape_static); - const float beam_max_shape = max(beam_min_shape, beam_max_shape_static); - const float beam_shape_power = max(0.0, beam_shape_power_static); -// const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0); - const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static); - const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0); - // Unpack vector elements to match scalar uniforms: - const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0); - const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0); - const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0); - const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0); - const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0); - const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0); - const float mask_type = clamp(mask_type_static, 0.0, 2.0); - const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0); - const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0); - // const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0); - const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0); - const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5); - const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5); - const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]? - const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]? - const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0); - const float geom_radius = max(1.0/(2.0*pi), geom_radius_static); // Clamp to [1/(2*pi), 1024]? - const float geom_view_dist = max(0.5, geom_view_dist_static); // Clamp to [0.5, 1024]? - const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi); - const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi); - const float geom_aspect_ratio_x = geom_aspect_ratio_static; // Force >= 1? - const float geom_aspect_ratio_y = 1.0; - const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x); - const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y); - const float border_size = clamp(border_size_static, 0.0, 0.5); // 0.5 reaches to image center - const float border_darkness = max(0.0, border_darkness_static); - const float border_compress = max(1.0, border_compress_static); // < 1.0 darkens whole image - const float interlace_bff = float(interlace_bff_static); - const float interlace_1080i = float(interlace_1080i_static); + static const float crt_gamma = max(0.0, crt_gamma_static); + static const float lcd_gamma = max(0.0, lcd_gamma_static); + static const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0); + static const float halation_weight = clamp(halation_weight_static, 0.0, 1.0); + static const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0); + static const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static); + static const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0); + static const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static); + static const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static); + static const float beam_spot_power = max(beam_spot_power_static, 0.0); + static const float beam_min_shape = max(2.0, beam_min_shape_static); + static const float beam_max_shape = max(beam_min_shape, beam_max_shape_static); + static const float beam_shape_power = max(0.0, beam_shape_power_static); + static const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0); + static const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static); + static const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0); + // Unpack static vector elements to match scalar uniforms: + static const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0); + static const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0); + static const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0); + static const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0); + static const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0); + static const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0); + static const float mask_type = clamp(mask_type_static, 0.0, 2.0); + static const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0); + static const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0); + static const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0); + static const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0); + static const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5); + static const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5); + static const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]? + static const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]? + static const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0); + static const float geom_radius = max(1.0/(2.0*pi), geom_radius_static); // Clamp to [1/(2*pi), 1024]? + static const float geom_view_dist = max(0.5, geom_view_dist_static); // Clamp to [0.5, 1024]? + static const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi); + static const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi); + static const float geom_aspect_ratio_x = geom_aspect_ratio_static; // Force >= 1? + static const float geom_aspect_ratio_y = 1.0; + static const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x); + static const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y); + static const float border_size = clamp(border_size_static, 0.0, 0.5); // 0.5 reaches to image center + static const float border_darkness = max(0.0, border_darkness_static); + static const float border_compress = max(1.0, border_compress_static); // < 1.0 darkens whole image + static const float interlace_bff = float(interlace_bff_static); + static const float interlace_1080i = float(interlace_1080i_static); +#endif #endif // Provide accessors for vector constants that pack scalar uniforms: -vec2 get_aspect_vector(const float geom_aspect_ratio) +inline float2 get_aspect_vector(const float geom_aspect_ratio) { // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent // the absolute scale from affecting the uv-mapping for curvature: const float geom_clamped_aspect_ratio = min(geom_aspect_ratio, geom_max_aspect_ratio); - const vec2 geom_aspect = - normalize(vec2(geom_clamped_aspect_ratio, 1.0)); + const float2 geom_aspect = + normalize(float2(geom_clamped_aspect_ratio, 1.0)); return geom_aspect; } -vec2 get_geom_overscan_vector() +inline float2 get_geom_overscan_vector() { - return vec2(geom_overscan_x, geom_overscan_y); + return float2(geom_overscan_x, geom_overscan_y); } -vec2 get_geom_tilt_angle_vector() +inline float2 get_geom_tilt_angle_vector() { - return vec2(geom_tilt_angle_x, geom_tilt_angle_y); + return float2(geom_tilt_angle_x, geom_tilt_angle_y); } -vec3 get_convergence_offsets_x_vector() +inline float3 get_convergence_offsets_x_vector() { - return vec3(convergence_offset_x_r, convergence_offset_x_g, + return float3(convergence_offset_x_r, convergence_offset_x_g, convergence_offset_x_b); } -vec3 get_convergence_offsets_y_vector() +inline float3 get_convergence_offsets_y_vector() { - return vec3(convergence_offset_y_r, convergence_offset_y_g, + return float3(convergence_offset_y_r, convergence_offset_y_g, convergence_offset_y_b); } -vec2 get_convergence_offsets_r_vector() +inline float2 get_convergence_offsets_r_vector() { - return vec2(convergence_offset_x_r, convergence_offset_y_r); + return float2(convergence_offset_x_r, convergence_offset_y_r); } -vec2 get_convergence_offsets_g_vector() +inline float2 get_convergence_offsets_g_vector() { - return vec2(convergence_offset_x_g, convergence_offset_y_g); + return float2(convergence_offset_x_g, convergence_offset_y_g); } -vec2 get_convergence_offsets_b_vector() +inline float2 get_convergence_offsets_b_vector() { - return vec2(convergence_offset_x_b, convergence_offset_y_b); + return float2(convergence_offset_x_b, convergence_offset_y_b); } -vec2 get_aa_subpixel_r_offset() +inline float2 get_aa_subpixel_r_offset() { #ifdef RUNTIME_ANTIALIAS_WEIGHTS #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS // WARNING: THIS IS EXTREMELY EXPENSIVE. - return vec2(aa_subpixel_r_offset_x_runtime, + return float2(aa_subpixel_r_offset_x_runtime, aa_subpixel_r_offset_y_runtime); #else return aa_subpixel_r_offset_static; @@ -214,17 +216,17 @@ vec2 get_aa_subpixel_r_offset() } // Provide accessors settings which still need "cooking:" -float get_mask_amplify() +inline float get_mask_amplify() { - const float mask_grille_amplify = 1.0/mask_grille_avg_color; - const float mask_slot_amplify = 1.0/mask_slot_avg_color; - const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + static const float mask_grille_amplify = 1.0/mask_grille_avg_color; + static const float mask_slot_amplify = 1.0/mask_slot_avg_color; + static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; return mask_type < 0.5 ? mask_grille_amplify : mask_type < 1.5 ? mask_slot_amplify : mask_shadow_amplify; } -float get_mask_sample_mode() +inline float get_mask_sample_mode() { #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE diff --git a/crt/shaders/crt-royale/src/bloom-functions.h b/crt/shaders/crt-royale/src/bloom-functions.h index 7e00dee..8ce2d0e 100644 --- a/crt/shaders/crt-royale/src/bloom-functions.h +++ b/crt/shaders/crt-royale/src/bloom-functions.h @@ -37,14 +37,13 @@ /////////////////////////////// BLOOM CONSTANTS ////////////////////////////// // Compute constants with manual inlines of the functions below: -const float bloom_diff_thresh = 1.0/256.0; +static const float bloom_diff_thresh = 1.0/256.0; + -// Assume an extremely large viewport size for asymptotic results: -const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); /////////////////////////////////// HELPERS ////////////////////////////////// -float get_min_sigma_to_blur_triad(const float triad_size, +inline float get_min_sigma_to_blur_triad(const float triad_size, const float thresh) { // Requires: 1.) triad_size is the final phosphor triad size in pixels @@ -60,7 +59,7 @@ float get_min_sigma_to_blur_triad(const float triad_size, //return 0.5985*triad_size - triad_size*sqrt(thresh) } -float get_absolute_scale_blur_sigma(const float thresh) +inline float get_absolute_scale_blur_sigma(const float thresh) { // Requires: 1.) min_expected_triads must be a global float. The number // of horizontal phosphor triads in the final image must be @@ -93,7 +92,7 @@ float get_absolute_scale_blur_sigma(const float thresh) max_viewport_size_x/min_allowed_viewport_triads.x, thresh); } -float get_center_weight(const float sigma) +inline float get_center_weight(const float sigma) { // Given a Gaussian blur sigma, get the blur weight for the center texel. #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA @@ -161,8 +160,8 @@ float get_center_weight(const float sigma) #endif } -vec3 tex2DblurNfast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv, + const float2 dxdy, const float sigma) { // If sigma is static, we can safely branch and use the smallest blur // that's big enough. Ignore #define hints, because we'll only use a @@ -186,40 +185,40 @@ vec3 tex2DblurNfast(const sampler2D tex, const vec2 tex_uv, #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE if(sigma <= blur9_std_dev) { - return tex2Dblur9fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); } else if(sigma <= blur17_std_dev) { - return tex2Dblur17fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); } else if(sigma <= blur25_std_dev) { - return tex2Dblur25fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); } else if(sigma <= blur31_std_dev) { - return tex2Dblur31fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); } else { - return tex2Dblur43fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); } #else // If we can't afford to branch, we can only guess at what blur // size we need. Therefore, use the largest blur allowed. #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS - return tex2Dblur43fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur43fast(texture, tex_uv, dxdy, sigma); #else #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS - return tex2Dblur31fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur31fast(texture, tex_uv, dxdy, sigma); #else #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS - return tex2Dblur25fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur25fast(texture, tex_uv, dxdy, sigma); #else #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS - return tex2Dblur17fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur17fast(texture, tex_uv, dxdy, sigma); #else - return tex2Dblur9fast(tex, tex_uv, dxdy, sigma); + return tex2Dblur9fast(texture, tex_uv, dxdy, sigma); #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS #endif // PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS @@ -227,7 +226,7 @@ vec3 tex2DblurNfast(const sampler2D tex, const vec2 tex_uv, #endif // PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE } -float get_bloom_approx_sigma(const float output_size_x_runtime, +inline float get_bloom_approx_sigma(const float output_size_x_runtime, const float estimated_viewport_size_x) { // Requires: 1.) output_size_x_runtime == BLOOM_APPROX.output_size.x. @@ -243,15 +242,15 @@ float get_bloom_approx_sigma(const float output_size_x_runtime, // bilinear filtering, so use static calculations. // Assume the default static value. This is a compromise that ensures // typical triads are blurred, even if unusually large ones aren't. - const float mask_num_triads_static = + static const float mask_num_triads_static = max(min_allowed_viewport_triads.x, mask_num_triads_desired_static); const float mask_num_triads_from_size = - estimated_viewport_size_x/params.mask_triad_size_desired; + estimated_viewport_size_x/global.mask_triad_size_desired; const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x, - mix(mask_num_triads_from_size, params.mask_num_triads_desired, - mask_specify_num_triads)); + lerp(mask_num_triads_from_size, global.mask_num_triads_desired, + global.mask_specify_num_triads)); // Assume an extremely large viewport size for asymptotic results: - const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize { // Use the runtime num triads and output size: @@ -264,7 +263,7 @@ float get_bloom_approx_sigma(const float output_size_x_runtime, // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but // account for the Gaussian scanline sigma from the last pass too. // The bloom will be too wide horizontally but tall enough vertically. - return length(vec2(bloom_approx_sigma, beam_max_sigma)); + return length(float2(bloom_approx_sigma, beam_max_sigma)); } else // 3x3 blur resize (the bilinear resize doesn't need a sigma) { @@ -272,12 +271,12 @@ float get_bloom_approx_sigma(const float output_size_x_runtime, // reason to choose blur3x3 is to avoid dynamic weights, so use a // static calculation. #ifdef PHOSPHOR_BLOOM_FAKE - const float output_size_x_static = + static const float output_size_x_static = bloom_approx_size_x_for_fake; #else - const float output_size_x_static = bloom_approx_size_x; + static const float output_size_x_static = bloom_approx_size_x; #endif - const float asymptotic_triad_size = + static const float asymptotic_triad_size = max_viewport_size_x/mask_num_triads_static; const float asymptotic_sigma = get_min_sigma_to_blur_triad( asymptotic_triad_size, bloom_diff_thresh); @@ -286,11 +285,11 @@ float get_bloom_approx_sigma(const float output_size_x_runtime, // The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but // try accounting for the Gaussian scanline sigma from the last pass // too; use the static default value: - return length(vec2(bloom_approx_sigma, beam_max_sigma_static)); + return length(float2(bloom_approx_sigma, beam_max_sigma_static)); } } -float get_final_bloom_sigma(const float bloom_sigma_runtime) +inline float get_final_bloom_sigma(const float bloom_sigma_runtime) { // Requires: 1.) bloom_sigma_runtime is a precalculated sigma that's // optimal for the [known] triad size. @@ -303,7 +302,7 @@ float get_final_bloom_sigma(const float bloom_sigma_runtime) // Notes: Call this from the fragment shader, NOT the vertex shader, // so static sigmas can be constant-folded! const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad( - params.mask_triad_size_desired, bloom_diff_thresh); + mask_triad_size_desired_static, bloom_diff_thresh); #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA return bloom_sigma_runtime; #else @@ -313,4 +312,6 @@ float get_final_bloom_sigma(const float bloom_sigma_runtime) #endif } -#endif // BLOOM_FUNCTIONS_H \ No newline at end of file + +#endif // BLOOM_FUNCTIONS_H + diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang new file mode 100644 index 0000000..e242024 --- /dev/null +++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang @@ -0,0 +1,3 @@ +#version 450 +#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE +#include "crt-royale-bloom-approx.h" \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.h similarity index 56% rename from crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang rename to crt/shaders/crt-royale/src/crt-royale-bloom-approx.h index 0fd6d24..984819e 100644 --- a/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang +++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.h @@ -1,16 +1,3 @@ -#version 450 - -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; - vec4 ORIG_LINEARIZEDSize; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -29,32 +16,170 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; + vec4 ORIG_LINEARIZEDSize; +} params; -////////////////////////////////// INCLUDES ////////////////////////////////// +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; +} global; +#define ORIG_LINEARIZEDvideo_size params.SourceSize.xy +#define ORIG_LINEARIZEDtexture_size params.SourceSize.xy + +float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" -#include "derived-settings-and-constants.h" #include "bind-shader-params.h" #include "../../../../include/gamma-management.h" -#include "../../../../include/blur-functions.h" +#include "derived-settings-and-constants.h" #include "scanline-functions.h" + +#pragma stage vertex +layout(location = 0) in vec4 Position; +layout(location = 1) in vec2 TexCoord; +layout(location = 0) out vec2 tex_uv; +layout(location = 1) out vec2 blur_dxdy; +layout(location = 2) out vec2 uv_scanline_step; +layout(location = 3) out float estimated_viewport_size_x; +layout(location = 4) out vec2 texture_size_inv; +layout(location = 5) out vec2 tex_uv_to_pixel_scale; + +void main() +{ + gl_Position = global.MVP * Position; + float2 vTexCoord = TexCoord; + const float2 video_uv = vTexCoord * IN.texture_size/IN.video_size; + tex_uv = video_uv * ORIG_LINEARIZEDvideo_size / + ORIG_LINEARIZEDtexture_size; + // The last pass (vertical scanlines) had a viewport y scale, so we can + // use it to calculate a better runtime sigma: + estimated_viewport_size_x = + IN.video_size.y * geom_aspect_ratio_x/geom_aspect_ratio_y; + + // Get the uv sample distance between output pixels. We're using a resize + // blur, so arbitrary upsizing will be acceptable if filter_linearN = + // "true," and arbitrary downsizing will be acceptable if mipmap_inputN = + // "true" too. The blur will be much more accurate if a true 4x4 Gaussian + // resize is used instead of tex2Dblur3x3_resize (which samples between + // texels even for upsizing). + const float2 dxdy_min_scale = ORIG_LINEARIZEDvideo_size/IN.output_size; + const float2 texture_size_inv = float2(1.0)/ORIG_LINEARIZEDtexture_size; + if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize + { + // For upsizing, we'll snap to texels and sample the nearest 4. + const float2 dxdy_scale = max(dxdy_min_scale, float2(1.0)); + blur_dxdy = dxdy_scale * texture_size_inv; + } + else + { + const float2 dxdy_scale = dxdy_min_scale; + blur_dxdy = dxdy_scale * texture_size_inv; + } + // tex2Dresize_gaussian4x4 needs to know a bit more than the other filters: + tex_uv_to_pixel_scale = IN.output_size * + ORIG_LINEARIZEDtexture_size / ORIG_LINEARIZEDvideo_size; + //texture_size_inv = texture_size_inv; + + // Detecting interlacing again here lets us apply convergence offsets in + // this pass. il_step_multiple contains the (texel, scanline) step + // multiple: 1 for progressive, 2 for interlaced. + const float2 orig_video_size = ORIG_LINEARIZEDvideo_size; + const float y_step = 1.0 + float(is_interlaced(orig_video_size.y)); + const float2 il_step_multiple = float2(1.0, y_step); + // Get the uv distance between (texels, same-field scanlines): + uv_scanline_step = il_step_multiple / ORIG_LINEARIZEDtexture_size; +} + +#pragma stage fragment +#pragma format R8G8B8A8_SRGB +layout(location = 0) in vec2 tex_uv; +layout(location = 1) in vec2 blur_dxdy; +layout(location = 2) in vec2 uv_scanline_step; +layout(location = 3) in float estimated_viewport_size_x; +layout(location = 4) in vec2 texture_size_inv; +layout(location = 5) in vec2 tex_uv_to_pixel_scale; +layout(location = 0) out vec4 FragColor; +layout(set = 0, binding = 2) uniform sampler2D Source; +layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED; +layout(set = 0, binding = 4) uniform sampler2D Original; + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +#include "../../../../include/blur-functions.h" #include "bloom-functions.h" +#include "../../../../include/gamma-management.h" + /////////////////////////////////// HELPERS ////////////////////////////////// -vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const vec2 texture_size, const vec2 texture_size_inv, - const vec2 tex_uv_to_pixel_scale, const float sigma) +float3 tex2Dresize_gaussian4x4(sampler2D tex, float2 tex_uv, float2 dxdy, float2 tex_size, float2 texture_size_inv, float2 tex_uv_to_pixel_scale, float sigma) { // Requires: 1.) All requirements of gamma-management.h must be satisfied! // 2.) filter_linearN must == "true" in your .cgp preset. // 3.) mipmap_inputN must == "true" in your .cgp preset if // IN.output_size << SRC.video_size. // 4.) dxdy should contain the uv pixel spacing: - // dxdy = max(vec2(1.0), + // dxdy = max(float2(1.0), // SRC.video_size/IN.output_size)/SRC.texture_size; // 5.) texture_size == SRC.texture_size - // 6.) texture_size_inv == vec2(1.0)/SRC.texture_size + // 6.) texture_size_inv == float2(1.0)/SRC.texture_size // 7.) tex_uv_to_pixel_scale == IN.output_size * // SRC.texture_size / SRC.video_size; // 8.) sigma is the desired Gaussian standard deviation, in @@ -72,65 +197,65 @@ vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv, const float denom_inv = 0.5/(sigma*sigma); // We're taking 4x4 samples, and we're snapping to texels for upsizing. // Find texture coords for sample 5 (second row, second column): - const vec2 curr_texel = tex_uv * texture_size; - const vec2 prev_texel = - floor(curr_texel - vec2(under_half)) + vec2(0.5); - const vec2 prev_texel_uv = prev_texel * texture_size_inv; - const bvec2 snap = lessThanEqual(dxdy , texture_size_inv); - const vec2 sample5_downsize_uv = tex_uv - 0.5 * dxdy; - const vec2 sample5_uv = mix(sample5_downsize_uv, prev_texel_uv, snap); + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_uv = prev_texel * texture_size_inv; + const float2 snap = float2((dxdy.x <= texture_size_inv.x), (dxdy.y <= texture_size_inv.y)); + const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy; + const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap); // Compute texture coords for other samples: - const vec2 dx = vec2(dxdy.x, 0.0); - const vec2 sample0_uv = sample5_uv - dxdy; - const vec2 sample10_uv = sample5_uv + dxdy; - const vec2 sample15_uv = sample5_uv + 2.0 * dxdy; - const vec2 sample1_uv = sample0_uv + dx; - const vec2 sample2_uv = sample0_uv + 2.0 * dx; - const vec2 sample3_uv = sample0_uv + 3.0 * dx; - const vec2 sample4_uv = sample5_uv - dx; - const vec2 sample6_uv = sample5_uv + dx; - const vec2 sample7_uv = sample5_uv + 2.0 * dx; - const vec2 sample8_uv = sample10_uv - 2.0 * dx; - const vec2 sample9_uv = sample10_uv - dx; - const vec2 sample11_uv = sample10_uv + dx; - const vec2 sample12_uv = sample15_uv - 3.0 * dx; - const vec2 sample13_uv = sample15_uv - 2.0 * dx; - const vec2 sample14_uv = sample15_uv - dx; + const float2 dx = float2(dxdy.x, 0.0); + const float2 sample0_uv = sample5_uv - dxdy; + const float2 sample10_uv = sample5_uv + dxdy; + const float2 sample15_uv = sample5_uv + 2.0 * dxdy; + const float2 sample1_uv = sample0_uv + dx; + const float2 sample2_uv = sample0_uv + 2.0 * dx; + const float2 sample3_uv = sample0_uv + 3.0 * dx; + const float2 sample4_uv = sample5_uv - dx; + const float2 sample6_uv = sample5_uv + dx; + const float2 sample7_uv = sample5_uv + 2.0 * dx; + const float2 sample8_uv = sample10_uv - 2.0 * dx; + const float2 sample9_uv = sample10_uv - dx; + const float2 sample11_uv = sample10_uv + dx; + const float2 sample12_uv = sample15_uv - 3.0 * dx; + const float2 sample13_uv = sample15_uv - 2.0 * dx; + const float2 sample14_uv = sample15_uv - dx; // Load each sample: - const vec3 sample0 = tex2D_linearize(tex, sample0_uv).rgb; - const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; - const vec3 sample2 = tex2D_linearize(tex, sample2_uv).rgb; - const vec3 sample3 = tex2D_linearize(tex, sample3_uv).rgb; - const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; - const vec3 sample5 = tex2D_linearize(tex, sample5_uv).rgb; - const vec3 sample6 = tex2D_linearize(tex, sample6_uv).rgb; - const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; - const vec3 sample8 = tex2D_linearize(tex, sample8_uv).rgb; - const vec3 sample9 = tex2D_linearize(tex, sample9_uv).rgb; - const vec3 sample10 = tex2D_linearize(tex, sample10_uv).rgb; - const vec3 sample11 = tex2D_linearize(tex, sample11_uv).rgb; - const vec3 sample12 = tex2D_linearize(tex, sample12_uv).rgb; - const vec3 sample13 = tex2D_linearize(tex, sample13_uv).rgb; - const vec3 sample14 = tex2D_linearize(tex, sample14_uv).rgb; - const vec3 sample15 = tex2D_linearize(tex, sample15_uv).rgb; + float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb; + float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + float3 sample2 = tex2D_linearize(tex, dx).rgb; + float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb; + float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb; + float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb; + float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb; + float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb; + float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb; + float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb; + float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb; + float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb; + float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb; + float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb; // Compute destination pixel offsets for each sample: - const vec2 dest_pixel = tex_uv * tex_uv_to_pixel_scale; - const vec2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale; + const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel; + const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel; // Compute Gaussian sample weights: const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv); const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv); @@ -152,81 +277,13 @@ vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv, w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15); // Weight and sum the samples: - const vec3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15; return sum * weight_sum_inv; } -#pragma stage vertex -layout(location = 0) in vec4 Position; -layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 tex_uv; -layout(location = 1) out float estimated_viewport_size_x; -layout(location = 2) out vec2 blur_dxdy; -layout(location = 3) out vec2 uv_scanline_step; -layout(location = 4) out vec2 texture_size_inv; -layout(location = 5) out vec2 tex_uv_to_pixel_scale; - -void main() -{ - // This vertex shader copies blurs/vertex-shader-blur-one-pass-resize.h, - // except we're using a different source image. - gl_Position = params.MVP * Position; - const vec2 video_uv = TexCoord; - tex_uv = video_uv; - // The last pass (vertical scanlines) had a viewport y scale, so we can - // use it to calculate a better runtime sigma: - estimated_viewport_size_x = registers.SourceSize.y * params.geom_aspect_ratio_x / params.geom_aspect_ratio_y; - - // Get the uv sample distance between output pixels. We're using a resize - // blur, so arbitrary upsizing will be acceptable if filter_linearN = - // "true," and arbitrary downsizing will be acceptable if mipmap_inputN = - // "true" too. The blur will be much more accurate if a true 4x4 Gaussian - // resize is used instead of tex2Dblur3x3_resize (which samples between - // texels even for upsizing). - const vec2 dxdy_min_scale = registers.ORIG_LINEARIZEDSize.xy * registers.OutputSize.zw; - texture_size_inv = registers.ORIG_LINEARIZEDSize.zw; - if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize - { - // For upsizing, we'll snap to texels and sample the nearest 4. - const vec2 dxdy_scale = max(dxdy_min_scale, vec2(1.0)); - blur_dxdy = dxdy_scale * texture_size_inv; - } - else - { - const vec2 dxdy_scale = dxdy_min_scale; - blur_dxdy = dxdy_scale * texture_size_inv; - } - - tex_uv_to_pixel_scale = registers.OutputSize.xy; -// texture_size_inv = texture_size_inv; <- commented out because it's pointless in slang - - // Detecting interlacing again here lets us apply convergence offsets in - // this pass. il_step_multiple contains the (texel, scanline) step - // multiple: 1 for progressive, 2 for interlaced. - const vec2 orig_video_size = registers.ORIG_LINEARIZEDSize.xy; - float interlace_check = 0.0; - if (is_interlaced(orig_video_size.y) == true) interlace_check = 1.0; - const float y_step = 1.0 + interlace_check; - const vec2 il_step_multiple = vec2(1.0, y_step); - // Get the uv distance between (texels, same-field scanlines): - uv_scanline_step = il_step_multiple * registers.ORIG_LINEARIZEDSize.zw; -} - -#pragma stage fragment -#pragma format R8G8B8A8_SRGB -layout(location = 0) in vec2 tex_uv; -layout(location = 1) in float estimated_viewport_size_x; -layout(location = 2) in vec2 blur_dxdy; -layout(location = 3) in vec2 uv_scanline_step; -layout(location = 4) in vec2 texture_size_inv; -layout(location = 5) in vec2 tex_uv_to_pixel_scale; -layout(location = 0) out vec4 FragColor; -layout(set = 0, binding = 2) uniform sampler2D Source; -layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED; - void main() { // Would a viewport-relative size work better for this pass? (No.) @@ -264,45 +321,52 @@ void main() // bandwidth if it's done at a small constant scale. // Get the constants we need to sample: - const vec2 texture_size = registers.ORIG_LINEARIZEDSize.xy; - vec2 tex_uv_r, tex_uv_g, tex_uv_b; - - if(beam_misconvergence == true) +// const sampler2D texture = ORIG_LINEARIZED.texture; +// const float2 tex_uv = tex_uv; +// const float2 blur_dxdy = blur_dxdy; + const float2 texture_size_ = ORIG_LINEARIZEDtexture_size; +// const float2 texture_size_inv = texture_size_inv; +// const float2 tex_uv_to_pixel_scale = tex_uv_to_pixel_scale; + float2 tex_uv_r, tex_uv_g, tex_uv_b; + + if(beam_misconvergence) { - const vec2 convergence_offsets_r = vec2(params.convergence_offset_x_r, params.convergence_offset_y_r);//get_convergence_offsets_r_vector(); - const vec2 convergence_offsets_g = vec2(params.convergence_offset_x_g, params.convergence_offset_y_g);//get_convergence_offsets_g_vector(); - const vec2 convergence_offsets_b = vec2(params.convergence_offset_x_b, params.convergence_offset_y_b);//get_convergence_offsets_b_vector(); - tex_uv_r = tex_uv - vec2(params.convergence_offset_x_r, params.convergence_offset_y_r) * uv_scanline_step; - tex_uv_g = tex_uv - vec2(params.convergence_offset_x_g, params.convergence_offset_y_g) * uv_scanline_step; - tex_uv_b = tex_uv - vec2(params.convergence_offset_x_b, params.convergence_offset_y_b) * uv_scanline_step; + const float2 uv_scanline_step = uv_scanline_step; + const float2 convergence_offsets_r = get_convergence_offsets_r_vector(); + const float2 convergence_offsets_g = get_convergence_offsets_g_vector(); + const float2 convergence_offsets_b = get_convergence_offsets_b_vector(); + tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step; + tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step; + tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step; } - // Get the blur sigma: - const float bloom_approx_sigma = get_bloom_approx_sigma(registers.OutputSize.x, estimated_viewport_size_x); - - // Sample the resized and blurred texture, and apply convergence offsets if + // Get the blur sigma: + const float bloom_approx_sigma = get_bloom_approx_sigma(IN.output_size.x, + estimated_viewport_size_x); + + // Sample the resized and blurred texture, and apply convergence offsets if // necessary. Applying convergence offsets here triples our samples from // 16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and // HALATION_BLUR 3 times at full resolution every time they're used. - vec3 color_r, color_g, color_b, color; - if(bloom_approx_filter > 1.5) + float3 color_r, color_g, color_b, color; + if(bloom_approx_filter > 1.5) { // Use a 4x4 Gaussian resize. This is slower but technically correct. - if(beam_misconvergence == true) + if(beam_misconvergence) { color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r, - blur_dxdy, texture_size, texture_size_inv, + blur_dxdy, texture_size_, texture_size_inv, tex_uv_to_pixel_scale, bloom_approx_sigma); color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g, - blur_dxdy, texture_size, texture_size_inv, + blur_dxdy, texture_size_, texture_size_inv, tex_uv_to_pixel_scale, bloom_approx_sigma); color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b, - blur_dxdy, texture_size, texture_size_inv, + blur_dxdy, texture_size_, texture_size_inv, tex_uv_to_pixel_scale, bloom_approx_sigma); } else { color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv, - blur_dxdy, texture_size, texture_size_inv, + blur_dxdy, texture_size_, texture_size_inv, tex_uv_to_pixel_scale, bloom_approx_sigma); } } @@ -311,7 +375,7 @@ void main() // Use a 3x3 resize blur. This is the softest option, because we're // blurring already blurry bilinear samples. It doesn't play quite as // nicely with convergence offsets, but it has its charms. - if(beam_misconvergence == true) + if(beam_misconvergence) { color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r, blur_dxdy, bloom_approx_sigma); @@ -333,7 +397,7 @@ void main() // too sharp above ~400x300, but the blurs break down above that // resolution too, unless min_allowed_viewport_triads is high enough to // keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.) - if(beam_misconvergence == true) + if(beam_misconvergence) { color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb; color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb; @@ -344,11 +408,11 @@ void main() color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb; } } - // Pack the colors from the red/green/blue beams into a single vector: - if(beam_misconvergence == true) + // Pack the colors from the red/green/blue beams into a single vector: + if(beam_misconvergence) { - color = vec3(color_r.r, color_g.g, color_b.b); + color = float3(color_r.r, color_g.g, color_b.b); } // Encode and output the blurred image: - FragColor = vec4(texture(ORIG_LINEARIZED, tex_uv));//vec4(color, 1.0);// -} + FragColor = encode_output(float4(tex2D_linearize(ORIG_LINEARIZED, tex_uv))); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang index e07e58a..14d4e76 100755 --- a/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang +++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang @@ -1,354 +1,2 @@ #version 450 - -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; - vec4 ORIG_LINEARIZEDSize; -} registers; - -#include "params.inc" - -///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// - -// crt-royale: A full-featured CRT shader, with cheese. -// Copyright (C) 2014 TroggleMonkey -// -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 2 of the License, or any later version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. -// -// You should have received a copy of the GNU General Public License along with -// this program; if not, write to the Free Software Foundation, Inc., 59 Temple -// Place, Suite 330, Boston, MA 02111-1307 USA - - -////////////////////////////////// INCLUDES ////////////////////////////////// - -#include "../user-settings.h" -#include "derived-settings-and-constants.h" -#include "bind-shader-params.h" -#include "../../../../include/gamma-management.h" -#include "../../../../include/blur-functions.h" -#include "scanline-functions.h" -#include "bloom-functions.h" - -/////////////////////////////////// HELPERS ////////////////////////////////// - -vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const vec2 texture_size, const vec2 texture_size_inv, - const vec2 tex_uv_to_pixel_scale, const float sigma) -{ - // Requires: 1.) All requirements of gamma-management.h must be satisfied! - // 2.) filter_linearN must == "true" in your .cgp preset. - // 3.) mipmap_inputN must == "true" in your .cgp preset if - // IN.output_size << SRC.video_size. - // 4.) dxdy should contain the uv pixel spacing: - // dxdy = max(vec2(1.0), - // SRC.video_size/IN.output_size)/SRC.texture_size; - // 5.) texture_size == SRC.texture_size - // 6.) texture_size_inv == vec2(1.0)/SRC.texture_size - // 7.) tex_uv_to_pixel_scale == IN.output_size * - // SRC.texture_size / SRC.video_size; - // 8.) sigma is the desired Gaussian standard deviation, in - // terms of output pixels. It should be < ~0.66171875 to - // ensure the first unused sample (outside the 4x4 box) has - // a weight < 1.0/256.0. - // Returns: A true 4x4 Gaussian resize of the input. - // Description: - // Given correct inputs, this Gaussian resizer samples 4 pixel locations - // along each downsized dimension and/or 4 texel locations along each - // upsized dimension. It computes dynamic weights based on the pixel-space - // distance of each sample from the destination pixel. It is arbitrarily - // resizable and higher quality than tex2Dblur3x3_resize, but it's slower. - // TODO: Move this to a more suitable file once there are others like it. - const float denom_inv = 0.5/(sigma*sigma); - // We're taking 4x4 samples, and we're snapping to texels for upsizing. - // Find texture coords for sample 5 (second row, second column): - const vec2 curr_texel = tex_uv * texture_size; - const vec2 prev_texel = - floor(curr_texel - vec2(under_half)) + vec2(0.5); - const vec2 prev_texel_uv = prev_texel * texture_size_inv; - const bvec2 snap = lessThanEqual(dxdy , texture_size_inv); - const vec2 sample5_downsize_uv = tex_uv - 0.5 * dxdy; - const vec2 sample5_uv = mix(sample5_downsize_uv, prev_texel_uv, snap); - // Compute texture coords for other samples: - const vec2 dx = vec2(dxdy.x, 0.0); - const vec2 sample0_uv = sample5_uv - dxdy; - const vec2 sample10_uv = sample5_uv + dxdy; - const vec2 sample15_uv = sample5_uv + 2.0 * dxdy; - const vec2 sample1_uv = sample0_uv + dx; - const vec2 sample2_uv = sample0_uv + 2.0 * dx; - const vec2 sample3_uv = sample0_uv + 3.0 * dx; - const vec2 sample4_uv = sample5_uv - dx; - const vec2 sample6_uv = sample5_uv + dx; - const vec2 sample7_uv = sample5_uv + 2.0 * dx; - const vec2 sample8_uv = sample10_uv - 2.0 * dx; - const vec2 sample9_uv = sample10_uv - dx; - const vec2 sample11_uv = sample10_uv + dx; - const vec2 sample12_uv = sample15_uv - 3.0 * dx; - const vec2 sample13_uv = sample15_uv - 2.0 * dx; - const vec2 sample14_uv = sample15_uv - dx; - // Load each sample: - const vec3 sample0 = tex2D_linearize(tex, sample0_uv).rgb; - const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; - const vec3 sample2 = tex2D_linearize(tex, sample2_uv).rgb; - const vec3 sample3 = tex2D_linearize(tex, sample3_uv).rgb; - const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; - const vec3 sample5 = tex2D_linearize(tex, sample5_uv).rgb; - const vec3 sample6 = tex2D_linearize(tex, sample6_uv).rgb; - const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; - const vec3 sample8 = tex2D_linearize(tex, sample8_uv).rgb; - const vec3 sample9 = tex2D_linearize(tex, sample9_uv).rgb; - const vec3 sample10 = tex2D_linearize(tex, sample10_uv).rgb; - const vec3 sample11 = tex2D_linearize(tex, sample11_uv).rgb; - const vec3 sample12 = tex2D_linearize(tex, sample12_uv).rgb; - const vec3 sample13 = tex2D_linearize(tex, sample13_uv).rgb; - const vec3 sample14 = tex2D_linearize(tex, sample14_uv).rgb; - const vec3 sample15 = tex2D_linearize(tex, sample15_uv).rgb; - // Compute destination pixel offsets for each sample: - const vec2 dest_pixel = tex_uv * tex_uv_to_pixel_scale; - const vec2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel; - const vec2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel; - // Compute Gaussian sample weights: - const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv); - const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv); - const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv); - const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv); - const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv); - const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv); - const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv); - const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv); - const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv); - const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv); - const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv); - const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv); - const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv); - const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv); - const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv); - const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv); - const float weight_sum_inv = 1.0/( - w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + - w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15); - // Weight and sum the samples: - const vec3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + - w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + - w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + - w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15; - return sum * weight_sum_inv; -} - -#pragma stage vertex -layout(location = 0) in vec4 Position; -layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 tex_uv; -layout(location = 1) out float estimated_viewport_size_x; -layout(location = 2) out vec2 blur_dxdy; -layout(location = 3) out vec2 uv_scanline_step; -layout(location = 4) out vec2 texture_size_inv; -layout(location = 5) out vec2 tex_uv_to_pixel_scale; - -void main() -{ - // This vertex shader copies blurs/vertex-shader-blur-one-pass-resize.h, - // except we're using a different source image. - gl_Position = params.MVP * Position; - const vec2 video_uv = TexCoord; - tex_uv = video_uv; - // The last pass (vertical scanlines) had a viewport y scale, so we can - // use it to calculate a better runtime sigma: - estimated_viewport_size_x = registers.SourceSize.y * params.geom_aspect_ratio_x / params.geom_aspect_ratio_y; - - // Get the uv sample distance between output pixels. We're using a resize - // blur, so arbitrary upsizing will be acceptable if filter_linearN = - // "true," and arbitrary downsizing will be acceptable if mipmap_inputN = - // "true" too. The blur will be much more accurate if a true 4x4 Gaussian - // resize is used instead of tex2Dblur3x3_resize (which samples between - // texels even for upsizing). - const vec2 dxdy_min_scale = registers.ORIG_LINEARIZEDSize.xy * registers.OutputSize.zw; - texture_size_inv = registers.ORIG_LINEARIZEDSize.zw; - if(bloom_approx_filter > 1.5) // 4x4 true Gaussian resize - { - // For upsizing, we'll snap to texels and sample the nearest 4. - const vec2 dxdy_scale = max(dxdy_min_scale, vec2(1.0)); - blur_dxdy = dxdy_scale * texture_size_inv; - } - else - { - const vec2 dxdy_scale = dxdy_min_scale; - blur_dxdy = dxdy_scale * texture_size_inv; - } - - tex_uv_to_pixel_scale = registers.OutputSize.xy; -// texture_size_inv = texture_size_inv; <- commented out because it's pointless in slang - - // Detecting interlacing again here lets us apply convergence offsets in - // this pass. il_step_multiple contains the (texel, scanline) step - // multiple: 1 for progressive, 2 for interlaced. - const vec2 orig_video_size = registers.ORIG_LINEARIZEDSize.xy; - float interlace_check = 0.0; - if (is_interlaced(orig_video_size.y) == true) interlace_check = 1.0; - const float y_step = 1.0 + interlace_check; - const vec2 il_step_multiple = vec2(1.0, y_step); - // Get the uv distance between (texels, same-field scanlines): - uv_scanline_step = il_step_multiple * registers.ORIG_LINEARIZEDSize.zw; -} - -#pragma stage fragment -#pragma format R8G8B8A8_SRGB -layout(location = 0) in vec2 tex_uv; -layout(location = 1) in float estimated_viewport_size_x; -layout(location = 2) in vec2 blur_dxdy; -layout(location = 3) in vec2 uv_scanline_step; -layout(location = 4) in vec2 texture_size_inv; -layout(location = 5) in vec2 tex_uv_to_pixel_scale; -layout(location = 0) out vec4 FragColor; -layout(set = 0, binding = 2) uniform sampler2D Source; -layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED; - -void main() -{ - // Would a viewport-relative size work better for this pass? (No.) - // PROS: - // 1.) Instead of writing an absolute size to user-cgp-constants.h, we'd - // write a viewport scale. That number could be used to directly scale - // the viewport-resolution bloom sigma and/or triad size to a smaller - // scale. This way, we could calculate an optimal dynamic sigma no - // matter how the dot pitch is specified. - // CONS: - // 1.) Texel smearing would be much worse at small viewport sizes, but - // performance would be much worse at large viewport sizes, so there - // would be no easy way to calculate a decent scale. - // 2.) Worse, we could no longer get away with using a constant-size blur! - // Instead, we'd have to face all the same difficulties as the real - // phosphor bloom, which requires static #ifdefs to decide the blur - // size based on the expected triad size...a dynamic value. - // 3.) Like the phosphor bloom, we'd have less control over making the blur - // size correct for an optical blur. That said, we likely overblur (to - // maintain brightness) more than the eye would do by itself: 20/20 - // human vision distinguishes ~1 arc minute, or 1/60 of a degree. The - // highest viewing angle recommendation I know of is THX's 40.04 degree - // recommendation, at which 20/20 vision can distinguish about 2402.4 - // lines. Assuming the "TV lines" definition, that means 1201.2 - // distinct light lines and 1201.2 distinct dark lines can be told - // apart, i.e. 1201.2 pairs of lines. This would correspond to 1201.2 - // pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total - // (if they're alternately lit). That's a max of 800.8 triads. Using - // a more popular 30 degree viewing angle recommendation, 20/20 vision - // can distinguish 1800 lines, or 600 triads of alternately lit - // phosphors. In contrast, we currently blur phosphors all the way - // down to 341.3 triads to ensure full brightness. - // 4.) Realistically speaking, we're usually just going to use bilinear - // filtering in this pass anyway, but it only works well to limit - // bandwidth if it's done at a small constant scale. - - // Get the constants we need to sample: - const vec2 texture_size = registers.ORIG_LINEARIZEDSize.xy; - vec2 tex_uv_r, tex_uv_g, tex_uv_b; - - if(beam_misconvergence == true) - { - const vec2 convergence_offsets_r = vec2(params.convergence_offset_x_r, params.convergence_offset_y_r);//get_convergence_offsets_r_vector(); - const vec2 convergence_offsets_g = vec2(params.convergence_offset_x_g, params.convergence_offset_y_g);//get_convergence_offsets_g_vector(); - const vec2 convergence_offsets_b = vec2(params.convergence_offset_x_b, params.convergence_offset_y_b);//get_convergence_offsets_b_vector(); - tex_uv_r = tex_uv - vec2(params.convergence_offset_x_r, params.convergence_offset_y_r) * uv_scanline_step; - tex_uv_g = tex_uv - vec2(params.convergence_offset_x_g, params.convergence_offset_y_g) * uv_scanline_step; - tex_uv_b = tex_uv - vec2(params.convergence_offset_x_b, params.convergence_offset_y_b) * uv_scanline_step; - } - // Get the blur sigma: - const float bloom_approx_sigma = get_bloom_approx_sigma(registers.OutputSize.x, estimated_viewport_size_x); - - // Sample the resized and blurred texture, and apply convergence offsets if - // necessary. Applying convergence offsets here triples our samples from - // 16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and - // HALATION_BLUR 3 times at full resolution every time they're used. - vec3 color_r, color_g, color_b, color; - if(bloom_approx_filter > 1.5) - { - // Use a 4x4 Gaussian resize. This is slower but technically correct. - if(beam_misconvergence == true) - { - color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r, - blur_dxdy, texture_size, texture_size_inv, - tex_uv_to_pixel_scale, bloom_approx_sigma); - color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g, - blur_dxdy, texture_size, texture_size_inv, - tex_uv_to_pixel_scale, bloom_approx_sigma); - color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b, - blur_dxdy, texture_size, texture_size_inv, - tex_uv_to_pixel_scale, bloom_approx_sigma); - } - else - { - color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv, - blur_dxdy, texture_size, texture_size_inv, - tex_uv_to_pixel_scale, bloom_approx_sigma); - } - } - else if(bloom_approx_filter > 0.5) - { - // Use a 3x3 resize blur. This is the softest option, because we're - // blurring already blurry bilinear samples. It doesn't play quite as - // nicely with convergence offsets, but it has its charms. - if(beam_misconvergence == true) - { - color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r, - blur_dxdy, bloom_approx_sigma); - color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g, - blur_dxdy, bloom_approx_sigma); - color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b, - blur_dxdy, bloom_approx_sigma); - } - else - { - color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy); - } - } - else - { - // Use bilinear sampling. This approximates a 4x4 Gaussian resize MUCH - // better than tex2Dblur3x3_resize for the very small sigmas we're - // likely to use at small output resolutions. (This estimate becomes - // too sharp above ~400x300, but the blurs break down above that - // resolution too, unless min_allowed_viewport_triads is high enough to - // keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.) - if(beam_misconvergence == true) - { - color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb; - color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb; - color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb; - } - else - { - color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb; - } - } - // Pack the colors from the red/green/blue beams into a single vector: - if(beam_misconvergence == true) - { - color = vec3(color_r.r, color_g.g, color_b.b); - } - // Encode and output the blurred image: - FragColor = vec4(color, 1.0);//vec4(texture(ORIG_LINEARIZED, tex_uv));// -} +#include "crt-royale-bloom-approx.h" \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang index 8708b7b..e74cb02 100755 --- a/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang +++ b/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang @@ -1,17 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OutputSize; - vec4 ORIG_LINEARIZEDSize; - vec4 HALATION_BLURSize; - vec4 MASKED_SCANLINESSize; - vec4 BRIGHTPASSSize; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -30,18 +18,93 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; + vec4 MASKED_SCANLINESSize; + vec4 HALATION_BLURSize; + vec4 BRIGHTPASSSize; +} global; + +#define MASKED_SCANLINEStexture MASKED_SCANLINES +#define MASKED_SCANLINEStexture_size global.MASKED_SCANLINESSize.xy +#define MASKED_SCANLINESvideo_size global.MASKED_SCANLINESSize.xy +#define HALATION_BLURtexture HALATION_BLUR +#define HALATION_BLURtexture_size global.HALATION_BLURSize.xy +#define HALATION_BLURvideo_size global.HALATION_BLURSize.xy +#define BRIGHTPASStexture BRIGHTPASS +#define BRIGHTPASStexture_size global.BRIGHTPASSSize.xy +#define BRIGHTPASSvideo_size global.BRIGHTPASSSize.xy + +float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" #include "derived-settings-and-constants.h" #include "bind-shader-params.h" -////////////////////////////////// INCLUDES ////////////////////////////////// +/////////////////////////////// VERTEX INCLUDES ////////////////////////////// #include "../../../../include/gamma-management.h" -#include "bloom-functions.h" #include "phosphor-mask-resizing.h" #include "scanline-functions.h" @@ -56,34 +119,52 @@ layout(location = 4) out vec2 bloom_tex_uv; layout(location = 5) out vec2 bloom_dxdy; layout(location = 6) out float bloom_sigma_runtime; +// copied from bloom-functions.h +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + void main() { - gl_Position = params.MVP * Position; - video_uv = TexCoord; + gl_Position = global.MVP * Position; + float2 tex_uv = TexCoord; // Our various input textures use different coords: - scanline_tex_uv = video_uv * registers.MASKED_SCANLINESSize.xy * - registers.MASKED_SCANLINESSize.zw; - halation_tex_uv = video_uv * registers.HALATION_BLURSize.xy * - registers.HALATION_BLURSize.zw; - brightpass_tex_uv = video_uv * registers.BRIGHTPASSSize.xy * - registers.BRIGHTPASSSize.zw; - bloom_tex_uv = TexCoord; + const float2 video_uv = tex_uv * IN.texture_size/IN.video_size; +// video_uv = video_uv; + scanline_tex_uv = video_uv * MASKED_SCANLINESvideo_size / + MASKED_SCANLINEStexture_size; + halation_tex_uv = video_uv * HALATION_BLURvideo_size / + HALATION_BLURtexture_size; + brightpass_tex_uv = video_uv * BRIGHTPASSvideo_size / + BRIGHTPASStexture_size; + bloom_tex_uv = tex_uv; // We're horizontally blurring the bloom input (vertically blurred // brightpass). Get the uv distance between output pixels / input texels // in the horizontal direction (this pass must NOT resize): - bloom_dxdy = vec2(registers.SourceSize.z, 0.0); + bloom_dxdy = float2(1.0/IN.texture_size.x, 0.0); // Calculate a runtime bloom_sigma in case it's needed: const float mask_tile_size_x = get_resized_mask_tile_size( - registers.OutputSize.xy, registers.OutputSize.xy * mask_resize_viewport_scale, false).x; + IN.output_size, IN.output_size * mask_resize_viewport_scale, false).x; bloom_sigma_runtime = get_min_sigma_to_blur_triad( - mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh); + mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_); } #pragma stage fragment -#pragma format R8G8B8A8_SRGB layout(location = 0) in vec2 video_uv; layout(location = 1) in vec2 scanline_tex_uv; layout(location = 2) in vec2 halation_tex_uv; @@ -93,40 +174,45 @@ layout(location = 5) in vec2 bloom_dxdy; layout(location = 6) in float bloom_sigma_runtime; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; -layout(set = 0, binding = 3) uniform sampler2D MASKED_SCANLINES; -layout(set = 0, binding = 4) uniform sampler2D HALATION_BLUR; -layout(set = 0, binding = 5) uniform sampler2D BRIGHTPASS; +layout(set = 0, binding = 3) uniform sampler2D HALATION_BLUR; +layout(set = 0, binding = 4) uniform sampler2D BRIGHTPASS; +layout(set = 0, binding = 5) uniform sampler2D MASKED_SCANLINES; +#define bloom_texture Source + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +#include "bloom-functions.h" void main() { -// Blur the vertically blurred brightpass horizontally by 9/17/25/43x: + // Blur the vertically blurred brightpass horizontally by 9/17/25/43x: const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime); - const vec3 blurred_brightpass = tex2DblurNfast(Source, + const float3 blurred_brightpass = tex2DblurNfast(bloom_texture, bloom_tex_uv, bloom_dxdy, bloom_sigma); -// Sample the masked scanlines. Alpha contains the auto-dim factor: - const vec3 intensity_dim = - tex2D_linearize(MASKED_SCANLINES, scanline_tex_uv).rgb; + // Sample the masked scanlines. Alpha contains the auto-dim factor: + const float3 intensity_dim = + tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb; const float auto_dim_factor = levels_autodim_temp; const float undim_factor = 1.0/auto_dim_factor; - - // Calculate the mask dimpass, add it to the blurred brightpass, and + + // Calculate the mask dimpass, add it to the blurred brightpass, and // undim (from scanline auto-dim) and amplify (from mask dim) the result: const float mask_amplify = get_mask_amplify(); - const vec3 brightpass = tex2D_linearize(BRIGHTPASS, + const float3 brightpass = tex2D_linearize(BRIGHTPASStexture, brightpass_tex_uv).rgb; - const vec3 dimpass = intensity_dim - brightpass; - const vec3 phosphor_bloom = (dimpass + blurred_brightpass) * - mask_amplify * undim_factor * params.levels_contrast; - - // Sample the halation texture, and let some light bleed into refractive + const float3 dimpass = intensity_dim - brightpass; + const float3 phosphor_bloom = (dimpass + blurred_brightpass) * + mask_amplify * undim_factor * levels_contrast; + + // Sample the halation texture, and let some light bleed into refractive // diffusion. Conceptually this occurs before the phosphor bloom, but // adding it in earlier passes causes black crush in the diffusion colors. - const vec3 diffusion_color = params.levels_contrast * tex2D_linearize( - HALATION_BLUR, halation_tex_uv).rgb; - const vec3 final_bloom = mix(phosphor_bloom, - diffusion_color, params.diffusion_weight); - - // Encode and output the bloomed image: - FragColor = encode_output(vec4(final_bloom, 1.0)); -} + const float3 diffusion_color = levels_contrast * tex2D_linearize( + HALATION_BLURtexture, halation_tex_uv).rgb; + const float3 final_bloom = lerp(phosphor_bloom, + diffusion_color, global.diffusion_weight); + + // Encode and output the bloomed image: + FragColor = encode_output(float4(final_bloom, 1.0)); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang index 06b5a99..c0e95f8 100755 --- a/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang +++ b/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang @@ -1,15 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -28,20 +18,84 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; + vec4 MASKED_SCANLINESSize; + vec4 BLOOM_APPROXSize; +} global; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" #include "derived-settings-and-constants.h" #include "bind-shader-params.h" -////////////////////////////////// INCLUDES ////////////////////////////////// +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// #include "../../../../include/gamma-management.h" -#include "bloom-functions.h" #include "phosphor-mask-resizing.h" +float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; + #pragma stage vertex layout(location = 0) in vec4 Position; layout(location = 1) in vec2 TexCoord; @@ -49,23 +103,40 @@ layout(location = 0) out vec2 tex_uv; layout(location = 1) out vec2 bloom_dxdy; layout(location = 2) out float bloom_sigma_runtime; +// copied from bloom-functions.h +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} + void main() { - gl_Position = params.MVP * Position; - tex_uv = TexCoord; + gl_Position = global.MVP * Position; + tex_uv = TexCoord * 1.0001; - // Get the uv sample distance between output pixels. Calculate dxdy like + // Get the uv sample distance between output pixels. Calculate dxdy like // blurs/vertex-shader-blur-fast-vertical.h. - const vec2 dxdy_scale = registers.SourceSize.xy * registers.OutputSize.zw; - const vec2 dxdy = dxdy_scale * registers.SourceSize.zw; + const float2 dxdy_scale = IN.video_size/IN.output_size; + const float2 dxdy = dxdy_scale/IN.texture_size; // This blur is vertical-only, so zero out the vertical offset: - bloom_dxdy = vec2(0.0, dxdy.y); + bloom_dxdy = float2(0.0, dxdy.y); // Calculate a runtime bloom_sigma in case it's needed: const float mask_tile_size_x = get_resized_mask_tile_size( - registers.OutputSize.xy, registers.OutputSize.xy * mask_resize_viewport_scale, false).x; + IN.output_size, IN.output_size * mask_resize_viewport_scale, false).x; bloom_sigma_runtime = get_min_sigma_to_blur_triad( - mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh); + mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_); } #pragma stage fragment @@ -75,13 +146,18 @@ layout(location = 1) in vec2 bloom_dxdy; layout(location = 2) in float bloom_sigma_runtime; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +#include "bloom-functions.h" void main() { // Blur the brightpass horizontally with a 9/17/25/43x blur: const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime); - const vec3 color = tex2DblurNfast(Source, tex_uv, + const float3 color = tex2DblurNfast(input_texture, tex_uv, bloom_dxdy, bloom_sigma); // Encode and output the blurred image: - FragColor = encode_output(vec4(color, 1.0)); -} + FragColor = encode_output(float4(color, 1.0)); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-brightpass.slang b/crt/shaders/crt-royale/src/crt-royale-brightpass.slang index 806b717..bac816c 100755 --- a/crt/shaders/crt-royale/src/crt-royale-brightpass.slang +++ b/crt/shaders/crt-royale/src/crt-royale-brightpass.slang @@ -1,17 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; - vec4 MASKED_SCANLINESSize; - vec4 BLOOM_APPROXSize; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -30,114 +18,198 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; + vec4 MASKED_SCANLINESSize; + vec4 BLOOM_APPROXSize; +} global; + +#define MASKED_SCANLINEStexture MASKED_SCANLINES +#define MASKED_SCANLINEStexture_size global.MASKED_SCANLINESSize.xy +#define MASKED_SCANLINESvideo_size global.MASKED_SCANLINESSize.xy +#define BLOOM_APPROXtexture BLOOM_APPROX +#define BLOOM_APPROXtexture_size global.BLOOM_APPROXSize.xy +#define BLOOM_APPROXvideo_size global.BLOOM_APPROXSize.xy + +float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); +const float bloom_diff_thresh_ = 1.0/256.0; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" #include "derived-settings-and-constants.h" #include "bind-shader-params.h" -////////////////////////////////// INCLUDES ////////////////////////////////// +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// #include "../../../../include/gamma-management.h" -#include "../../../../include/blur-functions.h" #include "phosphor-mask-resizing.h" #include "scanline-functions.h" -#include "bloom-functions.h" - #pragma stage vertex layout(location = 0) in vec4 Position; layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 video_uv; -layout(location = 1) out vec2 scanline_tex_uv; +layout(location = 0) out vec2 scanline_tex_uv; +layout(location = 1) out vec2 blur3x3_tex_uv; layout(location = 2) out float bloom_sigma_runtime; -layout(location = 3) out vec2 blur3x3_tex_uv; + +// copied from bloom-functions.h +inline float get_min_sigma_to_blur_triad(const float triad_size, + const float thresh) +{ + // Requires: 1.) triad_size is the final phosphor triad size in pixels + // 2.) thresh is the max desired pixel difference in the + // blurred triad (e.g. 1.0/256.0). + // Returns: Return the minimum sigma that will fully blur a phosphor + // triad on the screen to an even color, within thresh. + // This closed-form function was found by curve-fitting data. + // Estimate: max error = ~0.086036, mean sq. error = ~0.0013387: + return -0.05168 + 0.6113*triad_size - + 1.122*triad_size*sqrt(0.000416 + thresh); + // Estimate: max error = ~0.16486, mean sq. error = ~0.0041041: + //return 0.5985*triad_size - triad_size*sqrt(thresh) +} void main() { - gl_Position = params.MVP * Position; - const vec2 tex_uv = TexCoord; + gl_Position = global.MVP * Position; + float2 tex_uv = TexCoord; // Our various input textures use different coords: - video_uv = tex_uv; - scanline_tex_uv = video_uv * registers.MASKED_SCANLINESSize.xy * - registers.MASKED_SCANLINESSize.zw; - blur3x3_tex_uv = video_uv * registers.BLOOM_APPROXSize.xy * registers.BLOOM_APPROXSize.zw; + float2 video_uv = tex_uv * IN.texture_size/IN.video_size; + //video_uv = video_uv; + scanline_tex_uv = video_uv * MASKED_SCANLINESvideo_size / + MASKED_SCANLINEStexture_size; + blur3x3_tex_uv = video_uv * BLOOM_APPROXvideo_size / BLOOM_APPROXtexture_size; // Calculate a runtime bloom_sigma in case it's needed: const float mask_tile_size_x = get_resized_mask_tile_size( - registers.OutputSize.xy, registers.OutputSize.xy * mask_resize_viewport_scale, false).x; + IN.output_size, IN.output_size * mask_resize_viewport_scale, false).x; bloom_sigma_runtime = get_min_sigma_to_blur_triad( - mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh); + mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_); } #pragma stage fragment -#pragma format R8G8B8A8_SRGB -layout(location = 0) in vec2 video_uv; -layout(location = 1) in vec2 scanline_tex_uv; +layout(location = 0) in vec2 scanline_tex_uv; +layout(location = 1) in vec2 blur3x3_tex_uv; layout(location = 2) in float bloom_sigma_runtime; -layout(location = 3) in vec2 blur3x3_tex_uv; layout(location = 0) out vec4 FragColor; -layout(set = 0, binding = 2) uniform sampler2D Source; -layout(set = 0, binding = 3) uniform sampler2D MASKED_SCANLINES; -layout(set = 0, binding = 4) uniform sampler2D BLOOM_APPROX; +layout(set = 0, binding = 2) uniform sampler2D MASKED_SCANLINES; +layout(set = 0, binding = 3) uniform sampler2D BLOOM_APPROX; + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +#include "bloom-functions.h" +#include "../../../../include/blur-functions.h" void main() { - // Sample the masked scanlines: - const vec3 intensity_dim = - tex2D_linearize(MASKED_SCANLINES, scanline_tex_uv).rgb; + // Sample the masked scanlines: + const float3 intensity_dim = + tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb; // Get the full intensity, including auto-undimming, and mask compensation: const float auto_dim_factor = levels_autodim_temp; const float undim_factor = 1.0/auto_dim_factor; const float mask_amplify = get_mask_amplify(); - const vec3 intensity = intensity_dim * undim_factor * mask_amplify * - params.levels_contrast; - - // Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines + const float3 intensity = intensity_dim * undim_factor * mask_amplify * + levels_contrast; + + // Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines // would look like, so we can estimate how much energy we'll receive from // blooming neighbors: - const vec3 phosphor_blur_approx = params.levels_contrast * tex2D_linearize( - BLOOM_APPROX, blur3x3_tex_uv).rgb; - - // Compute the blur weight for the center texel and the maximum energy we + const float3 phosphor_blur_approx = levels_contrast * tex2D_linearize( + BLOOM_APPROXtexture, blur3x3_tex_uv).rgb; + + // Compute the blur weight for the center texel and the maximum energy we // expect to receive from neighbors: const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime); const float center_weight = get_center_weight(bloom_sigma); - const vec3 max_area_contribution_approx = - max(vec3(0.0), phosphor_blur_approx - center_weight * intensity); - - // Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0), + const float3 max_area_contribution_approx = + max(float3(0.0, 0.0, 0.0), phosphor_blur_approx - center_weight * intensity); + // Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0), // because it actually gets better results (on top of being very simple), // but adjust all intensities for the user's desired underestimate factor: - const vec3 area_contrib_underestimate = - params.bloom_underestimate_levels * max_area_contribution_approx; - const vec3 intensity_underestimate = - params.bloom_underestimate_levels * intensity; - - // Calculate the blur_ratio, the ratio of intensity we want to blur: + const float3 area_contrib_underestimate = + bloom_underestimate_levels * max_area_contribution_approx; + const float3 intensity_underestimate = + bloom_underestimate_levels * intensity; + // Calculate the blur_ratio, the ratio of intensity we want to blur: #ifdef BRIGHTPASS_AREA_BASED // This area-based version changes blur_ratio more smoothly and blurs // more, clipping less but offering less phosphor differentiation: - const vec3 phosphor_blur_underestimate = params.bloom_underestimate_levels * + const float3 phosphor_blur_underestimate = bloom_underestimate_levels * phosphor_blur_approx; - const vec3 soft_intensity = max(intensity_underestimate, + const float3 soft_intensity = max(intensity_underestimate, phosphor_blur_underestimate * mask_amplify); - const vec3 blur_ratio_temp = - ((vec3(1.0) - area_contrib_underestimate) / - soft_intensity - vec3(1.0)) / (center_weight - 1.0); + const float3 blur_ratio_temp = + ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) / + soft_intensity - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0); #else - const vec3 blur_ratio_temp = - ((vec3(1.0) - area_contrib_underestimate) / - intensity_underestimate - vec3(1.0)) / (center_weight - 1.0); + const float3 blur_ratio_temp = + ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) / + intensity_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0); #endif - - const vec3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0); + const float3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0); // Calculate the brightpass based on the auto-dimmed, unamplified, masked // scanlines, encode if necessary, and return! - const vec3 brightpass = intensity_dim * - mix(blur_ratio, vec3(1.0), params.bloom_excess); - - FragColor = encode_output(vec4(brightpass, 1.0)); -} + const float3 brightpass = intensity_dim * + lerp(blur_ratio, float3(1.0, 1.0, 1.0), global.bloom_excess); + FragColor = encode_output(float4(brightpass, 1.0)); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang index 02ec577..24d9c76 100755 --- a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang +++ b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang @@ -1,21 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - uint FrameCount; -} registers; - -layout(std140, set = 0, binding = 0) uniform UBO -{ - mat4 MVP; - float interlace_bff; - float beam_horiz_filter; -} params; - -#pragma parameter interlace_bff "interlace_bff" 1.0 0.0 1.0 1.0 -#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0 - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -34,18 +18,75 @@ layout(std140, set = 0, binding = 0) uniform UBO // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; +} global; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// // PASS SETTINGS: // gamma-management.h needs to know what kind of pipeline we're using and // what pass this is in that pipeline. This will become obsolete if/when we -// can #define things like this in the preset file. +// can #define things like this in the .cgp preset file. #define FIRST_PASS #define SIMULATE_CRT_ON_LCD ////////////////////////////////// INCLUDES ////////////////////////////////// +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" #include "bind-shader-params.h" #include "../../../../include/gamma-management.h" @@ -56,59 +97,60 @@ layout(location = 0) in vec4 Position; layout(location = 1) in vec2 TexCoord; layout(location = 0) out vec2 tex_uv; layout(location = 1) out vec2 uv_step; +layout(location = 2) out float interlaced; void main() { - gl_Position = params.MVP * Position; - tex_uv = TexCoord; + gl_Position = global.MVP * Position; + tex_uv = TexCoord * 1.00001; + uv_step = float2(1.0)/IN.texture_size; - // Save the uv distance between texels: - uv_step = vec2(1.0) * registers.SourceSize.zw; + // Detect interlacing: 1.0 = true, 0.0 = false. + const float2 _video_size = IN.video_size; + interlaced = float(is_interlaced(_video_size.y)); } #pragma stage fragment #pragma format R8G8B8A8_SRGB layout(location = 0) in vec2 tex_uv; layout(location = 1) in vec2 uv_step; +layout(location = 2) in float interlaced; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source void main() { - // Detect interlacing: 1.0 = true, 0.0 = false. - const vec2 video_size = registers.SourceSize.xy; - bool interlaced = is_interlaced(video_size.y); - -// Linearize the input based on CRT gamma and bob interlaced fields. -// Bobbing ensures we can immediately blur without getting artifacts. -// Note: TFF/BFF won't matter for sources that double-weave or similar. -if(interlace_detect == true) + // Linearize the input based on CRT gamma and bob interlaced fields. + // Bobbing ensures we can immediately blur without getting artifacts. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + if(bool(interlace_detect)) { // Sample the current line and an average of the previous/next line; // tex2D_linearize will decode CRT gamma. Don't bother branching: -// const vec2 tex_uv = tex_uv; - const vec2 v_step = vec2(0.0, uv_step.y); - const vec3 curr_line = tex2D_linearize( - Source, tex_uv).rgb; - const vec3 last_line = tex2D_linearize( - Source, tex_uv - v_step).rgb; - const vec3 next_line = tex2D_linearize( - Source, tex_uv + v_step).rgb; - const vec3 interpolated_line = 0.5 * (last_line + next_line); +// const float2 tex_uv = tex_uv; + const float2 v_step = float2(0.0, uv_step.y); + const float3 curr_line = tex2D_linearize( + input_texture, tex_uv).rgb; + const float3 last_line = tex2D_linearize( + input_texture, tex_uv - v_step).rgb; + const float3 next_line = tex2D_linearize( + input_texture, tex_uv + v_step).rgb; + const float3 interpolated_line = 0.5 * (last_line + next_line); // If we're interlacing, determine which field curr_line is in: - const float modulus = float(interlaced) + 1.0; + const float modulus = interlaced + 1.0; const float field_offset = - mod(registers.FrameCount + float(params.interlace_bff), modulus); - const float curr_line_texel = tex_uv.y * registers.SourceSize.y; + fmod(params.frame_count + global.interlace_bff, modulus); + const float curr_line_texel = tex_uv.y * IN.texture_size.y; // Use under_half to fix a rounding bug around exact texel locations. const float line_num_last = floor(curr_line_texel - under_half); - const float wrong_field = mod(line_num_last + field_offset, modulus); + const float wrong_field = fmod(line_num_last + field_offset, modulus); // Select the correct color, and output the result: - const vec3 color = mix(curr_line, interpolated_line, wrong_field); - FragColor = encode_output(vec4(color, 1.0)); + const float3 color = lerp(curr_line, interpolated_line, wrong_field); + FragColor = encode_output(float4(color, 1.0)); } else { - FragColor = encode_output(tex2D_linearize(Source, tex_uv)); + FragColor = encode_output(tex2D_linearize(input_texture, tex_uv)); } -} +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-backup.slang b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-backup.slang deleted file mode 100755 index 2e4466e..0000000 --- a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-backup.slang +++ /dev/null @@ -1,245 +0,0 @@ -#version 450 - -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} registers; - -#include "params.inc" - -///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// - -// crt-royale: A full-featured CRT shader, with cheese. -// Copyright (C) 2014 TroggleMonkey -// -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 2 of the License, or any later version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. -// -// You should have received a copy of the GNU General Public License along with -// this program; if not, write to the Free Software Foundation, Inc., 59 Temple -// Place, Suite 330, Boston, MA 02111-1307 USA - - -///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// - -#define LAST_PASS -#define SIMULATE_CRT_ON_LCD -#include "../user-settings.h" -#include "derived-settings-and-constants.h" -#include "bind-shader-params.h" - -#ifndef DONT_DEFINE //RUNTIME_GEOMETRY_TILT - // Create a local-to-global rotation matrix for the CRT's coordinate frame - // and its global-to-local inverse. See the vertex shader for details. - // It's faster to compute these statically if possible. - const vec2 sin_tilt = sin(geom_tilt_angle_static); - const vec2 cos_tilt = cos(geom_tilt_angle_static); - const mat3x3 geom_local_to_global_static = mat3x3( - cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, - 0.0, cos_tilt.y, -sin_tilt.y, - -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); - const mat3x3 geom_global_to_local_static = mat3x3( - cos_tilt.x, 0.0, -sin_tilt.x, - sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x, - cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x); -#endif - -////////////////////////////////// INCLUDES ////////////////////////////////// - -#include "../../../../include/gamma-management.h" -#include "tex2Dantialias.h" -#include "geometry-functions.h" - -/////////////////////////////////// HELPERS ////////////////////////////////// - -mat2x2 mul_scale(vec2 scale, mat2x2 matrix) -{ - //mat2x2 scale_matrix = mat2x2(scale.x, 0.0, 0.0, scale.y); - //return (matrix * scale_matrix); - return mat2x2(vec4(matrix[0].xy, matrix[1].xy) * scale.xxyy); -} - -#pragma stage vertex -layout(location = 0) in vec4 Position; -layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 tex_uv; -layout(location = 1) out vec4 video_and_texture_size_inv; -layout(location = 2) out vec2 output_size_inv; -layout(location = 3) out vec3 eye_pos_local; -layout(location = 4) out vec4 geom_aspect_and_overscan; -#ifdef RUNTIME_GEOMETRY_TILT -layout(location = 5) out vec3 global_to_local_row0; -layout(location = 6) out vec3 global_to_local_row1; -layout(location = 7) out vec3 global_to_local_row2; -#endif - -void main() -{ - gl_Position = params.MVP * Position; - tex_uv = TexCoord; - - video_and_texture_size_inv = vec4(registers.SourceSize.zw, registers.SourceSize.zw); - output_size_inv = registers.OutputSize.zw; - - // Get aspect/overscan vectors from scalar parameters (likely uniforms): - const float viewport_aspect_ratio = registers.OutputSize.x * registers.OutputSize.w; - const vec2 geom_aspect = get_aspect_vector(viewport_aspect_ratio); - const vec2 geom_overscan = get_geom_overscan_vector(); - geom_aspect_and_overscan = vec4(geom_aspect, geom_overscan); - - #ifdef DONT_DEFINE //RUNTIME_GEOMETRY_TILT - // Create a local-to-global rotation matrix for the CRT's coordinate - // frame and its global-to-local inverse. Rotate around the x axis - // first (pitch) and then the y axis (yaw) with yucky Euler angles. - // Positive angles go clockwise around the right-vec and up-vec. - // Runtime shader parameters prevent us from computing these globally, - // but we can still combine the pitch/yaw matrices by hand to cut a - // few instructions. Note that cg matrices fill row1 first, then row2, - // etc. (row-major order). - const vec2 geom_tilt_angle = get_geom_tilt_angle_vector(); - const vec2 sin_tilt = sin(geom_tilt_angle); - const vec2 cos_tilt = cos(geom_tilt_angle); - // Conceptual breakdown: - // const mat3x3 rot_x_matrix = mat3x3( - // 1.0, 0.0, 0.0, - // 0.0, cos_tilt.y, -sin_tilt.y, - // 0.0, sin_tilt.y, cos_tilt.y); - // const mat3x3 rot_y_matrix = mat3x3( - // cos_tilt.x, 0.0, sin_tilt.x, - // 0.0, 1.0, 0.0, - // -sin_tilt.x, 0.0, cos_tilt.x); - // const mat3x3 local_to_global = - // rot_x_matrix * rot_y_matrix; - // const mat3x3 global_to_local = - // transpose(local_to_global); - mat3x3 local_to_global = mat3x3( - cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, - 0.0, cos_tilt.y, -sin_tilt.y, - -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); - // This is a pure rotation, so transpose = inverse: - mat3x3 global_to_local = transpose(local_to_global); - // Decompose the matrix into 3 vec3's for output: - global_to_local_row0 = vec3(global_to_local[0].xyz); - global_to_local_row1 = vec3(global_to_local[1].xyz); - global_to_local_row2 = vec3(global_to_local[2].xyz); - #else - const mat3x3 global_to_local = geom_global_to_local_static; - const mat3x3 local_to_global = geom_local_to_global_static; - #endif - - // Get an optimal eye position based on geom_view_dist, viewport_aspect, - // and CRT radius/rotation: - #ifdef RUNTIME_GEOMETRY_MODE - geom_mode = params.geom_mode_runtime; - #else - const float geom_mode = geom_mode_static; - #endif - - const vec3 eye_pos_global = get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode); - eye_pos_local = eye_pos_global, global_to_local; -} - -#pragma stage fragment -layout(location = 0) in vec2 tex_uv; -layout(location = 1) in vec4 video_and_texture_size_inv; -layout(location = 2) in vec2 output_size_inv; -layout(location = 3) in vec3 eye_pos_local; -layout(location = 4) in vec4 geom_aspect_and_overscan; -#ifdef RUNTIME_GEOMETRY_TILT -layout(location = 5) in vec3 global_to_local_row0; -layout(location = 6) in vec3 global_to_local_row1; -layout(location = 7) in vec3 global_to_local_row2; -#endif -layout(location = 0) out vec4 FragColor; -layout(set = 0, binding = 2) uniform sampler2D Source; - -void main() -{ - // Localize some parameters: - const vec2 geom_aspect = geom_aspect_and_overscan.xy; - const vec2 geom_overscan = geom_aspect_and_overscan.zw; - const vec2 video_size_inv = video_and_texture_size_inv.xy; - const vec2 texture_size_inv = video_and_texture_size_inv.zw; - - #ifdef RUNTIME_GEOMETRY_TILT - const mat3x3 global_to_local = mat3x3(global_to_local_row0, - global_to_local_row1, global_to_local_row2); - #else - const mat3x3 global_to_local = geom_global_to_local_static; - #endif - #ifdef RUNTIME_GEOMETRY_MODE - geom_mode = params.geom_mode_runtime; - #else - const float geom_mode = geom_mode_static; - #endif - - // Get flat and curved texture coords for the current fragment point sample - // and a pixel_to_tangent_video_uv matrix for transforming pixel offsets: - // video_uv = relative position in video frame, mapped to [0.0, 1.0] range - // tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range - const vec2 flat_video_uv = tex_uv * (registers.SourceSize.xy * video_size_inv); - mat2x2 pixel_to_video_uv; - vec2 video_uv_no_geom_overscan; - if(geom_mode > 0.5) - { - video_uv_no_geom_overscan = - get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv, - eye_pos_local, output_size_inv, geom_aspect, - geom_mode, global_to_local, pixel_to_video_uv); - } - else - { - video_uv_no_geom_overscan = flat_video_uv; - pixel_to_video_uv = mat2x2( - output_size_inv.x, 0.0, 0.0, output_size_inv.y); - } - - // Correct for overscan here (not in curvature code): - const vec2 video_uv = - (video_uv_no_geom_overscan - vec2(0.5))/geom_overscan + vec2(0.5); - const vec2 tex_uv = video_uv * (registers.SourceSize.xy * texture_size_inv); - - // Get a matrix transforming pixel vectors to tex_uv vectors: - const mat2x2 pixel_to_tex_uv = - mul_scale(registers.SourceSize.xy * texture_size_inv / - geom_aspect_and_overscan.zw, pixel_to_video_uv); - - // Sample! Skip antialiasing if aa_level < 0.5 or both of these hold: - // 1.) Geometry/curvature isn't used - // 2.) Overscan == vec2(1.0) - // Skipping AA is sharper, but it's only faster with dynamic branches. - const vec2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset()); - bool need_subpixel_aa = true; - if(abs_aa_r_offset.x + abs_aa_r_offset.y < 0.0) need_subpixel_aa = false; - vec3 color; - if(aa_level > 0.5 && (geom_mode > 0.5 || any(notEqual(geom_overscan , vec2(1.0))))) - { - // Sample the input with antialiasing (due to sharp phosphors, etc.): - color = tex2Daa(Source, tex_uv, pixel_to_tex_uv, registers.FrameCount); - } - else if(aa_level > 0.5 && need_subpixel_aa = true) - { - // Sample at each subpixel location: - color = tex2Daa_subpixel_weights_only( - Source, tex_uv, pixel_to_tex_uv); - } - else - { - color = tex2D_linearize(Source, tex_uv).rgb; - } - - // Dim borders and output the final result: - const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect); - const vec3 final_color = color * border_dim_factor; - - FragColor = encode_output(vec4(final_color, 1.0)); -} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang new file mode 100644 index 0000000..46a53e4 --- /dev/null +++ b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang @@ -0,0 +1,4 @@ +#version 450 + +#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE +#include "crt-royale-geometry-aa-last-pass.h" \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.h b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.h new file mode 100644 index 0000000..ef99b01 --- /dev/null +++ b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.h @@ -0,0 +1,293 @@ +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; + vec4 MASKED_SCANLINESSize; + vec4 HALATION_BLURSize; + vec4 BRIGHTPASSSize; +} global; + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#define LAST_PASS +#define SIMULATE_CRT_ON_LCD +#include "params.inc" +#include "../../../../include/compat_macros.inc" +#include "../user-settings.h" +#include "derived-settings-and-constants.h" +#include "bind-shader-params.h" + +#ifndef RUNTIME_GEOMETRY_TILT + // Create a local-to-global rotation matrix for the CRT's coordinate frame + // and its global-to-local inverse. See the vertex shader for details. + // It's faster to compute these statically if possible. + static const float2 sin_tilt = sin(geom_tilt_angle_static); + static const float2 cos_tilt = cos(geom_tilt_angle_static); + static const float3x3 geom_local_to_global_static = float3x3( + cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, + 0.0, cos_tilt.y, -sin_tilt.y, + -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); + static const float3x3 geom_global_to_local_static = float3x3( + cos_tilt.x, 0.0, -sin_tilt.x, + sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x, + cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x); +#endif + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../../../../include/gamma-management.h" +#include "tex2Dantialias.h" +#include "geometry-functions.h" + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +float2x2 mul_scale(float2 scale, float2x2 matrix) +{ + //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y); + //return mul(scale_matrix, matrix); + return float2x2(float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy); +} + +#pragma stage vertex +layout(location = 0) in vec4 Position; +layout(location = 1) in vec2 TexCoord; +layout(location = 0) out vec2 tex_uv; +layout(location = 1) out vec4 video_and_texture_size_inv; +layout(location = 2) out vec2 output_size_inv; +layout(location = 3) out vec3 eye_pos_local; +layout(location = 4) out vec4 geom_aspect_and_overscan; +layout(location = 5) out vec3 global_to_local_row0; +layout(location = 6) out vec3 global_to_local_row1; +layout(location = 7) out vec3 global_to_local_row2; + +void main() +{ + gl_Position = global.MVP * Position; + tex_uv = TexCoord; + video_and_texture_size_inv = + float4(1.0, 1.0, 1.0, 1.0) / float4(IN.video_size, IN.texture_size); + output_size_inv = float2(1.0, 1.0)/IN.output_size; + + // Get aspect/overscan vectors from scalar parameters (likely uniforms): + const float viewport_aspect_ratio = IN.output_size.x/IN.output_size.y; + const float2 geom_aspect = get_aspect_vector(viewport_aspect_ratio); + const float2 geom_overscan = get_geom_overscan_vector(); + geom_aspect_and_overscan = float4(geom_aspect, geom_overscan); + + #ifdef RUNTIME_GEOMETRY_TILT + // Create a local-to-global rotation matrix for the CRT's coordinate + // frame and its global-to-local inverse. Rotate around the x axis + // first (pitch) and then the y axis (yaw) with yucky Euler angles. + // Positive angles go clockwise around the right-vec and up-vec. + // Runtime shader parameters prevent us from computing these globally, + // but we can still combine the pitch/yaw matrices by hand to cut a + // few instructions. Note that cg matrices fill row1 first, then row2, + // etc. (row-major order). + const float2 geom_tilt_angle = get_geom_tilt_angle_vector(); + const float2 sin_tilt = sin(geom_tilt_angle); + const float2 cos_tilt = cos(geom_tilt_angle); + // Conceptual breakdown: + // static const float3x3 rot_x_matrix = float3x3( + // 1.0, 0.0, 0.0, + // 0.0, cos_tilt.y, -sin_tilt.y, + // 0.0, sin_tilt.y, cos_tilt.y); + // static const float3x3 rot_y_matrix = float3x3( + // cos_tilt.x, 0.0, sin_tilt.x, + // 0.0, 1.0, 0.0, + // -sin_tilt.x, 0.0, cos_tilt.x); + // static const float3x3 local_to_global = + // mul(rot_y_matrix, rot_x_matrix); + // static const float3x3 global_to_local = + // transpose(local_to_global); + const float3x3 local_to_global = float3x3( + cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x, + 0.0, cos_tilt.y, -sin_tilt.y, + -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x); + // This is a pure rotation, so transpose = inverse: + const float3x3 global_to_local = transpose(local_to_global); + // Decompose the matrix into 3 float3's for output: + global_to_local_row0 = float3(global_to_local[0][0], global_to_local[0][1], global_to_local[0][2]);//._m00_m01_m02); + global_to_local_row1 = float3(global_to_local[1][0], global_to_local[1][1], global_to_local[1][2]);//._m10_m11_m12); + global_to_local_row2 = float3(global_to_local[2][0], global_to_local[2][1], global_to_local[2][2]);//._m20_m21_m22); + #else + static const float3x3 global_to_local = geom_global_to_local_static; + static const float3x3 local_to_global = geom_local_to_global_static; + #endif + + // Get an optimal eye position based on geom_view_dist, viewport_aspect, + // and CRT radius/rotation: + #ifdef RUNTIME_GEOMETRY_MODE + const float geom_mode = geom_mode_runtime; + #else + static const float geom_mode = geom_mode_static; + #endif + const float3 eye_pos_global = + get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode); + eye_pos_local = mul(global_to_local, eye_pos_global); +} + +#pragma stage fragment +layout(location = 0) in vec2 tex_uv; +layout(location = 1) in vec4 video_and_texture_size_inv; +layout(location = 2) in vec2 output_size_inv; +layout(location = 3) in vec3 eye_pos_local; +layout(location = 4) in vec4 geom_aspect_and_overscan; +layout(location = 5) in vec3 global_to_local_row0; +layout(location = 6) in vec3 global_to_local_row1; +layout(location = 7) in vec3 global_to_local_row2; +layout(location = 0) out vec4 FragColor; +layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source + +void main() +{ + // Localize some parameters: + const float2 geom_aspect = geom_aspect_and_overscan.xy; + const float2 geom_overscan = geom_aspect_and_overscan.zw; + const float2 video_size_inv = video_and_texture_size_inv.xy; + const float2 texture_size_inv = video_and_texture_size_inv.zw; + //const float2 output_size_inv = output_size_inv; + #ifdef RUNTIME_GEOMETRY_TILT + const float3x3 global_to_local = float3x3(global_to_local_row0, + global_to_local_row1, global_to_local_row2); + #else + static const float3x3 global_to_local = geom_global_to_local_static; + #endif + #ifdef RUNTIME_GEOMETRY_MODE + const float geom_mode = geom_mode_runtime; + #else + static const float geom_mode = geom_mode_static; + #endif + + // Get flat and curved texture coords for the current fragment point sample + // and a pixel_to_tangent_video_uv matrix for transforming pixel offsets: + // video_uv = relative position in video frame, mapped to [0.0, 1.0] range + // tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range + const float2 flat_video_uv = tex_uv * (IN.texture_size * video_size_inv); + float2x2 pixel_to_video_uv; + float2 video_uv_no_geom_overscan; + if(geom_mode > 0.5) + { + video_uv_no_geom_overscan = + get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv, + eye_pos_local, output_size_inv, geom_aspect, + geom_mode, global_to_local, pixel_to_video_uv); + } + else + { + video_uv_no_geom_overscan = flat_video_uv; + pixel_to_video_uv = float2x2( + output_size_inv.x, 0.0, 0.0, output_size_inv.y); + } + // Correct for overscan here (not in curvature code): + const float2 video_uv = + (video_uv_no_geom_overscan - float2(0.5, 0.5))/geom_overscan + float2(0.5, 0.5); + const float2 tex_uv = video_uv * (IN.video_size * texture_size_inv); + + // Get a matrix transforming pixel vectors to tex_uv vectors: + const float2x2 pixel_to_tex_uv = + mul_scale(IN.video_size * texture_size_inv / + geom_aspect_and_overscan.zw, pixel_to_video_uv); + + // Sample! Skip antialiasing if aa_level < 0.5 or both of these hold: + // 1.) Geometry/curvature isn't used + // 2.) Overscan == float2(1.0, 1.0) + // Skipping AA is sharper, but it's only faster with dynamic branches. + const float2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset()); + const bool need_subpixel_aa = abs_aa_r_offset.x + abs_aa_r_offset.y > 0.0; + float3 color; +/* //TODO/FIXME: This block is what causes the black screen when geom_mode >= 1.0 + if(aa_level > 0.5 && (geom_mode > 0.5 || any(bool2((geom_overscan.x != 1.0), (geom_overscan.y != 1.0))))) + { + // Sample the input with antialiasing (due to sharp phosphors, etc.): + color = tex2Daa(input_texture, tex_uv, pixel_to_tex_uv, float(IN.frame_count)); + } + + else */if(aa_level > 0.5 && need_subpixel_aa) + { + // Sample at each subpixel location: + color = tex2Daa_subpixel_weights_only( + input_texture, tex_uv, pixel_to_tex_uv); + } + else + { + color = tex2D_linearize(input_texture, tex_uv).rgb; + } + + // Dim borders and output the final result: + const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect); + const float3 final_color = color * border_dim_factor; + + FragColor = encode_output(float4(final_color, 1.0)); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang index 1a0fef1..18cd6e3 100755 --- a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang +++ b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang @@ -1,43 +1,3 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} params; - -layout(std140, set = 0, binding = 0) uniform UBO -{ - mat4 MVP; -} global; - -#define LAST_PASS -#define SIMULATE_CRT_ON_LCD -#include "../user-settings.h" -#include "derived-settings-and-constants.h" -#include "bind-shader-params.h" - -#include "../../../../include/gamma-management.h" - -#pragma stage vertex -layout(location = 0) in vec4 Position; -layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 vTexCoord; - -void main() -{ - gl_Position = global.MVP * Position; - vTexCoord = TexCoord; -} - -#pragma stage fragment -layout(location = 0) in vec2 vTexCoord; -layout(location = 0) out vec4 FragColor; -layout(set = 0, binding = 2) uniform sampler2D Source; - -void main() -{ - FragColor = encode_output(vec4(texture(Source, vTexCoord).rgb, 1.0)); -} \ No newline at end of file +#include "crt-royale-geometry-aa-last-pass.h" \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang b/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang index 0fcdc2f..10f6235 100755 --- a/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang +++ b/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang @@ -1,15 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -28,9 +18,68 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; +} global; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" #include "derived-settings-and-constants.h" #include "bind-shader-params.h" @@ -49,42 +98,45 @@ layout(location = 2) out vec2 resize_magnification_scale; layout(location = 3) out vec2 src_dxdy; layout(location = 4) out vec2 tile_size_uv; layout(location = 5) out vec2 input_tiles_per_texture; -layout(location = 6) out vec2 tex_uv; void main() { - gl_Position = params.MVP * Position; - tex_uv = TexCoord; - - // First estimate the viewport size (the user will get the wrong number of + gl_Position = global.MVP * Position; + float2 tex_uv = TexCoord.xy; + // First estimate the viewport size (the user will get the wrong number of // triads if it's wrong and mask_specify_num_triads is 1.0/true). - const vec2 estimated_viewport_size = - registers.OutputSize.xy / mask_resize_viewport_scale; - // Find the final size of our resized phosphor mask tiles. We probably + const float2 estimated_viewport_size = + IN.output_size / mask_resize_viewport_scale; + // Find the final size of our resized phosphor mask tiles. We probably // estimated the viewport size and MASK_RESIZE output size differently last // pass, so do not swear they were the same. ;) - const vec2 mask_resize_tile_size = get_resized_mask_tile_size( - estimated_viewport_size, registers.OutputSize.xy, false); - - // We'll render resized tiles until filling the output FBO or meeting a + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + estimated_viewport_size, IN.output_size, false); + + // We'll render resized tiles until filling the output FBO or meeting a // limit, so compute [wrapped] tile uv coords based on the output uv coords // and the number of tiles that will fit in the FBO. - const vec2 output_tiles_this_pass = registers.OutputSize.xy / mask_resize_tile_size; - const vec2 output_video_uv = tex_uv; - tile_uv_wrap = output_video_uv * output_tiles_this_pass; - - // Get the texel size of an input tile and related values: - const vec2 input_tile_size = vec2(min( - mask_resize_src_lut_size.x, registers.SourceSize.x), mask_resize_tile_size.y); - tile_size_uv = input_tile_size * registers.SourceSize.zw; - input_tiles_per_texture = registers.SourceSize.xy / input_tile_size; - - // Derive [wrapped] texture uv coords from [wrapped] tile uv coords and + const float2 output_tiles_this_pass = IN.output_size / mask_resize_tile_size; + const float2 output_video_uv = tex_uv * IN.texture_size / IN.video_size; + const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass; + + // Get the texel size of an input tile and related values: + const float2 input_tile_size = float2(min( + mask_resize_src_lut_size.x, IN.video_size.x), mask_resize_tile_size.y); + tile_size_uv = input_tile_size / IN.texture_size; + input_tiles_per_texture = IN.texture_size / input_tile_size; + + // Derive [wrapped] texture uv coords from [wrapped] tile uv coords and // the tile size in uv coords, and save frac() for the fragment shader. src_tex_uv_wrap = tile_uv_wrap * tile_size_uv; - - resize_magnification_scale = mask_resize_tile_size / input_tile_size; - src_dxdy = vec2(registers.SourceSize.z, 0.0); + + // Output the values we need, including the magnification scale and step: + //tile_uv_wrap = tile_uv_wrap; + //src_tex_uv_wrap = src_tex_uv_wrap; + resize_magnification_scale = mask_resize_tile_size / input_tile_size; + src_dxdy = float2(1.0/IN.texture_size.x, 0.0); + //tile_size_uv = tile_size_uv; + //input_tiles_per_texture = input_tiles_per_texture; } #pragma stage fragment @@ -94,9 +146,9 @@ layout(location = 2) in vec2 resize_magnification_scale; layout(location = 3) in vec2 src_dxdy; layout(location = 4) in vec2 tile_size_uv; layout(location = 5) in vec2 input_tiles_per_texture; -layout(location = 6) in vec2 tex_uv; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source void main() { @@ -108,17 +160,17 @@ void main() // easier tiled sampling later. #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE // Discard unneeded fragments in case our profile allows real branches. -// const vec2 tile_uv_wrap = tile_uv_wrap; - if(params.mask_sample_mode_desired < 0.5 && + const float2 tile_uv_wrap = tile_uv_wrap; + if(get_mask_sample_mode() < 0.5 && max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles) { const float src_dx = src_dxdy.x; - const vec2 src_tex_uv = fract(src_tex_uv_wrap); - const vec3 pixel_color = downsample_horizontal_sinc_tiled(Source, - src_tex_uv, registers.SourceSize.xy, src_dxdy.x, + const float2 src_tex_uv = frac(src_tex_uv_wrap); + const float3 pixel_color = downsample_horizontal_sinc_tiled(input_texture, + src_tex_uv, IN.texture_size, src_dxdy.x, resize_magnification_scale.x, tile_size_uv.x); // The input LUT was linear RGB, and so is our output: - FragColor = vec4(pixel_color, 1.0); + FragColor = float4(pixel_color, 1.0); } else { @@ -126,6 +178,6 @@ void main() } #else discard; - FragColor = vec4(1.0); + FragColor = float4(1.0); #endif } \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang b/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang index 4946536..5b57779 100755 --- a/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang +++ b/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang @@ -1,15 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -28,9 +18,68 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; +} global; ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// +#include "params.inc" +#include "../../../../include/compat_macros.inc" #include "../user-settings.h" #include "derived-settings-and-constants.h" #include "bind-shader-params.h" @@ -42,43 +91,41 @@ layout(push_constant) uniform Push #pragma stage vertex layout(location = 0) in vec4 Position; layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 tex_uv; -layout(location = 1) out vec2 src_tex_uv_wrap; -layout(location = 2) out vec2 resize_magnification_scale; +layout(location = 0) out vec2 src_tex_uv_wrap; +layout(location = 1) out vec2 resize_magnification_scale; void main() { - gl_Position = params.MVP * Position; - tex_uv = TexCoord; - - // First estimate the viewport size (the user will get the wrong number of + gl_Position = global.MVP * Position; + float2 tex_uv = TexCoord; + // First estimate the viewport size (the user will get the wrong number of // triads if it's wrong and mask_specify_num_triads is 1.0/true). - const float viewport_y = registers.OutputSize.y / mask_resize_viewport_scale.y; + const float viewport_y = IN.output_size.y / mask_resize_viewport_scale.y; const float aspect_ratio = geom_aspect_ratio_x / geom_aspect_ratio_y; - const vec2 estimated_viewport_size = - vec2(viewport_y * aspect_ratio, viewport_y); + const float2 estimated_viewport_size = + float2(viewport_y * aspect_ratio, viewport_y); // Estimate the output size of MASK_RESIZE (the next pass). The estimated // x component shouldn't matter, because we're not using the x result, and // we're not swearing it's correct (if we did, the x result would influence // the y result to maintain the tile aspect ratio). - const vec2 estimated_mask_resize_output_size = - vec2(registers.OutputSize.y * aspect_ratio, registers.OutputSize.y); + const float2 estimated_mask_resize_output_size = + float2(IN.output_size.y * aspect_ratio, IN.output_size.y); // Find the final intended [y] size of our resized phosphor mask tiles, // then the tile size for the current pass (resize y only): - const vec2 mask_resize_tile_size = get_resized_mask_tile_size( + float2 mask_resize_tile_size = get_resized_mask_tile_size( estimated_viewport_size, estimated_mask_resize_output_size, false); - const vec2 pass_output_tile_size = vec2(min( - mask_resize_src_lut_size.x, registers.OutputSize.x), mask_resize_tile_size.y); + float2 pass_output_tile_size = float2(min( + mask_resize_src_lut_size.x, IN.output_size.x), mask_resize_tile_size.y); // We'll render resized tiles until filling the output FBO or meeting a // limit, so compute [wrapped] tile uv coords based on the output uv coords // and the number of tiles that will fit in the FBO. - const vec2 output_tiles_this_pass = registers.OutputSize.xy / pass_output_tile_size; - const vec2 output_video_uv = tex_uv; - const vec2 tile_uv_wrap = output_video_uv * output_tiles_this_pass; + const float2 output_tiles_this_pass = IN.output_size / pass_output_tile_size; + const float2 output_video_uv = tex_uv * IN.texture_size / IN.video_size; + const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass; // The input LUT is just a single mask tile, so texture uv coords are the - // same as tile uv coords (save fract() for the fragment shader). The + // same as tile uv coords (save frac() for the fragment shader). The // magnification scale is also straightforward: src_tex_uv_wrap = tile_uv_wrap; resize_magnification_scale = @@ -86,69 +133,19 @@ void main() } #pragma stage fragment -layout(location = 0) in vec2 tex_uv; -layout(location = 1) in vec2 src_tex_uv_wrap; -layout(location = 2) in vec2 resize_magnification_scale; +layout(location = 0) in vec2 src_tex_uv_wrap; +layout(location = 1) in vec2 resize_magnification_scale; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; #ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT -layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large; -layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large; -layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large; - -void main() -{ - // Resize the input phosphor mask tile to the final vertical size it will - // appear on screen. Keep 1x horizontal size if possible (IN.output_size - // >= mask_resize_src_lut_size), and otherwise linearly sample horizontally - // to fit exactly one tile. Lanczos-resizing the phosphor mask achieves - // much sharper results than mipmapping, and vertically resizing first - // minimizes the total number of taps required. We output a number of - // resized tiles >= mask_resize_num_tiles for easier tiled sampling later. - #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE - // Discard unneeded fragments in case our profile allows real branches. - const vec2 tile_uv_wrap = src_tex_uv_wrap; - if(params.mask_sample_mode_desired < 0.5 && - tile_uv_wrap.y <= mask_resize_num_tiles) - { - const float src_dy = 1.0/mask_resize_src_lut_size.y; - const vec2 src_tex_uv = fract(src_tex_uv_wrap); - vec3 pixel_color; - // If mask_type is static, this branch will be resolved statically. - if(params.mask_type < 0.5) - { - pixel_color = downsample_vertical_sinc_tiled( - mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size, - src_dy, resize_magnification_scale.y, 1.0); - } - else if(params.mask_type < 1.5) - { - pixel_color = downsample_vertical_sinc_tiled( - mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size, - src_dy, resize_magnification_scale.y, 1.0); - } - else - { - pixel_color = downsample_vertical_sinc_tiled( - mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size, - src_dy, resize_magnification_scale.y, 1.0); - } - // The input LUT was linear RGB, and so is our output: - FragColor = vec4(pixel_color, 1.0); - } - else - { - discard; - } - #else - discard; - FragColor = vec4(1.0); - #endif -} + layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large; + layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large; + layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large; #else -layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_small; -layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_small; -layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_small; + layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_small; + layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_small; + layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_small; +#endif void main() { @@ -159,36 +156,58 @@ void main() // much sharper results than mipmapping, and vertically resizing first // minimizes the total number of taps required. We output a number of // resized tiles >= mask_resize_num_tiles for easier tiled sampling later. + //const float2 src_tex_uv_wrap = src_tex_uv_wrap; #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE // Discard unneeded fragments in case our profile allows real branches. - const vec2 tile_uv_wrap = src_tex_uv_wrap; - if(params.mask_sample_mode_desired < 0.5 && + const float2 tile_uv_wrap = src_tex_uv_wrap; + if(get_mask_sample_mode() < 0.5 && tile_uv_wrap.y <= mask_resize_num_tiles) { - const float src_dy = 1.0/mask_resize_src_lut_size.y; - const vec2 src_tex_uv = fract(src_tex_uv_wrap); - vec3 pixel_color; + static const float src_dy = 1.0/mask_resize_src_lut_size.y; + const float2 src_tex_uv = frac(src_tex_uv_wrap); + float3 pixel_color; // If mask_type is static, this branch will be resolved statically. - if(params.mask_type < 0.5) - { - pixel_color = downsample_vertical_sinc_tiled( - mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size, - src_dy, resize_magnification_scale.y, 1.0); - } - else if(params.mask_type < 1.5) - { - pixel_color = downsample_vertical_sinc_tiled( - mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size, - src_dy, resize_magnification_scale.y, 1.0); - } - else - { - pixel_color = downsample_vertical_sinc_tiled( - mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size, - src_dy, resize_magnification_scale.y, 1.0); - } + #ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + if(mask_type < 0.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else if(mask_type < 1.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else + { + pixel_color = downsample_vertical_sinc_tiled( + mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + #else + if(mask_type < 0.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else if(mask_type < 1.5) + { + pixel_color = downsample_vertical_sinc_tiled( + mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + else + { + pixel_color = downsample_vertical_sinc_tiled( + mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size, + src_dy, resize_magnification_scale.y, 1.0); + } + #endif // The input LUT was linear RGB, and so is our output: - FragColor = vec4(pixel_color, 1.0); + FragColor = float4(pixel_color, 1.0); } else { @@ -196,7 +215,6 @@ void main() } #else discard; - FragColor = vec4(1.0); - #endif -} -#endif \ No newline at end of file + FragColor = float4(1.0); + #endif +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang new file mode 100644 index 0000000..b38d831 --- /dev/null +++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang @@ -0,0 +1,4 @@ +#version 450 + +#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE +#include "crt-royale-scanlines-horizontal-apply-mask.h" diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.h b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.h new file mode 100644 index 0000000..459a80a --- /dev/null +++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.h @@ -0,0 +1,364 @@ +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float mask_triad_size_desired; + float mask_specify_num_triads; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; + vec4 VERTICAL_SCANLINESSize; + vec4 BLOOM_APPROXSize; + vec4 HALATION_BLURSize; + vec4 MASK_RESIZESize; +} global; + +#define VERTICAL_SCANLINEStexture VERTICAL_SCANLINES +#define VERTICAL_SCANLINEStexture_size global.VERTICAL_SCANLINESSize.xy +#define VERTICAL_SCANLINESvideo_size global.VERTICAL_SCANLINESSize.xy +#define BLOOM_APPROXtexture BLOOM_APPROX +#define BLOOM_APPROXtexture_size global.BLOOM_APPROXSize.xy +#define BLOOM_APPROXvideo_size global.BLOOM_APPROXSize.xy +#define HALATION_BLURtexture HALATION_BLUR +#define HALATION_BLURtexture_size global.HALATION_BLURSize.xy +#define HALATION_BLURvideo_size global.HALATION_BLURSize.xy +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #define MASK_RESIZEtexture Source +#else + #define MASK_RESIZEtexture MASK_RESIZE +#endif +#define MASK_RESIZEtexture_size global.MASK_RESIZESize.xy +#define MASK_RESIZEvideo_size global.MASK_RESIZESize.xy + +float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y; +const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0); + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#include "params.inc" +#include "../../../../include/compat_macros.inc" +#include "../user-settings.h" +#include "derived-settings-and-constants.h" +#include "bind-shader-params.h" + + +/////////////////////////////// VERTEX INCLUDES /////////////////////////////// + +#include "scanline-functions.h" +#include "phosphor-mask-resizing.h" +#include "../../../../include/gamma-management.h" + +/////////////////////////////////// HELPERS ////////////////////////////////// + +inline float4 tex2Dtiled_mask_linearize(const sampler2D tex, + const float2 tex_uv) +{ + // If we're manually tiling a texture, anisotropic filtering can get + // confused. One workaround is to just select the lowest mip level: + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + // TODO: Use tex2Dlod_linearize with a calculated mip level. + return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0)); + #else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0)); + #else + return tex2D_linearize(tex, tex_uv); + #endif + #endif + #else + return tex2D_linearize(tex, tex_uv); + #endif +} + +#pragma stage vertex +layout(location = 0) in vec4 Position; +layout(location = 1) in vec2 TexCoord; +layout(location = 0) out vec2 video_uv; +layout(location = 1) out vec2 scanline_tex_uv; +layout(location = 2) out vec2 blur3x3_tex_uv; +layout(location = 3) out vec2 halation_tex_uv; +layout(location = 4) out vec2 scanline_texture_size_inv; +layout(location = 5) out vec4 mask_tile_start_uv_and_size; +layout(location = 6) out vec2 mask_tiles_per_screen; + +void main() +{ + gl_Position = global.MVP * Position; + float2 tex_uv = TexCoord; + // Our various input textures use different coords. + video_uv = tex_uv * IN.texture_size/IN.video_size; + scanline_texture_size_inv = + float2(1.0, 1.0)/VERTICAL_SCANLINEStexture_size; + //video_uv = video_uv; + scanline_tex_uv = video_uv * VERTICAL_SCANLINESvideo_size * + scanline_texture_size_inv; + blur3x3_tex_uv = video_uv * BLOOM_APPROXvideo_size / + BLOOM_APPROXtexture_size; + halation_tex_uv = video_uv * HALATION_BLURvideo_size / + HALATION_BLURtexture_size; + //scanline_texture_size_inv = scanline_texture_size_inv; + + // Get a consistent name for the final mask texture size. Sample mode 0 + // uses the manually resized mask, but ignore it if we never resized. + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_texture_size = mask_sample_mode < 0.5 ? + MASK_RESIZEtexture_size : mask_texture_large_size; + const float2 mask_resize_video_size = mask_sample_mode < 0.5 ? + MASK_RESIZEvideo_size : mask_texture_large_size; + #else + const float2 mask_resize_texture_size = mask_texture_large_size; + const float2 mask_resize_video_size = mask_texture_large_size; + #endif + // Compute mask tile dimensions, starting points, etc.: + //float2 mask_tiles_per_screen; + mask_tile_start_uv_and_size = get_mask_sampling_parameters( + mask_resize_texture_size, mask_resize_video_size, IN.output_size, + mask_tiles_per_screen); + //mask_tiles_per_screen = mask_tiles_per_screen; +} + +#pragma stage fragment +layout(location = 0) in vec2 video_uv; +layout(location = 1) in vec2 scanline_tex_uv; +layout(location = 2) in vec2 blur3x3_tex_uv; +layout(location = 3) in vec2 halation_tex_uv; +layout(location = 4) in vec2 scanline_texture_size_inv; +layout(location = 5) in vec4 mask_tile_start_uv_and_size; +layout(location = 6) in vec2 mask_tiles_per_screen; +layout(location = 0) out vec4 FragColor; +layout(set = 0, binding = 2) uniform sampler2D Source; +layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large; +layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large; +layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large; +layout(set = 0, binding = 6) uniform sampler2D VERTICAL_SCANLINES; +layout(set = 0, binding = 7) uniform sampler2D BLOOM_APPROX; +layout(set = 0, binding = 8) uniform sampler2D HALATION_BLUR; +#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE +layout(set = 0, binding = 9) uniform sampler2D MASK_RESIZE; +#endif + +////////////////////////////// FRAGMENT INCLUDES ////////////////////////////// + +#include "bloom-functions.h" + +void main() +{ + // This pass: Sample (misconverged?) scanlines to the final horizontal + // resolution, apply halation (bouncing electrons), and apply the phosphor + // mask. Fake a bloom if requested. Unless we fake a bloom, the output + // will be dim from the scanline auto-dim, mask dimming, and low gamma. + + // Horizontally sample the current row (a vertically interpolated scanline) + // and account for horizontal convergence offsets, given in units of texels. + const float3 scanline_color_dim = sample_rgb_scanline_horizontal( + VERTICAL_SCANLINEStexture, scanline_tex_uv, + VERTICAL_SCANLINEStexture_size, scanline_texture_size_inv); + const float auto_dim_factor = levels_autodim_temp; + + // Sample the phosphor mask: + const float2 tile_uv_wrap = video_uv * mask_tiles_per_screen; + const float2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv( + tile_uv_wrap, mask_tile_start_uv_and_size); + float3 phosphor_mask_sample; + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + const bool sample_orig_luts = get_mask_sample_mode() > 0.5; + #else + static const bool sample_orig_luts = true; + #endif + if(sample_orig_luts) + { + // If mask_type is static, this branch will be resolved statically. + if(mask_type < 0.5) + { + phosphor_mask_sample = tex2D_linearize( + mask_grille_texture_large, mask_tex_uv).rgb; + } + else if(mask_type < 1.5) + { + phosphor_mask_sample = tex2D_linearize( + mask_slot_texture_large, mask_tex_uv).rgb; + } + else + { + phosphor_mask_sample = tex2D_linearize( + mask_shadow_texture_large, mask_tex_uv).rgb; + } + } + else + { + // Sample the resized mask, and avoid tiling artifacts: + phosphor_mask_sample = tex2Dtiled_mask_linearize( + MASK_RESIZEtexture, mask_tex_uv).rgb; + } + + // Sample the halation texture (auto-dim to match the scanlines), and + // account for both horizontal and vertical convergence offsets, given + // in units of texels horizontally and same-field scanlines vertically: + const float3 halation_color = tex2D_linearize( + HALATION_BLURtexture, halation_tex_uv).rgb; + + // Apply halation: Halation models electrons flying around under the glass + // and hitting the wrong phosphors (of any color). It desaturates, so + // average the halation electrons to a scalar. Reduce the local scanline + // intensity accordingly to conserve energy. + const float3 halation_intensity_dim = + float3(dot(halation_color, float3(auto_dim_factor/3.0))); + const float3 electron_intensity_dim = lerp(scanline_color_dim, + halation_intensity_dim, global.halation_weight); + + // Apply the phosphor mask: + const float3 phosphor_emission_dim = electron_intensity_dim * + phosphor_mask_sample; + + #ifdef PHOSPHOR_BLOOM_FAKE + // The BLOOM_APPROX pass approximates a blurred version of a masked + // and scanlined image. It's usually used to compute the brightpass, + // but we can also use it to fake the bloom stage entirely. Caveats: + // 1.) A fake bloom is conceptually different, since we're mixing in a + // fully blurred low-res image, and the biggest implication are: + // 2.) If mask_amplify is incorrect, results deteriorate more quickly. + // 3.) The inaccurate blurring hurts quality in high-contrast areas. + // 4.) The bloom_underestimate_levels parameter seems less sensitive. + // Reverse the auto-dimming and amplify to compensate for mask dimming: + #define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND + #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND + static const float blur_contrast = 1.05; + #else + static const float blur_contrast = 1.0; + #endif + const float mask_amplify = get_mask_amplify(); + const float undim_factor = 1.0/auto_dim_factor; + const float3 phosphor_emission = + phosphor_emission_dim * undim_factor * mask_amplify; + // Get a phosphor blur estimate, accounting for convergence offsets: + const float3 electron_intensity = electron_intensity_dim * undim_factor; + const float3 phosphor_blur_approx_soft = tex2D_linearize( + BLOOM_APPROXtexture, blur3x3_tex_uv).rgb; + const float3 phosphor_blur_approx = lerp(phosphor_blur_approx_soft, + electron_intensity, 0.1) * blur_contrast; + // We could blend between phosphor_emission and phosphor_blur_approx, + // solving for the minimum blend_ratio that avoids clipping past 1.0: + // 1.0 >= total_intensity + // 1.0 >= phosphor_emission * (1.0 - blend_ratio) + + // phosphor_blur_approx * blend_ratio + // blend_ratio = (phosphor_emission - 1.0)/ + // (phosphor_emission - phosphor_blur_approx); + // However, this blurs far more than necessary, because it aims for + // full brightness, not minimal blurring. To fix it, base blend_ratio + // on a max area intensity only so it varies more smoothly: + const float3 phosphor_blur_underestimate = + phosphor_blur_approx * bloom_underestimate_levels; + const float3 area_max_underestimate = + phosphor_blur_underestimate * mask_amplify; + #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND + const float3 blend_ratio_temp = + (area_max_underestimate - float3(1.0, 1.0, 1.0)) / + (area_max_underestimate - phosphor_blur_underestimate); + #else + // Try doing it like an area-based brightpass. This is nearly + // identical, but it's worth toying with the code in case I ever + // find a way to make it look more like a real bloom. (I've had + // some promising textures from combining an area-based blend ratio + // for the phosphor blur and a more brightpass-like blend-ratio for + // the phosphor emission, but I haven't found a way to make the + // brightness correct across the whole color range, especially with + // different bloom_underestimate_levels values.) + const float desired_triad_size = lerp(global.mask_triad_size_desired, + IN.output_size.x/global.mask_num_triads_desired, + global.mask_specify_num_triads); + const float bloom_sigma = get_min_sigma_to_blur_triad( + desired_triad_size, bloom_diff_thresh); + const float center_weight = get_center_weight(bloom_sigma); + const float3 max_area_contribution_approx = + max(float3(0.0, 0.0, 0.0), phosphor_blur_approx - + center_weight * phosphor_emission); + const float3 area_contrib_underestimate = + bloom_underestimate_levels * max_area_contribution_approx; + const float3 blend_ratio_temp = + ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) / + area_max_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0); + #endif + // Clamp blend_ratio in case it's out-of-range, but be SUPER careful: + // min/max/clamp are BIZARRELY broken with lerp (optimization bug?), + // and this redundant sequence avoids bugs, at least on nVidia cards: + const float3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0); + const float3 blend_ratio = lerp(blend_ratio_clamped, float3(1.0,1.0,1.0), global.bloom_excess); + // Blend the blurred and unblurred images: + const float3 phosphor_emission_unclipped = + lerp(phosphor_emission, phosphor_blur_approx, blend_ratio); + // Simulate refractive diffusion by reusing the halation sample. + const float3 pixel_color = lerp(phosphor_emission_unclipped, + halation_color, global.diffusion_weight); + #else + const float3 pixel_color = phosphor_emission_dim; + #endif + // Encode if necessary, and output. + FragColor = encode_output(float4(pixel_color, 1.0)); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang index fb2afa8..5303e71 100755 --- a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang +++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang @@ -1,294 +1,2 @@ #version 450 - -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OutputSize; - vec4 MASK_RESIZESize; - vec4 ORIG_LINEARIZEDSize; - vec4 VERTICAL_SCANLINESSize; - vec4 BLOOM_APPROXSize; - vec4 HALATION_BLURSize; -} registers; - -#include "params.inc" - -///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// - -// crt-royale: A full-featured CRT shader, with cheese. -// Copyright (C) 2014 TroggleMonkey -// -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the Free -// Software Foundation; either version 2 of the License, or any later version. -// -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. -// -// You should have received a copy of the GNU General Public License along with -// this program; if not, write to the Free Software Foundation, Inc., 59 Temple -// Place, Suite 330, Boston, MA 02111-1307 USA - - -///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// - -#include "../user-settings.h" -#include "derived-settings-and-constants.h" -#include "bind-shader-params.h" - - -////////////////////////////////// INCLUDES ////////////////////////////////// - -#include "scanline-functions.h" -#include "phosphor-mask-resizing.h" -#include "bloom-functions.h"//"bloom-functions.h" -#include "../../../../include/gamma-management.h" - -/////////////////////////////////// HELPERS ////////////////////////////////// - -vec4 tex2Dtiled_mask_linearize(const sampler2D tex, - const vec2 tex_uv) -{ - // If we're manually tiling a texture, anisotropic filtering can get - // confused. One workaround is to just select the lowest mip level: - #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE - #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD - // TODO: Use tex2Dlod_linearize with a calculated mip level. - return tex2Dlod_linearize(tex, vec4(tex_uv, 0.0, 0.0)); - #else - #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS - return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0)); - #else - return tex2D_linearize(tex, tex_uv); - #endif - #endif - #else - return tex2D_linearize(tex, tex_uv); - #endif -} - -#pragma stage vertex -layout(location = 0) in vec4 Position; -layout(location = 1) in vec2 TexCoord; -layout(location = 0) out vec2 video_uv; -layout(location = 1) out vec2 scanline_tex_uv; -layout(location = 2) out vec2 blur3x3_tex_uv; -layout(location = 3) out vec2 halation_tex_uv; -layout(location = 4) out vec2 scanline_texture_size_inv; -layout(location = 5) out vec4 mask_tile_start_uv_and_size; -layout(location = 6) out vec2 mask_tiles_per_screen; - -void main() -{ - gl_Position = params.MVP * Position; - - // Our various input textures use different coords. - video_uv = TexCoord; - scanline_texture_size_inv = - registers.VERTICAL_SCANLINESSize.zw; - scanline_tex_uv = video_uv;// * registers.VERTICAL_SCANLINESSize.xy * - scanline_texture_size_inv; - blur3x3_tex_uv = video_uv;// * registers.BLOOM_APPROXSize.xy * - registers.BLOOM_APPROXSize.zw; - halation_tex_uv = video_uv;// * registers.HALATION_BLURSize.xy * - registers.HALATION_BLURSize.zw; - - // Get a consistent name for the final mask texture size. Sample mode 0 - // uses the manually resized mask, but ignore it if we never resized. - #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE - const float mask_sample_mode = params.mask_sample_mode_desired;//get_mask_sample_mode(); - vec2 mask_resize_texture_size = registers.MASK_RESIZESize.xy; - if(mask_sample_mode > 0.5) mask_resize_texture_size = mask_texture_large_size; - vec2 mask_resize_video_size = registers.MASK_RESIZESize.xy; - if(mask_sample_mode > 0.5) mask_resize_video_size = mask_texture_large_size; - #else - const vec2 mask_resize_texture_size = mask_texture_large_size; - const vec2 mask_resize_video_size = mask_texture_large_size; - #endif -// mask_tiles_per_screen = vec2(1280.0, 480.0); - - // Compute mask tile dimensions, starting points, etc.: - mask_tile_start_uv_and_size = get_mask_sampling_parameters( - mask_resize_texture_size, mask_resize_video_size, registers.OutputSize.xy, - mask_tiles_per_screen); -} - -#pragma stage fragment -#pragma format R8G8B8A8_SRGB -layout(location = 0) in vec2 video_uv; -layout(location = 1) in vec2 scanline_tex_uv; -layout(location = 2) in vec2 blur3x3_tex_uv; -layout(location = 3) in vec2 halation_tex_uv; -layout(location = 4) in vec2 scanline_texture_size_inv; -layout(location = 5) in vec4 mask_tile_start_uv_and_size; -layout(location = 6) in vec2 mask_tiles_per_screen; -layout(location = 0) out vec4 FragColor; -layout(set = 0, binding = 2) uniform sampler2D Source; -layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large; -layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large; -layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large; -layout(set = 0, binding = 6) uniform sampler2D VERTICAL_SCANLINES; -layout(set = 0, binding = 7) uniform sampler2D BLOOM_APPROX; -layout(set = 0, binding = 8) uniform sampler2D HALATION_BLUR; -#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE -layout(set = 0, binding = 9) uniform sampler2D MASK_RESIZE; -#endif - -void main() -{ - // This pass: Sample (misconverged?) scanlines to the final horizontal - // resolution, apply halation (bouncing electrons), and apply the phosphor - // mask. Fake a bloom if requested. Unless we fake a bloom, the output - // will be dim from the scanline auto-dim, mask dimming, and low gamma. - - // Horizontally sample the current row (a vertically interpolated scanline) - // and account for horizontal convergence offsets, given in units of texels. - const vec3 scanline_color_dim = sample_rgb_scanline_horizontal( - VERTICAL_SCANLINES, scanline_tex_uv, - registers.VERTICAL_SCANLINESSize.xy, scanline_texture_size_inv); - const float auto_dim_factor = levels_autodim_temp; - - // Sample the phosphor mask: - const vec2 tile_uv_wrap = video_uv * mask_tiles_per_screen; - const vec2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv( - tile_uv_wrap, mask_tile_start_uv_and_size); - vec3 phosphor_mask_sample; - #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE - bool sample_orig_luts = true; - if (params.mask_sample_mode_desired < 0.5) sample_orig_luts = false; - #else - const bool sample_orig_luts = true; - #endif - - if(sample_orig_luts == true) - { - // If mask_type is static, this branch will be resolved statically. - if(params.mask_type < 0.5) - { - phosphor_mask_sample = tex2D_linearize( - mask_grille_texture_large, mask_tex_uv).rgb; - } - else if(params.mask_type < 1.5) - { - phosphor_mask_sample = tex2D_linearize( - mask_slot_texture_large, mask_tex_uv).rgb; - } - else - { - phosphor_mask_sample = tex2D_linearize( - mask_shadow_texture_large, mask_tex_uv).rgb; - } - } - else - { - // Sample the resized mask, and avoid tiling artifacts: - phosphor_mask_sample = tex2Dtiled_mask_linearize( - MASK_RESIZE, mask_tex_uv).rgb; - } - - // Sample the halation texture (auto-dim to match the scanlines), and - // account for both horizontal and vertical convergence offsets, given - // in units of texels horizontally and same-field scanlines vertically: - const vec3 halation_color = tex2D_linearize( - HALATION_BLUR, halation_tex_uv).rgb; - - // Apply halation: Halation models electrons flying around under the glass - // and hitting the wrong phosphors (of any color). It desaturates, so - // average the halation electrons to a scalar. Reduce the local scanline - // intensity accordingly to conserve energy. - const vec3 halation_intensity_dim = - vec3(dot(halation_color, vec3(auto_dim_factor/3.0))); - const vec3 electron_intensity_dim = mix(scanline_color_dim, - halation_intensity_dim, params.halation_weight); - - // Apply the phosphor mask: - const vec3 phosphor_emission_dim = electron_intensity_dim * - phosphor_mask_sample; -// #define PHOSPHOR_BLOOM_FAKE // TODO/FIXME: something seems wrong with the non-FAKE path - #ifdef PHOSPHOR_BLOOM_FAKE - // The BLOOM_APPROX pass approximates a blurred version of a masked - // and scanlined image. It's usually used to compute the brightpass, - // but we can also use it to fake the bloom stage entirely. Caveats: - // 1.) A fake bloom is conceptually different, since we're mixing in a - // fully blurred low-res image, and the biggest implication are: - // 2.) If mask_amplify is incorrect, results deteriorate more quickly. - // 3.) The inaccurate blurring hurts quality in high-contrast areas. - // 4.) The bloom_underestimate_levels parameter seems less sensitive. - // Reverse the auto-dimming and amplify to compensate for mask dimming: - #define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND - #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND - const float blur_contrast = 1.05; - #else - const float blur_contrast = 1.0; - #endif - const float mask_amplify = get_mask_amplify(); - const float undim_factor = 1.0/auto_dim_factor; - const vec3 phosphor_emission = - phosphor_emission_dim * undim_factor * mask_amplify; - // Get a phosphor blur estimate, accounting for convergence offsets: - const vec3 electron_intensity = electron_intensity_dim * undim_factor; - const vec3 phosphor_blur_approx_soft = tex2D_linearize( - BLOOM_APPROX, blur3x3_tex_uv).rgb; - const vec3 phosphor_blur_approx = mix(phosphor_blur_approx_soft, - electron_intensity, 0.1) * blur_contrast; - // We could blend between phosphor_emission and phosphor_blur_approx, - // solving for the minimum blend_ratio that avoids clipping past 1.0: - // 1.0 >= total_intensity - // 1.0 >= phosphor_emission * (1.0 - blend_ratio) + - // phosphor_blur_approx * blend_ratio - // blend_ratio = (phosphor_emission - 1.0)/ - // (phosphor_emission - phosphor_blur_approx); - // However, this blurs far more than necessary, because it aims for - // full brightness, not minimal blurring. To fix it, base blend_ratio - // on a max area intensity only so it varies more smoothly: - const vec3 phosphor_blur_underestimate = - phosphor_blur_approx * params.bloom_underestimate_levels; - const vec3 area_max_underestimate = - phosphor_blur_underestimate * mask_amplify; - #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND - const vec3 blend_ratio_temp = - (area_max_underestimate - vec3(1.0)) / - (area_max_underestimate - phosphor_blur_underestimate); - #else - // Try doing it like an area-based brightpass. This is nearly - // identical, but it's worth toying with the code in case I ever - // find a way to make it look more like a real bloom. (I've had - // some promising textures from combining an area-based blend ratio - // for the phosphor blur and a more brightpass-like blend-ratio for - // the phosphor emission, but I haven't found a way to make the - // brightness correct across the whole color range, especially with - // different bloom_underestimate_levels values.) - const float desired_triad_size = mix(params.mask_triad_size_desired, - registers.OutputSize.x/params.mask_num_triads_desired, - params.mask_specify_num_triads); - const float bloom_sigma = get_min_sigma_to_blur_triad( - desired_triad_size, bloom_diff_thresh); - const float center_weight = get_center_weight(bloom_sigma); - const vec3 max_area_contribution_approx = - max(vec3(0.0), phosphor_blur_approx - - center_weight * phosphor_emission); - const vec3 area_contrib_underestimate = - params.bloom_underestimate_levels * max_area_contribution_approx; - const vec3 blend_ratio_temp = - ((vec3(1.0) - area_contrib_underestimate) / - area_max_underestimate - vec3(1.0)) / (center_weight - 1.0); - #endif - // Clamp blend_ratio in case it's out-of-range, but be SUPER careful: - // min/max/clamp are BIZARRELY broken with lerp (optimization bug?), - // and this redundant sequence avoids bugs, at least on nVidia cards: - const vec3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0); - const vec3 blend_ratio = mix(blend_ratio_clamped, vec3(1.0), params.bloom_excess); - // Blend the blurred and unblurred images: - const vec3 phosphor_emission_unclipped = - mix(phosphor_emission, phosphor_blur_approx, blend_ratio); - // Simulate refractive diffusion by reusing the halation sample. - const vec3 pixel_color = mix(phosphor_emission_unclipped, - halation_color, params.diffusion_weight); - #else - const vec3 pixel_color = phosphor_emission_dim; - #endif - - FragColor = encode_output(vec4(pixel_color, 1.0)); -} +#include "crt-royale-scanlines-horizontal-apply-mask.h" \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang b/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang index cfc0e64..423ed38 100755 --- a/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang +++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang @@ -1,15 +1,5 @@ #version 450 -layout(push_constant) uniform Push -{ - vec4 SourceSize; - vec4 OriginalSize; - vec4 OutputSize; - uint FrameCount; -} registers; - -#include "params.inc" - ///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// // crt-royale: A full-featured CRT shader, with cheese. @@ -28,10 +18,67 @@ layout(push_constant) uniform Push // this program; if not, write to the Free Software Foundation, Inc., 59 Temple // Place, Suite 330, Boston, MA 02111-1307 USA +layout(push_constant) uniform Push +{ + vec4 SourceSize; + vec4 OriginalSize; + vec4 OutputSize; + uint FrameCount; +} params; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_num_triads_desired; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; +} global; ////////////////////////////////// INCLUDES ////////////////////////////////// -//#include "../user-settings.h" +#include "params.inc" +#include "../../../../include/compat_macros.inc" +#include "../user-settings.h" #include "derived-settings-and-constants.h" #include "bind-shader-params.h" #include "scanline-functions.h" @@ -41,46 +88,51 @@ layout(push_constant) uniform Push layout(location = 0) in vec4 Position; layout(location = 1) in vec2 TexCoord; layout(location = 0) out vec2 tex_uv; -layout(location = 1) out vec2 uv_step; -layout(location = 2) out vec2 il_step_multiple; -layout(location = 3) out float pixel_height_in_scanlines; +layout(location = 1) out vec2 uv_step; // uv size of a texel (x) and scanline (y) +layout(location = 2) out vec2 il_step_multiple; // (1, 1) = progressive, (1, 2) = interlaced +layout(location = 3) out float pixel_height_in_scanlines; // Height of an output pixel in scanlines +layout(location = 4) out float sigma_range; +layout(location = 5) out float shape_range; void main() { - gl_Position = params.MVP * Position; - tex_uv = TexCoord; + gl_Position = global.MVP * Position; + tex_uv = TexCoord * 1.00001; - // Detect interlacing: il_step_multiple indicates the step multiple between + // Detect interlacing: il_step_multiple indicates the step multiple between // lines: 1 is for progressive sources, and 2 is for interlaced sources. - const vec2 video_size = registers.SourceSize.xy; - float interlace_check = is_interlaced(video_size.y) ? 1.0 : 0.0; - const float y_step = 1.0 + interlace_check; - il_step_multiple = vec2(1.0, y_step); + float2 video_size_ = IN.video_size.xy; + const float y_step = 1.0 + float(is_interlaced(video_size_.y)); + il_step_multiple = float2(1.0, y_step); // Get the uv tex coords step between one texel (x) and scanline (y): - uv_step = il_step_multiple * registers.SourceSize.zw; - - // If shader parameters are used, {min, max}_{sigma, shape} are runtime + uv_step = il_step_multiple / IN.texture_size; + + // If shader parameters are used, {min, max}_{sigma, shape} are runtime // values. Compute {sigma, shape}_range outside of scanline_contrib() so // they aren't computed once per scanline (6 times per fragment and up to // 18 times per vertex): - const float sigma_range = max(params.beam_max_sigma, params.beam_min_sigma) - - params.beam_min_sigma; - const float shape_range = max(params.beam_max_shape, params.beam_min_shape) - - params.beam_min_shape; - - // We need the pixel height in scanlines for antialiased/integral sampling: - pixel_height_in_scanlines = (video_size.y * registers.OutputSize.w) / + const float sigma_range = max(beam_max_sigma, beam_min_sigma) - + beam_min_sigma; + const float shape_range = max(beam_max_shape, beam_min_shape) - + beam_min_shape; + + // We need the pixel height in scanlines for antialiased/integral sampling: + const float ph = (video_size_.y / IN.output_size.y) / il_step_multiple.y; + pixel_height_in_scanlines = ph; } #pragma stage fragment #pragma format R8G8B8A8_SRGB layout(location = 0) in vec2 tex_uv; -layout(location = 1) in vec2 uv_step; -layout(location = 2) in vec2 il_step_multiple; -layout(location = 3) in float pixel_height_in_scanlines; +layout(location = 1) in vec2 uv_step; // uv size of a texel (x) and scanline (y) +layout(location = 2) in vec2 il_step_multiple; // (1, 1) = progressive, (1, 2) = interlaced +layout(location = 3) in float pixel_height_in_scanlines; // Height of an output pixel in scanlines +layout(location = 4) in float sigma_range; +layout(location = 5) in float shape_range; layout(location = 0) out vec4 FragColor; layout(set = 0, binding = 2) uniform sampler2D Source; +#define input_texture Source void main() { @@ -88,155 +140,157 @@ void main() // vertical resolution. Temporarily auto-dim the output to avoid clipping. // Read some attributes into local variables: - const vec2 texture_size = registers.SourceSize.xy; - const vec2 texture_size_inv = registers.SourceSize.zw; - const float frame_count = vec2(registers.FrameCount, registers.FrameCount).x; + float2 texture_size_ = IN.texture_size; + float2 texture_size_inv = 1.0/texture_size_; + //const float2 uv_step = uv_step; + //const float2 il_step_multiple = il_step_multiple; + float frame_count = float(IN.frame_count); const float ph = pixel_height_in_scanlines; - - // Get the uv coords of the previous scanline (in this field), and the + + // Get the uv coords of the previous scanline (in this field), and the // scanline's distance from this sample, in scanlines. float dist; - const vec2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size, + const float2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size_, texture_size_inv, il_step_multiple, frame_count, dist); // Consider 2, 3, 4, or 6 scanlines numbered 0-5: The previous and next // scanlines are numbered 2 and 3. Get scanline colors colors (ignore - // horizontal sampling, since registers.OutputSize.x = video_size.x). + // horizontal sampling, since since IN.output_size.x = video_size.x). // NOTE: Anisotropic filtering creates interlacing artifacts, which is why // ORIG_LINEARIZED bobbed any interlaced input before this pass. - const vec2 v_step = vec2(0.0, uv_step.y); - const vec3 scanline2_color = tex2D_linearize(Source, scanline_uv).rgb; - const vec3 scanline3_color = - tex2D_linearize(Source, scanline_uv + v_step).rgb; - vec3 scanline0_color, scanline1_color, scanline4_color, scanline5_color, + const float2 v_step = float2(0.0, uv_step.y); + const float3 scanline2_color = tex2D_linearize(input_texture, scanline_uv).rgb; + const float3 scanline3_color = + tex2D_linearize(input_texture, scanline_uv + v_step).rgb; + float3 scanline0_color, scanline1_color, scanline4_color, scanline5_color, scanline_outside_color; float dist_round; // Use scanlines 0, 1, 4, and 5 for a total of 6 scanlines: - if(params.beam_num_scanlines > 5.5) + if(beam_num_scanlines > 5.5) { scanline1_color = - tex2D_linearize(Source, scanline_uv - v_step).rgb; + tex2D_linearize(input_texture, scanline_uv - v_step).rgb; scanline4_color = - tex2D_linearize(Source, scanline_uv + 2.0 * v_step).rgb; + tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb; scanline0_color = - tex2D_linearize(Source, scanline_uv - 2.0 * v_step).rgb; + tex2D_linearize(input_texture, scanline_uv - 2.0 * v_step).rgb; scanline5_color = - tex2D_linearize(Source, scanline_uv + 3.0 * v_step).rgb; + tex2D_linearize(input_texture, scanline_uv + 3.0 * v_step).rgb; } - // Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines: - else if(params.beam_num_scanlines > 4.5) + // Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines: + else if(beam_num_scanlines > 4.5) { scanline1_color = - tex2D_linearize(Source, scanline_uv - v_step).rgb; + tex2D_linearize(input_texture, scanline_uv - v_step).rgb; scanline4_color = - tex2D_linearize(Source, scanline_uv + 2.0 * v_step).rgb; + tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb; // dist is in [0, 1] dist_round = round(dist); - const vec2 sample_0_or_5_uv_off = - mix(-2.0 * v_step, 3.0 * v_step, dist_round); + const float2 sample_0_or_5_uv_off = + lerp(-2.0 * v_step, 3.0 * v_step, dist_round); // Call this "scanline_outside_color" to cope with the conditional // scanline number: scanline_outside_color = tex2D_linearize( - Source, scanline_uv + sample_0_or_5_uv_off).rgb; + input_texture, scanline_uv + sample_0_or_5_uv_off).rgb; } - // Use scanlines 1 and 4 for a total of 4 scanlines: - else if(params.beam_num_scanlines > 3.5) + // Use scanlines 1 and 4 for a total of 4 scanlines: + else if(beam_num_scanlines > 3.5) { scanline1_color = - tex2D_linearize(Source, scanline_uv - v_step).rgb; + tex2D_linearize(input_texture, scanline_uv - v_step).rgb; scanline4_color = - tex2D_linearize(Source, scanline_uv + 2.0 * v_step).rgb; + tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb; } // Use scanline 1 or 4 for a total of 3 scanlines: - else if(params.beam_num_scanlines > 2.5) + else if(beam_num_scanlines > 2.5) { // dist is in [0, 1] dist_round = round(dist); - const vec2 sample_1or4_uv_off = - mix(-v_step, 2.0 * v_step, dist_round); + const float2 sample_1or4_uv_off = + lerp(-v_step, 2.0 * v_step, dist_round); scanline_outside_color = tex2D_linearize( - Source, scanline_uv + sample_1or4_uv_off).rgb; + input_texture, scanline_uv + sample_1or4_uv_off).rgb; } - - // Compute scanline contributions, accounting for vertical convergence. + + // Compute scanline contributions, accounting for vertical convergence. // Vertical convergence offsets are in units of current-field scanlines. // dist2 means "positive sample distance from scanline 2, in scanlines:" - vec3 dist2 = vec3(dist); - if(beam_misconvergence == true) + float3 dist2 = float3(dist); + if(beam_misconvergence) { - const vec3 convergence_offsets_vert_rgb = - vec3(params.convergence_offset_y_r, params.convergence_offset_y_g, params.convergence_offset_y_b);//get_convergence_offsets_y_vector(); - dist2 = vec3(dist) - convergence_offsets_vert_rgb; + const float3 convergence_offsets_vert_rgb = + get_convergence_offsets_y_vector(); + dist2 = float3(dist) - convergence_offsets_vert_rgb; } - // Calculate {sigma, shape}_range outside of scanline_contrib so it's only + // Calculate {sigma, shape}_range outside of scanline_contrib so it's only // done once per pixel (not 6 times) with runtime params. Don't reuse the // vertex shader calculations, so static versions can be constant-folded. - const float sigma_range = max(params.beam_max_sigma, params.beam_min_sigma) - - params.beam_min_sigma; - const float shape_range = max(params.beam_max_shape, params.beam_min_shape) - - params.beam_min_shape; - // Calculate and sum final scanline contributions, starting with lines 2/3. + // TODO/FIXME: nvm, use the ones from the vertex /shrug +/* const float sigma_range = max(beam_max_sigma, beam_min_sigma) - + beam_min_sigma; + const float shape_range = max(beam_max_shape, beam_min_shape) - + beam_min_shape;*/ + // Calculate and sum final scanline contributions, starting with lines 2/3. // There is no normalization step, because we're not interpolating a // continuous signal. Instead, each scanline is an additive light source. - const vec3 scanline2_contrib = scanline_contrib(dist2, + const float3 scanline2_contrib = scanline_contrib(dist2, scanline2_color, ph, sigma_range, shape_range); - const vec3 scanline3_contrib = scanline_contrib(abs(vec3(1.0) - dist2), + const float3 scanline3_contrib = scanline_contrib(abs(float3(1.0) - dist2), scanline3_color, ph, sigma_range, shape_range); - vec3 scanline_intensity = scanline2_contrib + scanline3_contrib; - - if(params.beam_num_scanlines > 5.5) + float3 scanline_intensity = scanline2_contrib + scanline3_contrib; + if(beam_num_scanlines > 5.5) { - vec3 scanline0_contrib = - scanline_contrib(dist2 + vec3(2.0), scanline0_color, + const float3 scanline0_contrib = + scanline_contrib(dist2 + float3(2.0), scanline0_color, ph, sigma_range, shape_range); - vec3 scanline1_contrib = - scanline_contrib(dist2 + vec3(1.0), scanline1_color, + const float3 scanline1_contrib = + scanline_contrib(dist2 + float3(1.0), scanline1_color, ph, sigma_range, shape_range); - vec3 scanline4_contrib = - scanline_contrib(abs(vec3(2.0) - dist2), scanline4_color, + const float3 scanline4_contrib = + scanline_contrib(abs(float3(2.0) - dist2), scanline4_color, ph, sigma_range, shape_range); - vec3 scanline5_contrib = - scanline_contrib(abs(vec3(3.0) - dist2), scanline5_color, + const float3 scanline5_contrib = + scanline_contrib(abs(float3(3.0) - dist2), scanline5_color, ph, sigma_range, shape_range); scanline_intensity += scanline0_contrib + scanline1_contrib + scanline4_contrib + scanline5_contrib; } - else if(params.beam_num_scanlines > 4.5) + else if(beam_num_scanlines > 4.5) { - vec3 scanline1_contrib = - scanline_contrib(dist2 + vec3(1.0), scanline1_color, + const float3 scanline1_contrib = + scanline_contrib(dist2 + float3(1.0), scanline1_color, ph, sigma_range, shape_range); - vec3 scanline4_contrib = - scanline_contrib(abs(vec3(2.0) - dist2), scanline4_color, + const float3 scanline4_contrib = + scanline_contrib(abs(float3(2.0) - dist2), scanline4_color, ph, sigma_range, shape_range); - vec3 dist0or5 = mix( - dist2 + vec3(2.0), vec3(3.0) - dist2, dist_round); - vec3 scanline0or5_contrib = scanline_contrib( + const float3 dist0or5 = lerp( + dist2 + float3(2.0), float3(3.0) - dist2, dist_round); + const float3 scanline0or5_contrib = scanline_contrib( dist0or5, scanline_outside_color, ph, sigma_range, shape_range); scanline_intensity += scanline1_contrib + scanline4_contrib + scanline0or5_contrib; } - else if(params.beam_num_scanlines > 3.5) + else if(beam_num_scanlines > 3.5) { - vec3 scanline1_contrib = - scanline_contrib(dist2 + vec3(1.0), scanline1_color, + const float3 scanline1_contrib = + scanline_contrib(dist2 + float3(1.0), scanline1_color, ph, sigma_range, shape_range); - vec3 scanline4_contrib = - scanline_contrib(abs(vec3(2.0) - dist2), scanline4_color, + const float3 scanline4_contrib = + scanline_contrib(abs(float3(2.0) - dist2), scanline4_color, ph, sigma_range, shape_range); scanline_intensity += scanline1_contrib + scanline4_contrib; } - else if(params.beam_num_scanlines > 2.5) + else if(beam_num_scanlines > 2.5) { - vec3 dist1or4 = mix( - dist2 + vec3(1.0), vec3(2.0) - dist2, dist_round); - vec3 scanline1or4_contrib = scanline_contrib( + const float3 dist1or4 = lerp( + dist2 + float3(1.0), float3(2.0) - dist2, dist_round); + const float3 scanline1or4_contrib = scanline_contrib( dist1or4, scanline_outside_color, ph, sigma_range, shape_range); scanline_intensity += scanline1or4_contrib; } - - // Auto-dim the image to avoid clipping, encode if necessary, and output. + + // Auto-dim the image to avoid clipping, encode if necessary, and output. // My original idea was to compute a minimal auto-dim factor and put it in // the alpha channel, but it wasn't working, at least not reliably. This // is faster anyway, levels_autodim_temp = 0.5 isn't causing banding. - FragColor = vec4(encode_output(vec4(scanline_intensity * levels_autodim_temp, 1.0))); -} + FragColor = encode_output(float4(scanline_intensity * levels_autodim_temp, 1.0)); +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/derived-settings-and-constants.h b/crt/shaders/crt-royale/src/derived-settings-and-constants.h index 356eea3..1c39a97 100644 --- a/crt/shaders/crt-royale/src/derived-settings-and-constants.h +++ b/crt/shaders/crt-royale/src/derived-settings-and-constants.h @@ -29,12 +29,12 @@ ////////////////////////////////// INCLUDES ////////////////////////////////// #include "../user-settings.h" -#include "user-preset-constants.h" +#include "user-cgp-constants.h" /////////////////////////////// FIXED SETTINGS /////////////////////////////// -// Avoid dividing by zero; using a macro overloads for float, vec2, etc.: +// Avoid dividing by zero; using a macro overloads for float, float2, etc.: #define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 // Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. @@ -81,10 +81,10 @@ #endif // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is // inferior in most cases, so replace 2.0 with 0.0: - const float bloom_approx_filter = + static const float bloom_approx_filter = bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; #else - const float bloom_approx_filter = bloom_approx_filter_static; + static const float bloom_approx_filter = bloom_approx_filter_static; #endif // Disable slow runtime paths if static parameters are used. Most of these @@ -199,12 +199,12 @@ #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD // TODO: Take advantage of this! #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT - const vec2 mask_resize_src_lut_size = mask_texture_large_size; + static const float2 mask_resize_src_lut_size = mask_texture_large_size; #else - const vec2 mask_resize_src_lut_size = mask_texture_small_size; + static const float2 mask_resize_src_lut_size = mask_texture_small_size; #endif #else - const vec2 mask_resize_src_lut_size = mask_texture_small_size; + static const float2 mask_resize_src_lut_size = mask_texture_small_size; #endif @@ -237,35 +237,35 @@ // determine how many border texels and tiles we need, based on how the result // will be sampled: #ifdef GEOMETRY_EARLY - const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + static const float max_subpixel_offset = aa_subpixel_r_offset_static.x; // Most antialiasing filters have a base radius of 4.0 pixels: - const float max_aa_base_pixel_border = 4.0 + + static const float max_aa_base_pixel_border = 4.0 + max_subpixel_offset; #else - const float max_aa_base_pixel_border = 0.0; + static const float max_aa_base_pixel_border = 0.0; #endif // Anisotropic filtering adds about 0.5 to the pixel border: #ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY - const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; + static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; #else - const float max_aniso_pixel_border = max_aa_base_pixel_border; + static const float max_aniso_pixel_border = max_aa_base_pixel_border; #endif // Fixing discontinuities adds 1.0 more to the pixel border: #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES - const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; + static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; #else - const float max_tiled_pixel_border = max_aniso_pixel_border; + static const float max_tiled_pixel_border = max_aniso_pixel_border; #endif // Convert the pixel border to an integer texel border. Assume same-pass // curvature about triples the texel frequency: #ifdef GEOMETRY_EARLY - const float max_mask_texel_border = + static const float max_mask_texel_border = ceil(max_tiled_pixel_border * 3.0); #else - const float max_mask_texel_border = ceil(max_tiled_pixel_border); + static const float max_mask_texel_border = ceil(max_tiled_pixel_border); #endif // Convert the texel border to a tile border using worst-case assumptions: - const float max_mask_tile_border = max_mask_texel_border/ +static const float max_mask_tile_border = max_mask_texel_border/ (mask_min_allowed_triad_size * mask_triads_per_tile); // Finally, set the number of resized tiles to render to MASK_RESIZE, and set @@ -274,41 +274,41 @@ #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // Special case: Render two tiles without borders. Anisotropic // filtering doesn't seem to be a problem here. - const float mask_resize_num_tiles = 1.0 + 1.0; - const float mask_start_texels = 0.0; + static const float mask_resize_num_tiles = 1.0 + 1.0; + static const float mask_start_texels = 0.0; #else - const float mask_resize_num_tiles = 1.0 + + static const float mask_resize_num_tiles = 1.0 + 2.0 * max_mask_tile_border; - const float mask_start_texels = max_mask_texel_border; + static const float mask_start_texels = max_mask_texel_border; #endif #else - const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; - const float mask_start_texels = max_mask_texel_border; + static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + static const float mask_start_texels = max_mask_texel_border; #endif // We have to fit mask_resize_num_tiles into an FBO with a viewport scale of // mask_resize_viewport_scale. This limits the maximum final triad size. // Estimate the minimum number of triads we can split the screen into in each // dimension (we'll be as correct as mask_resize_viewport_scale is): - const float mask_resize_num_triads = +static const float mask_resize_num_triads = mask_resize_num_tiles * mask_triads_per_tile; - const vec2 min_allowed_viewport_triads = - vec2(mask_resize_num_triads) / mask_resize_viewport_scale; +static const float2 min_allowed_viewport_triads = + float2(mask_resize_num_triads) / mask_resize_viewport_scale; //////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// - const float pi = 3.141592653589; +static const float pi = 3.141592653589; // We often want to find the location of the previous texel, e.g.: -// const vec2 curr_texel = uv * texture_size; -// const vec2 prev_texel = floor(curr_texel - vec2(0.5)) + vec2(0.5); -// const vec2 prev_texel_uv = prev_texel / texture_size; +// const float2 curr_texel = uv * texture_size; +// const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5); +// const float2 prev_texel_uv = prev_texel / texture_size; // However, many GPU drivers round incorrectly around exact texel locations. // We need to subtract a little less than 0.5 before flooring, and some GPU's // require this value to be farther from 0.5 than others; define it here. -// const vec2 prev_texel = -// floor(curr_texel - vec2(under_half)) + vec2(0.5); - const float under_half = 0.4995; +// const float2 prev_texel = +// floor(curr_texel - float2(under_half)) + float2(0.5); +static const float under_half = 0.4995; #endif // DERIVED_SETTINGS_AND_CONSTANTS_H diff --git a/crt/shaders/crt-royale/src/geometry-functions.h b/crt/shaders/crt-royale/src/geometry-functions.h index de8036a..ed5a7f8 100644 --- a/crt/shaders/crt-royale/src/geometry-functions.h +++ b/crt/shaders/crt-royale/src/geometry-functions.h @@ -32,22 +32,23 @@ // Curvature-related constants: #define MAX_POINT_CLOUD_SIZE 9 + ///////////////////////////// CURVATURE FUNCTIONS ///////////////////////////// -vec2 quadratic_solve(const float a, const float b_over_2, const float c) +float2 quadratic_solve(const float a, const float b_over_2, const float c) { // Requires: 1.) a, b, and c are quadratic formula coefficients // 2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out) // 3.) b_over_2 must be guaranteed < 0.0 (avoids a branch) - // Returns: Returns vec2(first_solution, discriminant), so the caller + // Returns: Returns float2(first_solution, discriminant), so the caller // can choose how to handle the "no intersection" case. The // Kahan or Citardauq formula is used for numerical robustness. const float discriminant = b_over_2*b_over_2 - a*c; const float solution0 = c/(-b_over_2 + sqrt(discriminant)); - return vec2(solution0, discriminant); + return float2(solution0, discriminant); } -vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec) +float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec) { // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's // local coordinate frame (eye_pos_vec is a position, i.e. @@ -60,11 +61,11 @@ vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec) // Quadratic formula coefficients (b_over_2 is guaranteed negative): const float a = dot(view_vec, view_vec); const float b_over_2 = dot(view_vec, eye_pos_vec); // * 2.0 factored out - const float c = dot(eye_pos_vec, eye_pos_vec) - params.geom_radius*params.geom_radius; + const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius; return quadratic_solve(a, b_over_2, c); } -vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec) +float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec) { // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's // local coordinate frame (eye_pos_vec is a position, i.e. @@ -77,57 +78,57 @@ vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec) // Real-Time Collision Detection, p. 195-196, and this version // uses LaGrange's identity to reduce operations. // Arbitrary "cylinder top" reference point for an infinite cylinder: - const vec3 cylinder_top_vec = vec3(0.0, params.geom_radius, 0.0); - const vec3 cylinder_axis_vec = vec3(0.0, 1.0, 0.0);//vec3(0.0, 2.0*geom_radius, 0.0); - const vec3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec; - const vec3 axis_x_view = cross(cylinder_axis_vec, view_vec); - const vec3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec); + const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0); + const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0); + const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec; + const float3 axis_x_view = cross(cylinder_axis_vec, view_vec); + const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec); // Quadratic formula coefficients (b_over_2 is guaranteed negative): const float a = dot(axis_x_view, axis_x_view); const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view); const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) - - params.geom_radius*params.geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec); + geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec); return quadratic_solve(a, b_over_2, c); } -vec2 cylinder_xyz_to_uv(const vec3 intersection_pos_local, - const vec2 geom_aspect) +float2 cylinder_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) { // Requires: An xyz intersection position on a cylinder. // Returns: video_uv coords mapped to range [-0.5, 0.5] // Mapping: Define square_uv.x to be the signed arc length in xz-space, // and define square_uv.y = -intersection_pos_local.y (+v = -y). // Start with a numerically robust arc length calculation. - const float angle_from_image_center = atan(intersection_pos_local.z, - intersection_pos_local.x); - const float signed_arc_len = angle_from_image_center * params.geom_radius; + const float angle_from_image_center = atan2(intersection_pos_local.x, + intersection_pos_local.z); + const float signed_arc_len = angle_from_image_center * geom_radius; // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide // by the aspect ratio to stretch the mapping appropriately: - const vec2 square_uv = vec2(signed_arc_len, -intersection_pos_local.y); - const vec2 video_uv = square_uv / geom_aspect; + const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y); + const float2 video_uv = square_uv / geom_aspect; return video_uv; } -vec3 cylinder_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect) +float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) { // Requires: video_uv coords mapped to range [-0.5, 0.5] // Returns: An xyz intersection position on a cylinder. This is the // inverse of cylinder_xyz_to_uv(). // Expand video_uv by the aspect ratio to get proportionate x/y lengths, // then calculate an xyz position for the cylindrical mapping above. - const vec2 square_uv = video_uv * geom_aspect; + const float2 square_uv = video_uv * geom_aspect; const float arc_len = square_uv.x; - const float angle_from_image_center = arc_len / params.geom_radius; - const float x_pos = sin(angle_from_image_center) * params.geom_radius; - const float z_pos = cos(angle_from_image_center) * params.geom_radius; + const float angle_from_image_center = arc_len / geom_radius; + const float x_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = cos(angle_from_image_center) * geom_radius; // Or: z = sqrt(geom_radius**2 - x**2) // Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle) - const vec3 intersection_pos_local = vec3(x_pos, -square_uv.y, z_pos); + const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos); return intersection_pos_local; } -vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local, - const vec2 geom_aspect) +float2 sphere_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) { // Requires: An xyz intersection position on a sphere. // Returns: video_uv coords mapped to range [-0.5, 0.5] @@ -143,116 +144,119 @@ vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local, // sphere intersection point and the image center using a method posted by // Roger Stafford on comp.soft-sys.matlab: // https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ - const vec3 image_center_pos_local = vec3(0.0, 0.0, params.geom_radius); + const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius); const float cp_len = length(cross(intersection_pos_local, image_center_pos_local)); const float dp = dot(intersection_pos_local, image_center_pos_local); - const float angle_from_image_center = atan(dp, cp_len); - const float arc_len = angle_from_image_center * params.geom_radius; + const float angle_from_image_center = atan2(cp_len, dp); + const float arc_len = angle_from_image_center * geom_radius; // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide // by the aspect ratio to stretch the mapping appropriately: - const vec2 square_uv_unit = normalize(vec2(intersection_pos_local.x, + const float2 square_uv_unit = normalize(float2(intersection_pos_local.x, -intersection_pos_local.y)); - const vec2 square_uv = arc_len * square_uv_unit; - const vec2 video_uv = square_uv / geom_aspect; + const float2 square_uv = arc_len * square_uv_unit; + const float2 video_uv = square_uv / geom_aspect; return video_uv; } -vec3 sphere_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect) +float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) { // Requires: video_uv coords mapped to range [-0.5, 0.5] // Returns: An xyz intersection position on a sphere. This is the // inverse of sphere_xyz_to_uv(). // Expand video_uv by the aspect ratio to get proportionate x/y lengths, // then calculate an xyz position for the spherical mapping above. - const vec2 square_uv = video_uv * geom_aspect; + const float2 square_uv = video_uv * geom_aspect; // Using length or sqrt here butchers the framerate on my 8800GTS if // this function is called too many times, and so does taking the max // component of square_uv/square_uv_unit (program length threshold?). //float arc_len = length(square_uv); - const vec2 square_uv_unit = normalize(square_uv); + const float2 square_uv_unit = normalize(square_uv); const float arc_len = square_uv.y/square_uv_unit.y; - const float angle_from_image_center = arc_len / params.geom_radius; + const float angle_from_image_center = arc_len / geom_radius; const float xy_dist_from_sphere_center = - sin(angle_from_image_center) * params.geom_radius; - //vec2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len)); - const vec2 xy_pos = xy_dist_from_sphere_center * square_uv_unit; - const float z_pos = cos(angle_from_image_center) * params.geom_radius; - const vec3 intersection_pos_local = vec3(xy_pos.x, -xy_pos.y, z_pos); + sin(angle_from_image_center) * geom_radius; + //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len)); + const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit; + const float z_pos = cos(angle_from_image_center) * geom_radius; + const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos); return intersection_pos_local; } -vec2 sphere_alt_xyz_to_uv(const vec3 intersection_pos_local, - const vec2 geom_aspect) +float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect) { // Requires: An xyz intersection position on a cylinder. // Returns: video_uv coords mapped to range [-0.5, 0.5] // Mapping: Define square_uv.x to be the signed arc length in xz-space, // and define square_uv.y == signed arc length in yz-space. // See cylinder_xyz_to_uv() for implementation details (very similar). - const vec2 angle_from_image_center = atan((intersection_pos_local.zz), - vec2(intersection_pos_local.x, -intersection_pos_local.y)); - const vec2 signed_arc_len = angle_from_image_center * params.geom_radius; - const vec2 video_uv = signed_arc_len / geom_aspect; + const float2 angle_from_image_center = atan2( + float2(intersection_pos_local.x, -intersection_pos_local.y), + intersection_pos_local.zz); + const float2 signed_arc_len = angle_from_image_center * geom_radius; + const float2 video_uv = signed_arc_len / geom_aspect; return video_uv; } -vec3 sphere_alt_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect) +float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect) { // Requires: video_uv coords mapped to range [-0.5, 0.5] // Returns: An xyz intersection position on a sphere. This is the // inverse of sphere_alt_xyz_to_uv(). // See cylinder_uv_to_xyz() for implementation details (very similar). - const vec2 square_uv = video_uv * geom_aspect; - const vec2 arc_len = square_uv; - const vec2 angle_from_image_center = arc_len / params.geom_radius; - const vec2 xy_pos = sin(angle_from_image_center) * params.geom_radius; - const float z_pos = sqrt(params.geom_radius*params.geom_radius - dot(xy_pos, xy_pos)); - return vec3(xy_pos.x, -xy_pos.y, z_pos); + const float2 square_uv = video_uv * geom_aspect; + const float2 arc_len = square_uv; + const float2 angle_from_image_center = arc_len / geom_radius; + const float2 xy_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos)); + return float3(xy_pos.x, -xy_pos.y, z_pos); } -vec2 intersect(const vec3 view_vec_local, const vec3 eye_pos_local, +inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local, const float geom_mode) { - if (geom_mode < 2.5) return intersect_sphere(view_vec_local, eye_pos_local); - else return intersect_cylinder(view_vec_local, eye_pos_local); + return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) : + intersect_cylinder(view_vec_local, eye_pos_local); } -vec2 xyz_to_uv(const vec3 intersection_pos_local, - const vec2 geom_aspect, const float geom_mode) +inline float2 xyz_to_uv(const float3 intersection_pos_local, + const float2 geom_aspect, const float geom_mode) { - if (geom_mode < 1.5) return sphere_xyz_to_uv(intersection_pos_local, geom_aspect); - else if (geom_mode < 2.5) return sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect); - else return cylinder_xyz_to_uv(intersection_pos_local, geom_aspect); + return geom_mode < 1.5 ? + sphere_xyz_to_uv(intersection_pos_local, geom_aspect) : + geom_mode < 2.5 ? + sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) : + cylinder_xyz_to_uv(intersection_pos_local, geom_aspect); } -vec3 uv_to_xyz(const vec2 uv, const vec2 geom_aspect, +inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect, const float geom_mode) { - if (geom_mode < 1.5) return sphere_uv_to_xyz(uv, geom_aspect); - else if (geom_mode < 2.5) return sphere_alt_uv_to_xyz(uv, geom_aspect); - else return cylinder_uv_to_xyz(uv, geom_aspect); + return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) : + geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) : + cylinder_uv_to_xyz(uv, geom_aspect); } -vec2 view_vec_to_uv(const vec3 view_vec_local, const vec3 eye_pos_local, - const vec2 geom_aspect, const float geom_mode, out vec3 intersection_pos) +float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local, + const float2 geom_aspect, const float geom_mode, out float3 intersection_pos) { // Get the intersection point on the primitive, given an eye position // and view vector already in its local coordinate frame: - const vec2 intersect_dist_and_discriminant = intersect(view_vec_local, + const float2 intersect_dist_and_discriminant = intersect(view_vec_local, eye_pos_local, geom_mode); - const vec3 intersection_pos_local = eye_pos_local + + const float3 intersection_pos_local = eye_pos_local + view_vec_local * intersect_dist_and_discriminant.x; // Save the intersection position to an output parameter: intersection_pos = intersection_pos_local; // Transform into uv coords, but give out-of-range coords if the // view ray doesn't intersect the primitive in the first place: - if (intersect_dist_and_discriminant.y > 0.005) return xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode); - else return vec2(1.0); + return intersect_dist_and_discriminant.y > 0.005 ? + xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0); } -vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, - const vec2 geom_aspect, const vec3 global_coords[MAX_POINT_CLOUD_SIZE], +float3 get_ideal_global_eye_pos_for_points(float3 eye_pos, + const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE], const int num_points) { // Requires: Parameters: @@ -278,7 +282,7 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, // that result in each point being projected to a screen edge/corner in // pseudo-normalized device coords (where xy ranges from [-0.5, 0.5] // and z = eyespace z): - // pndc_coord = vec3(vec2(eyespace_xyz.x, -eyespace_xyz.y)* + // pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)* // geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z); // Notes: // The field of view is controlled by geom_view_dist's magnitude relative to @@ -288,11 +292,11 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, // But for the purposes of perspective divide, it should be considered: // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist // view_vec.z = -1.0 - const int max_centering_iters = 1; // Keep for easy testing. + static const int max_centering_iters = 1; // Keep for easy testing. for(int iter = 0; iter < max_centering_iters; iter++) { // 0.) Get the eyespace coordinates of our point cloud: - vec3 eyespace_coords[MAX_POINT_CLOUD_SIZE]; + float3 eyespace_coords[MAX_POINT_CLOUD_SIZE]; for(int i = 0; i < num_points; i++) { eyespace_coords[i] = global_coords[i] - eye_pos; @@ -302,31 +306,31 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, // Eyespace +y = up, screenspace +y = down, so flip y after // applying the eyespace offset (on the way to "clip space"). // Solve for two offsets per point based on: - // (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) * - // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(-0.5) - // (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) * - // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(0.5) + // (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5) + // (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5) // offset_ul and offset_dr represent the farthest we can move the // eye_pos up-left and down-right. Save the min of all offset_dr's // and the max of all offset_ul's (since it's negative). - float abs_radius = abs(params.geom_radius); // In case anyone gets ideas. ;) - vec2 offset_dr_min = vec2(10.0 * abs_radius, 10.0 * abs_radius); - vec2 offset_ul_max = vec2(-10.0 * abs_radius, -10.0 * abs_radius); + float abs_radius = abs(geom_radius); // In case anyone gets ideas. ;) + float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius); + float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius); for(int i = 0; i < num_points; i++) { - const vec2 flipy = vec2(1.0, -1.0); - vec3 eyespace_xyz = eyespace_coords[i]; - vec2 offset_dr = eyespace_xyz.xy - vec2(-0.5) * - (geom_aspect * -eyespace_xyz.z) / (params.geom_view_dist * flipy); - vec2 offset_ul = eyespace_xyz.xy - vec2(0.5) * - (geom_aspect * -eyespace_xyz.z) / (params.geom_view_dist * flipy); + static const float2 flipy = float2(1.0, -1.0); + float3 eyespace_xyz = eyespace_coords[i]; + float2 offset_dr = eyespace_xyz.xy - float2(-0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + float2 offset_ul = eyespace_xyz.xy - float2(0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); offset_dr_min = min(offset_dr_min, offset_dr); offset_ul_max = max(offset_ul_max, offset_ul); } // 1b.)Update eye_pos: Adding the average of offset_ul_max and // offset_dr_min gives it equal leeway on the top vs. bottom // and left vs. right. Recalculate eyespace_coords accordingly. - vec2 center_offset = 0.5 * (offset_ul_max + offset_dr_min); + float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min); eye_pos.xy += center_offset; for(int i = 0; i < num_points; i++) { @@ -347,14 +351,14 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, // We'll vectorize the actual computation. Take the maximum of // these four for a single offset, and continue taking the max // for every point (use max because offset.z is negative). - float offset_z_max = -10.0 * params.geom_radius * params.geom_view_dist; + float offset_z_max = -10.0 * geom_radius * geom_view_dist; for(int i = 0; i < num_points; i++) { - vec3 eyespace_xyz_flipy = eyespace_coords[i] * - vec3(1.0, -1.0, 1.0); - vec4 offset_zzzz = eyespace_xyz_flipy.zzzz + - (eyespace_xyz_flipy.xyxy * params.geom_view_dist) / - (vec4(-0.5, -0.5, 0.5, 0.5) * vec4(geom_aspect, geom_aspect)); + float3 eyespace_xyz_flipy = eyespace_coords[i] * + float3(1.0, -1.0, 1.0); + float4 offset_zzzz = eyespace_xyz_flipy.zzzz + + (eyespace_xyz_flipy.xyxy * geom_view_dist) / + (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect)); // Ignore offsets that push positive x/y values to opposite // boundaries, and vice versa, and don't let the camera move // past a point in the dead center of the screen: @@ -374,19 +378,20 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, return eye_pos; } -vec3 get_ideal_global_eye_pos(const mat3x3 local_to_global, - const vec2 geom_aspect, const float geom_mode) +float3 get_ideal_global_eye_pos(const float3x3 local_to_global, + const float2 geom_aspect, const float geom_mode) { // Start with an initial eye_pos that includes the entire primitive // (sphere or cylinder) in its field-of-view: - const vec3 high_view = vec3(0.0, geom_aspect.y, -params.geom_view_dist); - const vec3 low_view = high_view * vec3(1.0, -1.0, 1.0); + const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist); + const float3 low_view = high_view * float3(1.0, -1.0, 1.0); const float len_sq = dot(high_view, high_view); const float fov = abs(acos(dot(high_view, low_view)/len_sq)); // Trigonometry/similar triangles say distance = geom_radius/sin(fov/2): - const float eye_z_spherical = params.geom_radius/sin(fov*0.5); - vec3 eye_pos = vec3(0.0, 0.0, eye_z_spherical); - if (geom_mode < 2.5) eye_pos = vec3(0.0, 0.0, max(params.geom_view_dist, eye_z_spherical)); + const float eye_z_spherical = geom_radius/sin(fov*0.5); + const float3 eye_pos = geom_mode < 2.5 ? + float3(0.0, 0.0, eye_z_spherical) : + float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical)); // Get global xyz coords of extreme sample points on the simulated CRT // screen. Start with the center, edge centers, and corners of the @@ -394,37 +399,35 @@ vec3 get_ideal_global_eye_pos(const mat3x3 local_to_global, // by closer points on the primitive, but they may NOT be occluded by // the convex hull of the remaining samples (i.e. the remaining convex // hull might not envelope points that do occlude a back-facing point.) - const int num_points = MAX_POINT_CLOUD_SIZE; - vec3 global_coords[MAX_POINT_CLOUD_SIZE]; - global_coords[0] = (uv_to_xyz(vec2(0.0, 0.0), geom_aspect, geom_mode) * local_to_global); - global_coords[1] = (uv_to_xyz(vec2(0.0, -0.5), geom_aspect, geom_mode) * local_to_global); - global_coords[2] = (uv_to_xyz(vec2(0.0, 0.5), geom_aspect, geom_mode) * local_to_global); - global_coords[3] = (uv_to_xyz(vec2(-0.5, 0.0), geom_aspect, geom_mode) * local_to_global); - global_coords[4] = (uv_to_xyz(vec2(0.5, 0.0), geom_aspect, geom_mode) * local_to_global); - global_coords[5] = (uv_to_xyz(vec2(-0.5, -0.5), geom_aspect, geom_mode) * local_to_global); - global_coords[6] = (uv_to_xyz(vec2(0.5, -0.5), geom_aspect, geom_mode) * local_to_global); - global_coords[7] = (uv_to_xyz(vec2(-0.5, 0.5), geom_aspect, geom_mode) * local_to_global); - global_coords[8] = (uv_to_xyz(vec2(0.5, 0.5), geom_aspect, geom_mode) * local_to_global); + static const int num_points = MAX_POINT_CLOUD_SIZE; + float3 global_coords[MAX_POINT_CLOUD_SIZE]; + global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode)); + global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode)); + global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode)); + global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode)); + global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode)); + global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode)); + global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode)); + global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode)); + global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode)); // Adding more inner image points could help in extreme cases, but too many // points will kille the framerate. For safety, default to the initial // eye_pos if any z coords are negative: float num_negative_z_coords = 0.0; for(int i = 0; i < num_points; i++) { - if (global_coords[0].z < 0.0) - {num_negative_z_coords += float(global_coords[0].z);} + num_negative_z_coords += float(global_coords[0].z < 0.0); } // Outsource the optimized eye_pos calculation: - if (num_negative_z_coords > 0.5) - return eye_pos; - else - return get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect, global_coords, num_points); + return num_negative_z_coords > 0.5 ? eye_pos : + get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect, + global_coords, num_points); } -mat3x3 get_pixel_to_object_matrix(const mat3x3 global_to_local, - const vec3 eye_pos_local, const vec3 view_vec_global, - const vec3 intersection_pos_local, const vec3 normal, - const vec2 output_size_inv) +float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local, + const float3 eye_pos_local, const float3 view_vec_global, + const float3 intersection_pos_local, const float3 normal, + const float2 output_size_inv) { // Requires: See get_curved_video_uv_coords_and_tangent_matrix for // descriptions of each parameter. @@ -437,26 +440,26 @@ mat3x3 get_pixel_to_object_matrix(const mat3x3 global_to_local, // vectors to 3D vectors along the CRT's surface, for later // conversion to uv vectors.) // Shorthand inputs: - const vec3 pos = intersection_pos_local; - const vec3 eye_pos = eye_pos_local; + const float3 pos = intersection_pos_local; + const float3 eye_pos = eye_pos_local; // Get a piecewise-linear matrix transforming from "pixelspace" offset // vectors (1.0 = one pixel) to object space vectors in the tangent // plane (faster than finding 3 view-object intersections). // 1.) Get the local view vecs for the pixels to the right and down: - const vec3 view_vec_right_global = view_vec_global + - vec3(output_size_inv.x, 0.0, 0.0); - const vec3 view_vec_down_global = view_vec_global + - vec3(0.0, -output_size_inv.y, 0.0); - const vec3 view_vec_right_local = - (view_vec_right_global * global_to_local); - const vec3 view_vec_down_local = - (view_vec_down_global * global_to_local); + const float3 view_vec_right_global = view_vec_global + + float3(output_size_inv.x, 0.0, 0.0); + const float3 view_vec_down_global = view_vec_global + + float3(0.0, -output_size_inv.y, 0.0); + const float3 view_vec_right_local = + mul(global_to_local, view_vec_right_global); + const float3 view_vec_down_local = + mul(global_to_local, view_vec_down_global); // 2.) Using the true intersection point, intersect the neighboring // view vectors with the tangent plane: - const vec3 intersection_vec_dot_normal = vec3(dot(pos - eye_pos, normal)); - const vec3 right_pos = eye_pos + (intersection_vec_dot_normal / + const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal)); + const float3 right_pos = eye_pos + (intersection_vec_dot_normal / dot(view_vec_right_local, normal))*view_vec_right_local; - const vec3 down_pos = eye_pos + (intersection_vec_dot_normal / + const float3 down_pos = eye_pos + (intersection_vec_dot_normal / dot(view_vec_down_local, normal))*view_vec_down_local; // 3.) Subtract the original intersection pos from its neighbors; the // resulting vectors are object-space vectors tangent to the plane. @@ -464,17 +467,17 @@ mat3x3 get_pixel_to_object_matrix(const mat3x3 global_to_local, // and (0.0, 1.0) pixel offsets, so they form the first two basis // vectors of a pixelspace to object space transformation. This // transformation is 2D to 3D, so use (0, 0, 0) for the third vector. - const vec3 object_right_vec = right_pos - pos; - const vec3 object_down_vec = down_pos - pos; - const mat3x3 pixel_to_object = mat3x3( + const float3 object_right_vec = right_pos - pos; + const float3 object_down_vec = down_pos - pos; + const float3x3 pixel_to_object = float3x3( object_right_vec.x, object_down_vec.x, 0.0, object_right_vec.y, object_down_vec.y, 0.0, object_right_vec.z, object_down_vec.z, 0.0); return pixel_to_object; } -mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local, - const vec3 normal, const vec2 geom_aspect, const float geom_mode) +float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local, + const float3 normal, const float2 geom_aspect, const float geom_mode) { // Requires: See get_curved_video_uv_coords_and_tangent_matrix for // descriptions of each parameter. @@ -490,7 +493,7 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local, // We want the inverse of the TBN matrix (transpose of the cotangent // matrix), which transforms ordinary vectors from object->tangent space. // Start by calculating the relevant basis vectors in accordance with - // Christian Schüler's blog post "Followup: Normal Mapping Without + // Christian Schüler's blog post "Followup: Normal Mapping Without // Precomputed Tangents": http://www.thetenthplanet.de/archives/1180 // With our particular uv mapping, the scale of the u and v directions // is determined entirely by the aspect ratio for cylindrical and ordinary @@ -498,13 +501,13 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local, // determined by it (the alternate mapping is more complex). Therefore, we // must ensure appropriate cotangent and cobitangent lengths as well. // Base these off the uv<=>xyz mappings for each primitive. - const vec3 pos = intersection_pos_local; - const vec3 x_vec = vec3(1.0, 0.0, 0.0); - const vec3 y_vec = vec3(0.0, 1.0, 0.0); + const float3 pos = intersection_pos_local; + static const float3 x_vec = float3(1.0, 0.0, 0.0); + static const float3 y_vec = float3(0.0, 1.0, 0.0); // The tangent and bitangent vectors correspond with increasing u and v, // respectively. Mathematically we'd base the cotangent/cobitangent on // those, but we'll compute the cotangent/cobitangent directly when we can. - vec3 cotangent_unscaled, cobitangent_unscaled; + float3 cotangent_unscaled, cobitangent_unscaled; // geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE. if(geom_mode < 1.5) { @@ -526,10 +529,10 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local, // This mapping works a bit like the cylindrical mapping in two // directions, which makes the lengths and directions more complex. // Unfortunately, I can't find much of a shortcut: - const vec3 tangent = normalize( - cross(y_vec, vec3(pos.x, 0.0, pos.z))) * geom_aspect.x; - const vec3 bitangent = normalize( - cross(x_vec, vec3(0.0, pos.yz))) * geom_aspect.y; + const float3 tangent = normalize( + cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x; + const float3 bitangent = normalize( + cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y; cotangent_unscaled = cross(normal, bitangent); cobitangent_unscaled = cross(tangent, normal); } @@ -537,31 +540,31 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local, { // Cylinder: // tangent = normalize(cross(y_vec, normal)) * geom_aspect.x; - // bitangent = vec3(0.0, -geom_aspect.y, 0.0); + // bitangent = float3(0.0, -geom_aspect.y, 0.0); // inv_determinant = 1.0/length(cross(bitangent, tangent)) // cotangent = cross(normal, bitangent) * inv_determinant // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant // cobitangent = cross(tangent, normal) * inv_determinant - // == vec3(0.0, -geom_aspect.x, 0.0) * inv_determinant + // == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y; - cobitangent_unscaled = vec3(0.0, -geom_aspect.x, 0.0); + cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0); } - const vec3 computed_normal = + const float3 computed_normal = cross(cobitangent_unscaled, cotangent_unscaled); - const float inv_determinant = inversesqrt(dot(computed_normal, computed_normal)); - const vec3 cotangent = cotangent_unscaled * inv_determinant; - const vec3 cobitangent = cobitangent_unscaled * inv_determinant; + const float inv_determinant = rsqrt(dot(computed_normal, computed_normal)); + const float3 cotangent = cotangent_unscaled * inv_determinant; + const float3 cobitangent = cobitangent_unscaled * inv_determinant; // The [cotangent, cobitangent, normal] column vecs form the cotangent // frame, i.e. the inverse-transpose TBN matrix. Get its transpose: - const mat3x3 object_to_tangent = mat3x3(cotangent, cobitangent, normal); + const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal); return object_to_tangent; } -vec2 get_curved_video_uv_coords_and_tangent_matrix( - const vec2 flat_video_uv, const vec3 eye_pos_local, - const vec2 output_size_inv, const vec2 geom_aspect, - const float geom_mode, const mat3x3 global_to_local, - out mat2x2 pixel_to_tangent_video_uv) +float2 get_curved_video_uv_coords_and_tangent_matrix( + const float2 flat_video_uv, const float3 eye_pos_local, + const float2 output_size_inv, const float2 geom_aspect, + const float geom_mode, const float3x3 global_to_local, + out float2x2 pixel_to_tangent_video_uv) { // Requires: Parameters: // 1.) flat_video_uv coords are in range [0.0, 1.0], where @@ -570,7 +573,7 @@ vec2 get_curved_video_uv_coords_and_tangent_matrix( // 2.) eye_pos_local is the 3D camera position in the simulated // CRT's local coordinate frame. For best results, it must // be computed based on the same geom_view_dist used here. - // 3.) output_size_inv = vec2(1.0)/IN.output_size + // 3.) output_size_inv = float2(1.0)/IN.output_size // 4.) geom_aspect = get_aspect_vector( // IN.output_size.x / IN.output_size.y); // 5.) geom_mode is a static or runtime mode setting: @@ -600,66 +603,66 @@ vec2 get_curved_video_uv_coords_and_tangent_matrix( // For the effect of "looking through a window" at a CRT, it should be // set equal to the user's distance from their physical screen, in // units of the viewport's physical diagonal size. - const vec2 view_uv = (flat_video_uv - vec2(0.5)) * geom_aspect; - const vec3 view_vec_global = - vec3(view_uv.x, -view_uv.y, -params.geom_view_dist); + const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect; + const float3 view_vec_global = + float3(view_uv.x, -view_uv.y, -geom_view_dist); // Transform the view vector into the CRT's local coordinate frame, convert // to video_uv coords, and get the local 3D intersection position: - const vec3 view_vec_local = (view_vec_global * global_to_local); - vec3 pos; - const vec2 centered_uv = view_vec_to_uv( + const float3 view_vec_local = mul(global_to_local, view_vec_global); + float3 pos; + const float2 centered_uv = view_vec_to_uv( view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos); - const vec2 video_uv = centered_uv + vec2(0.5); + const float2 video_uv = centered_uv + float2(0.5); // Get a pixel-to-tangent-video-uv matrix. The caller could deal with // all but one of these cases, but that would be more complicated. #ifdef DRIVERS_ALLOW_DERIVATIVES // Derivatives obtain a matrix very fast, but the direction of pixel- // space +y seems to depend on the pass. Enforce the correct direction // on a best-effort basis (but it shouldn't matter for antialiasing). - const vec2 duv_dx = ddx(video_uv); - const vec2 duv_dy = ddy(video_uv); + const float2 duv_dx = ddx(video_uv); + const float2 duv_dy = ddy(video_uv); #ifdef LAST_PASS - pixel_to_tangent_video_uv = mat2x2( + pixel_to_tangent_video_uv = float2x2( duv_dx.x, duv_dy.x, -duv_dx.y, -duv_dy.y); #else - pixel_to_tangent_video_uv = mat2x2( + pixel_to_tangent_video_uv = float2x2( duv_dx.x, duv_dy.x, duv_dx.y, duv_dy.y); #endif #else // Manually define a transformation matrix. We'll assume pixel-space // +y = down, just like +v = down. - if(geom_force_correct_tangent_matrix == true) + if(geom_force_correct_tangent_matrix) { // Get the surface normal based on the local intersection position: - vec3 normal_base = pos; - if (geom_mode > 2.5) normal_base = vec3(pos.x, 0.0, pos.z); - const vec3 normal = normalize(normal_base); + const float3 normal_base = geom_mode < 2.5 ? pos : + float3(pos.x, 0.0, pos.z); + const float3 normal = normalize(normal_base); // Get pixel-to-object and object-to-tangent matrices and combine // them into a 2x2 pixel-to-tangent matrix for video_uv offsets: - const mat3x3 pixel_to_object = get_pixel_to_object_matrix( + const float3x3 pixel_to_object = get_pixel_to_object_matrix( global_to_local, eye_pos_local, view_vec_global, pos, normal, output_size_inv); - const mat3x3 object_to_tangent = get_object_to_tangent_matrix( + const float3x3 object_to_tangent = get_object_to_tangent_matrix( pos, normal, geom_aspect, geom_mode); - const mat3x3 pixel_to_tangent3x3 = - (pixel_to_object * object_to_tangent); - pixel_to_tangent_video_uv = mat2x2( - pixel_to_tangent3x3[0].xyz, pixel_to_tangent3x3[1].x); + const float3x3 pixel_to_tangent3x3 = + mul(object_to_tangent, pixel_to_object); + pixel_to_tangent_video_uv = float2x2( + pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11); } else { // Ignore curvature, and just consider flat scaling. The // difference is only apparent with strong curvature: - pixel_to_tangent_video_uv = mat2x2( + pixel_to_tangent_video_uv = float2x2( output_size_inv.x, 0.0, 0.0, output_size_inv.y); } #endif return video_uv; } -float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect) +float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect) { // COPYRIGHT NOTE FOR THIS FUNCTION: // Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey @@ -671,15 +674,20 @@ float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect) // Calculate border_dim_factor from the proximity to uv-space image // borders; geom_aspect/border_size/border/darkness/border_compress are globals: - const vec2 edge_dists = min(video_uv, vec2(1.0) - video_uv) * + const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) * geom_aspect; - const vec2 border_penetration = - max(vec2(params.border_size) - edge_dists, vec2(0.0)); - const float penetration_ratio = length(border_penetration)/params.border_size; + const float2 border_penetration = + max(float2(border_size) - edge_dists, float2(0.0)); + const float penetration_ratio = length(border_penetration)/border_size; const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0); const float border_dim_factor = - pow(border_escape_ratio, params.border_darkness) * max(1.0, params.border_compress); + pow(border_escape_ratio, border_darkness) * max(1.0, border_compress); return min(border_dim_factor, 1.0); } -#endif // GEOMETRY_FUNCTIONS_H \ No newline at end of file + + +#endif // GEOMETRY_FUNCTIONS_H + + + diff --git a/crt/shaders/crt-royale/src/params.inc b/crt/shaders/crt-royale/src/params.inc index e442b5e..46044ec 100644 --- a/crt/shaders/crt-royale/src/params.inc +++ b/crt/shaders/crt-royale/src/params.inc @@ -1,101 +1,88 @@ -#ifndef PARAMS_INC -#define PARAMS_INC - -layout(std140, set = 0, binding = 0) uniform UBO -{ - mat4 MVP; -// float crt_gamma; -// float lcd_gamma; - float levels_contrast; - float halation_weight; - float diffusion_weight; - float bloom_underestimate_levels; - float bloom_excess; - float beam_min_sigma; - float beam_max_sigma; - float beam_spot_power; - float beam_min_shape; - float beam_max_shape; - float beam_shape_power; - float beam_horiz_filter; - float beam_horiz_sigma; -// float beam_horiz_linear_rgb_weight; - float convergence_offset_x_r; - float convergence_offset_x_g; - float convergence_offset_x_b; - float convergence_offset_y_r; - float convergence_offset_y_g; - float convergence_offset_y_b; - float mask_type; - float mask_sample_mode_desired; - float mask_specify_num_triads; - float mask_triad_size_desired; - float mask_num_triads_desired; -// float aa_subpixel_r_offset_x_runtime; -// float aa_subpixel_r_offset_y_runtime; -// float aa_cubic_c; -// float aa_gauss_sigma; -// float geom_mode_runtime; -// float geom_radius; -// float geom_view_dist; -// float geom_tilt_angle_x; -// float geom_tilt_angle_y; - float geom_aspect_ratio_x; - float geom_aspect_ratio_y; -// float geom_overscan_x; -// float geom_overscan_y; -// float border_size; -// float border_darkness; -// float border_compress; - float interlace_1080i; - float beam_num_scanlines; -} params; +//#define HARDCODE_SETTINGS +#ifndef HARDCODE_SETTINGS // Set shader params for all passes here: -//#pragma parameter crt_gamma "crt_gamma" 2.5 1.0 5.0 0.025 -//#pragma parameter lcd_gamma "lcd_gamma" 2.2 1.0 5.0 0.025 -#pragma parameter levels_contrast "levels_contrast" 1.0 0.0 4.0 0.015625 -#pragma parameter halation_weight "halation_weight" 0.0 0.0 1.0 0.005 -#pragma parameter diffusion_weight "diffusion_weight" 0.075 0.0 1.0 0.005 -#pragma parameter bloom_underestimate_levels "bloom_underestimate_levels" 0.8 0.0 5.0 0.01 -#pragma parameter bloom_excess "bloom_excess" 0.0 0.0 1.0 0.005 -#pragma parameter beam_min_sigma "beam_min_sigma" 0.02 0.005 1.0 0.005 -#pragma parameter beam_max_sigma "beam_max_sigma" 0.3 0.005 1.0 0.005 -#pragma parameter beam_spot_power "beam_spot_power" 0.33 0.01 16.0 0.01 -#pragma parameter beam_min_shape "beam_min_shape" 2.0 2.0 32.0 0.1 -#pragma parameter beam_max_shape "beam_max_shape" 4.0 2.0 32.0 0.1 -#pragma parameter beam_shape_power "beam_shape_power" 0.25 0.01 16.0 0.01 -#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0 -#pragma parameter beam_horiz_sigma "beam_horiz_sigma" 0.35 0.0 0.67 0.005 -//#pragma parameter beam_horiz_linear_rgb_weight "beam_horiz_linear_rgb_weight" 1.0 0.0 1.0 0.01 -#pragma parameter convergence_offset_x_r "convergence_offset_x_r" 0.0 -4.0 4.0 0.05 -#pragma parameter convergence_offset_x_g "convergence_offset_x_g" 0.0 -4.0 4.0 0.05 -#pragma parameter convergence_offset_x_b "convergence_offset_x_b" 0.0 -4.0 4.0 0.05 -#pragma parameter convergence_offset_y_r "convergence_offset_y_r" 0.0 -2.0 2.0 0.05 -#pragma parameter convergence_offset_y_g "convergence_offset_y_g" 0.0 -2.0 2.0 0.05 -#pragma parameter convergence_offset_y_b "convergence_offset_y_b" 0.0 -2.0 2.0 0.05 -#pragma parameter mask_type "mask_type" 1.0 0.0 2.0 1.0 -#pragma parameter mask_sample_mode_desired "mask_sample_mode" 1.0 0.0 2.0 1.0 // Consider blocking mode 2. -#pragma parameter mask_specify_num_triads "mask_specify_num_triads" 0.0 0.0 1.0 1.0 -#pragma parameter mask_triad_size_desired "mask_triad_size_desired" 3.0 1.0 18.0 0.125 -#pragma parameter mask_num_triads_desired "mask_num_triads_desired" 480.0 342.0 1920.0 1.0 -//#pragma parameter aa_subpixel_r_offset_x_runtime "aa_subpixel_r_offset_x" -0.333333333 -0.333333333 0.333333333 0.333333333 -//#pragma parameter aa_subpixel_r_offset_y_runtime "aa_subpixel_r_offset_y" 0.0 -0.333333333 0.333333333 0.333333333 -//#pragma parameter aa_cubic_c "antialias_cubic_sharpness" 0.5 0.0 4.0 0.015625 -//#pragma parameter aa_gauss_sigma "antialias_gauss_sigma" 0.5 0.0625 1.0 0.015625 -//#pragma parameter geom_mode_runtime "geom_mode" 0.0 0.0 3.0 1.0 -//#pragma parameter geom_radius "geom_radius" 2.0 0.16 1024.0 0.1 -//#pragma parameter geom_view_dist "geom_view_dist" 2.0 0.5 1024.0 0.25 -//#pragma parameter geom_tilt_angle_x "geom_tilt_angle_x" 0.0 -3.14159265 3.14159265 0.017453292519943295 -//#pragma parameter geom_tilt_angle_y "geom_tilt_angle_y" 0.0 -3.14159265 3.14159265 0.017453292519943295 -#pragma parameter geom_aspect_ratio_x "geom_aspect_ratio_x" 432.0 1.0 512.0 1.0 -#pragma parameter geom_aspect_ratio_y "geom_aspect_ratio_y" 329.0 1.0 512.0 1.0 -//#pragma parameter geom_overscan_x "geom_overscan_x" 1.0 0.00390625 4.0 0.00390625 -//#pragma parameter geom_overscan_y "geom_overscan_y" 1.0 0.00390625 4.0 0.00390625 -//#pragma parameter border_size "border_size" 0.015 0.0000001 0.5 0.005 -//#pragma parameter border_darkness "border_darkness" 2.0 0.0 16.0 0.0625 -//#pragma parameter border_compress "border_compress" 2.5 1.0 64.0 0.0625 -#pragma parameter interlace_1080i "interlace_1080i" 0.0 0.0 1.0 1.0 -#pragma parameter beam_num_scanlines "beam_num_scanlines" 4.0 2.0 6.0 1.0 - +#pragma parameter crt_gamma "Simulated CRT Gamma" 2.5 1.0 5.0 0.025 +#define crt_gamma global.crt_gamma +#pragma parameter lcd_gamma "Your Display Gamma" 2.2 1.0 5.0 0.025 +#define lcd_gamma global.lcd_gamma +#pragma parameter levels_contrast "Contrast" 1.0 0.0 4.0 0.015625 +#define levels_contrast global.levels_contrast +#pragma parameter halation_weight "Halation Weight" 0.0 0.0 1.0 0.005 +#pragma parameter diffusion_weight "Diffusion Weight" 0.075 0.0 1.0 0.005 +#pragma parameter bloom_underestimate_levels "Bloom - Underestimate Levels" 0.8 0.0 5.0 0.01 +#define bloom_underestimate_levels global.bloom_underestimate_levels +#pragma parameter bloom_excess "Bloom - Excess" 0.0 0.0 1.0 0.005 +#pragma parameter beam_min_sigma "Beam - Min Sigma" 0.02 0.005 1.0 0.005 +#define beam_min_sigma global.beam_min_sigma +#pragma parameter beam_max_sigma "Beam - Max Sigma" 0.3 0.005 1.0 0.005 +#define beam_max_sigma global.beam_max_sigma +#pragma parameter beam_spot_power "Beam - Spot Power" 0.33 0.01 16.0 0.01 +#define beam_spot_power global.beam_spot_power +#pragma parameter beam_min_shape "Beam - Min Shape" 2.0 2.0 32.0 0.1 +#define beam_min_shape global.beam_min_shape +#pragma parameter beam_max_shape "Beam - Max Shape" 4.0 2.0 32.0 0.1 +#define beam_max_shape global.beam_max_shape +#pragma parameter beam_shape_power "Beam - Shape Power" 0.25 0.01 16.0 0.01 +#define beam_shape_power global.beam_shape_power +#pragma parameter beam_horiz_filter "Beam - Horiz Filter" 0.0 0.0 2.0 1.0 +#define beam_horiz_filter global.beam_horiz_filter +#pragma parameter beam_horiz_sigma "Beam - Horiz Sigma" 0.35 0.0 0.67 0.005 +#define beam_horiz_sigma global.beam_horiz_sigma +#pragma parameter beam_horiz_linear_rgb_weight "Beam - Horiz Linear RGB Weight" 1.0 0.0 1.0 0.01 +#pragma parameter convergence_offset_x_r "Convergence - Offset X Red" 0.0 -4.0 4.0 0.05 +#define convergence_offset_x_r global.convergence_offset_x_r +#pragma parameter convergence_offset_x_g "Convergence - Offset X Green" 0.0 -4.0 4.0 0.05 +#define convergence_offset_x_g global.convergence_offset_x_g +#pragma parameter convergence_offset_x_b "Convergence - Offset X Blue" 0.0 -4.0 4.0 0.05 +#define convergence_offset_x_b global.convergence_offset_x_b +#pragma parameter convergence_offset_y_r "Convergence - Offset Y Red" 0.0 -2.0 2.0 0.05 +#define convergence_offset_y_r global.convergence_offset_y_r +#pragma parameter convergence_offset_y_g "Convergence - Offset Y Green" 0.0 -2.0 2.0 0.05 +#define convergence_offset_y_g global.convergence_offset_y_g +#pragma parameter convergence_offset_y_b "Convergence - Offset Y Blue" 0.0 -2.0 2.0 0.05 +#define convergence_offset_y_b global.convergence_offset_y_b +#pragma parameter mask_type "Mask - Type" 1.0 0.0 2.0 1.0 +#define mask_type global.mask_type +#pragma parameter mask_sample_mode_desired "Mask - Sample Mode" 0.0 0.0 2.0 1.0 // Consider blocking mode 2. +#define mask_sample_mode_desired global.mask_sample_mode_desired +#pragma parameter mask_specify_num_triads "Mask - Specify Number of Triads" 0.0 0.0 1.0 1.0 +#pragma parameter mask_triad_size_desired "Mask - Triad Size Desired" 3.0 1.0 18.0 0.125 +#pragma parameter mask_num_triads_desired "Mask - Number of Triads Desired" 480.0 342.0 1920.0 1.0 +#pragma parameter aa_subpixel_r_offset_x_runtime "AA - Subpixel R Offset X" -0.333333333 -0.333333333 0.333333333 0.333333333 +#define aa_subpixel_r_offset_x_runtime global.aa_subpixel_r_offset_x_runtime +#pragma parameter aa_subpixel_r_offset_y_runtime "AA - Subpixel R Offset Y" 0.0 -0.333333333 0.333333333 0.333333333 +#define aa_subpixel_r_offset_y_runtime global.aa_subpixel_r_offset_y_runtime +#pragma parameter aa_cubic_c "AA - Cubic Sharpness" 0.5 0.0 4.0 0.015625 +#define aa_cubic_c global.aa_cubic_c +#pragma parameter aa_gauss_sigma "AA - Gaussian Sigma" 0.5 0.0625 1.0 0.015625 +#define aa_gauss_sigma global.aa_gauss_sigma +#pragma parameter geom_mode_runtime "Geometry - Mode" 0.0 0.0 3.0 1.0 +#define geom_mode_runtime global.geom_mode_runtime +#pragma parameter geom_radius "Geometry - Radius" 2.0 0.16 1024.0 0.1 +#define geom_radius global.geom_radius +#pragma parameter geom_view_dist "Geometry - View Distance" 2.0 0.5 1024.0 0.25 +#define geom_view_dist global.geom_view_dist +#pragma parameter geom_tilt_angle_x "Geometry - Tilt Angle X" 0.0 -3.14159265 3.14159265 0.017453292519943295 +#define geom_tilt_angle_x global.geom_tilt_angle_x +#pragma parameter geom_tilt_angle_y "Geometry - Tilt Angle Y" 0.0 -3.14159265 3.14159265 0.017453292519943295 +#define geom_tilt_angle_y global.geom_tilt_angle_y +#pragma parameter geom_aspect_ratio_x "Geometry - Aspect Ratio X" 432.0 1.0 512.0 1.0 +#define geom_aspect_ratio_x global.geom_aspect_ratio_x +#pragma parameter geom_aspect_ratio_y "Geometry - Aspect Ratio Y" 329.0 1.0 512.0 1.0 +#define geom_aspect_ratio_y global.geom_aspect_ratio_y +#pragma parameter geom_overscan_x "Geometry - Overscan X" 1.0 0.00390625 4.0 0.00390625 +#define geom_overscan_x global.geom_overscan_x +#pragma parameter geom_overscan_y "Geometry - Overscan Y" 1.0 0.00390625 4.0 0.00390625 +#define geom_overscan_y global.geom_overscan_y +#pragma parameter border_size "Border - Size" 0.015 0.0000001 0.5 0.005 +#define border_size global.border_size +#pragma parameter border_darkness "Border - Darkness" 2.0 0.0 16.0 0.0625 +#define border_darkness global.border_darkness +#pragma parameter border_compress "Border - Compression" 2.5 1.0 64.0 0.0625 +#define border_compress global.border_compress +#pragma parameter interlace_bff "Interlacing - Bottom Field First" 0.0 0.0 1.0 1.0 +//#define interlace_bff global.interlace_bff +#pragma parameter interlace_1080i "Interlace - Detect 1080i" 0.0 0.0 1.0 1.0 +#define interlace_1080i global.interlace_1080i #endif \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/phosphor-mask-resizing.h b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h index 8508688..dc82562 100644 --- a/crt/shaders/crt-royale/src/phosphor-mask-resizing.h +++ b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h @@ -40,76 +40,26 @@ #endif // No else needed: Dynamic loops assumed. - #define CALCULATE_R_COORD_FOR_4_SAMPLES \ - const vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \ - const vec4 tile_uv_r = fract( \ - first_texel_tile_uv_rrrr + true_i * tile_dr); \ - const vec4 tex_uv_r = tile_uv_r * tile_size_uv_r; - - #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ - CALCULATE_R_COORD_FOR_4_SAMPLES; \ - const vec3 new_sample0 = tex2Dlod0try(texture, \ - vec2(tex_uv.x, tex_uv_r.x)).rgb; \ - const vec3 new_sample1 = tex2Dlod0try(texture, \ - vec2(tex_uv.x, tex_uv_r.y)).rgb; \ - const vec3 new_sample2 = tex2Dlod0try(texture, \ - vec2(tex_uv.x, tex_uv_r.z)).rgb; \ - const vec3 new_sample3 = tex2Dlod0try(texture, \ - vec2(tex_uv.x, tex_uv_r.w)).rgb; \ - UPDATE_COLOR_AND_WEIGHT_SUMS; - - #define UPDATE_COLOR_AND_WEIGHT_SUMS \ - const vec4 dist = magnification_scale * \ - abs(first_dist_unscaled - true_i); \ - const vec4 pi_dist = pi * dist; \ - CALCULATE_SINC_RESAMPLE_WEIGHTS; \ - pixel_color += new_sample0 * weights.xxx; \ - pixel_color += new_sample1 * weights.yyy; \ - pixel_color += new_sample2 * weights.zzz; \ - pixel_color += new_sample3 * weights.www; \ - weight_sum += weights; - - #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW - #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ - const vec4 pi_dist_over_lobes = pi_over_lobes * dist; \ - const vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ - (pi_dist*pi_dist_over_lobes), vec4(1.0)); - #else - #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ - const vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0)); - #endif - - #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ - CALCULATE_R_COORD_FOR_4_SAMPLES; \ - const vec3 new_sample0 = tex2Dlod0try(texture, \ - vec2(tex_uv_r.x, tex_uv.y)).rgb; \ - const vec3 new_sample1 = tex2Dlod0try(texture, \ - vec2(tex_uv_r.y, tex_uv.y)).rgb; \ - const vec3 new_sample2 = tex2Dlod0try(texture, \ - vec2(tex_uv_r.z, tex_uv.y)).rgb; \ - const vec3 new_sample3 = tex2Dlod0try(texture, \ - vec2(tex_uv_r.w, tex_uv.y)).rgb; \ - UPDATE_COLOR_AND_WEIGHT_SUMS; - ////////////////////////////////// CONSTANTS ///////////////////////////////// // The larger the resized tile, the fewer samples we'll need for downsizing. // See if we can get a static min tile size > mask_min_allowed_tile_size: -const float mask_min_allowed_tile_size = ceil( +static const float mask_min_allowed_tile_size = ceil( mask_min_allowed_triad_size * mask_triads_per_tile); -const float mask_min_expected_tile_size = +static const float mask_min_expected_tile_size = mask_min_allowed_tile_size; // Limit the number of sinc resize taps by the maximum minification factor: -const float pi_over_lobes = pi/mask_sinc_lobes; -const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * +static const float pi_over_lobes = pi/mask_sinc_lobes; +static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * mask_resize_src_lut_size.x/mask_min_expected_tile_size; // Vectorized loops sample in multiples of 4. Round up to be safe: -const float max_sinc_resize_samples_m4 = ceil( +static const float max_sinc_resize_samples_m4 = ceil( max_sinc_resize_samples_float * 0.25) * 4.0; - - ///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// -float get_dynamic_loop_size(const float magnification_scale) + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) { // Requires: The following global constants must be defined: // 1.) mask_sinc_lobes @@ -130,10 +80,10 @@ float get_dynamic_loop_size(const float magnification_scale) return min(min_samples_m4, max_samples_m4); } -vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv, - const vec2 texture_size, const float dr, +float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, + const float2 tex_size, const float dr, const float input_tiles_per_texture_r, const float samples, - const bool vertical) + static const bool vertical) { // Requires: 1.) dr == du == 1.0/texture_size.x or // dr == dv == 1.0/texture_size.y @@ -151,216 +101,122 @@ vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv, // so get the first sample location and distance. Modify both dimensions // as if we're doing a one-pass 2D resize; we'll throw away the unneeded // (and incorrect) dimension at the end. - const vec2 curr_texel = tex_uv * texture_size; - const vec2 prev_texel = - floor(curr_texel - vec2(under_half)) + vec2(0.5); - const vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0); - const vec2 first_texel_uv_wrap_2D = first_texel * dr; - const vec2 first_texel_dist_2D = curr_texel - first_texel; - // Convert from tex_uv to tile_uv coords so we can sub fracts for fmods. - const vec2 first_texel_tile_uv_wrap_2D = + const float2 curr_texel = tex_uv * tex_size; + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0); + const float2 first_texel_uv_wrap_2D = first_texel * dr; + const float2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const float2 first_texel_tile_uv_wrap_2D = first_texel_uv_wrap_2D * input_tiles_per_texture_r; // Project wrapped coordinates to the [0, 1] range. We'll do this with all // samples,but the first texel is special, since it might be negative. - vec2 coord_negative = vec2(0.0); - if(first_texel_tile_uv_wrap_2D.x < 0.0) coord_negative.x = first_texel_tile_uv_wrap_2D.x; - if(first_texel_tile_uv_wrap_2D.x < 0.0) coord_negative.y = first_texel_tile_uv_wrap_2D.y; - const vec2 first_texel_tile_uv_2D = - fract(first_texel_tile_uv_wrap_2D) + coord_negative; + const float2 coord_negative = + float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.)); + const float2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; // Pack the first texel's tile_uv coord and texel distance in 1D: - const vec2 tile_u_and_dist = - vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); - const vec2 tile_v_and_dist = - vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + const float2 tile_u_and_dist = + float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const float2 tile_v_and_dist = + float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); return vertical ? tile_v_and_dist : tile_u_and_dist; - //return mix(tile_u_and_dist, tile_v_and_dist, float(vertical)); + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); } -vec4 tex2Dlod0try(const sampler2D tex, const vec2 tex_uv) +inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv) { // Mipmapping and anisotropic filtering get confused by sinc-resampling. // One [slow] workaround is to select the lowest mip level: #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD - return tex2Dlod(tex, vec4(tex_uv, 0.0, 0.0)); + return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy); #else #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS - return tex2Dbias(tex, vec4(tex_uv, 0.0, -16.0)); + return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0)); #else return texture(tex, tex_uv); #endif #endif } - -//////////////////////////// TILE SIZE CALCULATION /////////////////////////// -vec2 get_resized_mask_tile_size(const vec2 estimated_viewport_size, - const vec2 estimated_mask_resize_output_size, - const bool solemnly_swear_same_inputs_for_every_pass) -{ - // Requires: The following global constants must be defined according to - // certain constraints: - // 1.) mask_resize_num_triads: Must be high enough that our - // mask sampling method won't have artifacts later - // (long story; see derived-settings-and-constants.h) - // 2.) mask_resize_src_lut_size: Texel size of our mask LUT - // 3.) mask_triads_per_tile: Num horizontal triads in our LUT - // 4.) mask_min_allowed_triad_size: User setting (the more - // restrictive it is, the faster the resize will go) - // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x - // 6.) mask_triad_size_desired_{runtime, static} - // 7.) mask_num_triads_desired_{runtime, static} - // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) - // The function parameters must be defined as follows: - // 1.) estimated_viewport_size == (final viewport size); - // If mask_specify_num_triads is 1.0/true and the viewport - // estimate is wrong, the number of triads will differ from - // the user's preference by about the same factor. - // 2.) estimated_mask_resize_output_size: Must equal the - // output size of the MASK_RESIZE pass. - // Exception: The x component may be estimated garbage if - // and only if the caller throws away the x result. - // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, - // unless you can guarantee that every call across every - // pass will use the same sizes for the other parameters. - // When calling this across multiple passes, always use the - // same y viewport size/scale, and always use the same x - // viewport size/scale when using the x result. - // Returns: Return the final size of a manually resized mask tile, after - // constraining the desired size to avoid artifacts. Under - // unusual circumstances, tiles may become stretched vertically - // (see wall of text below). - // Stated tile properties must be correct: - const float tile_aspect_ratio_inv = - mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; - const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; - const vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv); - // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is - // wrong, the user preference will be misinterpreted: - const float desired_tile_size_x = mask_triads_per_tile * mix( - params.mask_triad_size_desired, - estimated_viewport_size.x / params.mask_num_triads_desired, - params.mask_specify_num_triads); - if(params.mask_sample_mode_desired > 0.5) - { - // We don't need constraints unless we're sampling MASK_RESIZE. - return desired_tile_size_x * tile_aspect; - } - // Make sure we're not upsizing: - const float temp_tile_size_x = - min(desired_tile_size_x, mask_resize_src_lut_size.x); - // Enforce min_tile_size and max_tile_size in both dimensions: - const vec2 temp_tile_size = temp_tile_size_x * tile_aspect; - const vec2 min_tile_size = - mask_min_allowed_tile_size * tile_aspect; - const vec2 max_tile_size = - estimated_mask_resize_output_size / mask_resize_num_tiles; - const vec2 clamped_tile_size = - clamp(temp_tile_size, min_tile_size, max_tile_size); - // Try to maintain tile_aspect_ratio. This is the tricky part: - // If we're currently resizing in the y dimension, the x components - // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is - // bogus, then so is max_tile_size.x and clamped_tile_size.x.) - // We can't adjust the y size based on clamped_tile_size.x. If it - // clamps when it shouldn't, it won't clamp again when later passes - // call this function with the correct sizes, and the discrepancy will - // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit - // the x size based on the y size, but not vice versa, unless the - // caller swears the parameters were the same (correct) in every pass. - // As a result, triads could appear vertically stretched if: - // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide - // LUT's might clamp x more than y (all provided LUT's are square) - // b.) true_viewport_size.x < true_viewport_size.y: The user is playing - // with a vertically oriented screen (not accounted for anyway) - // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: - // Viewport scales are equal by default. - // If any of these are the case, you can fix the stretching by setting: - // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * - // (1.0 / min_expected_aspect_ratio) * - // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) - const float x_tile_size_from_y = - clamped_tile_size.y * tile_aspect_ratio; - const float y_tile_size_from_x = mix(clamped_tile_size.y, - clamped_tile_size.x * tile_aspect_ratio_inv, - float(solemnly_swear_same_inputs_for_every_pass)); - const vec2 reclamped_tile_size = vec2( - min(clamped_tile_size.x, x_tile_size_from_y), - min(clamped_tile_size.y, y_tile_size_from_x)); - // We need integer tile sizes in both directions for tiled sampling to - // work correctly. Use floor (to make sure we don't round up), but be - // careful to avoid a rounding bug where floor decreases whole numbers: - const vec2 final_resized_tile_size = - floor(reclamped_tile_size + vec2(FIX_ZERO(0.0))); - return final_resized_tile_size; -} -///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \ + const float4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const float4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), float4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const float4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const float4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.x)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.z)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const float3 new_sample0 = tex2Dlod0try(tex, \ + float2(tex_uv_r.x, tex_uv.y)).rgb; \ + const float3 new_sample1 = tex2Dlod0try(tex, \ + float2(tex_uv_r.y, tex_uv.y)).rgb; \ + const float3 new_sample2 = tex2Dlod0try(tex, \ + float2(tex_uv_r.z, tex_uv.y)).rgb; \ + const float3 new_sample3 = tex2Dlod0try(tex, \ + float2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; -vec4 get_mask_sampling_parameters(const vec2 mask_resize_texture_size, - const vec2 mask_resize_video_size, const vec2 true_viewport_size, - out vec2 mask_tiles_per_screen) -{ - // Requires: 1.) Requirements of get_resized_mask_tile_size() must be - // met, particularly regarding global constants. - // The function parameters must be defined as follows: - // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size - // if get_mask_sample_mode() is 0 (otherwise anything) - // 2.) mask_resize_video_size == MASK_RESIZE.video_size - // if get_mask_sample_mode() is 0 (otherwise anything) - // 3.) true_viewport_size == IN.output_size for a pass set to - // 1.0 viewport scale (i.e. it must be correct) - // Returns: Return a vec4 containing: - // xy: tex_uv coords for the start of the mask tile - // zw: tex_uv size of the mask tile from start to end - // mask_tiles_per_screen is an out parameter containing the - // number of mask tiles that will fit on the screen. - // First get the final resized tile size. The viewport size and mask - // resize viewport scale must be correct, but don't solemnly swear they - // were correct in both mask resize passes unless you know it's true. - // (We can better ensure a correct tile aspect ratio if the parameters are - // guaranteed correct in all passes...but if we lie, we'll get inconsistent - // sizes across passes, resulting in broken texture coordinates.) - const float mask_sample_mode = params.mask_sample_mode_desired;//get_mask_sample_mode(); - const vec2 mask_resize_tile_size = get_resized_mask_tile_size( - true_viewport_size, mask_resize_video_size, false); - if(mask_sample_mode < 0.5) - { - // Sample MASK_RESIZE: The resized tile is a fracttion of the texture - // size and starts at a nonzero offset to allow for border texels: - const vec2 mask_tile_uv_size = mask_resize_tile_size / - mask_resize_texture_size; - const vec2 skipped_tiles = mask_start_texels/mask_resize_tile_size; - const vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; - // mask_tiles_per_screen must be based on the *true* viewport size: - mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; - return vec4(mask_tile_start_uv, mask_tile_uv_size); - } - else - { - // If we're tiling at the original size (1:1 pixel:texel), redefine a - // "tile" to be the full texture containing many triads. Otherwise, - // we're hardware-resampling an LUT, and the texture truly contains a - // single unresized phosphor mask tile anyway. - const vec2 mask_tile_uv_size = vec2(1.0); - const vec2 mask_tile_start_uv = vec2(0.0); - if(mask_sample_mode > 1.5) - { - // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: - mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; - } - else - { - // Hardware-resize the original LUT: - mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; - } - return vec4(mask_tile_start_uv, mask_tile_uv_size); - } -} //////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// -vec3 downsample_vertical_sinc_tiled(const sampler2D texture, - const vec2 tex_uv, const vec2 texture_size, const float dr, - const float magnification_scale, const float tile_size_uv_r) +float3 downsample_vertical_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, static const float dr, + const float magnification_scale, static const float tile_size_uv_r) { // Requires: 1.) dr == du == 1.0/texture_size.x or // dr == dv == 1.0/texture_size.y @@ -381,29 +237,29 @@ vec3 downsample_vertical_sinc_tiled(const sampler2D texture, #ifdef USE_SINGLE_STATIC_LOOP // A static loop can be faster, but it might blur too much from using // more samples than it should. - const int samples = int(max_sinc_resize_samples_m4); + static const int samples = int(max_sinc_resize_samples_m4); #else const int samples = int(get_dynamic_loop_size(magnification_scale)); #endif // Get the first sample location (scalar tile uv coord along the resized // dimension) and distance from the output location (in texels): - const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; // true = vertical resize: - const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( - tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, true); - const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; - const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; // Get the tile sample offset: - const float tile_dr = dr * input_tiles_per_texture_r; + static const float tile_dr = dr * input_tiles_per_texture_r; // Sum up each weight and weighted sample color, varying the looping // strategy based on our expected dynamic loop capabilities. See the // loop body macros above. int i_base = 0; - vec4 weight_sum = vec4(0.0); - vec3 pixel_color = vec3(0.0); - const int i_step = 4; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; #ifdef BREAK_LOOPS_INTO_PIECES if(samples - i_base >= 64) { @@ -460,14 +316,14 @@ vec3 downsample_vertical_sinc_tiled(const sampler2D texture, } #endif // Normalize so the weight_sum == 1.0, and return: - const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; - const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + weight_sum_reduce.y); return (pixel_color/scalar_weight_sum); } -vec3 downsample_horizontal_sinc_tiled(const sampler2D texture, - const vec2 tex_uv, const vec2 texture_size, const float dr, +float3 downsample_horizontal_sinc_tiled(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, const float dr, const float magnification_scale, const float tile_size_uv_r) { // Differences from downsample_horizontal_sinc_tiled: @@ -486,7 +342,7 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture, // we're resizing along, e.g. "dx" in this case. #ifdef USE_SINGLE_STATIC_LOOP // If we have to load all samples, we might as well use them. - const int samples = int(max_sinc_resize_samples_m4); + static const int samples = int(max_sinc_resize_samples_m4); #else const int samples = int(get_dynamic_loop_size(magnification_scale)); #endif @@ -495,10 +351,10 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture, // dimension) and distance from the output location (in texels): const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; // false = horizontal resize: - const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( - tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, false); - const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; - const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false); + const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; // Get the tile sample offset: const float tile_dr = dr * input_tiles_per_texture_r; @@ -506,9 +362,9 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture, // strategy based on our expected dynamic loop capabilities. See the // loop body macros above. int i_base = 0; - vec4 weight_sum = vec4(0.0); - vec3 pixel_color = vec3(0.0); - const int i_step = 4; + float4 weight_sum = float4(0.0); + float3 pixel_color = float3(0.0); + static const int i_step = 4; #ifdef BREAK_LOOPS_INTO_PIECES if(samples - i_base >= 64) { @@ -565,47 +421,243 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture, } #endif // Normalize so the weight_sum == 1.0, and return: - const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; - const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + + const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const float3 scalar_weight_sum = float3(weight_sum_reduce.x + weight_sum_reduce.y); return (pixel_color/scalar_weight_sum); } -vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap, - const vec4 mask_tile_start_uv_and_size) + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +float2 get_resized_mask_tile_size(const float2 estimated_viewport_size, + const float2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + static const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + global.mask_triad_size_desired, + estimated_viewport_size.x / global.mask_num_triads_desired, + global.mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const float2 temp_tile_size = temp_tile_size_x * tile_aspect; + static const float2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const float2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const float2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const float2 reclamped_tile_size = float2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const float2 final_resized_tile_size = + floor(reclamped_tile_size + float2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size, + const float2 mask_resize_video_size, const float2 true_viewport_size, + out float2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == IN.output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a float4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const float2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const float2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return float4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + static const float2 mask_tile_uv_size = float2(1.0); + static const float2 mask_tile_start_uv = float2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return float4(mask_tile_start_uv, mask_tile_uv_size); + } +} +/* +float2 fix_tiling_discontinuities_normalized(const float2 tile_uv, + float2 duv_dx, float2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5)); + const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} +*/ +float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap, + const float4 mask_tile_start_uv_and_size) { // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the // tile spans from [0, 1], such that (0.5, 0.5) is at the // tile center. The input coords can range from [0, inf], - // and their fracttional parts map to a repeated tile. + // and their fractional parts map to a repeated tile. // ("Tile" can mean texture, the video embedded in the // texture, or some other "tile" embedded in a texture.) // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords // for the start of the embedded tile in the full texture. - // 3.) mask_tile_start_uv_and_size.zw contains the [fracttional] + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] // tex_uv size of the embedded tile in the full texture. // Returns: Return tex_uv coords (used for texture sampling) // corresponding to tile_uv_wrap. - if(params.mask_sample_mode_desired < 0.5) + if(get_mask_sample_mode() < 0.5) { // Manually repeat the resized mask tile to fill the screen: - // First get fracttional tile_uv coords. Using fract/fmod on coords + // First get fractional tile_uv coords. Using frac/fmod on coords // confuses anisotropic filtering; fix it as user options dictate. // derived-settings-and-constants.h disables incompatible options. #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE - vec2 tile_uv = fract(tile_uv_wrap * 0.5) * 2.0; + float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; #else - vec2 tile_uv = fract(tile_uv_wrap); + float2 tile_uv = frac(tile_uv_wrap); #endif #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES - const vec2 tile_uv_dx = ddx(tile_uv); - const vec2 tile_uv_dy = ddy(tile_uv); + const float2 tile_uv_dx = ddx(tile_uv); + const float2 tile_uv_dy = ddy(tile_uv); tile_uv = fix_tiling_discontinuities_normalized(tile_uv, tile_uv_dx, tile_uv_dy); #endif // The tile is embedded in a padded FBO, and it may start at a // nonzero offset if border texels are used to avoid artifacts: - const vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy + tile_uv * mask_tile_start_uv_and_size.zw; return mask_tex_uv; } @@ -620,5 +672,6 @@ vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap, } } + #endif // PHOSPHOR_MASK_RESIZING_H diff --git a/crt/shaders/crt-royale/src/scanline-functions.h b/crt/shaders/crt-royale/src/scanline-functions.h index 5169b3d..9c4f9e5 100644 --- a/crt/shaders/crt-royale/src/scanline-functions.h +++ b/crt/shaders/crt-royale/src/scanline-functions.h @@ -27,284 +27,10 @@ #include "../../../../include/special-functions.h" #include "../../../../include/gamma-management.h" + ///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// -vec3 get_raw_interpolated_color(const vec3 color0, - const vec3 color1, const vec3 color2, const vec3 color3, - const vec4 weights) -{ - // Use max to avoid bizarre artifacts from negative colors: - return max(mat4x3(color0, color1, color2, color3) * weights, 0.0); -} - -vec3 get_interpolated_linear_color(const vec3 color0, const vec3 color1, - const vec3 color2, const vec3 color3, const vec4 weights) -{ - // Requires: 1.) Requirements of include/gamma-management.h must be met: - // intermediate_gamma must be globally defined, and input - // colors are interpreted as linear RGB unless you #define - // GAMMA_ENCODE_EVERY_FBO (in which case they are - // interpreted as gamma-encoded with intermediate_gamma). - // 2.) color0-3 are colors sampled from a texture with tex2D(). - // They are interpreted as defined in requirement 1. - // 3.) weights contains weights for each color, summing to 1.0. - // 4.) beam_horiz_linear_rgb_weight must be defined as a global - // float in [0.0, 1.0] describing how much blending should - // be done in linear RGB (rest is gamma-corrected RGB). - // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined - // if beam_horiz_linear_rgb_weight is anything other than a - // static constant, or we may try branching at runtime - // without dynamic branches allowed (slow). - // Returns: Return an interpolated color lookup between the four input - // colors based on the weights in weights. The final color will - // be a linear RGB value, but the blending will be done as - // indicated above. - const float intermediate_gamma = get_intermediate_gamma(); - // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the - // profile allows dynamic branches (faster than computing extra pows): - #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE - #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT - #else - #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES - #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT - #endif - #endif - #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT - // beam_horiz_linear_rgb_weight is static, so we can branch: - #ifdef GAMMA_ENCODE_EVERY_FBO - const vec3 gamma_mixed_color = pow(get_raw_interpolated_color( - color0, color1, color2, color3, weights), vec3(intermediate_gamma)); - if(beam_horiz_linear_rgb_weight > 0.0) - { - const vec3 linear_mixed_color = get_raw_interpolated_color( - pow(color0, vec3(intermediate_gamma)), - pow(color1, vec3(intermediate_gamma)), - pow(color2, vec3(intermediate_gamma)), - pow(color3, vec3(intermediate_gamma)), - weights); - return mix(gamma_mixed_color, linear_mixed_color, - beam_horiz_linear_rgb_weight); - } - else - { - return gamma_mixed_color; - } - #else - const vec3 linear_mixed_color = get_raw_interpolated_color( - color0, color1, color2, color3, weights); - if(beam_horiz_linear_rgb_weight < 1.0) - { - const vec3 gamma_mixed_color = get_raw_interpolated_color( - pow(color0, vec3(1.0/intermediate_gamma)), - pow(color1, vec3(1.0/intermediate_gamma)), - pow(color2, vec3(1.0/intermediate_gamma)), - pow(color3, vec3(1.0/intermediate_gamma)), - weights); - return mix(gamma_mixed_color, linear_mixed_color, - beam_horiz_linear_rgb_weight); - } - else - { - return linear_mixed_color; - } - #endif // GAMMA_ENCODE_EVERY_FBO - #else - #ifdef GAMMA_ENCODE_EVERY_FBO - // Inputs: color0-3 are colors in gamma-encoded RGB. - const vec3 gamma_mixed_color = pow(get_raw_interpolated_color( - color0, color1, color2, color3, weights), vec3(intermediate_gamma)); - const vec3 linear_mixed_color = get_raw_interpolated_color( - pow(color0, vec3(intermediate_gamma)), - pow(color1, vec3(intermediate_gamma)), - pow(color2, vec3(intermediate_gamma)), - pow(color3, vec3(intermediate_gamma)), - weights); - return mix(gamma_mixed_color, linear_mixed_color, - beam_horiz_linear_rgb_weight); - #else - // Inputs: color0-3 are colors in linear RGB. - const vec3 linear_mixed_color = get_raw_interpolated_color( - color0, color1, color2, color3, weights); - const vec3 gamma_mixed_color = get_raw_interpolated_color( - pow(color0, vec3(1.0/intermediate_gamma)), - pow(color1, vec3(1.0/intermediate_gamma)), - pow(color2, vec3(1.0/intermediate_gamma)), - pow(color3, vec3(1.0/intermediate_gamma)), - weights); - return mix(gamma_mixed_color, linear_mixed_color, - beam_horiz_linear_rgb_weight); - #endif // GAMMA_ENCODE_EVERY_FBO - #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT -} - -vec3 get_scanline_color(const sampler2D tex, const vec2 scanline_uv, - const vec2 uv_step_x, const vec4 weights) -{ - // Requires: 1.) scanline_uv must be vertically snapped to the caller's - // desired line or scanline and horizontally snapped to the - // texel just left of the output pixel (color1) - // 2.) uv_step_x must contain the horizontal uv distance - // between texels. - // 3.) weights must contain interpolation filter weights for - // color0, color1, color2, and color3, where color1 is just - // left of the output pixel. - // Returns: Return a horizontally interpolated texture lookup using 2-4 - // nearby texels, according to weights and the conventions of - // get_interpolated_linear_color(). - // We can ignore the outside texture lookups for Quilez resampling. - const vec3 color1 = texture(tex, scanline_uv).rgb; - const vec3 color2 = texture(tex, scanline_uv + uv_step_x).rgb; - vec3 color0 = vec3(0.0); - vec3 color3 = vec3(0.0); - if(params.beam_horiz_filter > 0.5) - { - color0 = texture(tex, scanline_uv - uv_step_x).rgb; - color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb; - } - // Sample the texture as-is, whether it's linear or gamma-encoded: - // get_interpolated_linear_color() will handle the difference. - return get_interpolated_linear_color(color0, color1, color2, color3, weights); -} - -vec3 sample_single_scanline_horizontal(const sampler2D texture, - const vec2 tex_uv, const vec2 texture_size, - const vec2 texture_size_inv) -{ - // TODO: Add function requirements. - // Snap to the previous texel and get sample dists from 2/4 nearby texels: - const vec2 curr_texel = tex_uv * texture_size; - // Use under_half to fix a rounding bug right around exact texel locations. - const vec2 prev_texel = - floor(curr_texel - vec2(under_half)) + vec2(0.5); - const vec2 prev_texel_hor = vec2(prev_texel.x, curr_texel.y); - const vec2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; - const float prev_dist = curr_texel.x - prev_texel_hor.x; - const vec4 sample_dists = vec4(1.0 + prev_dist, prev_dist, - 1.0 - prev_dist, 2.0 - prev_dist); - // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: - vec4 weights; - if(params.beam_horiz_filter < 0.5) - { - // Quilez: - const float x = sample_dists.y; - const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); - weights = vec4(0.0, 1.0 - w2, w2, 0.0); - } - else if(params.beam_horiz_filter < 1.5) - { - // Gaussian: - float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); - weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); - } - else - { - // Lanczos2: - const vec4 pi_dists = FIX_ZERO(sample_dists * pi); - weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / - (pi_dists * pi_dists); - } - // Ensure the weight sum == 1.0: - const vec4 final_weights = weights/dot(weights, vec4(1.0)); - // Get the interpolated horizontal scanline color: - const vec2 uv_step_x = vec2(texture_size_inv.x, 0.0); - return get_scanline_color( - texture, prev_texel_hor_uv, uv_step_x, final_weights); -} - -bool is_interlaced(float num_lines) -{ - // Detect interlacing based on the number of lines in the source. - if(interlace_detect == true) - { - // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field - // NTSC Emulators: Typically 224 or 240 lines - // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field - // PAL Emulators: ? - // ATSC: 720p, 1080i, 1080p - // Where do we place our cutoffs? Assumptions: - // 1.) We only need to care about active lines. - // 2.) Anything > 288 and <= 576 lines is probably interlaced. - // 3.) Anything > 576 lines is probably not interlaced... - // 4.) ...except 1080 lines, which is a crapshoot (user decision). - // 5.) Just in case the main program uses calculated video sizes, - // we should nudge the float thresholds a bit. - bool sd_interlace; - if (num_lines > 288.5 && num_lines < 576.5) - {sd_interlace = true;} - else - {sd_interlace = false;} - bool hd_interlace; - if (num_lines > 1079.5 && num_lines < 1080.5) - {hd_interlace = true;} - else - {hd_interlace = false;} - return (sd_interlace || hd_interlace); - } - else - { - return false; - } -} - -vec3 sample_rgb_scanline_horizontal(const sampler2D tex, - const vec2 tex_uv, const vec2 texture_size, - const vec2 texture_size_inv) -{ - // TODO: Add function requirements. - // Rely on a helper to make convergence easier. - if(beam_misconvergence == true) - { - const vec3 convergence_offsets_rgb = - get_convergence_offsets_x_vector(); - const vec3 offset_u_rgb = - convergence_offsets_rgb * texture_size_inv.xxx; - const vec2 scanline_uv_r = tex_uv - vec2(offset_u_rgb.r, 0.0); - const vec2 scanline_uv_g = tex_uv - vec2(offset_u_rgb.g, 0.0); - const vec2 scanline_uv_b = tex_uv - vec2(offset_u_rgb.b, 0.0); - const vec3 sample_r = sample_single_scanline_horizontal( - tex, scanline_uv_r, texture_size, texture_size_inv); - const vec3 sample_g = sample_single_scanline_horizontal( - tex, scanline_uv_g, texture_size, texture_size_inv); - const vec3 sample_b = sample_single_scanline_horizontal( - tex, scanline_uv_b, texture_size, texture_size_inv); - return vec3(sample_r.r, sample_g.g, sample_b.b); - } - else - { - return sample_single_scanline_horizontal(tex, tex_uv, texture_size, - texture_size_inv); - } -} - -vec2 get_last_scanline_uv(const vec2 tex_uv, const vec2 texture_size, - const vec2 texture_size_inv, const vec2 il_step_multiple, - const float frame_count, out float dist) -{ - // Compute texture coords for the last/upper scanline, accounting for - // interlacing: With interlacing, only consider even/odd scanlines every - // other frame. Top-field first (TFF) order puts even scanlines on even - // frames, and BFF order puts them on odd frames. Texels are centered at: - // frac(tex_uv * texture_size) == x.5 - // Caution: If these coordinates ever seem incorrect, first make sure it's - // not because anisotropic filtering is blurring across field boundaries. - // Note: TFF/BFF won't matter for sources that double-weave or similar. - const float field_offset = floor(il_step_multiple.y * 0.75) * - mod(frame_count + float(interlace_bff), 2.0); - const vec2 curr_texel = tex_uv * texture_size; - // Use under_half to fix a rounding bug right around exact texel locations. - const vec2 prev_texel_num = floor(curr_texel - vec2(under_half)); - const float wrong_field = mod( - prev_texel_num.y + field_offset, il_step_multiple.y); - const vec2 scanline_texel_num = prev_texel_num - vec2(0.0, wrong_field); - // Snap to the center of the previous scanline in the current field: - const vec2 scanline_texel = scanline_texel_num + vec2(0.5); - const vec2 scanline_uv = scanline_texel * texture_size_inv; - // Save the sample's distance from the scanline, in units of scanlines: - dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; - return scanline_uv; -} - -vec3 get_gaussian_sigma(const vec3 color, const float sigma_range) +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) { // Requires: Globals: // 1.) beam_min_sigma and beam_max_sigma are global floats @@ -356,19 +82,19 @@ vec3 get_gaussian_sigma(const vec3 color, const float sigma_range) if(beam_spot_shape_function < 0.5) { // Use a power function: - return vec3(beam_min_sigma) + sigma_range * - pow(color, vec3(beam_spot_power)); + return float3(beam_min_sigma) + sigma_range * + pow(color, float3(beam_spot_power)); } else { // Use a spherical function: - const vec3 color_minus_1 = color - vec3(1.0); - return vec3(beam_min_sigma) + sigma_range * - sqrt(vec3(1.0) - color_minus_1*color_minus_1); + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); } } -vec3 get_generalized_gaussian_beta(const vec3 color, +inline float3 get_generalized_gaussian_beta(const float3 color, const float shape_range) { // Requires: Globals: @@ -394,11 +120,11 @@ vec3 get_generalized_gaussian_beta(const vec3 color, // beta widen and sharpen peaks at the risk of aliasing. // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape // transitions, whereas lower ones sharpen them (at the risk of aliasing). - return beam_min_shape + shape_range * pow(color, vec3(beam_shape_power)); + return beam_min_shape + shape_range * pow(color, float3(beam_shape_power)); } -vec3 scanline_gaussian_integral_contrib(const vec3 dist, - const vec3 color, const float pixel_height, const float sigma_range) +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) { // Requires: 1.) dist is the distance of the [potentially separate R/G/B] // point(s) from a scanline in units of scanlines, where @@ -419,16 +145,16 @@ vec3 scanline_gaussian_integral_contrib(const vec3 dist, // average brightness over a given pixel area. Even if curved coords were // used in this pass, a flat scalar pixel height works almost as well as a // pixel height computed from a full pixel-space to scanline-space matrix. - const vec3 sigma = get_gaussian_sigma(color, sigma_range); - const vec3 ph_offset = vec3(pixel_height * 0.5); - const vec3 denom_inv = 1.0/(sigma*sqrt(2.0)); - const vec3 integral_high = erf((dist + ph_offset)*denom_inv); - const vec3 integral_low = erf((dist - ph_offset)*denom_inv); + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); return color * 0.5*(integral_high - integral_low)/pixel_height; } -vec3 scanline_generalized_gaussian_integral_contrib(const vec3 dist, - const vec3 color, const float pixel_height, const float sigma_range, +float3 scanline_generalized_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range, const float shape_range) { // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() @@ -450,44 +176,44 @@ vec3 scanline_generalized_gaussian_integral_contrib(const vec3 dist, // models models standard deviation at beta == 2, because the standard // deviation depends on both alpha and beta (keeping alpha independent is // faster and preserves intuitive behavior and a full spectrum of results). - const vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); - const vec3 beta = get_generalized_gaussian_beta(color, shape_range); - const vec3 alpha_inv = vec3(1.0)/alpha; - const vec3 s = vec3(1.0)/beta; - const vec3 ph_offset = vec3(pixel_height * 0.5); + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); // Pass beta to gamma_impl to avoid repeated divides. Similarly pass // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. - const vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, beta); - const vec3 dist1 = dist + ph_offset; - const vec3 dist0 = dist - ph_offset; - const vec3 integral_high = sign(dist1) * normalized_ligamma_impl( + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); - const vec3 integral_low = sign(dist0) * normalized_ligamma_impl( + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); return color * 0.5*(integral_high - integral_low)/pixel_height; } -vec3 scanline_gaussian_sampled_contrib(const vec3 dist, const vec3 color, +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, const float pixel_height, const float sigma_range) { // See scanline_gaussian integral_contrib() for detailed comments! // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) - const vec3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 sigma = get_gaussian_sigma(color, sigma_range); // Avoid repeated divides: - const vec3 sigma_inv = vec3(1.0)/sigma; - const vec3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; - const vec3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); if(beam_antialias_level > 0.5) { // Sample 1/3 pixel away in each direction as well: - const vec3 sample_offset = vec3(pixel_height/3.0); - const vec3 dist2 = dist + sample_offset; - const vec3 dist3 = abs(dist - sample_offset); + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); // Average three pure Gaussian samples: - const vec3 scale = color/3.0 * outer_denom_inv; - const vec3 weight1 = exp(-(dist*dist)*inner_denom_inv); - const vec3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); - const vec3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); return scale * (weight1 + weight2 + weight3); } else @@ -496,30 +222,30 @@ vec3 scanline_gaussian_sampled_contrib(const vec3 dist, const vec3 color, } } -vec3 scanline_generalized_gaussian_sampled_contrib(const vec3 dist, - const vec3 color, const float pixel_height, const float sigma_range, +float3 scanline_generalized_gaussian_sampled_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range, const float shape_range) { // See scanline_generalized_gaussian_integral_contrib() for details! // generalized sample = // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) - const vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); - const vec3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); // Avoid repeated divides: - const vec3 alpha_inv = vec3(1.0)/alpha; - const vec3 beta_inv = vec3(1.0)/beta; - const vec3 scale = color * beta * 0.5 * alpha_inv / + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / gamma_impl(beta_inv, beta); if(beam_antialias_level > 0.5) { // Sample 1/3 pixel closer to and farther from the scanline too. - const vec3 sample_offset = vec3(pixel_height/3.0); - const vec3 dist2 = dist + sample_offset; - const vec3 dist3 = abs(dist - sample_offset); + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); // Average three generalized Gaussian samples: - const vec3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); - const vec3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); - const vec3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); return scale/3.0 * (weight1 + weight2 + weight3); } else @@ -528,7 +254,7 @@ vec3 scanline_generalized_gaussian_sampled_contrib(const vec3 dist, } } -vec3 scanline_contrib(vec3 dist, vec3 color, +inline float3 scanline_contrib(float3 dist, float3 color, float pixel_height, const float sigma_range, const float shape_range) { // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() @@ -539,7 +265,7 @@ vec3 scanline_contrib(vec3 dist, vec3 color, // Returns: Return a scanline's light output over a given pixel, using // a generalized or pure Gaussian distribution and sampling or // integrals as desired by user codepath choices. - if(beam_generalized_gaussian == true) + if(beam_generalized_gaussian) { if(beam_antialias_level > 1.5) { @@ -567,4 +293,279 @@ vec3 scanline_contrib(vec3 dist, vec3 color, } } -#endif // SCANLINE_FUNCTIONS_H \ No newline at end of file +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), float3(intermediate_gamma)); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, float3(intermediate_gamma)), + pow(color1, float3(intermediate_gamma)), + pow(color2, float3(intermediate_gamma)), + pow(color3, float3(intermediate_gamma)), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, float3(1.0/intermediate_gamma)), + pow(color1, float3(1.0/intermediate_gamma)), + pow(color2, float3(1.0/intermediate_gamma)), + pow(color3, float3(1.0/intermediate_gamma)), + weights); + // wtf fixme +// const float beam_horiz_linear_rgb_weight1 = 1.0; + return lerp(gamma_mixed_color, linear_mixed_color, + global.beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = texture(tex, scanline_uv).rgb; + const float3 color2 = texture(tex, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = texture(tex, scanline_uv - uv_step_x).rgb; + color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + tex, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D tex, + const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + tex, scanline_uv_r, tex_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + tex, scanline_uv_g, tex_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + tex, scanline_uv_b, tex_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(tex, tex_uv, tex_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * tex_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + // wtf fixme +// const float interlace_bff1 = 1.0; + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(global.interlace_bff), 2.0); + const float2 curr_texel = tex_uv * tex_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} + +inline bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5)); + const bool hd_interlace = bool(interlace_1080i) ? + ((num_lines > 1079.5) && (num_lines < 1080.5)) : + false; + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} + + +#endif // SCANLINE_FUNCTIONS_H + diff --git a/crt/shaders/crt-royale/src/tex2Dantialias.h b/crt/shaders/crt-royale/src/tex2Dantialias.h index af7dd59..93fe7d4 100644 --- a/crt/shaders/crt-royale/src/tex2Dantialias.h +++ b/crt/shaders/crt-royale/src/tex2Dantialias.h @@ -27,18 +27,18 @@ // 1.) All requirements of gamma-management.h must be satisfied! // 2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe- // space offsets to texture uv offsets. You can get this with: -// const vec2 duv_dx = ddx(tex_uv); -// const vec2 duv_dy = ddy(tex_uv); -// const mat2x2 pixel_to_tex_uv = mat2x2( +// const float2 duv_dx = ddx(tex_uv); +// const float2 duv_dy = ddy(tex_uv); +// const float2x2 pixel_to_tex_uv = float2x2( // duv_dx.x, duv_dy.x, // duv_dx.y, duv_dy.y); // This is left to the user in case the current Cg profile // doesn't support ddx()/ddy(). Ideally, the user could find // calculate a distorted tangent-space mapping analytically. // If not, a simple flat mapping can be obtained with: -// const vec2 xy_to_uv_scale = IN.output_size * +// const float2 xy_to_uv_scale = IN.output_size * // IN.video_size/IN.texture_size; -// const mat2x2 pixel_to_tex_uv = mat2x2( +// const float2x2 pixel_to_tex_uv = float2x2( // xy_to_uv_scale.x, 0.0, // 0.0, xy_to_uv_scale.y); // Optional: To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and: @@ -70,8 +70,8 @@ // 0.5/aa_pixel_diameter; // 3.) Set subpixel offsets. This requires an accessor function // for compatibility with scalar runtime shader params. Return -// a vec2 pixel offset in [-0.5, 0.5] for the red subpixel: -// vec2 get_aa_subpixel_r_offset() +// a float2 pixel offset in [-0.5, 0.5] for the red subpixel: +// float2 get_aa_subpixel_r_offset() // The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to // override (all of) the following default static values. However, // the file's structure requires them to be declared static const: @@ -84,7 +84,7 @@ // values; much larger gauss_sigmas ironically prefer slightly // smaller support given sparse sampling, and vice versa.) // 3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter; -// 4.) static const vec2 aa_xy_axis_importance: +// 4.) static const float2 aa_xy_axis_importance: // The sparse N-queens sampling grid interacts poorly with // negative-lobed 2D filters. However, if aliasing is much // stronger in one direction (e.g. horizontally with a phosphor @@ -93,11 +93,11 @@ // aa_xy_axis_importance down to a minimum of 0.5 (box support), // after which point only the offsets used for calculating // weights continue to scale downward. This works as follows: -// If aa_xy_axis_importance = vec2(1.0, 1.0/support_radius), +// If aa_xy_axis_importance = float2(1.0, 1.0/support_radius), // the vertical support radius will drop to 1.0, and we'll just // filter vertical offsets with the first filter lobe, while // horizontal offsets go through the full multi-lobe filter. -// If aa_xy_axis_importance = vec2(1.0, 0.0), the vertical +// If aa_xy_axis_importance = float2(1.0, 0.0), the vertical // support radius will drop to box support, and the vertical // offsets will be ignored entirely (essentially giving us a // box filter vertically). The former is potentially smoother @@ -141,7 +141,7 @@ // 2.) For decent results, negative-lobed filters must be computed based on // separable weights, not radial distances, because the sparse sampling // makes no guarantees about radial distributions. Even then, it's much -// better to set aa_xy_axis_importance to e.g. vec2(1.0, 0.0) to use e.g. +// better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g. // Lanczos2 horizontally and a box filter vertically. This is mainly due // to the sparse N-queens sampling and a statistically enormous positive or // negative covariance between horizontal and vertical weights. @@ -154,32 +154,33 @@ // exploit temporal AA better, but it would require a dynamic branch or a lot // of conditional moves, so it's prohibitively slow for the minor benefit. + ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// #ifndef ANTIALIAS_OVERRIDE_BASICS // The following settings must be static constants: - const float aa_level = 12.0; - const float aa_filter = 0.0; - const bool aa_temporal = false; + static const float aa_level = 12.0; + static const float aa_filter = 0.0; + static const bool aa_temporal = false; #endif #ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS // Users may override these parameters, but the file structure requires // them to be static constants; see the descriptions above. - const float aa_pixel_diameter = 1.0; - const float aa_lanczos_lobes = 3.0; - const float aa_gauss_support = 1.0 / aa_pixel_diameter; - const float aa_tent_support = 1.0 / aa_pixel_diameter; + static const float aa_pixel_diameter = 1.0; + static const float aa_lanczos_lobes = 3.0; + static const float aa_gauss_support = 1.0 / aa_pixel_diameter; + static const float aa_tent_support = 1.0 / aa_pixel_diameter; // If we're using a negative-lobed filter, default to using it horizontally // only, and use only the first lobe vertically or a box filter, over a // correspondingly smaller range. This compensates for the sparse sampling // grid's typically large positive/negative x/y covariance. - vec2 aa_xy_axis_importance = - aa_filter < 5.5 ? vec2(1.0) : // Box, tent, Gaussian - aa_filter < 8.5 ? vec2(1.0, 0.0) : // Cubic and Lanczos sinc - aa_filter < 9.5 ? vec2(1.0, 1.0/aa_lanczos_lobes) : // Lanczos jinc - vec2(1.0); // Default to box + static const float2 aa_xy_axis_importance = + aa_filter < 5.5 ? float2(1.0) : // Box, tent, Gaussian + aa_filter < 8.5 ? float2(1.0, 0.0) : // Cubic and Lanczos sinc + aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) : // Lanczos jinc + float2(1.0); // Default to box #endif #ifndef ANTIALIAS_OVERRIDE_PARAMETERS @@ -189,39 +190,40 @@ // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. // 4.) C = 0.0 is a soft spline filter. -// const float aa_cubic_c = 0.5; -// const float aa_gauss_sigma = 0.5 / aa_pixel_diameter; + static const float aa_cubic_c = 0.5; + static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter; // Users may override the subpixel offset accessor function with their own. // A function is used for compatibility with scalar runtime shader params. - vec2 get_aa_subpixel_r_offset() + inline float2 get_aa_subpixel_r_offset() { - return vec2(0.0, 0.0); + return float2(0.0, 0.0); } #endif + ////////////////////////////////// INCLUDES ////////////////////////////////// -//#include "../../../../include/gamma-management.h" -#include "gamma-management.h" +#include "../../../../include/gamma-management.h" + ////////////////////////////////// CONSTANTS ///////////////////////////////// -const float aa_box_support = 0.5; -const float aa_cubic_support = 2.0; +static const float aa_box_support = 0.5; +static const float aa_cubic_support = 2.0; //////////////////////////// GLOBAL NON-CONSTANTS //////////////////////////// // We'll want to define these only once per fragment at most. #ifdef RUNTIME_ANTIALIAS_WEIGHTS - float aa_cubic_b; - float cubic_branch1_x3_coeff; - float cubic_branch1_x2_coeff; - float cubic_branch1_x0_coeff; - float cubic_branch2_x3_coeff; - float cubic_branch2_x2_coeff; - float cubic_branch2_x1_coeff; - float cubic_branch2_x0_coeff; + float aa_cubic_b; + float cubic_branch1_x3_coeff; + float cubic_branch1_x2_coeff; + float cubic_branch1_x0_coeff; + float cubic_branch2_x3_coeff; + float cubic_branch2_x2_coeff; + float cubic_branch2_x1_coeff; + float cubic_branch2_x0_coeff; #endif @@ -235,38 +237,38 @@ void assign_aa_cubic_constants() #ifdef RUNTIME_ANTIALIAS_WEIGHTS if(aa_filter > 5.5 && aa_filter < 7.5) { - aa_cubic_b = 1.0 - 2.0*params.aa_cubic_c; - cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*params.aa_cubic_c; - cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*params.aa_cubic_c; + aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; - cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * params.aa_cubic_c; - cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*params.aa_cubic_c; - cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*params.aa_cubic_c; - cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*params.aa_cubic_c; + cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; } #endif } -vec4 get_subpixel_support_diam_and_final_axis_importance() +inline float4 get_subpixel_support_diam_and_final_axis_importance() { // Statically select the base support radius: - float base_support_radius; - if(aa_filter < 1.5) base_support_radius = aa_box_support; - else if(aa_filter < 3.5) base_support_radius = aa_tent_support; - else if(aa_filter < 5.5) base_support_radius = aa_gauss_support; - else if(aa_filter < 7.5) base_support_radius = aa_cubic_support; - else if(aa_filter < 9.5) base_support_radius = aa_lanczos_lobes; - else base_support_radius = aa_box_support; // Default to box + static const float base_support_radius = + aa_filter < 1.5 ? aa_box_support : + aa_filter < 3.5 ? aa_tent_support : + aa_filter < 5.5 ? aa_gauss_support : + aa_filter < 7.5 ? aa_cubic_support : + aa_filter < 9.5 ? aa_lanczos_lobes : + aa_box_support; // Default to box // Expand the filter support for subpixel filtering. - const vec2 subpixel_support_radius_raw = - vec2(base_support_radius) + abs(get_aa_subpixel_r_offset()); + const float2 subpixel_support_radius_raw = + float2(base_support_radius) + abs(get_aa_subpixel_r_offset()); if(aa_filter < 1.5) { // Ignore aa_xy_axis_importance for box filtering. - const vec2 subpixel_support_diam = + const float2 subpixel_support_diam = 2.0 * subpixel_support_radius_raw; - const vec2 final_axis_importance = vec2(1.0); - return vec4(subpixel_support_diam, final_axis_importance); + const float2 final_axis_importance = float2(1.0); + return float4(subpixel_support_diam, final_axis_importance); } else { @@ -274,55 +276,54 @@ vec4 get_subpixel_support_diam_and_final_axis_importance() // it further than box support. This allows decent vertical AA without // messing up horizontal weights or using something silly like Lanczos4 // horizontally with a huge vertical average over an 8-pixel radius. - const vec2 subpixel_support_radius = max(vec2(aa_box_support), + const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support), subpixel_support_radius_raw * aa_xy_axis_importance); // Adjust aa_xy_axis_importance to compensate for what's already done: - const vec2 final_axis_importance = aa_xy_axis_importance * + const float2 final_axis_importance = aa_xy_axis_importance * subpixel_support_radius_raw/subpixel_support_radius; - const vec2 subpixel_support_diam = 2.0 * subpixel_support_radius; - return vec4(subpixel_support_diam, final_axis_importance); + const float2 subpixel_support_diam = 2.0 * subpixel_support_radius; + return float4(subpixel_support_diam, final_axis_importance); } } + /////////////////////////// FILTER WEIGHT FUNCTIONS ////////////////////////// -float eval_box_filter(const float dist) +inline float eval_box_filter(const float dist) { -if(abs(dist) <= aa_box_support) return 1.0;//abs(dist); -else return 0.0; + return float(abs(dist) <= aa_box_support); } -float eval_separable_box_filter(const vec2 offset) +inline float eval_separable_box_filter(const float2 offset) { - if(all(lessThanEqual(abs(offset) , vec2(aa_box_support)))) return 1.0;//float(abs(offset)); - else return 0.0; + return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support)))); } -float eval_tent_filter(const float dist) +inline float eval_tent_filter(const float dist) { return clamp((aa_tent_support - dist)/ aa_tent_support, 0.0, 1.0); } -float eval_gaussian_filter(const float dist) +inline float eval_gaussian_filter(const float dist) { - return exp(-(dist*dist) / (2.0*params.aa_gauss_sigma*params.aa_gauss_sigma)); + return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma)); } -float eval_cubic_filter(const float dist) +inline float eval_cubic_filter(const float dist) { // Compute coefficients like assign_aa_cubic_constants(), but statically. #ifndef RUNTIME_ANTIALIAS_WEIGHTS // When runtime weights are used, these values are instead written to // global uniforms at the beginning of each tex2Daa* call. - const float aa_cubic_b = 1.0 - 2.0*params.aa_cubic_c; - const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*params.aa_cubic_c; - const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*params.aa_cubic_c; + const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; - const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * params.aa_cubic_c; - const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*params.aa_cubic_c; - const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*params.aa_cubic_c; - const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*params.aa_cubic_c; + const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; #endif const float abs_dist = abs(dist); // Compute the cubic based on the Horner's method formula in: @@ -338,32 +339,32 @@ float eval_cubic_filter(const float dist) 0.0)/6.0; } -float eval_separable_cubic_filter(const vec2 offset) +inline float eval_separable_cubic_filter(const float2 offset) { - // This is faster than using a specific vec2 version: + // This is faster than using a specific float2 version: return eval_cubic_filter(offset.x) * eval_cubic_filter(offset.y); } -vec2 eval_sinc_filter(const vec2 offset) +inline float2 eval_sinc_filter(const float2 offset) { // It's faster to let the caller handle the zero case, or at least it // was when I used macros and the shader preset took a full minute to load. - const vec2 pi_offset = pi * offset; + const float2 pi_offset = pi * offset; return sin(pi_offset)/pi_offset; } -float eval_separable_lanczos_sinc_filter(const vec2 offset_unsafe) +inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe) { // Note: For sparse sampling, you really need to pick an axis to use - // Lanczos along (e.g. set aa_xy_axis_importance = vec2(1.0, 0.0)). - const vec2 offset = FIX_ZERO(offset_unsafe); - const vec2 xy_weights = eval_sinc_filter(offset) * + // Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)). + const float2 offset = FIX_ZERO(offset_unsafe); + const float2 xy_weights = eval_sinc_filter(offset) * eval_sinc_filter(offset/aa_lanczos_lobes); return xy_weights.x * xy_weights.y; } -float eval_jinc_filter_unorm(const float x) +inline float eval_jinc_filter_unorm(const float x) { // This is a Jinc approximation for x in [0, 45). We'll use x in range // [0, 4*pi) or so. There are faster/closer approximations based on @@ -383,19 +384,19 @@ float eval_jinc_filter_unorm(const float x) 0.180837503591406); } -float eval_jinc_filter(const float dist) +inline float eval_jinc_filter(const float dist) { return eval_jinc_filter_unorm(pi * dist); } -float eval_lanczos_jinc_filter(const float dist) +inline float eval_lanczos_jinc_filter(const float dist) { return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes); } -vec3 eval_unorm_rgb_weights(const vec2 offset, - const vec2 final_axis_importance) +inline float3 eval_unorm_rgb_weights(const float2 offset, + const float2 final_axis_importance) { // Requires: 1.) final_axis_impportance must be computed according to // get_subpixel_support_diam_and_final_axis_importance(). @@ -407,135 +408,135 @@ vec3 eval_unorm_rgb_weights(const vec2 offset, // subpixel_support_diameter.y/2]) // Returns: Sample weights at R/G/B destination subpixels for the // given xy pixel offset. - const vec2 offset_g = offset * final_axis_importance; - const vec2 aa_r_offset = get_aa_subpixel_r_offset(); - const vec2 offset_r = offset_g - aa_r_offset * final_axis_importance; - const vec2 offset_b = offset_g + aa_r_offset * final_axis_importance; + const float2 offset_g = offset * final_axis_importance; + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 offset_r = offset_g - aa_r_offset * final_axis_importance; + const float2 offset_b = offset_g + aa_r_offset * final_axis_importance; // Statically select a filter: if(aa_filter < 0.5) { - return vec3(eval_separable_box_filter(offset_r), + return float3(eval_separable_box_filter(offset_r), eval_separable_box_filter(offset_g), eval_separable_box_filter(offset_b)); } else if(aa_filter < 1.5) { - return vec3(eval_box_filter(length(offset_r)), + return float3(eval_box_filter(length(offset_r)), eval_box_filter(length(offset_g)), eval_box_filter(length(offset_b))); } else if(aa_filter < 2.5) { - return vec3( + return float3( eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y), eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y), eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y)); } else if(aa_filter < 3.5) { - return vec3(eval_tent_filter(length(offset_r)), + return float3(eval_tent_filter(length(offset_r)), eval_tent_filter(length(offset_g)), eval_tent_filter(length(offset_b))); } else if(aa_filter < 4.5) { - return vec3( + return float3( eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y), eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y), eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y)); } else if(aa_filter < 5.5) { - return vec3(eval_gaussian_filter(length(offset_r)), + return float3(eval_gaussian_filter(length(offset_r)), eval_gaussian_filter(length(offset_g)), eval_gaussian_filter(length(offset_b))); } else if(aa_filter < 6.5) { - return vec3( + return float3( eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y), eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y), eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y)); } else if(aa_filter < 7.5) { - return vec3(eval_cubic_filter(length(offset_r)), + return float3(eval_cubic_filter(length(offset_r)), eval_cubic_filter(length(offset_g)), eval_cubic_filter(length(offset_b))); } else if(aa_filter < 8.5) { - return vec3(eval_separable_lanczos_sinc_filter(offset_r), + return float3(eval_separable_lanczos_sinc_filter(offset_r), eval_separable_lanczos_sinc_filter(offset_g), eval_separable_lanczos_sinc_filter(offset_b)); } else if(aa_filter < 9.5) { - return vec3(eval_lanczos_jinc_filter(length(offset_r)), + return float3(eval_lanczos_jinc_filter(length(offset_r)), eval_lanczos_jinc_filter(length(offset_g)), eval_lanczos_jinc_filter(length(offset_b))); } else { // Default to a box, because Lanczos Jinc is so bad. ;) - return vec3(eval_separable_box_filter(offset_r), + return float3(eval_separable_box_filter(offset_r), eval_separable_box_filter(offset_g), eval_separable_box_filter(offset_b)); } } + ////////////////////////////// HELPER FUNCTIONS ////////////////////////////// -vec4 tex2Daa_tiled_linearize(const sampler2D samp, const vec2 s) +inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s) { // If we're manually tiling a texture, anisotropic filtering can get // confused. This is one workaround: #ifdef ANTIALIAS_DISABLE_ANISOTROPIC // TODO: Use tex2Dlod_linearize with a calculated mip level. - return tex2Dlod_linearize(samp, vec4(s, 0.0, 0.0)); + return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0)); #else return tex2D_linearize(samp, s); #endif } -vec2 get_frame_sign(const float frame) +inline float2 get_frame_sign(const float frame) { - if(aa_temporal == true) + if(aa_temporal) { // Mirror the sampling pattern for odd frames in a direction that // lets us keep the same subpixel sample weights: - float frame_odd = float(mod(frame, 2.0) > 0.5); - const vec2 aa_r_offset = get_aa_subpixel_r_offset(); - vec2 mirror = vec2(FIX_ZERO(0.0)); - if ( abs(aa_r_offset.x) < FIX_ZERO(0.0)) mirror.x = abs(aa_r_offset.x); - if ( abs(aa_r_offset.y) < FIX_ZERO(0.0)) mirror.y = abs(aa_r_offset.y); - return vec2(-1.0) * mirror; + const float frame_odd = float(fmod(frame, 2.0) > 0.5); + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0))); + return mirror; } else { - return vec2(1.0); + return float2(1.0, 1.0); } } + ///////////////////////// ANTIALIASED TEXTURE LOOKUPS //////////////////////// -vec3 tex2Daa_subpixel_weights_only(const sampler2D tex, - const vec2 tex_uv, const mat2x2 pixel_to_tex_uv) +float3 tex2Daa_subpixel_weights_only(const sampler2D tex, + const float2 tex_uv, const float2x2 pixel_to_tex_uv) { // This function is unlike the others: Just perform a single independent // lookup for each subpixel. It may be very aliased. - const vec2 aa_r_offset = get_aa_subpixel_r_offset(); - const vec2 aa_r_offset_uv_offset = (aa_r_offset * pixel_to_tex_uv); + const float2 aa_r_offset = get_aa_subpixel_r_offset(); + const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset); const float color_g = tex2D_linearize(tex, tex_uv).g; const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r; const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b; - return vec3(color_r, color_g, color_b); + return float3(color_r, color_g, color_b); } // The tex2Daa* functions compile very slowly due to all the macros and // compile-time math, so only include the ones we'll actually use! -vec3 tex2Daa4x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use an RGMS4 pattern (4-queens): // . . Q . : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4 @@ -543,45 +544,45 @@ vec3 tex2Daa4x(const sampler2D tex, const vec2 tex_uv, // . . . Q : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4 // . Q . . : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4 // Static screenspace sample offsets (compute some implicitly): - const float grid_size = 4.0; + static const float grid_size = 4.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(0.0, 1.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = w1.bgr; - const vec3 w3 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = w1.bgr; + const float3 w3 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3); } -vec3 tex2Daa5x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 5-queens pattern: // . Q . . . : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5 @@ -590,46 +591,46 @@ vec3 tex2Daa5x(const sampler2D tex, const vec2 tex_uv, // Q . . . . : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5 // . . . Q . : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5 // Static screenspace sample offsets (compute some implicitly): - const float grid_size = 5.0; + static const float grid_size = 5.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 2.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = w1.bgr; - const vec3 w4 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = w1.bgr; + const float3 w4 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 w_sum_inv = vec3(1.0)/(w0 + w1 + w2 + w3 + w4); + const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + w4 * sample4); } -vec3 tex2Daa6x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 6-queens pattern with a stronger horizontal // than vertical slant: @@ -640,51 +641,51 @@ vec3 tex2Daa6x(const sampler2D tex, const vec2 tex_uv, // . . . Q . . : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6 // . Q . . . . : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6 // Static screenspace sample offsets (compute some implicitly): - const float grid_size = 6.0; + static const float grid_size = 6.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(4.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(2.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = w2.bgr; - const vec3 w4 = w1.bgr; - const vec3 w5 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = w2.bgr; + const float3 w4 = w1.bgr; + const float3 w5 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1 + w2; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + w4 * sample4 + w5 * sample5); } -vec3 tex2Daa7x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 7-queens pattern with a queen in the center: // . Q . . . . . : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7 @@ -694,55 +695,55 @@ vec3 tex2Daa7x(const sampler2D tex, const vec2 tex_uv, // . . . . . . Q : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7 // . . Q . . . . : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7 // . . . . . Q . : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7 - const float grid_size = 7.0; + static const float grid_size = 7.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 3.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = w2.bgr; - const vec3 w5 = w1.bgr; - const vec3 w6 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = w2.bgr; + const float3 w5 = w1.bgr; + const float3 w6 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2; - const vec3 w_sum = half_sum + half_sum.bgr + w3; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1 + w2; + const float3 w_sum = half_sum + half_sum.bgr + w3; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + w4 * sample4 + w5 * sample5 + w6 * sample6); } -vec3 tex2Daa8x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 8-queens pattern. // . . Q . . . . . : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8 @@ -753,57 +754,57 @@ vec3 tex2Daa8x(const sampler2D tex, const vec2 tex_uv, // . . . . . . Q . : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8 // . . . Q . . . . : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8 // . . . . . Q . . : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8 - const float grid_size = 8.0; + static const float grid_size = 8.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(1.0, 2.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(7.0, 3.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = w3.bgr; - const vec3 w5 = w2.bgr; - const vec3 w6 = w1.bgr; - const vec3 w7 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = w3.bgr; + const float3 w5 = w2.bgr; + const float3 w6 = w1.bgr; + const float3 w7 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2 + w3; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1 + w2 + w3; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, and mirror on odd frames if directed: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7); } -vec3 tex2Daa12x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 12-superqueens pattern where no 3 points are // exactly collinear. @@ -819,62 +820,62 @@ vec3 tex2Daa12x(const sampler2D tex, const vec2 tex_uv, // . . . . . Q . . . . . . : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12 // . . Q . . . . . . . . . : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12 // . . . . . . . . Q . . . : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12 - const float grid_size = 12.0; + static const float grid_size = 12.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(3.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(6.0, 2.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step; - const vec2 xy_offset4 = xy_start_offset + vec2(11.0, 4.0) * xy_step; - const vec2 xy_offset5 = xy_start_offset + vec2(4.0, 5.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); - const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); - const vec3 w6 = w5.bgr; - const vec3 w7 = w4.bgr; - const vec3 w8 = w3.bgr; - const vec3 w9 = w2.bgr; - const vec3 w10 = w1.bgr; - const vec3 w11 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = w5.bgr; + const float3 w7 = w4.bgr; + const float3 w8 = w3.bgr; + const float3 w9 = w2.bgr; + const float3 w10 = w1.bgr; + const float3 w11 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/w_sum; + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/w_sum; // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; - const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; - const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; - const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + @@ -882,8 +883,8 @@ vec3 tex2Daa12x(const sampler2D tex, const vec2 tex_uv, w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11); } -vec3 tex2Daa16x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 16-superqueens pattern where no 3 points are // exactly collinear. @@ -903,74 +904,74 @@ vec3 tex2Daa16x(const sampler2D tex, const vec2 tex_uv, // . . . Q . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16 // . . . . . . Q . . . . . . . . . : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16 // . . . . . . . . . . . . . Q . . : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16 - const float grid_size = 16.0; + static const float grid_size = 16.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(12.0, 2.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(4.0, 3.0) * xy_step; - const vec2 xy_offset4 = xy_start_offset + vec2(8.0, 4.0) * xy_step; - const vec2 xy_offset5 = xy_start_offset + vec2(14.0, 5.0) * xy_step; - const vec2 xy_offset6 = xy_start_offset + vec2(0.0, 6.0) * xy_step; - const vec2 xy_offset7 = xy_start_offset + vec2(10.0, 7.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); - const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); - const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); - const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); - const vec3 w8 = w7.bgr; - const vec3 w9 = w6.bgr; - const vec3 w10 = w5.bgr; - const vec3 w11 = w4.bgr; - const vec3 w12 = w3.bgr; - const vec3 w13 = w2.bgr; - const vec3 w14 = w1.bgr; - const vec3 w15 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = w7.bgr; + const float3 w9 = w6.bgr; + const float3 w10 = w5.bgr; + const float3 w11 = w4.bgr; + const float3 w12 = w3.bgr; + const float3 w13 = w2.bgr; + const float3 w14 = w1.bgr; + const float3 w15 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset6 = (xy_offset6 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset7 = (xy_offset7 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; - const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; - const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; - const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; - const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; - const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; - const vec3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; - const vec3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + @@ -979,8 +980,8 @@ vec3 tex2Daa16x(const sampler2D tex, const vec2 tex_uv, w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); } -vec3 tex2Daa20x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 20-superqueens pattern where no 3 points are // exactly collinear and superqueens have a squared attack radius of 13. @@ -1004,86 +1005,86 @@ vec3 tex2Daa20x(const sampler2D tex, const vec2 tex_uv, // . . . . . . . . Q . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20 // . . . Q . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20 // . . . . . . . . . . . . Q . . . . . . . : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20 - const float grid_size = 20.0; + static const float grid_size = 20.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(7.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(11.0, 2.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step; - const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step; - const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step; - const vec2 xy_offset6 = xy_start_offset + vec2(10.0, 6.0) * xy_step; - const vec2 xy_offset7 = xy_start_offset + vec2(19.0, 7.0) * xy_step; - const vec2 xy_offset8 = xy_start_offset + vec2(2.0, 8.0) * xy_step; - const vec2 xy_offset9 = xy_start_offset + vec2(6.0, 9.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step; + const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step; + const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); - const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); - const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); - const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); - const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); - const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); - const vec3 w10 = w9.bgr; - const vec3 w11 = w8.bgr; - const vec3 w12 = w7.bgr; - const vec3 w13 = w6.bgr; - const vec3 w14 = w5.bgr; - const vec3 w15 = w4.bgr; - const vec3 w16 = w3.bgr; - const vec3 w17 = w2.bgr; - const vec3 w18 = w1.bgr; - const vec3 w19 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const float3 w10 = w9.bgr; + const float3 w11 = w8.bgr; + const float3 w12 = w7.bgr; + const float3 w13 = w6.bgr; + const float3 w14 = w5.bgr; + const float3 w15 = w4.bgr; + const float3 w16 = w3.bgr; + const float3 w17 = w2.bgr; + const float3 w18 = w1.bgr; + const float3 w19 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset6 = (xy_offset6 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset7 = (xy_offset7 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset8 = (xy_offset8 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset9 = (xy_offset9 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; - const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; - const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; - const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; - const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; - const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; - const vec3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; - const vec3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; - const vec3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; - const vec3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; - const vec3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; - const vec3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + @@ -1093,8 +1094,8 @@ vec3 tex2Daa20x(const sampler2D tex, const vec2 tex_uv, w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19); } -vec3 tex2Daa24x(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Use a diagonally symmetric 24-superqueens pattern where no 3 points are // exactly collinear and superqueens have a squared attack radius of 13. @@ -1122,99 +1123,99 @@ vec3 tex2Daa24x(const sampler2D tex, const vec2 tex_uv, // . . . . . . . . . . . . . Q . . . . . . . . . . : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24 // . . . . . . . Q . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24 // . . . . . . . . . . . . . . . . . Q . . . . . . : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24 - const float grid_size = 24.0; + static const float grid_size = 24.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample. Exploit diagonal symmetry: - const vec2 xy_offset0 = xy_start_offset + vec2(6.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(10.0, 2.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(21.0, 3.0) * xy_step; - const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step; - const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step; - const vec2 xy_offset6 = xy_start_offset + vec2(1.0, 6.0) * xy_step; - const vec2 xy_offset7 = xy_start_offset + vec2(11.0, 7.0) * xy_step; - const vec2 xy_offset8 = xy_start_offset + vec2(19.0, 8.0) * xy_step; - const vec2 xy_offset9 = xy_start_offset + vec2(23.0, 9.0) * xy_step; - const vec2 xy_offset10 = xy_start_offset + vec2(3.0, 10.0) * xy_step; - const vec2 xy_offset11 = xy_start_offset + vec2(14.0, 11.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step; + const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step; + const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step; + const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step; + const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); - const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); - const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); - const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); - const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); - const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); - const vec3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance); - const vec3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance); - const vec3 w12 = w11.bgr; - const vec3 w13 = w10.bgr; - const vec3 w14 = w9.bgr; - const vec3 w15 = w8.bgr; - const vec3 w16 = w7.bgr; - const vec3 w17 = w6.bgr; - const vec3 w18 = w5.bgr; - const vec3 w19 = w4.bgr; - const vec3 w20 = w3.bgr; - const vec3 w21 = w2.bgr; - const vec3 w22 = w1.bgr; - const vec3 w23 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance); + const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance); + const float3 w12 = w11.bgr; + const float3 w13 = w10.bgr; + const float3 w14 = w9.bgr; + const float3 w15 = w8.bgr; + const float3 w16 = w7.bgr; + const float3 w17 = w6.bgr; + const float3 w18 = w5.bgr; + const float3 w19 = w4.bgr; + const float3 w20 = w3.bgr; + const float3 w21 = w2.bgr; + const float3 w22 = w1.bgr; + const float3 w23 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, mirror on odd frames if directed, and exploit // diagonal symmetry: - const vec2 frame_sign = get_frame_sign(frame); - const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset6 = (xy_offset6 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset7 = (xy_offset7 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset8 = (xy_offset8 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset9 = (xy_offset9 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset10 = (xy_offset10 * frame_sign * true_pixel_to_tex_uv); - const vec2 uv_offset11 = (xy_offset11 * frame_sign * true_pixel_to_tex_uv); + const float2 frame_sign = get_frame_sign(frame); + const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign); + const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign); // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; - const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; - const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; - const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; - const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb; - const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb; - const vec3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb; - const vec3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb; - const vec3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; - const vec3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; - const vec3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; - const vec3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; - const vec3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; - const vec3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; - const vec3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; - const vec3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; - const vec3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; - const vec3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb; + const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb; + const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb; + const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb; + const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb; + const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb; + const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb; + const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb; + const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + @@ -1225,78 +1226,78 @@ vec3 tex2Daa24x(const sampler2D tex, const vec2 tex_uv, w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23); } -vec3 tex2Daa_debug_16x_regular(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Sample on a regular 4x4 grid. This is mainly for testing. - const float grid_size = 4.0; + static const float grid_size = 4.0; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; - const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; - const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; + const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter; + const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step; // Get the xy offset of each sample: - const vec2 xy_offset0 = xy_start_offset + vec2(0.0, 0.0) * xy_step; - const vec2 xy_offset1 = xy_start_offset + vec2(1.0, 0.0) * xy_step; - const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 0.0) * xy_step; - const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 0.0) * xy_step; - const vec2 xy_offset4 = xy_start_offset + vec2(0.0, 1.0) * xy_step; - const vec2 xy_offset5 = xy_start_offset + vec2(1.0, 1.0) * xy_step; - const vec2 xy_offset6 = xy_start_offset + vec2(2.0, 1.0) * xy_step; - const vec2 xy_offset7 = xy_start_offset + vec2(3.0, 1.0) * xy_step; + const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step; + const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step; + const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step; + const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step; + const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step; + const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step; + const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step; + const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step; // Compute subpixel weights, and exploit diagonal symmetry for speed. // (We can't exploit vertical or horizontal symmetry due to uncertain // subpixel offsets. We could fix that by rotating xy offsets with the // subpixel structure, but...no.) - const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); - const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); - const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); - const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); - const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); - const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); - const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); - const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); - const vec3 w8 = w7.bgr; - const vec3 w9 = w6.bgr; - const vec3 w10 = w5.bgr; - const vec3 w11 = w4.bgr; - const vec3 w12 = w3.bgr; - const vec3 w13 = w2.bgr; - const vec3 w14 = w1.bgr; - const vec3 w15 = w0.bgr; + const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const float3 w8 = w7.bgr; + const float3 w9 = w6.bgr; + const float3 w10 = w5.bgr; + const float3 w11 = w4.bgr; + const float3 w12 = w3.bgr; + const float3 w13 = w2.bgr; + const float3 w14 = w1.bgr; + const float3 w15 = w0.bgr; // Get the weight sum to normalize the total to 1.0 later: - const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; - const vec3 w_sum = half_sum + half_sum.bgr; - const vec3 w_sum_inv = vec3(1.0)/(w_sum); + const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const float3 w_sum = half_sum + half_sum.bgr; + const float3 w_sum_inv = float3(1.0)/(w_sum); // Scale the pixel-space to texture offset matrix by the pixel diameter. - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); // Get uv sample offsets, taking advantage of row alignment: - const vec2 uv_step_x = (vec2(xy_step.x, 0.0) * true_pixel_to_tex_uv); - const vec2 uv_step_y = (vec2(0.0, xy_step.y) * true_pixel_to_tex_uv); - const vec2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y); - const vec2 sample0_uv = tex_uv + uv_offset0; - const vec2 sample4_uv = sample0_uv + uv_step_y; - const vec2 sample8_uv = sample0_uv + uv_step_y * 2.0; - const vec2 sample12_uv = sample0_uv + uv_step_y * 3.0; + const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0)); + const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y)); + const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y); + const float2 sample0_uv = tex_uv + uv_offset0; + const float2 sample4_uv = sample0_uv + uv_step_y; + const float2 sample8_uv = sample0_uv + uv_step_y * 2.0; + const float2 sample12_uv = sample0_uv + uv_step_y * 3.0; // Load samples, linearizing if necessary, etc.: - const vec3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb; - const vec3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb; - const vec3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb; - const vec3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb; - const vec3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb; - const vec3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb; - const vec3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb; - const vec3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb; - const vec3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb; - const vec3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb; - const vec3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb; - const vec3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb; - const vec3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb; - const vec3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb; - const vec3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb; - const vec3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb; + const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb; + const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb; + const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb; + const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb; + const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb; + const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb; + const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb; + const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb; + const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb; + const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb; + const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb; + const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb; + const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb; + const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb; + const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb; // Sum weighted samples (weight sum must equal 1.0 for each channel): return w_sum_inv * ( w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + @@ -1305,54 +1306,56 @@ vec3 tex2Daa_debug_16x_regular(const sampler2D tex, const vec2 tex_uv, w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); } -vec3 tex2Daa_debug_dynamic(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // This function is for testing only: Use an NxN grid with dynamic weights. - const int grid_size = 8; + static const int grid_size = 8; assign_aa_cubic_constants(); - const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); - const vec2 subpixel_support_diameter = ssd_fai.xy; - const vec2 final_axis_importance = ssd_fai.zw; + const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const float2 subpixel_support_diameter = ssd_fai.xy; + const float2 final_axis_importance = ssd_fai.zw; const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0; - const vec2 filter_space_offset_step = - subpixel_support_diameter/vec2(grid_size); - const vec2 sample0_filter_space_offset = + const float2 filter_space_offset_step = + subpixel_support_diameter/float2(grid_size); + const float2 sample0_filter_space_offset = -grid_radius_in_samples * filter_space_offset_step; // Compute xy sample offsets and subpixel weights: - vec3 weights[grid_size * grid_size]; - vec3 weight_sum = vec3(0.0); + float3 weights[grid_size * grid_size]; + float3 weight_sum = float3(0.0, 0.0, 0.0); for(int i = 0; i < grid_size; ++i) { for(int j = 0; j < grid_size; ++j) { // Weights based on xy distances: - const vec2 offset = sample0_filter_space_offset + - vec2(j, i) * filter_space_offset_step; - const vec3 weight = eval_unorm_rgb_weights(offset, final_axis_importance); + const float2 offset = sample0_filter_space_offset + + float2(j, i) * filter_space_offset_step; + const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance); weights[i*grid_size + j] = weight; weight_sum += weight; } } // Get uv offset vectors along x and y directions: - const mat2x2 true_pixel_to_tex_uv = - mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); - const vec2 uv_offset_step_x = (vec2(filter_space_offset_step.x, 0.0) * true_pixel_to_tex_uv); - const vec2 uv_offset_step_y = (vec2(0.0, filter_space_offset_step.y) * true_pixel_to_tex_uv); + const float2x2 true_pixel_to_tex_uv = + float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter)); + const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv, + float2(filter_space_offset_step.x, 0.0)); + const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv, + float2(0.0, filter_space_offset_step.y)); // Get a starting sample location: - const vec2 sample0_uv_offset = -grid_radius_in_samples * + const float2 sample0_uv_offset = -grid_radius_in_samples * (uv_offset_step_x + uv_offset_step_y); - const vec2 sample0_uv = tex_uv + sample0_uv_offset; + const float2 sample0_uv = tex_uv + sample0_uv_offset; // Load, weight, and sum [linearized] samples: - vec3 sum = vec3(0.0); - const vec3 weight_sum_inv = vec3(1.0)/vec3(weight_sum); + float3 sum = float3(0.0, 0.0, 0.0); + const float3 weight_sum_inv = float3(1.0)/weight_sum; for(int i = 0; i < grid_size; ++i) { - const vec2 row_i_first_sample_uv = + const float2 row_i_first_sample_uv = sample0_uv + i * uv_offset_step_y; for(int j = 0; j < grid_size; ++j) { - const vec2 sample_uv = + const float2 sample_uv = row_i_first_sample_uv + j * uv_offset_step_x; sum += weights[i*grid_size + j] * tex2Daa_tiled_linearize(tex, sample_uv).rgb; @@ -1361,26 +1364,30 @@ vec3 tex2Daa_debug_dynamic(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } + /////////////////////// ANTIALIASING CODEPATH SELECTION ////////////////////// -vec3 tex2Daa(const sampler2D tex, const vec2 tex_uv, - const mat2x2 pixel_to_tex_uv, const float frame) +inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv, + const float2x2 pixel_to_tex_uv, const float frame) { // Statically switch between antialiasing modes/levels: - if (aa_level < 0.5) return tex2D_linearize(tex, tex_uv).rgb; - else if (aa_level < 3.5) return tex2Daa_subpixel_weights_only( - tex, tex_uv, pixel_to_tex_uv); - else if (aa_level < 4.5) return tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 5.5) return tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 6.5) return tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 7.5) return tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 11.5) return tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 15.5) return tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 19.5) return tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 23.5) return tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 253.5) return tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame); - else if (aa_level < 254.5) return tex2Daa_debug_16x_regular(tex, tex_uv, pixel_to_tex_uv, frame); - else return tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame); + return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb : + (aa_level < 3.5) ? tex2Daa_subpixel_weights_only( + tex, tex_uv, pixel_to_tex_uv) : + (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) : + (aa_level < 254.5) ? tex2Daa_debug_16x_regular( + tex, tex_uv, pixel_to_tex_uv, frame) : + tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame); } -#endif // TEX2DANTIALIAS_H \ No newline at end of file + +#endif // TEX2DANTIALIAS_H + diff --git a/crt/shaders/crt-royale/src/user-preset-constants.h b/crt/shaders/crt-royale/src/user-cgp-constants.h similarity index 74% rename from crt/shaders/crt-royale/src/user-preset-constants.h rename to crt/shaders/crt-royale/src/user-cgp-constants.h index 93a77d5..25578cb 100644 --- a/crt/shaders/crt-royale/src/user-preset-constants.h +++ b/crt/shaders/crt-royale/src/user-cgp-constants.h @@ -11,14 +11,13 @@ // Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of // this shader: One does a viewport-scale bloom, and the other skips it. The // latter benefits from a higher bloom_approx_scale_x, so save both separately: -const float bloom_approx_size_x = 320.0; -const float bloom_approx_scale_x = 320.0; //dunno why this is necessary -const float bloom_approx_size_x_for_fake = 400.0; +static const float bloom_approx_size_x = 320.0; +static const float bloom_approx_size_x_for_fake = 400.0; // Copy the viewport-relative scales of the phosphor mask resize passes // (MASK_RESIZE and the pass immediately preceding it): -const vec2 mask_resize_viewport_scale = vec2(0.0625, 0.0625); +static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625); // Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: -const float geom_max_aspect_ratio = 4.0/3.0; +static const float geom_max_aspect_ratio = 4.0/3.0; // PHOSPHOR MASK TEXTURE CONSTANTS: // Set the following constants to reflect the properties of the phosphor mask @@ -26,32 +25,32 @@ const float geom_max_aspect_ratio = 4.0/3.0; // based on user settings, then repeats a single tile until filling the screen. // The shader must know the input texture size (default 64x64), and to manually // resize, it must also know the horizontal triads per tile (default 8). -const vec2 mask_texture_small_size = vec2(64.0); -const vec2 mask_texture_large_size = vec2(512.0); -const float mask_triads_per_tile = 8.0; +static const float2 mask_texture_small_size = float2(64.0, 64.0); +static const float2 mask_texture_large_size = float2(512.0, 512.0); +static const float mask_triads_per_tile = 8.0; // We need the average brightness of the phosphor mask to compensate for the // dimming it causes. The following four values are roughly correct for the // masks included with the shader. Update the value for any LUT texture you // change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether // the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). //#define PHOSPHOR_MASK_GRILLE14 -const float mask_grille14_avg_color = 50.6666666/255.0; +static const float mask_grille14_avg_color = 50.6666666/255.0; // TileableLinearApertureGrille14Wide7d33Spacing*.png // TileableLinearApertureGrille14Wide10And6Spacing*.png -const float mask_grille15_avg_color = 53.0/255.0; +static const float mask_grille15_avg_color = 53.0/255.0; // TileableLinearApertureGrille15Wide6d33Spacing*.png // TileableLinearApertureGrille15Wide8And5d5Spacing*.png -const float mask_slot_avg_color = 46.0/255.0; +static const float mask_slot_avg_color = 46.0/255.0; // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png -const float mask_shadow_avg_color = 41.0/255.0; +static const float mask_shadow_avg_color = 41.0/255.0; // TileableLinearShadowMask*.png // TileableLinearShadowMaskEDP*.png #ifdef PHOSPHOR_MASK_GRILLE14 - const float mask_grille_avg_color = mask_grille14_avg_color; + static const float mask_grille_avg_color = mask_grille14_avg_color; #else - const float mask_grille_avg_color = mask_grille15_avg_color; + static const float mask_grille_avg_color = mask_grille15_avg_color; #endif diff --git a/crt/shaders/crt-royale/user-settings.h b/crt/shaders/crt-royale/user-settings.h index cc375df..211d624 100644 --- a/crt/shaders/crt-royale/user-settings.h +++ b/crt/shaders/crt-royale/user-settings.h @@ -15,8 +15,8 @@ // Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. // Among other things, derivatives help us fix anisotropic filtering artifacts // with curved manually tiled phosphor mask coords. Related errors: -// error C3004: function "vec2 ddx(vec2);" not supported in this profile -// error C3004: function "vec2 ddy(vec2);" not supported in this profile +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile //#define DRIVERS_ALLOW_DERIVATIVES // Fine derivatives: Unsupported on older ATI cards. @@ -43,13 +43,13 @@ // tex2Dlod: Requires an fp40 or newer profile. This can be used to disable // anisotropic filtering, thereby fixing related artifacts. Related errors: -// error C3004: function "vec4 tex2Dlod(sampler2D, vec4);" not supported in +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in // this profile //#define DRIVERS_ALLOW_TEX2DLOD // tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate // artifacts from anisotropic filtering and mipmapping. Related errors: -// error C3004: function "vec4 tex2Dbias(sampler2D, vec4);" not supported +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported // in this profile //#define DRIVERS_ALLOW_TEX2DBIAS @@ -124,30 +124,30 @@ // options that were cleaner or more convert to code as static constants. // GAMMA: - const float crt_gamma_static = 2.5; // range [1, 5] - const float lcd_gamma_static = 2.2; // range [1, 5] + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] // LEVELS MANAGEMENT: // Control the final multiplicative image contrast: - const float levels_contrast_static = 1.0; // range [0, 4) + static const float levels_contrast_static = 1.0; // range [0, 4) // We auto-dim to avoid clipping between passes and restore brightness // later. Control the dim factor here: Lower values clip less but crush // blacks more (static only for now). - const float levels_autodim_temp = 0.5; // range (0, 1] + static const float levels_autodim_temp = 0.5; // range (0, 1] // HALATION/DIFFUSION/BLOOM: // Halation weight: How much energy should be lost to electrons bounding // around under the CRT glass and exciting random phosphors? - const float halation_weight_static = 0.0; // range [0, 1] + static const float halation_weight_static = 0.0; // range [0, 1] // Refractive diffusion weight: How much light should spread/diffuse from // refracting through the CRT glass? - const float diffusion_weight_static = 0.075; // range [0, 1] + static const float diffusion_weight_static = 0.075; // range [0, 1] // Underestimate brightness: Bright areas bloom more, but we can base the // bloom brightpass on a lower brightness to sharpen phosphors, or a higher // brightness to soften them. Low values clip, but >= 0.8 looks okay. - const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] // Blur all colors more than necessary for a softer phosphor bloom? - const float bloom_excess_static = 0.0; // range [0, 1] + static const float bloom_excess_static = 0.0; // range [0, 1] // The BLOOM_APPROX pass approximates a phosphor blur early on with a small // blurred resize of the input (convergence offsets are applied as well). // There are three filter options (static option only for now): @@ -159,7 +159,11 @@ // mask_num_triads_desired. // 2.) True 4x4 Gaussian resize: Slowest, technically correct. // These options are more pronounced for the fast, unbloomed shader version. - const float bloom_approx_filter_static = 2.0; +#ifndef RADEON_FIX + static const float bloom_approx_filter_static = 2.0; +#else + static const float bloom_approx_filter_static = 1.0; +#endif // ELECTRON BEAM SCANLINE DISTRIBUTION: // How many scanlines should contribute light to each pixel? Using more @@ -172,68 +176,68 @@ // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized - const float beam_num_scanlines = 3.0; // range [2, 6] + static const float beam_num_scanlines = 3.0; // range [2, 6] // A generalized Gaussian beam varies shape with color too, now just width. // It's slower but more flexible (static option only for now). - bool beam_generalized_gaussian = true; + static const bool beam_generalized_gaussian = true; // What kind of scanline antialiasing do you want? // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral // Integrals are slow (especially for generalized Gaussians) and rarely any // better than 3x antialiasing (static option only for now). - const float beam_antialias_level = 1.0; // range [0, 2] + static const float beam_antialias_level = 1.0; // range [0, 2] // Min/max standard deviations for scanline beams: Higher values widen and // soften scanlines. Depending on other options, low min sigmas can alias. - const float beam_min_sigma_static = 0.02; // range (0, 1] - const float beam_max_sigma_static = 0.3; // range (0, 1] + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] // Beam width varies as a function of color: A power function (0) is more // configurable, but a spherical function (1) gives the widest beam // variability without aliasing (static option only for now). - const float beam_spot_shape_function = 0.0; + static const float beam_spot_shape_function = 0.0; // Spot shape power: Powers <= 1 give smoother spot shapes but lower // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. - const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] // Generalized Gaussian max shape parameters: Higher values give flatter // scanline plateaus and steeper dropoffs, simultaneously widening and // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and // values > ~40.0 cause artifacts with integrals. - const float beam_min_shape_static = 2.0; // range [2, 32] - const float beam_max_shape_static = 4.0; // range [2, 32] + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] // Generalized Gaussian shape power: Affects how quickly the distribution // changes shape from Gaussian to steep/plateaued as color increases from 0 // to 1.0. Higher powers appear softer for most colors, and lower powers // appear sharper for most colors. - const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] // What filter should be used to sample scanlines horizontally? // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) - const float beam_horiz_filter_static = 0.0; + static const float beam_horiz_filter_static = 0.0; // Standard deviation for horizontal Gaussian resampling: - const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] // Do horizontal scanline sampling in linear RGB (correct light mixing), // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- // limiting circuitry in some CRT's), or a weighted avg.? - const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] // Simulate scanline misconvergence? This needs 3x horizontal texture // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in // later passes (static option only for now). - bool beam_misconvergence = true; + static const bool beam_misconvergence = true; // Convergence offsets in x/y directions for R/G/B scanline beams in units // of scanlines. Positive offsets go right/down; ranges [-2, 2] - const vec2 convergence_offsets_r_static = vec2(0.0, 0.0); - const vec2 convergence_offsets_g_static = vec2(0.0, 0.0); - const vec2 convergence_offsets_b_static = vec2(0.0, 0.0); + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); // Detect interlacing (static option only for now)? - bool interlace_detect = true; + static const bool interlace_detect = true; // Assume 1080-line sources are interlaced? - const bool interlace_1080i_static = false; + static const bool interlace_1080i_static = false; // For interlaced sources, assume TFF (top-field first) or BFF order? // (Whether this matters depends on the nature of the interlaced input.) - const bool interlace_bff_static = false; + static const bool interlace_bff_static = false; // ANTIALIASING: // What AA level do you want for curvature/overscan/subpixels? Options: // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x // (Static option only for now) - const float aa_level = 12.0; // range [0, 24] + static const float aa_level = 12.0; // range [0, 24] // What antialiasing filter do you want (static option only)? Options: // 0: Box (separable), 1: Box (cylindrical), // 2: Tent (separable), 3: Tent (cylindrical), @@ -241,24 +245,24 @@ // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS - const float aa_filter = 6.0; // range [0, 9] + static const float aa_filter = 6.0; // range [0, 9] // Flip the sample grid on odd/even frames (static option only for now)? - const bool aa_temporal = false; + static const bool aa_temporal = false; // Use RGB subpixel offsets for antialiasing? The pixel is at green, and // the blue offset is the negative r offset; range [0, 0.5] - const vec2 aa_subpixel_r_offset_static = vec2(-1.0/3.0, 0.0);//vec2(0.0); + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. // 4.) C = 0.0 is a soft spline filter. - const float aa_cubic_c_static = 0.5; // range [0, 4] + static const float aa_cubic_c_static = 0.5; // range [0, 4] // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. - const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] // PHOSPHOR MASK: // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask - const float mask_type_static = 1.0; // range [0, 2] + static const float mask_type_static = 1.0; // range [0, 2] // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. @@ -268,11 +272,11 @@ // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. // This mode reuses the same masks, so triads will be enormous unless // you change the mask LUT filenames in your .cgp file. - const float mask_sample_mode_static = 0.0; // range [0, 2] + static const float mask_sample_mode_static = 0.0; // range [0, 2] // Prefer setting the triad size (0.0) or number on the screen (1.0)? // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size // will always be used to calculate the full bloom sigma statically. - const float mask_specify_num_triads_static = 0.0; // range [0, 1] + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] // Specify the phosphor triad size, in pixels. Each tile (usually with 8 // triads) will be rounded to the nearest integer tile size and clamped to // obey minimum size constraints (imposed to reduce downsize taps) and @@ -280,14 +284,14 @@ // To increase the size limit, double the viewport-relative scales for the // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. // range [1, mask_texture_small_size/mask_triads_per_tile] -// const float mask_triad_size_desired_static = 24.0 / 8.0; + static const float mask_triad_size_desired_static = 24.0 / 8.0; // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the // final size will be rounded and constrained as above); default 480.0 - const float mask_num_triads_desired_static = 480.0; + static const float mask_num_triads_desired_static = 480.0; // How many lobes should the sinc/Lanczos resizer use? More lobes require // more samples and avoid moire a bit better, but some is unavoidable // depending on the destination size (static option for now). - const float mask_sinc_lobes = 3.0; // range [2, 4] + static const float mask_sinc_lobes = 3.0; // range [2, 4] // The mask is resized using a variable number of taps in each dimension, // but some Cg profiles always fetch a constant number of taps no matter // what (no dynamic branching). We can limit the maximum number of taps if @@ -295,27 +299,27 @@ // faster, but the limit IS enforced (static option only, forever); // range [1, mask_texture_small_size/mask_triads_per_tile] // TODO: Make this 1.0 and compensate with smarter sampling! - const float mask_min_allowed_triad_size = 2.0; + static const float mask_min_allowed_triad_size = 2.0; // GEOMETRY: // Geometry mode: // 0: Off (default), 1: Spherical mapping (like cgwg's), // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron - const float geom_mode_static = 0.0; // range [0, 3] + static const float geom_mode_static = 0.0; // range [0, 3] // Radius of curvature: Measured in units of your viewport's diagonal size. - const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] // View dist is the distance from the player to their physical screen, in // units of the viewport's diagonal size. It controls the field of view. - const float geom_view_dist_static = 2.0; // range [0.5, 1024] + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] // Tilt angle in radians (clockwise around up and right vectors): - const vec2 geom_tilt_angle_static = vec2(0.0, 0.0); // range [-pi, pi] + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] // Aspect ratio: When the true viewport size is unknown, this value is used // to help convert between the phosphor triad size and count, along with // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set // this equal to Retroarch's display aspect ratio (DAR) for best results; // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; // default (256/224)*(54/47) = 1.313069909 (see below) - const float geom_aspect_ratio_static = 1.313069909; + static const float geom_aspect_ratio_static = 1.313069909; // Before getting into overscan, here's some general aspect ratio info: // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR @@ -338,21 +342,21 @@ // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly // or adjust x/y independently to e.g. readd horizontal padding, as noted // above: Values < 1.0 zoom out; range (0, inf) - const vec2 geom_overscan_static = vec2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) // Compute a proper pixel-space to texture-space matrix even without ddx()/ // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering // with strong curvature (static option only for now). - const bool geom_force_correct_tangent_matrix = true; + static const bool geom_force_correct_tangent_matrix = true; // BORDERS: // Rounded border size in texture uv coords: - const float border_size_static = 0.015; // range [0, 0.5] + static const float border_size_static = 0.015; // range [0, 0.5] // Border darkness: Moderate values darken the border smoothly, and high // values make the image very dark just inside the border: - const float border_darkness_static = 2.0; // range [0, inf) + static const float border_darkness_static = 2.0; // range [0, inf) // Border compression: High numbers compress border transitions, narrowing // the dark border area. - const float border_compress_static = 2.5; // range [1, inf) + static const float border_compress_static = 2.5; // range [1, inf) #endif // USER_SETTINGS_H diff --git a/include/blur-functions.h b/include/blur-functions.h index bfef056..517a8cc 100644 --- a/include/blur-functions.h +++ b/include/blur-functions.h @@ -41,7 +41,7 @@ // dxdy = (IN.video_size/IN.output_size)/IN.texture_size // 6.) For separable blurs (tex2DblurNresize and tex2DblurNfast), // zero out the dxdy component in the unblurred dimension: -// dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y) +// dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y) // Many blurs share these requirements: // 1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0, // or they will blur more in the lower-scaled dimension. @@ -145,6 +145,7 @@ // tex2Dblur43fast // tex2Dblur3x3resize + ///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// // Set static standard deviations, but allow users to override them with their @@ -157,70 +158,69 @@ // This distribution works such that blurring multiple times should // have the same result as a single larger blur. These values are // larger than default for blurs up to 6x and smaller thereafter. - const float blur3_std_dev = 0.84931640625; - const float blur4_std_dev = 0.84931640625; - const float blur5_std_dev = 1.0595703125; - const float blur6_std_dev = 1.06591796875; - const float blur7_std_dev = 1.17041015625; - const float blur8_std_dev = 1.1720703125; - const float blur9_std_dev = 1.2259765625; - const float blur10_std_dev = 1.21982421875; - const float blur11_std_dev = 1.25361328125; - const float blur12_std_dev = 1.2423828125; - const float blur17_std_dev = 1.27783203125; - const float blur25_std_dev = 1.2810546875; - const float blur31_std_dev = 1.28125; - const float blur43_std_dev = 1.28125; + static const float blur3_std_dev = 0.84931640625; + static const float blur4_std_dev = 0.84931640625; + static const float blur5_std_dev = 1.0595703125; + static const float blur6_std_dev = 1.06591796875; + static const float blur7_std_dev = 1.17041015625; + static const float blur8_std_dev = 1.1720703125; + static const float blur9_std_dev = 1.2259765625; + static const float blur10_std_dev = 1.21982421875; + static const float blur11_std_dev = 1.25361328125; + static const float blur12_std_dev = 1.2423828125; + static const float blur17_std_dev = 1.27783203125; + static const float blur25_std_dev = 1.2810546875; + static const float blur31_std_dev = 1.28125; + static const float blur43_std_dev = 1.28125; #else // The defaults are the largest values that keep the largest unused // blur term on each side <= 1.0/256.0. (We could get away with more // or be more conservative, but this compromise is pretty reasonable.) - const float blur3_std_dev = 0.62666015625; - const float blur4_std_dev = 0.66171875; - const float blur5_std_dev = 0.9845703125; - const float blur6_std_dev = 1.02626953125; - const float blur7_std_dev = 1.36103515625; - const float blur8_std_dev = 1.4080078125; - const float blur9_std_dev = 1.7533203125; - const float blur10_std_dev = 1.80478515625; - const float blur11_std_dev = 2.15986328125; - const float blur12_std_dev = 2.215234375; - const float blur17_std_dev = 3.45535583496; - const float blur25_std_dev = 5.3409576416; - const float blur31_std_dev = 6.86488037109; - const float blur43_std_dev = 10.1852050781; + static const float blur3_std_dev = 0.62666015625; + static const float blur4_std_dev = 0.66171875; + static const float blur5_std_dev = 0.9845703125; + static const float blur6_std_dev = 1.02626953125; + static const float blur7_std_dev = 1.36103515625; + static const float blur8_std_dev = 1.4080078125; + static const float blur9_std_dev = 1.7533203125; + static const float blur10_std_dev = 1.80478515625; + static const float blur11_std_dev = 2.15986328125; + static const float blur12_std_dev = 2.215234375; + static const float blur17_std_dev = 3.45535583496; + static const float blur25_std_dev = 5.3409576416; + static const float blur31_std_dev = 6.86488037109; + static const float blur43_std_dev = 10.1852050781; #endif // USE_BINOMIAL_BLUR_STD_DEVS #endif // OVERRIDE_BLUR_STD_DEVS #ifndef OVERRIDE_ERROR_BLURRING // error_blurring should be in [0.0, 1.0]. Higher values reduce ringing // in shared-sample blurs but increase blurring and feature shifting. - const float error_blurring = 0.5; + static const float error_blurring = 0.5; #endif -// Make a length squared helper macro (for usage with static constants): -#define LENGTH_SQ(vec) (dot(vec, vec)) ////////////////////////////////// INCLUDES ////////////////////////////////// // gamma-management.h relies on pass-specific settings to guide its behavior: // FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc. See it for details. #include "gamma-management.h" -//#include "quad-pixel-communication.h" +#include "quad-pixel-communication.h" #include "special-functions.h" + /////////////////////////////////// HELPERS ////////////////////////////////// -vec4 uv2_to_uv4(vec2 tex_uv) +inline float4 uv2_to_uv4(float2 tex_uv) { - // Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords: - return vec4(tex_uv, 0.0, 0.0); + // Make a float2 uv offset safe for adding to float4 tex2Dlod coords: + return float4(tex_uv, 0.0, 0.0); } // Make a length squared helper macro (for usage with static constants): #define LENGTH_SQ(vec) (dot(vec, vec)) -float get_fast_gaussian_weight_sum_inv(const float sigma) +inline float get_fast_gaussian_weight_sum_inv(const float sigma) { // We can use the Gaussian integral to calculate the asymptotic weight for // the center pixel. Since the unnormalized center pixel weight is 1.0, @@ -241,10 +241,11 @@ float get_fast_gaussian_weight_sum_inv(const float sigma) (sigma - 0.0860587260734721))), 0.399334576340352/sigma); } + //////////////////// ARBITRARILY RESIZABLE SEPARABLE BLURS /////////////////// -vec3 tex2Dblur11resize(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Global requirements must be met (see file description). // Returns: A 1D 11x Gaussian blurred texture lookup using a 11-tap blur. @@ -262,7 +263,7 @@ vec3 tex2Dblur11resize(const sampler2D tex, const vec2 tex_uv, (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5)); // Statically normalize weights, sum weighted samples, and return. Blurs are // currently optimized for dynamic weights. - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb; sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; @@ -277,8 +278,8 @@ vec3 tex2Dblur11resize(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur9resize(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Global requirements must be met (see file description). // Returns: A 1D 9x Gaussian blurred texture lookup using a 9-tap blur. @@ -292,7 +293,7 @@ vec3 tex2Dblur9resize(const sampler2D tex, const vec2 tex_uv, const float w4 = exp(-16.0 * denom_inv); const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb; sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; @@ -305,8 +306,8 @@ vec3 tex2Dblur9resize(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur7resize(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Global requirements must be met (see file description). // Returns: A 1D 7x Gaussian blurred texture lookup using a 7-tap blur. @@ -319,7 +320,7 @@ vec3 tex2Dblur7resize(const sampler2D tex, const vec2 tex_uv, const float w3 = exp(-9.0 * denom_inv); const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb; sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; @@ -330,8 +331,8 @@ vec3 tex2Dblur7resize(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur5resize(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Global requirements must be met (see file description). // Returns: A 1D 5x Gaussian blurred texture lookup using a 5-tap blur. @@ -343,7 +344,7 @@ vec3 tex2Dblur5resize(const sampler2D tex, const vec2 tex_uv, const float w2 = exp(-4.0 * denom_inv); const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb; sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; sum += w0 * tex2D_linearize(tex, tex_uv).rgb; @@ -352,8 +353,8 @@ vec3 tex2Dblur5resize(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur3resize(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Global requirements must be met (see file description). // Returns: A 1D 3x Gaussian blurred texture lookup using a 3-tap blur. @@ -364,17 +365,18 @@ vec3 tex2Dblur3resize(const sampler2D tex, const vec2 tex_uv, const float w1 = exp(-1.0 * denom_inv); const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb; sum += w0 * tex2D_linearize(tex, tex_uv).rgb; sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb; return sum * weight_sum_inv; } + /////////////////////////// FAST SEPARABLE BLURS /////////////////////////// -vec3 tex2Dblur11fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: 1.) Global requirements must be met (see file description). // 2.) filter_linearN must = "true" in your .cgp file. @@ -401,7 +403,7 @@ vec3 tex2Dblur11fast(const sampler2D tex, const vec2 tex_uv, const float w23_ratio = w3/w23; const float w45_ratio = w5/w45; // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb; sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; @@ -411,12 +413,12 @@ vec3 tex2Dblur11fast(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur17fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Same as tex2Dblur11() - // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest - // neighbor and 8 linear taps. It may be mipmapped depending + // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest + // neighbor and 4 linear taps. It may be mipmapped depending // on settings and dxdy. // First get the texel weights and normalization factor as above. const float denom_inv = 0.5/(sigma*sigma); @@ -425,97 +427,27 @@ vec3 tex2Dblur17fast(const sampler2D tex, const vec2 tex_uv, const float w2 = exp(-4.0 * denom_inv); const float w3 = exp(-9.0 * denom_inv); const float w4 = exp(-16.0 * denom_inv); - const float w5 = exp(-25.0 * denom_inv); - const float w6 = exp(-36.0 * denom_inv); - const float w7 = exp(-49.0 * denom_inv); - const float w8 = exp(-64.0 * denom_inv); - //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( - // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); - const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); // Calculate combined weights and linear sample ratios between texel pairs. - const float w1_2 = w1 + w2; - const float w3_4 = w3 + w4; - const float w5_6 = w5 + w6; - const float w7_8 = w7 + w8; - const float w1_2_ratio = w2/w1_2; - const float w3_4_ratio = w4/w3_4; - const float w5_6_ratio = w6/w5_6; - const float w7_8_ratio = w8/w7_8; + const float w12 = w1 + w2; + const float w34 = w3 + w4; + const float w12_ratio = w2/w12; + const float w34_ratio = w4/w34; // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); - sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; - sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; - sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; - sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + float3 sum = float3(0.0,0.0,0.0); + sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; sum += w0 * tex2D_linearize(tex, tex_uv).rgb; - sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; - sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; - sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; - sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; return sum * weight_sum_inv; } -vec3 tex2Dblur25fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Same as tex2Dblur11() - // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest - // neighbor and 12 linear taps. It may be mipmapped depending - // on settings and dxdy. - // First get the texel weights and normalization factor as above. - const float denom_inv = 0.5/(sigma*sigma); - const float w0 = 1.0; - const float w1 = exp(-1.0 * denom_inv); - const float w2 = exp(-4.0 * denom_inv); - const float w3 = exp(-9.0 * denom_inv); - const float w4 = exp(-16.0 * denom_inv); - const float w5 = exp(-25.0 * denom_inv); - const float w6 = exp(-36.0 * denom_inv); - const float w7 = exp(-49.0 * denom_inv); - const float w8 = exp(-64.0 * denom_inv); - const float w9 = exp(-81.0 * denom_inv); - const float w10 = exp(-100.0 * denom_inv); - const float w11 = exp(-121.0 * denom_inv); - const float w12 = exp(-144.0 * denom_inv); - //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( - // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); - const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); - // Calculate combined weights and linear sample ratios between texel pairs. - const float w1_2 = w1 + w2; - const float w3_4 = w3 + w4; - const float w5_6 = w5 + w6; - const float w7_8 = w7 + w8; - const float w9_10 = w9 + w10; - const float w11_12 = w11 + w12; - const float w1_2_ratio = w2/w1_2; - const float w3_4_ratio = w4/w3_4; - const float w5_6_ratio = w6/w5_6; - const float w7_8_ratio = w8/w7_8; - const float w9_10_ratio = w10/w9_10; - const float w11_12_ratio = w12/w11_12; - // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); - sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; - sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; - sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; - sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; - sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; - sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; - sum += w0 * tex2D_linearize(tex, tex_uv).rgb; - sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; - sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; - sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; - sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; - sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; - sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; - return sum * weight_sum_inv; -} - -vec3 tex2Dblur31fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) -{ - // Requires: Same as tex2Dblur11() - // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear + // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear // taps. It may be mipmapped depending on settings and dxdy. // First get the texel weights and normalization factor as above. const float denom_inv = 0.5/(sigma*sigma); @@ -523,63 +455,73 @@ vec3 tex2Dblur31fast(const sampler2D tex, const vec2 tex_uv, const float w1 = exp(-1.0 * denom_inv); const float w2 = exp(-4.0 * denom_inv); const float w3 = exp(-9.0 * denom_inv); - const float w4 = exp(-16.0 * denom_inv); - const float w5 = exp(-25.0 * denom_inv); - const float w6 = exp(-36.0 * denom_inv); - const float w7 = exp(-49.0 * denom_inv); - const float w8 = exp(-64.0 * denom_inv); - const float w9 = exp(-81.0 * denom_inv); - const float w10 = exp(-100.0 * denom_inv); - const float w11 = exp(-121.0 * denom_inv); - const float w12 = exp(-144.0 * denom_inv); - const float w13 = exp(-169.0 * denom_inv); - const float w14 = exp(-196.0 * denom_inv); - const float w15 = exp(-225.0 * denom_inv); - //const float weight_sum_inv = 1.0 / - // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + - // w9 + w10 + w11 + w12 + w13 + w14 + w15)); - const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); // Calculate combined weights and linear sample ratios between texel pairs. // The center texel (with weight w0) is used twice, so halve its weight. - const float w0_1 = w0 * 0.5 + w1; - const float w2_3 = w2 + w3; - const float w4_5 = w4 + w5; - const float w6_7 = w6 + w7; - const float w8_9 = w8 + w9; - const float w10_11 = w10 + w11; - const float w12_13 = w12 + w13; - const float w14_15 = w14 + w15; - const float w0_1_ratio = w1/w0_1; - const float w2_3_ratio = w3/w2_3; - const float w4_5_ratio = w5/w4_5; - const float w6_7_ratio = w7/w6_7; - const float w8_9_ratio = w9/w8_9; - const float w10_11_ratio = w11/w10_11; - const float w12_13_ratio = w13/w12_13; - const float w14_15_ratio = w15/w14_15; + const float w01 = w0 * 0.5 + w1; + const float w23 = w2 + w3; + const float w01_ratio = w1/w01; + const float w23_ratio = w3/w23; // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); - sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; - sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; - sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; - sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; - sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; - sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; - sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; - sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; - sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; - sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; - sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; - sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; - sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; - sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; - sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; - sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; + float3 sum = float3(0.0,0.0,0.0); + sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; + sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; + sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; return sum * weight_sum_inv; } -vec3 tex2Dblur43fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest + // neighbor and 2 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w12 = w1 + w2; + const float w12_ratio = w2/w12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear + // taps. It may be mipmapped depending on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); + // Calculate combined weights and linear sample ratios between texel pairs. + // The center texel (with weight w0) is used twice, so halve its weight. + const float w01 = w0 * 0.5 + w1; + const float w01_ratio = w1/w01; + // Weights for all samples are the same, so just average them: + return 0.5 * ( + tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + + tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); +} + + +//////////////////////////// HUGE SEPARABLE BLURS //////////////////////////// + +// Huge separable blurs come only in "fast" versions. +float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Same as tex2Dblur11() // Returns: A 1D 43x Gaussian blurred texture lookup using 22 linear @@ -637,7 +579,7 @@ vec3 tex2Dblur43fast(const sampler2D tex, const vec2 tex_uv, const float w18_19_ratio = w19/w18_19; const float w20_21_ratio = w21/w20_21; // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb; sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb; sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb; @@ -663,56 +605,11 @@ vec3 tex2Dblur43fast(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur3fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Same as tex2Dblur11() - // Returns: A 1D 3x Gaussian blurred texture lookup using 2 linear - // taps. It may be mipmapped depending on settings and dxdy. - // First get the texel weights and normalization factor as above. - const float denom_inv = 0.5/(sigma*sigma); - const float w0 = 1.0; - const float w1 = exp(-1.0 * denom_inv); - const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1); - // Calculate combined weights and linear sample ratios between texel pairs. - // The center texel (with weight w0) is used twice, so halve its weight. - const float w01 = w0 * 0.5 + w1; - const float w01_ratio = w1/w01; - // Weights for all samples are the same, so just average them: - return 0.5 * ( - tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb + - tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb); -} - -vec3 tex2Dblur5fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) -{ - // Requires: Same as tex2Dblur11() - // Returns: A 1D 5x Gaussian blurred texture lookup using 1 nearest - // neighbor and 2 linear taps. It may be mipmapped depending - // on settings and dxdy. - // First get the texel weights and normalization factor as above. - const float denom_inv = 0.5/(sigma*sigma); - const float w0 = 1.0; - const float w1 = exp(-1.0 * denom_inv); - const float w2 = exp(-4.0 * denom_inv); - const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2)); - // Calculate combined weights and linear sample ratios between texel pairs. - const float w12 = w1 + w2; - const float w12_ratio = w2/w12; - // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); - sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; - sum += w0 * tex2D_linearize(tex, tex_uv).rgb; - sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; - return sum * weight_sum_inv; -} - -vec3 tex2Dblur7fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) -{ - // Requires: Same as tex2Dblur11() - // Returns: A 1D 7x Gaussian blurred texture lookup using 4 linear + // Returns: A 1D 31x Gaussian blurred texture lookup using 16 linear // taps. It may be mipmapped depending on settings and dxdy. // First get the texel weights and normalization factor as above. const float denom_inv = 0.5/(sigma*sigma); @@ -720,26 +617,166 @@ vec3 tex2Dblur7fast(const sampler2D tex, const vec2 tex_uv, const float w1 = exp(-1.0 * denom_inv); const float w2 = exp(-4.0 * denom_inv); const float w3 = exp(-9.0 * denom_inv); - const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3)); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + const float w13 = exp(-169.0 * denom_inv); + const float w14 = exp(-196.0 * denom_inv); + const float w15 = exp(-225.0 * denom_inv); + //const float weight_sum_inv = 1.0 / + // (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + + // w9 + w10 + w11 + w12 + w13 + w14 + w15)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); // Calculate combined weights and linear sample ratios between texel pairs. // The center texel (with weight w0) is used twice, so halve its weight. - const float w01 = w0 * 0.5 + w1; - const float w23 = w2 + w3; - const float w01_ratio = w1/w01; - const float w23_ratio = w3/w23; + const float w0_1 = w0 * 0.5 + w1; + const float w2_3 = w2 + w3; + const float w4_5 = w4 + w5; + const float w6_7 = w6 + w7; + const float w8_9 = w8 + w9; + const float w10_11 = w10 + w11; + const float w12_13 = w12 + w13; + const float w14_15 = w14 + w15; + const float w0_1_ratio = w1/w0_1; + const float w2_3_ratio = w3/w2_3; + const float w4_5_ratio = w5/w4_5; + const float w6_7_ratio = w7/w6_7; + const float w8_9_ratio = w9/w8_9; + const float w10_11_ratio = w11/w10_11; + const float w12_13_ratio = w13/w12_13; + const float w14_15_ratio = w15/w14_15; // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); - sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb; - sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb; - sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb; - sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb; + float3 sum = float3(0.0,0.0,0.0); + sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb; + sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb; + sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb; + sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb; + sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb; + sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb; + sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb; + sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb; + sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb; return sum * weight_sum_inv; } +float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 25x Gaussian blurred texture lookup using 1 nearest + // neighbor and 12 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + const float w9 = exp(-81.0 * denom_inv); + const float w10 = exp(-100.0 * denom_inv); + const float w11 = exp(-121.0 * denom_inv); + const float w12 = exp(-144.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w9_10 = w9 + w10; + const float w11_12 = w11 + w12; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + const float w9_10_ratio = w10/w9_10; + const float w11_12_ratio = w12/w11_12; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb; + sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + +float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) +{ + // Requires: Same as tex2Dblur11() + // Returns: A 1D 17x Gaussian blurred texture lookup using 1 nearest + // neighbor and 8 linear taps. It may be mipmapped depending + // on settings and dxdy. + // First get the texel weights and normalization factor as above. + const float denom_inv = 0.5/(sigma*sigma); + const float w0 = 1.0; + const float w1 = exp(-1.0 * denom_inv); + const float w2 = exp(-4.0 * denom_inv); + const float w3 = exp(-9.0 * denom_inv); + const float w4 = exp(-16.0 * denom_inv); + const float w5 = exp(-25.0 * denom_inv); + const float w6 = exp(-36.0 * denom_inv); + const float w7 = exp(-49.0 * denom_inv); + const float w8 = exp(-64.0 * denom_inv); + //const float weight_sum_inv = 1.0 / (w0 + 2.0 * ( + // w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8)); + const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma); + // Calculate combined weights and linear sample ratios between texel pairs. + const float w1_2 = w1 + w2; + const float w3_4 = w3 + w4; + const float w5_6 = w5 + w6; + const float w7_8 = w7 + w8; + const float w1_2_ratio = w2/w1_2; + const float w3_4_ratio = w4/w3_4; + const float w5_6_ratio = w6/w5_6; + const float w7_8_ratio = w8/w7_8; + // Statically normalize weights, sum weighted samples, and return: + float3 sum = float3(0.0,0.0,0.0); + sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w0 * tex2D_linearize(tex, tex_uv).rgb; + sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb; + sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb; + sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb; + sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb; + return sum * weight_sum_inv; +} + + //////////////////// ARBITRARILY RESIZABLE ONE-PASS BLURS //////////////////// -vec3 tex2Dblur3x3resize(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Requires: Global requirements must be met (see file description). // Returns: A 3x3 Gaussian blurred mipmapped texture lookup of the @@ -751,71 +788,37 @@ vec3 tex2Dblur3x3resize(const sampler2D tex, const vec2 tex_uv, // Load each sample. We need all 3x3 samples. Quad-pixel communication // won't help either: This should perform like tex2Dblur5x5, but sharing a // 4x4 sample field would perform more like tex2Dblur8x8shared (worse). - const vec2 sample4_uv = tex_uv; - const vec2 dx = vec2(dxdy.x, 0.0); - const vec2 dy = vec2(0.0, dxdy.y); - const vec2 sample1_uv = sample4_uv - dy; - const vec2 sample7_uv = sample4_uv + dy; - const vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; - const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; - const vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; - const vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; - const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; - const vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; - const vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; - const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; - const vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; + const float2 sample4_uv = tex_uv; + const float2 dx = float2(dxdy.x, 0.0); + const float2 dy = float2(0.0, dxdy.y); + const float2 sample1_uv = sample4_uv - dy; + const float2 sample7_uv = sample4_uv + dy; + const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb; + const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb; + const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb; + const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb; + const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb; + const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb; + const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb; + const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb; + const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb; // Statically compute Gaussian sample weights: const float w4 = 1.0; - const float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); - const float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); + const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8)); // Weight and sum the samples: - const vec3 sum = w4 * sample4 + + const float3 sum = w4 * sample4 + w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) + w0_2_6_8 * (sample0 + sample2 + sample6 + sample8); return sum * weight_sum_inv; } -// Resizable one-pass blurs: -vec3 tex2Dblur3x3resize(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev); -} -vec3 tex2Dblur9fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) -{ - // Requires: Same as tex2Dblur11() - // Returns: A 1D 9x Gaussian blurred texture lookup using 1 nearest - // neighbor and 4 linear taps. It may be mipmapped depending - // on settings and dxdy. - // First get the texel weights and normalization factor as above. - const float denom_inv = 0.5/(sigma*sigma); - const float w0 = 1.0; - const float w1 = exp(-1.0 * denom_inv); - const float w2 = exp(-4.0 * denom_inv); - const float w3 = exp(-9.0 * denom_inv); - const float w4 = exp(-16.0 * denom_inv); - const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4)); - // Calculate combined weights and linear sample ratios between texel pairs. - const float w12 = w1 + w2; - const float w34 = w3 + w4; - const float w12_ratio = w2/w12; - const float w34_ratio = w4/w34; - // Statically normalize weights, sum weighted samples, and return: - vec3 sum = vec3(0.0); - sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb; - sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb; - sum += w0 * tex2D_linearize(tex, tex_uv).rgb; - sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb; - sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb; - return sum * weight_sum_inv; -} +//////////////////////////// FASTER ONE-PASS BLURS /////////////////////////// -vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Perform a 1-pass 9x9 blur with 5x5 bilinear samples. // Requires: Same as tex2Dblur9() @@ -867,12 +870,12 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv, const float texel3to4ratio = w4off/(w3off + w4off); // Statically compute texel offsets from the fragment center to each // bilinear sample in the bottom-right quadrant, including x-axis-aligned: - const vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); - const vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0); - const vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); - const vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio); - const vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio); - const vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio); + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0); + const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); + const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio); + const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio); + const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio); // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: // Statically compute Gaussian texel weights for the bottom-right quadrant. @@ -881,16 +884,16 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv, const float w1R2 = w2off; const float w2R1 = w3off; const float w2R2 = w4off; - const float w3d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); - const float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); - const float w3d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); - const float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); - const float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv); - const float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); - const float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv); - const float w6d1 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); - const float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv); - const float w6d4 = exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv); + const float w3d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w3d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv); + const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv); + const float w6d1 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); + const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv); + const float w6d4 = exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv); // Statically add texel weights in each sample to get sample weights: const float w0 = 1.0; const float w1 = w1R1 + w1R2; @@ -905,42 +908,42 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv, // LOAD TEXTURE SAMPLES: // Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry: - const vec2 mirror_x = vec2(-1.0, 1.0); - const vec2 mirror_y = vec2(1.0, -1.0); - const vec2 mirror_xy = vec2(-1.0, -1.0); - const vec2 dxdy_mirror_x = dxdy * mirror_x; - const vec2 dxdy_mirror_y = dxdy * mirror_y; - const vec2 dxdy_mirror_xy = dxdy * mirror_xy; + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; // Sampling order doesn't seem to affect performance, so just be clear: - const vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; - const vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; - const vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; - const vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; - const vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; - const vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; - const vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; - const vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; - const vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; - const vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; - const vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; - const vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; - const vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; - const vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; - const vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; - const vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; - const vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; - const vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; - const vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; - const vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; - const vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; - const vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; - const vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; - const vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; - const vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb; + const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb; + const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb; + const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb; + const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb; + const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb; + const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb; + const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb; + const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb; + const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb; // SUM WEIGHTED SAMPLES: // Statically normalize weights (so total = 1.0), and sum weighted samples. - vec3 sum = w0 * sample0C; + float3 sum = w0 * sample0C; sum += w1 * (sample1R + sample1D + sample1L + sample1U); sum += w2 * (sample2R + sample2D + sample2L + sample2U); sum += w3 * (sample3d + sample3c + sample3b + sample3a); @@ -950,8 +953,8 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Perform a 1-pass 7x7 blur with 5x5 bilinear samples. // Requires: Same as tex2Dblur9() @@ -987,24 +990,24 @@ vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv, const float texel2to3ratio = w3off/(w2off + w3off); // Statically compute texel offsets from the fragment center to each // bilinear sample in the bottom-right quadrant, including axis-aligned: - const vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); - const vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio); - const vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio); - const vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio); + const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio); + const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: // Statically compute Gaussian texel weights for the bottom-right quadrant. // Read underscores as "and." const float w1abcd = 1.0; - const float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv); - const float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv); - const float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv); - const float w1d4 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); - const float w2d3_3d2 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); - const float w2d4_3d4 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv); - const float w4d1 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); - const float w4d2_4d3 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv); - const float w4d4 = exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv); + const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv); + const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv); + const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv); + const float w1d4 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d3_3d2 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4_3d4 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv); + const float w4d1 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); + const float w4d2_4d3 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv); + const float w4d4 = exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv); // Statically add texel weights in each sample to get sample weights. // Split weights for shared texels between samples sharing them: const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4; @@ -1016,32 +1019,32 @@ vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv, // LOAD TEXTURE SAMPLES: // Load all 16 samples using symmetry: - const vec2 mirror_x = vec2(-1.0, 1.0); - const vec2 mirror_y = vec2(1.0, -1.0); - const vec2 mirror_xy = vec2(-1.0, -1.0); - const vec2 dxdy_mirror_x = dxdy * mirror_x; - const vec2 dxdy_mirror_y = dxdy * mirror_y; - const vec2 dxdy_mirror_xy = dxdy * mirror_xy; - const vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; - const vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; - const vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; - const vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; - const vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; - const vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; - const vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; - const vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; - const vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; - const vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; - const vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; - const vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; - const vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; - const vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; - const vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; - const vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb; + const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb; + const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb; + const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb; + const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb; + const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb; + const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb; + const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb; // SUM WEIGHTED SAMPLES: // Statically normalize weights (so total = 1.0), and sum weighted samples. - vec3 sum = vec3(0.0); + float3 sum = float3(0.0,0.0,0.0); sum += w1 * (sample1a + sample1b + sample1c + sample1d); sum += w2_3 * (sample2a + sample2b + sample2c + sample2d); sum += w2_3 * (sample3a + sample3b + sample3c + sample3d); @@ -1049,8 +1052,8 @@ vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv, return sum * weight_sum_inv; } -vec3 tex2Dblur5x5(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Perform a 1-pass 5x5 blur with 3x3 bilinear samples. // Requires: Same as tex2Dblur9() @@ -1077,17 +1080,17 @@ vec3 tex2Dblur5x5(const sampler2D tex, const vec2 tex_uv, const float texel1to2ratio = w2off/(w1off + w2off); // Statically compute texel offsets from the fragment center to each // bilinear sample in the bottom-right quadrant, including x-axis-aligned: - const vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0); - const vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio); + const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0); + const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio); // CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES: // Statically compute Gaussian texel weights for the bottom-right quadrant. // Read underscores as "and." const float w1R1 = w1off; const float w1R2 = w2off; - const float w2d1 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv); - const float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv); - const float w2d4 = exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv); + const float w2d1 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv); + const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv); + const float w2d4 = exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv); // Statically add texel weights in each sample to get sample weights: const float w0 = 1.0; const float w1 = w1R1 + w1R2; @@ -1097,32 +1100,32 @@ vec3 tex2Dblur5x5(const sampler2D tex, const vec2 tex_uv, // LOAD TEXTURE SAMPLES: // Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry: - const vec2 mirror_x = vec2(-1.0, 1.0); - const vec2 mirror_y = vec2(1.0, -1.0); - const vec2 mirror_xy = vec2(-1.0, -1.0); - const vec2 dxdy_mirror_x = dxdy * mirror_x; - const vec2 dxdy_mirror_y = dxdy * mirror_y; - const vec2 dxdy_mirror_xy = dxdy * mirror_xy; - const vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb; - const vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; - const vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; - const vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; - const vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; - const vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; - const vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; - const vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; - const vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb; + const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb; + const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb; + const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb; + const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb; + const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb; + const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb; + const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb; // SUM WEIGHTED SAMPLES: // Statically normalize weights (so total = 1.0), and sum weighted samples. - vec3 sum = w0 * sample0C; + float3 sum = w0 * sample0C; sum += w1 * (sample1R + sample1D + sample1L + sample1U); sum += w2 * (sample2a + sample2b + sample2c + sample2d); return sum * weight_sum_inv; } -vec3 tex2Dblur3x3(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy, const float sigma) +float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy, const float sigma) { // Perform a 1-pass 3x3 blur with 5x5 bilinear samples. // Requires: Same as tex2Dblur9() @@ -1148,130 +1151,766 @@ vec3 tex2Dblur3x3(const sampler2D tex, const vec2 tex_uv, const float texel0to1ratio = w1off/(w0off * 0.5 + w1off); // Statically compute texel offsets from the fragment center to each // bilinear sample in the bottom-right quadrant, including axis-aligned: - const vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio); + const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio); // LOAD TEXTURE SAMPLES: // Load all 4 samples using symmetry: - const vec2 mirror_x = vec2(-1.0, 1.0); - const vec2 mirror_y = vec2(1.0, -1.0); - const vec2 mirror_xy = vec2(-1.0, -1.0); - const vec2 dxdy_mirror_x = dxdy * mirror_x; - const vec2 dxdy_mirror_y = dxdy * mirror_y; - const vec2 dxdy_mirror_xy = dxdy * mirror_xy; - const vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; - const vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; - const vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; - const vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; + const float2 mirror_x = float2(-1.0, 1.0); + const float2 mirror_y = float2(1.0, -1.0); + const float2 mirror_xy = float2(-1.0, -1.0); + const float2 dxdy_mirror_x = dxdy * mirror_x; + const float2 dxdy_mirror_y = dxdy * mirror_y; + const float2 dxdy_mirror_xy = dxdy * mirror_xy; + const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb; + const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb; + const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb; + const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb; // SUM WEIGHTED SAMPLES: // Weights for all samples are the same, so just average them: return 0.25 * (sample0a + sample0b + sample0c + sample0d); } -vec3 tex2Dblur9fast(const sampler2D tex, const vec2 tex_uv, - const vec2 dxdy) + +////////////////// LINEAR ONE-PASS BLURS WITH SHARED SAMPLES ///////////////// + +float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: 1.) Same as tex2Dblur9() + // 2.) ddx() and ddy() are present in the current Cg profile. + // 3.) The GPU driver is using fine/high-quality derivatives. + // 4.) quad_vector *correctly* describes the current fragment's + // location in its pixel quad, by the conventions noted in + // get_quad_vector[_naive]. + // 5.) tex_uv.w = log2(IN.video_size/IN.output_size).y + // 6.) tex2Dlod() is present in the current Cg profile. + // Optional: Tune artifacts vs. excessive blurriness with the global + // float error_blurring. + // Returns: A blurred texture lookup using a "virtual" 12x12 Gaussian + // blur (a 6x6 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // Perform a 1-pass blur with shared texture lookups across a pixel quad. + // We'll get neighboring samples with high-quality ddx/ddy derivatives, as + // in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad + // Message Passing" by Eric Penner. + // + // Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12 + // bilinear samples, where bilinear sampling positions are computed from + // the relative Gaussian weights of the 4 surrounding texels. The catch is + // that the appropriate texel weights and sample coords differ for each + // fragment, but we're reusing most of the same samples across a quad of + // destination fragments. (We do use unique coords for the four nearest + // samples at each fragment.) Mixing bilinear filtering and sample-sharing + // therefore introduces some error into the weights, and this can get nasty + // when the source image is small or high-frequency. Computing bilinear + // ratios based on weights at the sample field center results in sharpening + // and ringing artifacts, but we can move samples closer to halfway between + // texels to try blurring away the error (which can move features around by + // a texel or so). Tune this with the global float "error_blurring". + // + // The pixel quad's sample field covers 12x12 texels, accessed through 6x6 + // bilinear (2x2 texel) taps. Each fragment depends on a window of 10x10 + // texels (5x5 bilinear taps), and each fragment is responsible for loading + // a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps + // to use unique bilinear coords for sample0* for each fragment. This + // diagram illustrates the relative locations of bilinear samples 1-9 for + // each quadrant a, b, c, d (note samples will not be equally spaced): + // 8a 7a 6a 6b 7b 8b + // 5a 4a 3a 3b 4b 5b + // 2a 1a 0a 0b 1b 2b + // 2c 1c 0c 0d 1d 2d + // 5c 4c 3c 3d 4d 5d + // 8c 7c 6c 6d 7d 8d + // The following diagram illustrates the underlying equally spaced texels, + // named after the sample that accesses them and subnamed by their location + // within their 2x2 texel block: + // 8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3 + // 8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1 + // 5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3 + // 5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1 + // 2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3 + // 2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1 + // 2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1 + // 2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3 + // 5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1 + // 5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3 + // 8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1 + // 8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3 + // With this symmetric arrangement, we don't have to know which absolute + // quadrant a sample lies in to assign kernel weights; it's enough to know + // the sample number and the relative quadrant of the sample (relative to + // the current quadrant): + // {current, adjacent x, adjacent y, diagonal} + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute sampling offsets within each 2x2 texel block, based + // on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3], + // and [4, 5] away from the fragment, and reuse them independently for both + // dimensions. Use the sample field center as the estimated destination, + // but nudge the result closer to halfway between texels to blur error. + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // based on the sum of their 4 underlying texel weights. Assume a same- + // resolution blur, so each symmetrically named sample weight will compute + // the same at every fragment in the pixel quad: We can therefore compute + // texel weights based only on the bottom-right quadrant (fragment at 0d0). + // Too avoid too much boilerplate code, use a macro to get all 4 texel + // weights for a bilinear sample based on the offset of its top-left texel: + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0); + const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0); + const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0); + const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0); + const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0); + const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0); + const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0); + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag); + const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag); + const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag); + const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + float3 sample8adjx, sample8adjy, sample8diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag)); + sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag)); + sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag)); + sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 10x10 Gaussian + // blur (a 5x5 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 25 of the 36 samples taken across the pixel quad (to cover a + // 5x5 sample area, or 10x10 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 11 omitted samples + // are always the "same:" + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float w4_5off = exp(-(4.5*4.5) * denom_inv); + const float w5_5off = exp(-(5.5*5.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio); + const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio); + const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio); + const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio); + const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 25 of the 36 sample weights. Skip the following weights: + // 8adjx, 2adjx, 5adjx, + // 6adjy, 7adjy, 8adjy, + // 2diag, 5diag, 6diag, 7diag, 8diag + const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0); + const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0); + const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0); + const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0); + const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0); + const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0); + const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w4curr + w5curr + w6curr + w7curr + w8curr + + w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx + + w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy + + w0diag + w1diag + w3diag + w4diag); + // Statically pack most weights for runtime. Note the mixed packing: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag); + const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy); + const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb; + const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb; + const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb; + const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb; + const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad in order of need: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + float3 sample4adjx, sample4adjy, sample4diag; + float3 sample5adjx, sample5adjy, sample5diag; + float3 sample6adjx, sample6adjy, sample6diag; + float3 sample7adjx, sample7adjy, sample7diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag); + quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag); + quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag); + quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result. First do the simple ones: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag)); + // Now do the mixed-sample ones: + sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy)); + sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx)); + sum += w8curr * sample8curr; + // Normalize the sum (so the weights add to 1.0) and return: + return sum * weight_sum_inv; +} + +float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 8x8 Gaussian + // blur (a 4x4 blur of carefully selected bilinear samples) + // of the given mip level. There will be subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur12x12shared(). This function + // shares the same concept and a similar sample placement, except each + // quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3 + // respectively. There could be a total of 16 samples, 4 of which each + // fragment is responsible for, but each fragment loads 0a/0b/0c/0d with + // its own offset to reduce shared sample artifacts, bringing the sample + // count for each fragment to 7. Sample placement: + // 3a 2a 2b 3b + // 1a 0a 0b 1b + // 1c 0c 0d 1d + // 3c 2c 2d 3d + // Texel placement: + // 3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3 + // 3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1 + // 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 + // 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 + // 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 + // 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 + // 3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1 + // 3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3 + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0); + const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0); + const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0); + const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0); + const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0); + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Statically pack weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag); + const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag); + const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag); + // Get the weight sum inverse (normalization factor): + const float4 weight_sum4 = w0 + w1 + w2 + w3; + const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw; + const float weight_sum = weight_sum2.x + weight_sum2.y; + const float weight_sum_inv = 1.0/(weight_sum); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + float3 sample3adjx, sample3adjy, sample3diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag)); + sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag)); + sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag)); + return sum * weight_sum_inv; +} + +float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector, + const float sigma) +{ + // Perform a 1-pass mipmapped blur with shared samples across a pixel quad. + // Requires: Same as tex2Dblur12x12shared() + // Returns: A blurred texture lookup using a "virtual" 6x6 Gaussian + // blur (a 3x3 blur of carefully selected bilinear samples) + // of the given mip level. There will be some inaccuracies,subtle inaccuracies, + // especially for small or high-frequency detailed sources. + // Description: + // First see the description for tex2Dblur8x8shared(). This + // function shares the same concept and sample placement, but each fragment + // only uses 9 of the 16 samples taken across the pixel quad (to cover a + // 3x3 sample area, or 6x6 texel area), and it uses a lower standard + // deviation to compensate. Thanks to symmetry, the 7 omitted samples + // are always the "same:" + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + + // COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared). + const float denom_inv = 0.5/(sigma*sigma); + const float w0off = 1.0; + const float w0_5off = exp(-(0.5*0.5) * denom_inv); + const float w1off = exp(-(1.0*1.0) * denom_inv); + const float w1_5off = exp(-(1.5*1.5) * denom_inv); + const float w2off = exp(-(2.0*2.0) * denom_inv); + const float w2_5off = exp(-(2.5*2.5) * denom_inv); + const float w3_5off = exp(-(3.5*3.5) * denom_inv); + const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring); + const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring); + // We don't share sample0*, so use the nearest destination fragment: + const float texel0to1ratio_nearest = w1off/(w0off + w1off); + const float texel1to2ratio_nearest = w2off/(w1off + w2off); + // Statically compute texel offsets from the bottom-right fragment to each + // bilinear sample in the bottom-right quadrant: + const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest); + const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest); + const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest); + const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio); + const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio); + const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio); + + // CALCULATE KERNEL WEIGHTS: + // Statically compute bilinear sample weights at each destination fragment + // from the sum of their 4 texel weights (details in tex2Dblur12x12shared). + #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \ + (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \ + exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv)) + // We only need 9 of the 16 sample weights. Skip the following weights: + // 1adjx, 3adjx + // 2adjy, 3adjy + // 1diag, 2diag, 3diag + const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0); + const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0); + const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0); + const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0); + const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0); + const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0); + const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0); + const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0); + const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0); + #undef GET_TEXEL_QUAD_WEIGHTS + // Get the weight sum inverse (normalization factor): + const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr + + w0adjx + w2adjx + w0adjy + w1adjy + w0diag); + // Statically pack some weights for runtime: + const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag); + + // LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR: + // Get a uv vector from texel 0q0 of this quadrant to texel 0q3: + const float2 dxdy_curr = dxdy * quad_vector.xy; + // Load bilinear samples for the current quadrant (for this fragment): + const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb; + const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb; + const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb; + const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb; + const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb; + const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb; + const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb; + + // GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES: + // Fetch the samples from other fragments in the 2x2 quad: + float3 sample1adjx, sample1adjy, sample1diag; + float3 sample2adjx, sample2adjy, sample2diag; + quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag); + quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag); + // Statically normalize weights (so total = 1.0), and sum weighted samples. + // Fill each row of a matrix with an rgb sample and pre-multiply by the + // weights to obtain a weighted result for sample1*, and handle the rest + // of the weights more directly/verbosely: + float3 sum = float3(0.0,0.0,0.0); + sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag)); + sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr + + w2adjx * sample2adjx + w3curr * sample3curr; + return sum * weight_sum_inv; +} + + +/////////////////////// MAX OPTIMAL SIGMA BLUR WRAPPERS ////////////////////// + +// The following blurs are static wrappers around the dynamic blurs above. +// HOPEFULLY, the compiler will be smart enough to do constant-folding. + +// Resizable separable blurs: +inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// Fast separable blurs: +inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev); +} +inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) { return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev); } - -vec3 tex2Dblur17fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) +inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) { - return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev); + return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev); +} +// Huge, "fast" separable blurs: +inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev); +} +inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev); +} +inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev); +} +inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev); +} +// Resizable one-pass blurs: +inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" one-pass blurs: +inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev); +} +inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev); +} +inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev); +} +inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv, + const float2 dxdy) +{ + return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev); +} +// "Fast" shared-sample one-pass blurs: +inline float3 tex2Dblur12x12shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev); +} +inline float3 tex2Dblur10x10shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev); +} +inline float3 tex2Dblur8x8shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev); +} +inline float3 tex2Dblur6x6shared(const sampler2D tex, + const float4 tex_uv, const float2 dxdy, const float4 quad_vector) +{ + return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev); } -vec3 tex2Dblur25fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev); -} -vec3 tex2Dblur43fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev); -} -vec3 tex2Dblur31fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev); -} +#endif // BLUR_FUNCTIONS_H -vec3 tex2Dblur3fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev); -} - -vec3 tex2Dblur3x3(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev); -} - -vec3 tex2Dblur5fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev); -} - -vec3 tex2Dblur5resize(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev); -} -vec3 tex2Dblur3resize(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev); -} - -vec3 tex2Dblur5x5(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev); -} - -vec3 tex2Dblur7resize(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev); -} - -vec3 tex2Dblur7fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev); -} - -vec3 tex2Dblur7x7(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev); -} - -vec3 tex2Dblur9resize(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev); -} - -vec3 tex2Dblur9x9(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev); -} - -vec3 tex2Dblur11resize(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev); -} - -vec3 tex2Dblur11fast(const sampler2D texture, const vec2 tex_uv, - const vec2 dxdy) -{ - return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev); -} - -#endif // BLUR_FUNCTIONS_H \ No newline at end of file diff --git a/include/compat_macros.inc b/include/compat_macros.inc new file mode 100644 index 0000000..fd9dba7 --- /dev/null +++ b/include/compat_macros.inc @@ -0,0 +1,28 @@ +// compatibility macros for transparently converting HLSLisms into GLSLisms +#define mul(a,b) (b*a) +#define lerp(a,b,c) mix(a,b,c) +#define saturate(c) clamp(c, 0.0, 1.0) +#define frac(x) (fract(x)) +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#define float2x2 mat2x2 +#define float3x3 mat3x3 +#define float4x4 mat4x4 +#define float4x3 mat4x3 +#define float2x4 mat2x4 +#define IN params +#define texture_size SourceSize.xy +#define video_size SourceSize.xy +#define output_size OutputSize.xy +#define frame_count FrameCount +#define static +#define inline +#define fmod(x,y) mod(x,y) +#define ddx(c) dFdx(c) +#define ddy(c) dFdy(c) +#define atan2(x,y) atan(y,x) +#define rsqrt(c) inversesqrt(c) \ No newline at end of file diff --git a/include/gamma-management.h b/include/gamma-management.h index a89bc2a..424290a 100644 --- a/include/gamma-management.h +++ b/include/gamma-management.h @@ -1,13 +1,138 @@ #ifndef GAMMA_MANAGEMENT_H #define GAMMA_MANAGEMENT_H +///////////////////////////////// MIT LICENSE //////////////////////////////// + +// Copyright (C) 2014 TroggleMonkey +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides gamma-aware tex*D*() and encode_output() functions. +// Requires: Before #include-ing this file, the including file must #define +// the following macros when applicable and follow their rules: +// 1.) #define FIRST_PASS if this is the first pass. +// 2.) #define LAST_PASS if this is the last pass. +// 3.) If sRGB is available, set srgb_framebufferN = "true" for +// every pass except the last in your .cgp preset. +// 4.) If sRGB isn't available but you want gamma-correctness with +// no banding, #define GAMMA_ENCODE_EVERY_FBO each pass. +// 5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7) +// 6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7) +// 7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7) +// 8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -) +// If an option in [5, 8] is #defined in the first or last pass, it +// should be #defined for both. It shouldn't make a difference +// whether it's #defined for intermediate passes or not. +// Optional: The including file (or an earlier included file) may optionally +// #define a number of macros indicating it will override certain +// macros and associated constants are as follows: +// static constants with either static or uniform constants. The +// 1.) OVERRIDE_STANDARD_GAMMA: The user must first define: +// static const float ntsc_gamma +// static const float pal_gamma +// static const float crt_reference_gamma_high +// static const float crt_reference_gamma_low +// static const float lcd_reference_gamma +// static const float crt_office_gamma +// static const float lcd_office_gamma +// 2.) OVERRIDE_DEVICE_GAMMA: The user must first define: +// static const float crt_gamma +// static const float gba_gamma +// static const float lcd_gamma +// 3.) OVERRIDE_FINAL_GAMMA: The user must first define: +// static const float input_gamma +// static const float intermediate_gamma +// static const float output_gamma +// (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.) +// 4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define: +// static const bool assume_opaque_alpha +// The gamma constant overrides must be used in every pass or none, +// and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros. +// OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis. +// Usage: After setting macros appropriately, ignore gamma correction and +// replace all tex*D*() calls with equivalent gamma-aware +// tex*D*_linearize calls, except: +// 1.) When you read an LUT, use regular tex*D or a gamma-specified +// function, depending on its gamma encoding: +// tex*D*_linearize_gamma (takes a runtime gamma parameter) +// 2.) If you must read pass0's original input in a later pass, use +// tex2D_linearize_ntsc_gamma. If you want to read pass0's +// input with gamma-corrected bilinear filtering, consider +// creating a first linearizing pass and reading from the input +// of pass1 later. +// Then, return encode_output(color) from every fragment shader. +// Finally, use the global gamma_aware_bilinear boolean if you want +// to statically branch based on whether bilinear filtering is +// gamma-correct or not (e.g. for placing Gaussian blur samples). +// +// Detailed Policy: +// tex*D*_linearize() functions enforce a consistent gamma-management policy +// based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings. They assume +// their input texture has the same encoding characteristics as the input for +// the current pass (which doesn't apply to the exceptions listed above). +// Similarly, encode_output() enforces a policy based on the LAST_PASS and +// GAMMA_ENCODE_EVERY_FBO settings. Together, they result in one of the +// following two pipelines. +// Typical pipeline with intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = linear_color; // Automatic sRGB encoding +// linear_color = intermediate_output; // Automatic sRGB decoding +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Typical pipeline without intermediate sRGB framebuffers: +// linear_color = pow(pass0_encoded_color, input_gamma); +// intermediate_output = pow(linear_color, 1.0/intermediate_gamma); +// linear_color = pow(intermediate_output, intermediate_gamma); +// final_output = pow(intermediate_output, 1.0/output_gamma); +// Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to +// easily get gamma-correctness without banding on devices where sRGB isn't +// supported. +// +// Use This Header to Maximize Code Reuse: +// The purpose of this header is to provide a consistent interface for texture +// reads and output gamma-encoding that localizes and abstracts away all the +// annoying details. This greatly reduces the amount of code in each shader +// pass that depends on the pass number in the .cgp preset or whether sRGB +// FBO's are being used: You can trivially change the gamma behavior of your +// whole pass by commenting or uncommenting 1-3 #defines. To reuse the same +// code in your first, Nth, and last passes, you can even put it all in another +// header file and #include it from skeleton .cg files that #define the +// appropriate pass-specific settings. +// +// Rationale for Using Three Macros: +// This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like +// SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes +// a lower maintenance burden on each pass. At first glance it seems we could +// accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT. +// This works for simple use cases where input_gamma == output_gamma, but it +// breaks down for more complex scenarios like CRT simulation, where the pass +// number determines the gamma encoding of the input and output. + + /////////////////////////////// BASE CONSTANTS /////////////////////////////// // Set standard gamma constants, but allow users to override them: #ifndef OVERRIDE_STANDARD_GAMMA // Standard encoding gammas: - const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? - const float pal_gamma = 2.8; // Never actually 2.8 in practice + static const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + static const float pal_gamma = 2.8; // Never actually 2.8 in practice // Typical device decoding gammas (only use for emulating devices): // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard // gammas: The standards purposely undercorrected for an analog CRT's @@ -19,17 +144,17 @@ // (they struggle near black with 2.5 gamma anyway), especially PC/laptop // displays designed to view sRGB in bright environments. (Standards are // also in flux again with BT.1886, but it's underspecified for displays.) - const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) - const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) - const float lcd_reference_gamma = 2.5; // To match CRT - const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC - const float lcd_office_gamma = 2.2; // Approximates sRGB + static const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + static const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + static const float lcd_reference_gamma = 2.5; // To match CRT + static const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + static const float lcd_office_gamma = 2.2; // Approximates sRGB #endif // OVERRIDE_STANDARD_GAMMA // Assuming alpha == 1.0 might make it easier for users to avoid some bugs, // but only if they're aware of it. #ifndef OVERRIDE_ALPHA_ASSUMPTIONS - bool assume_opaque_alpha = false; + static const bool assume_opaque_alpha = false; #endif @@ -43,90 +168,99 @@ // Set device gamma constants, but allow users to override them: #ifdef OVERRIDE_DEVICE_GAMMA // The user promises to globally define the appropriate constants: - float get_crt_gamma() { return crt_gamma; } - float get_gba_gamma() { return gba_gamma; } - float get_lcd_gamma() { return lcd_gamma; } + inline float get_crt_gamma() { return crt_gamma; } + inline float get_gba_gamma() { return gba_gamma; } + inline float get_lcd_gamma() { return lcd_gamma; } #else - float get_crt_gamma() { return crt_reference_gamma_high; } - float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) - float get_lcd_gamma() { return lcd_office_gamma; } + inline float get_crt_gamma() { return crt_reference_gamma_high; } + inline float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + inline float get_lcd_gamma() { return lcd_office_gamma; } #endif // OVERRIDE_DEVICE_GAMMA // Set decoding/encoding gammas for the first/lass passes, but allow overrides: #ifdef OVERRIDE_FINAL_GAMMA // The user promises to globally define the appropriate constants: - float get_intermediate_gamma() { return intermediate_gamma; } - float get_input_gamma() { return input_gamma; } - float get_output_gamma() { return output_gamma; } + inline float get_intermediate_gamma() { return intermediate_gamma; } + inline float get_input_gamma() { return input_gamma; } + inline float get_output_gamma() { return output_gamma; } #else // If we gamma-correct every pass, always use ntsc_gamma between passes to // ensure middle passes don't need to care if anything is being simulated: - float get_intermediate_gamma() { return ntsc_gamma; } + inline float get_intermediate_gamma() { return ntsc_gamma; } #ifdef SIMULATE_CRT_ON_LCD - float get_input_gamma() { return get_crt_gamma(); } - float get_output_gamma() { return get_lcd_gamma(); } + inline float get_input_gamma() { return get_crt_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } #else #ifdef SIMULATE_GBA_ON_LCD - float get_input_gamma() { return get_gba_gamma(); } - float get_output_gamma() { return get_lcd_gamma(); } + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_lcd_gamma(); } #else #ifdef SIMULATE_LCD_ON_CRT - float get_input_gamma() { return get_lcd_gamma(); } - float get_output_gamma() { return get_crt_gamma(); } + inline float get_input_gamma() { return get_lcd_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } #else #ifdef SIMULATE_GBA_ON_CRT - float get_input_gamma() { return get_gba_gamma(); } - float get_output_gamma() { return get_crt_gamma(); } + inline float get_input_gamma() { return get_gba_gamma(); } + inline float get_output_gamma() { return get_crt_gamma(); } #else // Don't simulate anything: - float get_input_gamma() { return ntsc_gamma; } - float get_output_gamma() { return ntsc_gamma; } + inline float get_input_gamma() { return ntsc_gamma; } + inline float get_output_gamma() { return ntsc_gamma; } #endif // SIMULATE_GBA_ON_CRT #endif // SIMULATE_LCD_ON_CRT #endif // SIMULATE_GBA_ON_LCD #endif // SIMULATE_CRT_ON_LCD #endif // OVERRIDE_FINAL_GAMMA +// Set decoding/encoding gammas for the current pass. Use static constants for +// linearize_input and gamma_encode_output, because they aren't derived, and +// they let the compiler do dead-code elimination. #ifndef GAMMA_ENCODE_EVERY_FBO #ifdef FIRST_PASS - bool linearize_input = true; - float get_pass_input_gamma() { return get_input_gamma(); } + static const bool linearize_input = true; + inline float get_pass_input_gamma() { return get_input_gamma(); } #else - bool linearize_input = false; - float get_pass_input_gamma() { return 1.0; } + static const bool linearize_input = false; + inline float get_pass_input_gamma() { return 1.0; } #endif #ifdef LAST_PASS - bool gamma_encode_output = true; - float get_pass_output_gamma() { return get_output_gamma(); } + static const bool gamma_encode_output = true; + inline float get_pass_output_gamma() { return get_output_gamma(); } #else - bool gamma_encode_output = false; - float get_pass_output_gamma() { return 1.0; } + static const bool gamma_encode_output = false; + inline float get_pass_output_gamma() { return 1.0; } #endif #else - bool linearize_input = true; - bool gamma_encode_output = true; + static const bool linearize_input = true; + static const bool gamma_encode_output = true; #ifdef FIRST_PASS - float get_pass_input_gamma() { return get_input_gamma(); } + inline float get_pass_input_gamma() { return get_input_gamma(); } #else - float get_pass_input_gamma() { return get_intermediate_gamma(); } + inline float get_pass_input_gamma() { return get_intermediate_gamma(); } #endif #ifdef LAST_PASS - float get_pass_output_gamma() { return get_output_gamma(); } + inline float get_pass_output_gamma() { return get_output_gamma(); } #else - float get_pass_output_gamma() { return get_intermediate_gamma(); } + inline float get_pass_output_gamma() { return get_intermediate_gamma(); } #endif #endif -vec4 decode_input(const vec4 color) +// Users might want to know if bilinear filtering will be gamma-correct: +static const bool gamma_aware_bilinear = !linearize_input; + + +////////////////////// COLOR ENCODING/DECODING FUNCTIONS ///////////////////// + +inline float4 encode_output(const float4 color) { - if(linearize_input = true) + if(gamma_encode_output) { - if(assume_opaque_alpha = true) + if(assume_opaque_alpha) { - return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0); + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0); } else { - return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a); + return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a); } } else @@ -135,17 +269,17 @@ vec4 decode_input(const vec4 color) } } -vec4 encode_output(const vec4 color) +inline float4 decode_input(const float4 color) { - if(gamma_encode_output = true) + if(linearize_input) { - if(assume_opaque_alpha = true) + if(assume_opaque_alpha) { - return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0); + return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0); } else { - return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a); + return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a); } } else @@ -154,12 +288,259 @@ vec4 encode_output(const vec4 color) } } +inline float4 decode_gamma_input(const float4 color, const float3 gamma) +{ + if(assume_opaque_alpha) + { + return float4(pow(color.rgb, gamma), 1.0); + } + else + { + return float4(pow(color.rgb, gamma), color.a); + } +} + +//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯ #define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) -//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords) -//{ return decode_input(vec4(texture(tex, tex_coords))); } -//#define tex2D_linearize(C, D, E) decode_input(vec4(texture(C, D, E))) -//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords, const int texel_off) -//{ return decode_input(vec4(texture(tex, tex_coords, texel_off))); } +/////////////////////////// TEXTURE LOOKUP WRAPPERS ////////////////////////// + +// "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a wide array of linearizing texture lookup wrapper functions. The +// Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D +// lookups are provided for completeness in case that changes someday. Nobody +// is likely to use the *fetch and *proj functions, but they're included just +// in case. The only tex*D texture sampling functions omitted are: +// - tex*Dcmpbias +// - tex*Dcmplod +// - tex*DARRAY* +// - tex*DMS* +// - Variants returning integers +// Standard line length restrictions are ignored below for vertical brevity. +/* +// tex1D: +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1D(tex, tex_coords)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy) +{ return decode_input(tex1D(tex, tex_coords, dx, dy)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off) +{ return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off)); } + +// tex1Dbias: +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dbias(tex, tex_coords)); } + +inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dbias(tex, tex_coords, texel_off)); } + +// tex1Dfetch: +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords) +{ return decode_input(tex1Dfetch(tex, tex_coords)); } + +inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex1Dfetch(tex, tex_coords, texel_off)); } + +// tex1Dlod: +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords) +{ return decode_input(tex1Dlod(tex, tex_coords)); } + +inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex1Dlod(tex, tex_coords, texel_off)); } + +// tex1Dproj: +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords) +{ return decode_input(tex1Dproj(tex, tex_coords)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex1Dproj(tex, tex_coords, texel_off)); } +/////////* +// tex2D: +inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords) +{ return decode_input(texture(tex, tex_coords)); } + +inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(texture(tex, tex_coords.xy)); } + +inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +{ return decode_input(textureLod(tex, tex_coords, texel_off)); } + +inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(texture(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy) +//{ return decode_input(texture(tex, tex_coords, dx, dy)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(texture(tex, tex_coords, dx, dy, texel_off)); } + +//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off) +//{ return decode_input(texture(tex, tex_coords, dx, dy, texel_off)); } + +// tex2Dbias: +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords) +//{ return decode_input(tex2Dbias(tex, tex_coords)); } + +//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dbias(tex, tex_coords, texel_off)); } + +// tex2Dfetch: +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords) +//{ return decode_input(tex2Dfetch(tex, tex_coords)); } + +//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off) +//{ return decode_input(tex2Dfetch(tex, tex_coords, texel_off)); } +*/ +// tex2Dlod: +inline float4 tex2Dlod_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(textureLod(tex, tex_coords.xy, 0.0)); } + +inline float4 tex2Dlod_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(textureLod(tex, tex_coords.xy, texel_off)); } +/* +// tex2Dproj: +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords) +{ return decode_input(tex2Dproj(tex, tex_coords)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } + +inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex2Dproj(tex, tex_coords, texel_off)); } +*/ +/* +// tex3D: +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords) +{ return decode_input(tex3D(tex, tex_coords)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, texel_off)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy) +{ return decode_input(tex3D(tex, tex_coords, dx, dy)); } + +inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off) +{ return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off)); } + +// tex3Dbias: +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dbias(tex, tex_coords)); } + +inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dbias(tex, tex_coords, texel_off)); } + +// tex3Dfetch: +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords) +{ return decode_input(tex3Dfetch(tex, tex_coords)); } + +inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off) +{ return decode_input(tex3Dfetch(tex, tex_coords, texel_off)); } + +// tex3Dlod: +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dlod(tex, tex_coords)); } + +inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dlod(tex, tex_coords, texel_off)); } + +// tex3Dproj: +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords) +{ return decode_input(tex3Dproj(tex, tex_coords)); } + +inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off) +{ return decode_input(tex3Dproj(tex, tex_coords, texel_off)); } +/////////* + +// NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// This narrow selection of nonstandard tex2D* functions can be useful: + +// tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0. +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0))); } + +//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off) +//{ return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off)); } + + +// MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS: +// Provide a narrower selection of tex2D* wrapper functions that decode an +// input sample with a specified gamma value. These are useful for reading +// LUT's and for reading the input of pass0 in a later pass. + +// tex2D: +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma) +{ return decode_gamma_input(texture(tex, tex_coords), gamma); } + +inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma) +{ return decode_gamma_input(texture(tex, tex_coords.xy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(texture(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(texture(tex, tex_coords, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(texture(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma) +//{ return decode_gamma_input(texture(tex, tex_coords, dx, dy), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(texture(tex, tex_coords, dx, dy, texel_off), gamma); } + +//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma) +//{ return decode_gamma_input(texture(tex, tex_coords, dx, dy, texel_off), gamma); } +/* +// tex2Dbias: +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma); } + +inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma); } + +// tex2Dfetch: +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma); } + +inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma); } +/////////* +// tex2Dlod: +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma); } + +inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma) +{ return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma); } +*/ + +#endif // GAMMA_MANAGEMENT_H -#endif // GAMMA_MANAGEMENT_H \ No newline at end of file diff --git a/include/quad-pixel-communication.h b/include/quad-pixel-communication.h index 4c3f1cb..c8ffca4 100644 --- a/include/quad-pixel-communication.h +++ b/include/quad-pixel-communication.h @@ -47,7 +47,7 @@ ///////////////////// QUAD-PIXEL COMMUNICATION PRIMITIVES //////////////////// -vec4 get_quad_vector_naive(const vec4 output_pixel_num_wrt_uvxy) +float4 get_quad_vector_naive(const float4 output_pixel_num_wrt_uvxy) { // Requires: Two measures of the current fragment's output pixel number // in the range ([0, IN.output_size.x), [0, IN.output_size.y)): @@ -62,33 +62,33 @@ vec4 get_quad_vector_naive(const vec4 output_pixel_num_wrt_uvxy) // 2.) The .zw components are its 2x2 placement with respect to // screen xy direction (IN.position); the origin varies. // quad_gather needs this measure to work correctly. - // Note: quad_vector.zw = quad_vector.xy * vec2( + // Note: quad_vector.zw = quad_vector.xy * float2( // ddx(output_pixel_num_wrt_uvxy.x), // ddy(output_pixel_num_wrt_uvxy.y)); // Caveats: This function assumes the GPU driver always starts 2x2 pixel // quads at even pixel numbers. This assumption can be wrong // for odd output resolutions (nondeterministically so). - const vec4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; - const vec4 quad_vector = pixel_odd * 2.0 - vec4(1.0); + const float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0; + const float4 quad_vector = pixel_odd * 2.0 - float4(1.0); return quad_vector; } -vec4 get_quad_vector(const vec4 output_pixel_num_wrt_uvxy) +float4 get_quad_vector(const float4 output_pixel_num_wrt_uvxy) { // Requires: Same as get_quad_vector_naive() (see that first). // Returns: Same as get_quad_vector_naive() (see that first), but it's // correct even if the 2x2 pixel quad starts at an odd pixel, // which can occur at odd resolutions. - const vec4 quad_vector_guess = + const float4 quad_vector_guess = get_quad_vector_naive(output_pixel_num_wrt_uvxy); // If quad_vector_guess.zw doesn't increase with screen xy, we know // the 2x2 pixel quad starts at an odd pixel: - const vec2 odd_start_mirror = 0.5 * vec2(ddx(quad_vector_guess.z), + const float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z), ddy(quad_vector_guess.w)); return quad_vector_guess * odd_start_mirror.xyxy; } -vec4 get_quad_vector(const vec2 output_pixel_num_wrt_uv) +float4 get_quad_vector(const float2 output_pixel_num_wrt_uv) { // Requires: 1.) ddx() and ddy() are present in the current Cg profile. // 2.) output_pixel_num_wrt_uv must increase with uv coords and @@ -98,25 +98,25 @@ vec4 get_quad_vector(const vec2 output_pixel_num_wrt_uv) // correct even if the 2x2 pixel quad starts at an odd pixel, // which can occur at odd resolutions. // Caveats: This function requires less information than the version - // taking a vec4, but it's potentially slower. + // taking a float4, but it's potentially slower. // Do screen coords increase with or against uv? Get the direction // with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}. - const vec2 screen_uv_mirror = vec2(ddx(output_pixel_num_wrt_uv.x), + const float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x), ddy(output_pixel_num_wrt_uv.y)); - const vec2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; - const vec2 quad_vector_uv_guess = (pixel_odd_wrt_uv - vec2(0.5)) * 2.0; - const vec2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; + const float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0; + const float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0; + const float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror; // If quad_vector_screen_guess doesn't increase with screen xy, we know // the 2x2 pixel quad starts at an odd pixel: - const vec2 odd_start_mirror = 0.5 * vec2(ddx(quad_vector_screen_guess.x), + const float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x), ddy(quad_vector_screen_guess.y)); - const vec4 quad_vector_guess = vec4( + const float4 quad_vector_guess = float4( quad_vector_uv_guess, quad_vector_screen_guess); return quad_vector_guess * odd_start_mirror.xyxy; } -void quad_gather(const vec4 quad_vector, const vec4 curr, - out vec4 adjx, out vec4 adjy, out vec4 diag) +void quad_gather(const float4 quad_vector, const float4 curr, + out float4 adjx, out float4 adjy, out float4 diag) { // Requires: 1.) ddx() and ddy() are present in the current Cg profile. // 2.) The GPU driver is using fine/high-quality derivatives. @@ -130,70 +130,70 @@ void quad_gather(const vec4 quad_vector, const vec4 curr, diag = adjx - ddy(adjx) * quad_vector.w; } -void quad_gather(const vec4 quad_vector, const vec3 curr, - out vec3 adjx, out vec3 adjy, out vec3 diag) +void quad_gather(const float4 quad_vector, const float3 curr, + out float3 adjx, out float3 adjy, out float3 diag) { - // vec3 version + // Float3 version adjx = curr - ddx(curr) * quad_vector.z; adjy = curr - ddy(curr) * quad_vector.w; diag = adjx - ddy(adjx) * quad_vector.w; } -void quad_gather(const vec4 quad_vector, const vec2 curr, - out vec2 adjx, out vec2 adjy, out vec2 diag) +void quad_gather(const float4 quad_vector, const float2 curr, + out float2 adjx, out float2 adjy, out float2 diag) { - // vec2 version + // Float2 version adjx = curr - ddx(curr) * quad_vector.z; adjy = curr - ddy(curr) * quad_vector.w; diag = adjx - ddy(adjx) * quad_vector.w; } -vec4 quad_gather(const vec4 quad_vector, const float curr) +float4 quad_gather(const float4 quad_vector, const float curr) { // Float version: // Returns: return.x == current // return.y == adjacent x // return.z == adjacent y // return.w == diagonal - vec4 all = vec4(curr); + float4 all = float4(curr); all.y = all.x - ddx(all.x) * quad_vector.z; all.zw = all.xy - ddy(all.xy) * quad_vector.w; return all; } -vec4 quad_gather_sum(const vec4 quad_vector, const vec4 curr) +float4 quad_gather_sum(const float4 quad_vector, const float4 curr) { // Requires: Same as quad_gather() // Returns: Sum of an input vector (curr) at all fragments in a quad. - vec4 adjx, adjy, diag; + float4 adjx, adjy, diag; quad_gather(quad_vector, curr, adjx, adjy, diag); return (curr + adjx + adjy + diag); } -vec3 quad_gather_sum(const vec4 quad_vector, const vec3 curr) +float3 quad_gather_sum(const float4 quad_vector, const float3 curr) { - // vec3 version: - vec3 adjx, adjy, diag; + // Float3 version: + float3 adjx, adjy, diag; quad_gather(quad_vector, curr, adjx, adjy, diag); return (curr + adjx + adjy + diag); } -vec2 quad_gather_sum(const vec4 quad_vector, const vec2 curr) +float2 quad_gather_sum(const float4 quad_vector, const float2 curr) { - // vec2 version: - vec2 adjx, adjy, diag; + // Float2 version: + float2 adjx, adjy, diag; quad_gather(quad_vector, curr, adjx, adjy, diag); return (curr + adjx + adjy + diag); } -float quad_gather_sum(const vec4 quad_vector, const float curr) +float quad_gather_sum(const float4 quad_vector, const float curr) { // Float version: - const vec4 all_values = quad_gather(quad_vector, curr); + const float4 all_values = quad_gather(quad_vector, curr); return (all_values.x + all_values.y + all_values.z + all_values.w); } -bool fine_derivatives_working(const vec4 quad_vector, vec4 curr) +bool fine_derivatives_working(const float4 quad_vector, float4 curr) { // Requires: 1.) ddx() and ddy() are present in the current Cg profile. // 2.) quad_vector describes the current fragment's location in @@ -206,19 +206,19 @@ bool fine_derivatives_working(const vec4 quad_vector, vec4 curr) // Method: We can confirm fine derivatives are used if the following // holds (ever, for any value at any fragment): // (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy)) - // The more values we test (e.g. test a vec4 two ways), the + // The more values we test (e.g. test a float4 two ways), the // easier it is to demonstrate fine derivatives are working. // TODO: Check for floating point exact comparison issues! - vec4 ddx_curr = ddx(curr); - vec4 ddy_curr = ddy(curr); - vec4 adjx = curr - ddx_curr * quad_vector.z; - vec4 adjy = curr - ddy_curr * quad_vector.w; - bool ddy_different = any(ddy_curr != ddy(adjx)); - bool ddx_different = any(ddx_curr != ddx(adjy)); + float4 ddx_curr = ddx(curr); + float4 ddy_curr = ddy(curr); + float4 adjx = curr - ddx_curr * quad_vector.z; + float4 adjy = curr - ddy_curr * quad_vector.w; + bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w)); + bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w)); return any(bool2(ddy_different, ddx_different)); } -bool fine_derivatives_working_fast(const vec4 quad_vector, float curr) +bool fine_derivatives_working_fast(const float4 quad_vector, float curr) { // Requires: Same as fine_derivatives_working() // Returns: Same as fine_derivatives_working() diff --git a/include/special-functions.h b/include/special-functions.h index 2a06390..6fb3809 100644 --- a/include/special-functions.h +++ b/include/special-functions.h @@ -1,7 +1,6 @@ #ifndef SPECIAL_FUNCTIONS_H #define SPECIAL_FUNCTIONS_H - ///////////////////////////////// MIT LICENSE //////////////////////////////// // Copyright (C) 2014 TroggleMonkey @@ -38,7 +37,7 @@ // // Design Rationale: // Pretty much every line of code in this file is duplicated four times for -// different input types (vec4/vec3/vec2/float). This is unfortunate, +// different input types (float4/float3/float2/float). This is unfortunate, // but Cg doesn't allow function templates. Macros would be far less verbose, // but they would make the code harder to document and read. I don't expect // these functions will require a whole lot of maintenance changes unless @@ -48,7 +47,7 @@ /////////////////////////// GAUSSIAN ERROR FUNCTION ////////////////////////// -vec4 erf6(vec4 x) +float4 erf6(float4 x) { // Requires: x is the standard parameter to erf(). // Returns: Return an Abramowitz/Stegun approximation of erf(), where: @@ -56,32 +55,32 @@ vec4 erf6(vec4 x) // This approximation has a max absolute error of 2.5*10**-5 // with solid numerical robustness and efficiency. See: // https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions - const vec4 one = vec4(1.0); - const vec4 sign_x = sign(x); - const vec4 t = one/(one + 0.47047*abs(x)); - const vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + static const float4 one = float4(1.0); + const float4 sign_x = sign(x); + const float4 t = one/(one + 0.47047*abs(x)); + const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* exp(-(x*x)); return result * sign_x; } -vec3 erf6(const vec3 x) +float3 erf6(const float3 x) { - // vec3 version: - const vec3 one = vec3(1.0); - const vec3 sign_x = sign(x); - const vec3 t = one/(one + 0.47047*abs(x)); - const vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + // Float3 version: + static const float3 one = float3(1.0); + const float3 sign_x = sign(x); + const float3 t = one/(one + 0.47047*abs(x)); + const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* exp(-(x*x)); return result * sign_x; } -vec2 erf6(const vec2 x) +float2 erf6(const float2 x) { - // vec2 version: - const vec2 one = vec2(1.0); - const vec2 sign_x = sign(x); - const vec2 t = one/(one + 0.47047*abs(x)); - const vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* + // Float2 version: + static const float2 one = float2(1.0); + const float2 sign_x = sign(x); + const float2 t = one/(one + 0.47047*abs(x)); + const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))* exp(-(x*x)); return result * sign_x; } @@ -96,7 +95,7 @@ float erf6(const float x) return result * sign_x; } -vec4 erft(const vec4 x) +float4 erft(const float4 x) { // Requires: x is the standard parameter to erf(). // Returns: Approximate erf() with the hyperbolic tangent. The error is @@ -108,15 +107,15 @@ vec4 erft(const vec4 x) return tanh(1.202760580 * x); } -vec3 erft(const vec3 x) +float3 erft(const float3 x) { - // vec3 version: + // Float3 version: return tanh(1.202760580 * x); } -vec2 erft(const vec2 x) +float2 erft(const float2 x) { - // vec2 version: + // Float2 version: return tanh(1.202760580 * x); } @@ -126,7 +125,7 @@ float erft(const float x) return tanh(1.202760580 * x); } -vec4 erf(const vec4 x) +inline float4 erf(const float4 x) { // Requires: x is the standard parameter to erf(). // Returns: Some approximation of erf(x), depending on user settings. @@ -137,9 +136,9 @@ vec4 erf(const vec4 x) #endif } -vec3 erf(const vec3 x) +inline float3 erf(const float3 x) { - // vec3 version: + // Float3 version: #ifdef ERF_FAST_APPROXIMATION return erft(x); #else @@ -147,9 +146,9 @@ vec3 erf(const vec3 x) #endif } -vec2 erf(const vec2 x) +inline float2 erf(const float2 x) { - // vec2 version: + // Float2 version: #ifdef ERF_FAST_APPROXIMATION return erft(x); #else @@ -157,7 +156,7 @@ vec2 erf(const vec2 x) #endif } -float erf(const float x) +inline float erf(const float x) { // Float version: #ifdef ERF_FAST_APPROXIMATION @@ -167,9 +166,10 @@ float erf(const float x) #endif } + /////////////////////////// COMPLETE GAMMA FUNCTION ////////////////////////// -vec4 gamma_impl(const vec4 s, const vec4 s_inv) +float4 gamma_impl(const float4 s, const float4 s_inv) { // Requires: 1.) s is the standard parameter to the gamma function, and // it should lie in the [0, 36] range. @@ -185,76 +185,76 @@ vec4 gamma_impl(const vec4 s, const vec4 s_inv) // evals. We could use three coeffs (0.0000346 error) without // hurting latency, but this allows more parallelism with // outside instructions. - const vec4 g = vec4(1.12906830989); - const vec4 c0 = vec4(0.8109119309638332633713423362694399653724431); - const vec4 c1 = vec4(0.4808354605142681877121661197951496120000040); - const vec4 e = vec4(2.71828182845904523536028747135266249775724709); - const vec4 sph = s + vec4(0.5); - const vec4 lanczos_sum = c0 + c1/(s + vec4(1.0)); - const vec4 base = (sph + g)/e; // or (s + g + vec4(0.5))/e + static const float4 g = float4(1.12906830989); + static const float4 c0 = float4(0.8109119309638332633713423362694399653724431); + static const float4 c1 = float4(0.4808354605142681877121661197951496120000040); + static const float4 e = float4(2.71828182845904523536028747135266249775724709); + const float4 sph = s + float4(0.5); + const float4 lanczos_sum = c0 + c1/(s + float4(1.0)); + const float4 base = (sph + g)/e; // or (s + g + float4(0.5))/e // gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s). // This has less error for small s's than (s -= 1.0) at the beginning. return (pow(base, sph) * lanczos_sum) * s_inv; } -vec3 gamma_impl(const vec3 s, const vec3 s_inv) +float3 gamma_impl(const float3 s, const float3 s_inv) { - // vec3 version: - const vec3 g = vec3(1.12906830989); - const vec3 c0 = vec3(0.8109119309638332633713423362694399653724431); - const vec3 c1 = vec3(0.4808354605142681877121661197951496120000040); - const vec3 e = vec3(2.71828182845904523536028747135266249775724709); - const vec3 sph = s + vec3(0.5); - const vec3 lanczos_sum = c0 + c1/(s + vec3(1.0)); - const vec3 base = (sph + g)/e; + // Float3 version: + static const float3 g = float3(1.12906830989); + static const float3 c0 = float3(0.8109119309638332633713423362694399653724431); + static const float3 c1 = float3(0.4808354605142681877121661197951496120000040); + static const float3 e = float3(2.71828182845904523536028747135266249775724709); + const float3 sph = s + float3(0.5); + const float3 lanczos_sum = c0 + c1/(s + float3(1.0)); + const float3 base = (sph + g)/e; return (pow(base, sph) * lanczos_sum) * s_inv; } -vec2 gamma_impl(const vec2 s, const vec2 s_inv) +float2 gamma_impl(const float2 s, const float2 s_inv) { - // vec2 version: - const vec2 g = vec2(1.12906830989); - const vec2 c0 = vec2(0.8109119309638332633713423362694399653724431); - const vec2 c1 = vec2(0.4808354605142681877121661197951496120000040); - const vec2 e = vec2(2.71828182845904523536028747135266249775724709); - const vec2 sph = s + vec2(0.5); - const vec2 lanczos_sum = c0 + c1/(s + vec2(1.0)); - const vec2 base = (sph + g)/e; + // Float2 version: + static const float2 g = float2(1.12906830989); + static const float2 c0 = float2(0.8109119309638332633713423362694399653724431); + static const float2 c1 = float2(0.4808354605142681877121661197951496120000040); + static const float2 e = float2(2.71828182845904523536028747135266249775724709); + const float2 sph = s + float2(0.5); + const float2 lanczos_sum = c0 + c1/(s + float2(1.0)); + const float2 base = (sph + g)/e; return (pow(base, sph) * lanczos_sum) * s_inv; } float gamma_impl(const float s, const float s_inv) { // Float version: - const float g = 1.12906830989; - const float c0 = 0.8109119309638332633713423362694399653724431; - const float c1 = 0.4808354605142681877121661197951496120000040; - const float e = 2.71828182845904523536028747135266249775724709; + static const float g = 1.12906830989; + static const float c0 = 0.8109119309638332633713423362694399653724431; + static const float c1 = 0.4808354605142681877121661197951496120000040; + static const float e = 2.71828182845904523536028747135266249775724709; const float sph = s + 0.5; const float lanczos_sum = c0 + c1/(s + 1.0); const float base = (sph + g)/e; return (pow(base, sph) * lanczos_sum) * s_inv; } -vec4 gamma(const vec4 s) +float4 gamma(const float4 s) { // Requires: s is the standard parameter to the gamma function, and it // should lie in the [0, 36] range. // Returns: Return approximate gamma function output with a maximum // relative error of 0.000463. See gamma_impl for details. - return gamma_impl(s, vec4(1.0)/s); + return gamma_impl(s, float4(1.0)/s); } -vec3 gamma(const vec3 s) +float3 gamma(const float3 s) { - // vec3 version: - return gamma_impl(s, vec3(1.0)/s); + // Float3 version: + return gamma_impl(s, float3(1.0)/s); } -vec2 gamma(const vec2 s) +float2 gamma(const float2 s) { - // vec2 version: - return gamma_impl(s, vec2(1.0)/s); + // Float2 version: + return gamma_impl(s, float2(1.0)/s); } float gamma(const float s) @@ -263,10 +263,11 @@ float gamma(const float s) return gamma_impl(s, 1.0/s); } + //////////////// INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT) /////////////// // Lower incomplete gamma function for small s and z (implementation): -vec4 ligamma_small_z_impl(const vec4 s, const vec4 z, const vec4 s_inv) +float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv) { // Requires: 1.) s < ~0.5 // 2.) z <= ~0.775075 @@ -282,14 +283,14 @@ vec4 ligamma_small_z_impl(const vec4 s, const vec4 z, const vec4 s_inv) // sum += last_sign * last_pow / ((s + k) * last_factorial); // } // Unrolled, constant-unfolded and arranged for madds and parallelism: - const vec4 scale = pow(z, s); - vec4 sum = s_inv; // Summation iteration 0 result + const float4 scale = pow(z, s); + float4 sum = s_inv; // Summation iteration 0 result // Summation iterations 1, 2, and 3: - const vec4 z_sq = z*z; - const vec4 denom1 = s + vec4(1.0); - const vec4 denom2 = 2.0*s + vec4(4.0); - const vec4 denom3 = 6.0*s + vec4(18.0); - //vec4 denom4 = 24.0*s + vec4(96.0); + const float4 z_sq = z*z; + const float4 denom1 = s + float4(1.0); + const float4 denom2 = 2.0*s + float4(4.0); + const float4 denom3 = 6.0*s + float4(18.0); + //float4 denom4 = 24.0*s + float4(96.0); sum -= z/denom1; sum += z_sq/denom2; sum -= z * z_sq/denom3; @@ -298,30 +299,30 @@ vec4 ligamma_small_z_impl(const vec4 s, const vec4 z, const vec4 s_inv) return scale * sum; } -vec3 ligamma_small_z_impl(const vec3 s, const vec3 z, const vec3 s_inv) +float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv) { - // vec3 version: - const vec3 scale = pow(z, s); - vec3 sum = s_inv; - const vec3 z_sq = z*z; - const vec3 denom1 = s + vec3(1.0); - const vec3 denom2 = 2.0*s + vec3(4.0); - const vec3 denom3 = 6.0*s + vec3(18.0); + // Float3 version: + const float3 scale = pow(z, s); + float3 sum = s_inv; + const float3 z_sq = z*z; + const float3 denom1 = s + float3(1.0); + const float3 denom2 = 2.0*s + float3(4.0); + const float3 denom3 = 6.0*s + float3(18.0); sum -= z/denom1; sum += z_sq/denom2; sum -= z * z_sq/denom3; return scale * sum; } -vec2 ligamma_small_z_impl(const vec2 s, const vec2 z, const vec2 s_inv) +float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv) { - // vec2 version: - const vec2 scale = pow(z, s); - vec2 sum = s_inv; - const vec2 z_sq = z*z; - const vec2 denom1 = s + vec2(1.0); - const vec2 denom2 = 2.0*s + vec2(4.0); - const vec2 denom3 = 6.0*s + vec2(18.0); + // Float2 version: + const float2 scale = pow(z, s); + float2 sum = s_inv; + const float2 z_sq = z*z; + const float2 denom1 = s + float2(1.0); + const float2 denom2 = 2.0*s + float2(4.0); + const float2 denom3 = 6.0*s + float2(18.0); sum -= z/denom1; sum += z_sq/denom2; sum -= z * z_sq/denom3; @@ -344,7 +345,7 @@ float ligamma_small_z_impl(const float s, const float z, const float s_inv) } // Upper incomplete gamma function for small s and large z (implementation): -vec4 uigamma_large_z_impl(const vec4 s, const vec4 z) +float4 uigamma_large_z_impl(const float4 s, const float4 z) { // Requires: 1.) s < ~0.5 // 2.) z > ~0.775075 @@ -352,40 +353,40 @@ vec4 uigamma_large_z_impl(const vec4 s, const vec4 z) // incomplete gamma function (4 terms). // The "rolled up" continued fraction looks like this. The denominator // is truncated, and it's calculated "from the bottom up:" - // denom = vec4('inf'); - // vec4 one = vec4(1.0); + // denom = float4('inf'); + // float4 one = float4(1.0); // for(int i = 4; i > 0; --i) // { // denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom; // } // Unrolled and constant-unfolded for madds and parallelism: - const vec4 numerator = pow(z, s) * exp(-z); - vec4 denom = vec4(7.0) + z - s; - denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom; - denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom; - denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom; + const float4 numerator = pow(z, s) * exp(-z); + float4 denom = float4(7.0) + z - s; + denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom; + denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom; + denom = float4(1.0) + z - s + (s - float4(1.0))/denom; return numerator / denom; } -vec3 uigamma_large_z_impl(const vec3 s, const vec3 z) +float3 uigamma_large_z_impl(const float3 s, const float3 z) { - // vec3 version: - const vec3 numerator = pow(z, s) * exp(-z); - vec3 denom = vec3(7.0) + z - s; - denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom; - denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom; - denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom; + // Float3 version: + const float3 numerator = pow(z, s) * exp(-z); + float3 denom = float3(7.0) + z - s; + denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom; + denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom; + denom = float3(1.0) + z - s + (s - float3(1.0))/denom; return numerator / denom; } -vec2 uigamma_large_z_impl(const vec2 s, const vec2 z) +float2 uigamma_large_z_impl(const float2 s, const float2 z) { - // vec2 version: - const vec2 numerator = pow(z, s) * exp(-z); - vec2 denom = vec2(7.0) + z - s; - denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom; - denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom; - denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom; + // Float2 version: + const float2 numerator = pow(z, s) * exp(-z); + float2 denom = float2(7.0) + z - s; + denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom; + denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom; + denom = float2(1.0) + z - s + (s - float2(1.0))/denom; return numerator / denom; } @@ -401,8 +402,8 @@ float uigamma_large_z_impl(const float s, const float z) } // Normalized lower incomplete gamma function for small s (implementation): -vec4 normalized_ligamma_impl(const vec4 s, const vec4 z, - const vec4 s_inv, const vec4 gamma_s_inv) +float4 normalized_ligamma_impl(const float4 s, const float4 z, + const float4 s_inv, const float4 gamma_s_inv) { // Requires: 1.) s < ~0.5 // 2.) s_inv = 1/s (precomputed for outside reuse) @@ -415,75 +416,83 @@ vec4 normalized_ligamma_impl(const vec4 s, const vec4 z, // from Gil/Segura/Temme's paper here: // http://oai.cwi.nl/oai/asset/20433/20433B.pdf // Evaluate both branches: Real branches test slower even when available. - const vec4 thresh = vec4(0.775075); - bvec4 z_is_large = greaterThan(z , thresh); - vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0); - const vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; - const vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + static const float4 thresh = float4(0.775075); + bool4 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + z_is_large.w = z.w > thresh.w; + const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; // Combine the results from both branches: - return large_z * vec4(z_size_check) + small_z * vec4(z_size_check); + bool4 inverse_z_is_large = not(z_is_large); + return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large); } -vec3 normalized_ligamma_impl(const vec3 s, const vec3 z, - const vec3 s_inv, const vec3 gamma_s_inv) +float3 normalized_ligamma_impl(const float3 s, const float3 z, + const float3 s_inv, const float3 gamma_s_inv) { - // vec3 version: - const vec3 thresh = vec3(0.775075); - bvec3 z_is_large = greaterThan(z , thresh); - vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0); - const vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; - const vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; - return large_z * vec3(z_size_check) + small_z * vec3(z_size_check); + // Float3 version: + static const float3 thresh = float3(0.775075); + bool3 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + z_is_large.z = z.z > thresh.z; + const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool3 inverse_z_is_large = not(z_is_large); + return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large); } -vec2 normalized_ligamma_impl(const vec2 s, const vec2 z, - const vec2 s_inv, const vec2 gamma_s_inv) +float2 normalized_ligamma_impl(const float2 s, const float2 z, + const float2 s_inv, const float2 gamma_s_inv) { - // vec2 version: - const vec2 thresh = vec2(0.775075); - bvec2 z_is_large = greaterThan(z , thresh); - vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0); - const vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; - const vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; - return large_z * vec2(z_size_check) + small_z * vec2(z_size_check); + // Float2 version: + static const float2 thresh = float2(0.775075); + bool2 z_is_large; + z_is_large.x = z.x > thresh.x; + z_is_large.y = z.y > thresh.y; + const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv; + const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; + bool2 inverse_z_is_large = not(z_is_large); + return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large); } float normalized_ligamma_impl(const float s, const float z, const float s_inv, const float gamma_s_inv) { // Float version: - const float thresh = 0.775075; - float z_size_check = 0.0; - if (z > thresh) z_size_check = 1.0; + static const float thresh = 0.775075; + const bool z_is_large = z > thresh; const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv; const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv; - return large_z * float(z_size_check) + small_z * float(z_size_check); + return large_z * float(z_is_large) + small_z * float(!z_is_large); } // Normalized lower incomplete gamma function for small s: -vec4 normalized_ligamma(const vec4 s, const vec4 z) +float4 normalized_ligamma(const float4 s, const float4 z) { // Requires: s < ~0.5 // Returns: Approximate the normalized lower incomplete gamma function // for s < 0.5. See normalized_ligamma_impl() for details. - const vec4 s_inv = vec4(1.0)/s; - const vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv); + const float4 s_inv = float4(1.0)/s; + const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv); return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); } -vec3 normalized_ligamma(const vec3 s, const vec3 z) +float3 normalized_ligamma(const float3 s, const float3 z) { - // vec3 version: - const vec3 s_inv = vec3(1.0)/s; - const vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv); + // Float3 version: + const float3 s_inv = float3(1.0)/s; + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv); return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); } -vec2 normalized_ligamma(const vec2 s, const vec2 z) +float2 normalized_ligamma(const float2 s, const float2 z) { - // vec2 version: - const vec2 s_inv = vec2(1.0)/s; - const vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv); + // Float2 version: + const float2 s_inv = float2(1.0)/s; + const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv); return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); } @@ -495,4 +504,7 @@ float normalized_ligamma(const float s, const float z) return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv); } -#endif // SPECIAL_FUNCTIONS_H \ No newline at end of file + +#endif // SPECIAL_FUNCTIONS_H + +