diff --git a/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang
index d910887..4e8a32c 100644
--- a/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur11fast-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang
index 36997c4..d5a77a3 100644
--- a/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur11fast-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11fast-horizontal-last-pass.slang b/blurs/blur11fast-horizontal-last-pass.slang
index 1220392..95ae5bb 100644
--- a/blurs/blur11fast-horizontal-last-pass.slang
+++ b/blurs/blur11fast-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11fast-horizontal.slang b/blurs/blur11fast-horizontal.slang
index 72bdcf0..5ecf46f 100644
--- a/blurs/blur11fast-horizontal.slang
+++ b/blurs/blur11fast-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang
index 843c0b1..4dcad35 100644
--- a/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur11fast-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11fast-vertical.slang b/blurs/blur11fast-vertical.slang
index 1cb691e..b4d96c2 100644
--- a/blurs/blur11fast-vertical.slang
+++ b/blurs/blur11fast-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang
index c67e1f9..d9f57f2 100644
--- a/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur11resize-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang
index e14b773..2c6bfdf 100644
--- a/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur11resize-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11resize-horizontal-last-pass.slang b/blurs/blur11resize-horizontal-last-pass.slang
index 3e105a1..6ee8518 100644
--- a/blurs/blur11resize-horizontal-last-pass.slang
+++ b/blurs/blur11resize-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11resize-horizontal.slang b/blurs/blur11resize-horizontal.slang
index 066df28..861d0aa 100644
--- a/blurs/blur11resize-horizontal.slang
+++ b/blurs/blur11resize-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang
index 160729c..5902484 100644
--- a/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur11resize-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur11resize-vertical.slang b/blurs/blur11resize-vertical.slang
index 90158c5..ffd46b9 100644
--- a/blurs/blur11resize-vertical.slang
+++ b/blurs/blur11resize-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur12x12shared.slang b/blurs/blur12x12shared.slang
new file mode 100644
index 0000000..cca3ed4
--- /dev/null
+++ b/blurs/blur12x12shared.slang
@@ -0,0 +1,87 @@
+#version 450
+
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+} global;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the .cgp preset file.
+//#define GAMMA_ENCODE_EVERY_FBO
+//#define FIRST_PASS
+//#define LAST_PASS
+//#define SIMULATE_CRT_ON_LCD
+//#define SIMULATE_GBA_ON_LCD
+//#define SIMULATE_LCD_ON_CRT
+//#define SIMULATE_GBA_ON_CRT
+
+//  blur-functions.h needs to know our profile's capabilities:
+//  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
+//  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
+//      artifacts without it due to funky texture sampling derivatives.
+#define DRIVERS_ALLOW_DERIVATIVES
+#define DRIVERS_ALLOW_TEX2DLOD
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+#include "../include/compat_macros.inc"
+#pragma stage vertex
+#include "vertex-shader-blur-one-pass-shared-sample.h"
+
+#pragma stage fragment
+layout(location = 0) in vec4 tex_uv;
+layout(location = 1) in vec2 blur_dxdy;
+layout(location = 2) in vec4 output_pixel_num;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
+
+void main()
+{
+    //  Get the integer output pixel number from two origins (uv and screen):
+    float4 output_pixel_num_integer = floor(output_pixel_num);
+    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
+    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
+    float3 color = tex2Dblur12x12shared(input_texture, tex_uv,
+        blur_dxdy, quad_vector);
+    //  Encode and output the blurred image:
+    FragColor = encode_output(float4(color, 1.0));
+}
\ No newline at end of file
diff --git a/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang
index 212ab6f..0ff0d43 100644
--- a/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur3fast-horizontal-gamma-encode-every-fbo.slang
@@ -51,12 +51,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_GBA_ON_CRT
 
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +64,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang
index 188c6b4..e4d4483 100644
--- a/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur3fast-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -51,12 +51,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_GBA_ON_CRT
 
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +64,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3fast-horizontal-last-pass.slang b/blurs/blur3fast-horizontal-last-pass.slang
index d508039..432b541 100644
--- a/blurs/blur3fast-horizontal-last-pass.slang
+++ b/blurs/blur3fast-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3fast-horizontal.slang b/blurs/blur3fast-horizontal.slang
index 4d9d5f4..97fffd4 100644
--- a/blurs/blur3fast-horizontal.slang
+++ b/blurs/blur3fast-horizontal.slang
@@ -51,12 +51,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_GBA_ON_CRT
 
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +64,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang
index 15a9e6b..e59be4b 100644
--- a/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur3fast-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3fast-vertical.slang b/blurs/blur3fast-vertical.slang
index a42c2b8..e9666a0 100644
--- a/blurs/blur3fast-vertical.slang
+++ b/blurs/blur3fast-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang
index c497bf0..f5c7a64 100644
--- a/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur3resize-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang
index 4eadb90..daf0908 100644
--- a/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur3resize-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3resize-horizontal-last-pass.slang b/blurs/blur3resize-horizontal-last-pass.slang
index d339ce5..18a2f8a 100644
--- a/blurs/blur3resize-horizontal-last-pass.slang
+++ b/blurs/blur3resize-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3resize-horizontal.slang b/blurs/blur3resize-horizontal.slang
index 5cc3f1c..f9a4eb1 100644
--- a/blurs/blur3resize-horizontal.slang
+++ b/blurs/blur3resize-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang
index 35ecc9b..faafc5e 100644
--- a/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur3resize-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3resize-vertical.slang b/blurs/blur3resize-vertical.slang
index e5b11ce..ebe89a3 100644
--- a/blurs/blur3resize-vertical.slang
+++ b/blurs/blur3resize-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3-gamma-encode-every-fbo.slang b/blurs/blur3x3-gamma-encode-every-fbo.slang
index 6af8321..0ca86e8 100644
--- a/blurs/blur3x3-gamma-encode-every-fbo.slang
+++ b/blurs/blur3x3-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang
index 2567a41..8cd8ecd 100644
--- a/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur3x3-last-pass-gamma-encode-every-fbo.slang
@@ -50,15 +50,11 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
-#include "vertex-shader-blur-one-pass-resize.h"
+#include "vertex-shader-blur-one-pass.h"
 
 ///////////////////////////////  FRAGMENT SHADER  //////////////////////////////
 
@@ -67,10 +63,15 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
-	vec3 color = tex2Dblur3x3resize(Source, tex_uv, blur_dxdy);
+	vec3 color = tex2Dblur3x3(Source, tex_uv, blur_dxdy);
     //  Encode and output the blurred image:
    FragColor = encode_output(vec4(color, 1.0));
 }
\ No newline at end of file
diff --git a/blurs/blur3x3-last-pass.slang b/blurs/blur3x3-last-pass.slang
index 4290a6c..2073a26 100644
--- a/blurs/blur3x3-last-pass.slang
+++ b/blurs/blur3x3-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3.slang b/blurs/blur3x3.slang
index caffdf8..af918be 100644
--- a/blurs/blur3x3.slang
+++ b/blurs/blur3x3.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3resize-gamma-encode-every-fbo.slang b/blurs/blur3x3resize-gamma-encode-every-fbo.slang
index abc4b1e..7c0d356 100644
--- a/blurs/blur3x3resize-gamma-encode-every-fbo.slang
+++ b/blurs/blur3x3resize-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-resize.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang b/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang
index 2567a41..092e074 100644
--- a/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur3x3resize-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-resize.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3resize-last-pass.slang b/blurs/blur3x3resize-last-pass.slang
index b430f00..84ed9c1 100644
--- a/blurs/blur3x3resize-last-pass.slang
+++ b/blurs/blur3x3resize-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-resize.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur3x3resize.slang b/blurs/blur3x3resize.slang
index c72d02f..35c8f7d 100644
--- a/blurs/blur3x3resize.slang
+++ b/blurs/blur3x3resize.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-resize.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang
index 1644912..8f2c028 100644
--- a/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur5fast-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang
index c074006..d396723 100644
--- a/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur5fast-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5fast-horizontal-last-pass.slang b/blurs/blur5fast-horizontal-last-pass.slang
index 240b90e..d8c6b3d 100644
--- a/blurs/blur5fast-horizontal-last-pass.slang
+++ b/blurs/blur5fast-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5fast-horizontal.slang b/blurs/blur5fast-horizontal.slang
index b638643..1b0b167 100644
--- a/blurs/blur5fast-horizontal.slang
+++ b/blurs/blur5fast-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang
index 922c71c..4a6b833 100644
--- a/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur5fast-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5fast-vertical.slang b/blurs/blur5fast-vertical.slang
index 0750af7..0d76d57 100644
--- a/blurs/blur5fast-vertical.slang
+++ b/blurs/blur5fast-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang
index 0f4b9ff..81d640b 100644
--- a/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur5resize-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang
index 09935ac..5e8ccc8 100644
--- a/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur5resize-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5resize-horizontal-last-pass.slang b/blurs/blur5resize-horizontal-last-pass.slang
index 929d899..14862b9 100644
--- a/blurs/blur5resize-horizontal-last-pass.slang
+++ b/blurs/blur5resize-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5resize-horizontal.slang b/blurs/blur5resize-horizontal.slang
index baf3430..c98c620 100644
--- a/blurs/blur5resize-horizontal.slang
+++ b/blurs/blur5resize-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang
index f59b54d..958c4cf 100644
--- a/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur5resize-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5resize-vertical.slang b/blurs/blur5resize-vertical.slang
index 061ee15..290992e 100644
--- a/blurs/blur5resize-vertical.slang
+++ b/blurs/blur5resize-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang b/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang
index 4019848..cb5f7e6 100644
--- a/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur5x5-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5x5-last-pass.slang b/blurs/blur5x5-last-pass.slang
index 80a958f..aeb2519 100644
--- a/blurs/blur5x5-last-pass.slang
+++ b/blurs/blur5x5-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur5x5.slang b/blurs/blur5x5.slang
index 39b01a3..03560d9 100644
--- a/blurs/blur5x5.slang
+++ b/blurs/blur5x5.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang
index 5e973be..c1602ed 100644
--- a/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur7fast-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang
index 20b09ab..49a452c 100644
--- a/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur7fast-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7fast-horizontal-last-pass.slang b/blurs/blur7fast-horizontal-last-pass.slang
index dc8029b..c7079e5 100644
--- a/blurs/blur7fast-horizontal-last-pass.slang
+++ b/blurs/blur7fast-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7fast-horizontal.slang b/blurs/blur7fast-horizontal.slang
index 9f0bb91..6cb6277 100644
--- a/blurs/blur7fast-horizontal.slang
+++ b/blurs/blur7fast-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang
index 6ead23e..b757482 100644
--- a/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur7fast-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7fast-vertical.slang b/blurs/blur7fast-vertical.slang
index bff459e..9da9398 100644
--- a/blurs/blur7fast-vertical.slang
+++ b/blurs/blur7fast-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang
index afcbb86..4915ab8 100644
--- a/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur7resize-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang
index f4f497b..8c71b17 100644
--- a/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur7resize-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7resize-horizontal-last-pass.slang b/blurs/blur7resize-horizontal-last-pass.slang
index 2183817..85000dc 100644
--- a/blurs/blur7resize-horizontal-last-pass.slang
+++ b/blurs/blur7resize-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7resize-horizontal.slang b/blurs/blur7resize-horizontal.slang
index 4726b55..51e9fbb 100644
--- a/blurs/blur7resize-horizontal.slang
+++ b/blurs/blur7resize-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang
index 18cb379..1c016c4 100644
--- a/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur7resize-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7resize-vertical.slang b/blurs/blur7resize-vertical.slang
index 250e84d..4fbc043 100644
--- a/blurs/blur7resize-vertical.slang
+++ b/blurs/blur7resize-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7x7-gamma-encode-every-fbo.slang b/blurs/blur7x7-gamma-encode-every-fbo.slang
index 82aa0cb..fcaf5d3 100644
--- a/blurs/blur7x7-gamma-encode-every-fbo.slang
+++ b/blurs/blur7x7-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang b/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang
index 1d2b266..65bde8c 100644
--- a/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur7x7-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7x7-last-pass.slang b/blurs/blur7x7-last-pass.slang
index 5967195..541a536 100644
--- a/blurs/blur7x7-last-pass.slang
+++ b/blurs/blur7x7-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur7x7.slang b/blurs/blur7x7.slang
index 9e5fd8a..9ef0177 100644
--- a/blurs/blur7x7.slang
+++ b/blurs/blur7x7.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang b/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang
index a84e349..202bd8d 100755
--- a/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur9fast-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang
index 645bee6..7d8d2a7 100755
--- a/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur9fast-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9fast-horizontal-last-pass.slang b/blurs/blur9fast-horizontal-last-pass.slang
index 12c6207..5885f2e 100755
--- a/blurs/blur9fast-horizontal-last-pass.slang
+++ b/blurs/blur9fast-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9fast-horizontal.slang b/blurs/blur9fast-horizontal.slang
index 17aa56d..ff679f5 100755
--- a/blurs/blur9fast-horizontal.slang
+++ b/blurs/blur9fast-horizontal.slang
@@ -1,18 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} params;
-
-layout(std140, set = 0, binding = 0) uniform UBO
-{
-	mat4 MVP;
-} global;
-
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 
 //  Copyright (C) 2014 TroggleMonkey
@@ -35,6 +22,18 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+} global;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
@@ -50,24 +49,24 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-horizontal.h"
 
 ///////////////////////////////  FRAGMENT SHADER  //////////////////////////////
 
 #pragma stage fragment
-#pragma format R8G8B8A8_SRGB
 layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang b/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang
index c351fbf..5db9055 100755
--- a/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur9fast-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9fast-vertical.slang b/blurs/blur9fast-vertical.slang
index 2f39809..356df0d 100755
--- a/blurs/blur9fast-vertical.slang
+++ b/blurs/blur9fast-vertical.slang
@@ -1,18 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} params;
-
-layout(std140, set = 0, binding = 0) uniform UBO
-{
-	mat4 MVP;
-} global;
-
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 
 //  Copyright (C) 2014 TroggleMonkey
@@ -35,6 +22,18 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+} global;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
@@ -50,28 +49,28 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-fast-vertical.h"
 
 ///////////////////////////////  FRAGMENT SHADER  //////////////////////////////
 
 #pragma stage fragment
-#pragma format R8G8B8A8_SRGB
 layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
-	vec3 color = tex2Dblur9fast(Source, tex_uv, blur_dxdy);
+	vec3 color = tex2Dblur9fast(input_texture, tex_uv, blur_dxdy);
     //  Encode and output the blurred image:
-   FragColor = encode_output(vec4(color, 1.0));
+   FragColor = encode_output(float4(color, 1.0));
 }
diff --git a/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang b/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang
index e50fb4b..a7a6b3e 100644
--- a/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang
+++ b/blurs/blur9resize-horizontal-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang b/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang
index 0a436c8..a678730 100644
--- a/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur9resize-horizontal-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9resize-horizontal-last-pass.slang b/blurs/blur9resize-horizontal-last-pass.slang
index 71a5b0c..b9b5fa8 100644
--- a/blurs/blur9resize-horizontal-last-pass.slang
+++ b/blurs/blur9resize-horizontal-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9resize-horizontal.slang b/blurs/blur9resize-horizontal.slang
index 11248b9..b89c1d5 100644
--- a/blurs/blur9resize-horizontal.slang
+++ b/blurs/blur9resize-horizontal.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-horizontal.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang b/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang
index cdf42d3..e951332 100644
--- a/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang
+++ b/blurs/blur9resize-vertical-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9resize-vertical.slang b/blurs/blur9resize-vertical.slang
index 20426cc..beb6ef4 100644
--- a/blurs/blur9resize-vertical.slang
+++ b/blurs/blur9resize-vertical.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-resize-vertical.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9x9-gamma-encode-every-fbo.slang b/blurs/blur9x9-gamma-encode-every-fbo.slang
index e9c074b..8d69528 100644
--- a/blurs/blur9x9-gamma-encode-every-fbo.slang
+++ b/blurs/blur9x9-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang b/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang
index d3dedf2..cf2d2ab 100644
--- a/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur9x9-last-pass-gamma-encode-every-fbo.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9x9-last-pass.slang b/blurs/blur9x9-last-pass.slang
index cb6a5ce..2f3c40d 100644
--- a/blurs/blur9x9-last-pass.slang
+++ b/blurs/blur9x9-last-pass.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/blur9x9.slang b/blurs/blur9x9.slang
index ed1c665..d33e933 100644
--- a/blurs/blur9x9.slang
+++ b/blurs/blur9x9.slang
@@ -50,13 +50,9 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-//  #included by vertex shader:
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
-
+#include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass.h"
 
@@ -67,6 +63,11 @@ layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+/////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
+#include "../include/gamma-management.h"
+#include "../include/blur-functions.h"
 
 void main()
 {
diff --git a/blurs/vertex-shader-blur-fast-horizontal.h b/blurs/vertex-shader-blur-fast-horizontal.h
index 78390a6..9398b23 100644
--- a/blurs/vertex-shader-blur-fast-horizontal.h
+++ b/blurs/vertex-shader-blur-fast-horizontal.h
@@ -30,8 +30,8 @@
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
diff --git a/blurs/vertex-shader-blur-fast-vertical.h b/blurs/vertex-shader-blur-fast-vertical.h
index cd324a0..f5d9ddb 100644
--- a/blurs/vertex-shader-blur-fast-vertical.h
+++ b/blurs/vertex-shader-blur-fast-vertical.h
@@ -23,17 +23,10 @@
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 
-
-/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
-
-//  PASS SETTINGS:
-//  Pass settings should be set by the shader file that #includes this one.
-
-
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
@@ -55,10 +48,10 @@ void main()
     //  (not output pixels), but we avoid this and consistently blur at the
     //  destination size.  Otherwise, combining statically calculated weights
     //  with bilinear sample exploitation would result in terrible artifacts.   
-    const vec2 dxdy_scale = params.SourceSize.xy * params.OutputSize.zw;
-	const vec2 dxdy = dxdy_scale * params.SourceSize.zw;
+    const float2 dxdy_scale = IN.video_size/IN.output_size;
+	const float2 dxdy = dxdy_scale/IN.texture_size;
     //  This blur is vertical-only, so zero out the horizontal offset:
-	blur_dxdy = vec2(0.0, dxdy.y);
+	blur_dxdy = float2(0.0, dxdy.y);
 }
 
 #endif  //  VERTEX_SHADER_BLUR_FAST_VERTICAL_H
\ No newline at end of file
diff --git a/blurs/vertex-shader-blur-one-pass-resize.h b/blurs/vertex-shader-blur-one-pass-resize.h
index 3321625..2082630 100644
--- a/blurs/vertex-shader-blur-one-pass-resize.h
+++ b/blurs/vertex-shader-blur-one-pass-resize.h
@@ -32,8 +32,8 @@
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
diff --git a/blurs/vertex-shader-blur-one-pass-shared-sample.h b/blurs/vertex-shader-blur-one-pass-shared-sample.h
index fd960d4..4dd8e30 100644
--- a/blurs/vertex-shader-blur-one-pass-shared-sample.h
+++ b/blurs/vertex-shader-blur-one-pass-shared-sample.h
@@ -32,20 +32,20 @@
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 tex_uv;
+layout(location = 0) out vec4 tex_uv;
 layout(location = 1) out vec4 output_pixel_num;
 layout(location = 2) out vec2 blur_dxdy;
 
 void main()
 {
    gl_Position = global.MVP * Position;
-   tex_uv = TexCoord;
+   vec2 tex_uv_ = TexCoord;
 
 	//  Get the uv sample distance between output pixels.  Blurs are not generic
     //  Gaussian resizers, and correct blurs require:
@@ -57,21 +57,21 @@ void main()
     //  (not output pixels), but we avoid this and consistently blur at the
     //  destination size.  Otherwise, combining statically calculated weights
     //  with bilinear sample exploitation would result in terrible artifacts.
-    const vec2 dxdy_scale params.SourceSize.xy * params.OutputSize.zw;
+    const vec2 dxdy_scale = params.SourceSize.xy * params.OutputSize.zw;
     blur_dxdy = dxdy_scale * params.SourceSize.zw;
 
     //  Get the output pixel number in ([0, xres), [0, yres)) with respect to
     //  the uv origin (.xy components) and the screen origin (.zw components).
     //  Both are useful.  Don't round until the fragment shader.
-    const float2 video_uv = tex_uv;
+    const float2 video_uv = tex_uv_;
     output_pixel_num.xy = params.OutputSize.xy * vec2(video_uv.x, video_uv.y);
     output_pixel_num.zw = params.OutputSize.xy *
-        (out_position.xy * 0.5 + vec2(0.5));
+        (gl_Position.xy * 0.5 + vec2(0.5));
 
     //  Set the mip level correctly for shared-sample blurs (where the
     //  derivatives are unreliable):
     const float mip_level = log2(params.SourceSize.xy * params.OutputSize.zw).y;
-    tex_uv = vec4(tex_uv, 0.0, mip_level);
+    tex_uv = vec4(tex_uv_, 0.0, mip_level);
 }
 
 #endif  //  VERTEX_SHADER_BLUR_ONE_PASS_SHARED_SAMPLE_H
\ No newline at end of file
diff --git a/blurs/vertex-shader-blur-one-pass.h b/blurs/vertex-shader-blur-one-pass.h
index d94a6df..7d62899 100644
--- a/blurs/vertex-shader-blur-one-pass.h
+++ b/blurs/vertex-shader-blur-one-pass.h
@@ -32,8 +32,8 @@
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
diff --git a/blurs/vertex-shader-blur-resize-horizontal.h b/blurs/vertex-shader-blur-resize-horizontal.h
index 66b2179..407d47d 100644
--- a/blurs/vertex-shader-blur-resize-horizontal.h
+++ b/blurs/vertex-shader-blur-resize-horizontal.h
@@ -32,8 +32,8 @@
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
diff --git a/blurs/vertex-shader-blur-resize-vertical.h b/blurs/vertex-shader-blur-resize-vertical.h
index 41fbe2b..6265af7 100644
--- a/blurs/vertex-shader-blur-resize-vertical.h
+++ b/blurs/vertex-shader-blur-resize-vertical.h
@@ -32,8 +32,8 @@
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-#include "../include/gamma-management.h"
-#include "../include/blur-functions.h"
+//#include "../include/gamma-management.h"
+//#include "../include/blur-functions.h"
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
diff --git a/crt/crt-royale-fake-bloom-intel.slangp b/crt/crt-royale-fake-bloom-intel.slangp
new file mode 100644
index 0000000..468fe6a
--- /dev/null
+++ b/crt/crt-royale-fake-bloom-intel.slangp
@@ -0,0 +1,134 @@
+# IMPORTANT:
+# Shader passes need to know details about the image in the mask_texture LUT
+# files, so set the following constants in user-cgp-constants.h accordingly:
+# 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's)
+# 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's)
+# 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's)
+# 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1])
+# 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1])
+# 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1])
+# Shader passes also need to know certain scales set in this .slangp, but their
+# compilation model doesn't currently allow the .slangp file to tell them.  Make
+# sure to set the following constants in user-cgp-constants.h accordingly too:
+# 1.) bloom_approx_scale_x_for_fake = scale_x2
+# 2.) mask_resize_viewport_scale = float2(scale_x6, scale_y5)
+# Finally, shader passes need to know the value of geom_max_aspect_ratio used to
+# calculate scale_y5 (among other values):
+# 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5)
+
+shaders = "7"
+
+# Set an identifier, filename, and sampling traits for the phosphor mask texture.
+# Load an aperture grille, slot mask, and an EDP shadow mask, and load a small
+# non-mipmapped version and a large mipmapped version.
+# TODO: Test masks in other directories.
+textures = "mask_grille_texture_small;mask_grille_texture_large;mask_slot_texture_small;mask_slot_texture_large;mask_shadow_texture_small;mask_shadow_texture_large"
+mask_grille_texture_small = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png"
+mask_grille_texture_large = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png"
+mask_slot_texture_small = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png"
+mask_slot_texture_large = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png"
+mask_shadow_texture_small = "shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png"
+mask_shadow_texture_large = "shaders/crt-royale/TileableLinearShadowMaskEDP.png"
+mask_grille_texture_small_wrap_mode = "repeat"
+mask_grille_texture_large_wrap_mode = "repeat"
+mask_slot_texture_small_wrap_mode = "repeat"
+mask_slot_texture_large_wrap_mode = "repeat"
+mask_shadow_texture_small_wrap_mode = "repeat"
+mask_shadow_texture_large_wrap_mode = "repeat"
+mask_grille_texture_small_linear = "true"
+mask_grille_texture_large_linear = "true"
+mask_slot_texture_small_linear = "true"
+mask_slot_texture_large_linear = "true"
+mask_shadow_texture_small_linear = "true"
+mask_shadow_texture_large_linear = "true"
+mask_grille_texture_small_mipmap = "false"  # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_grille_texture_large_mipmap = "true"   # Essential for hardware-resized masks
+mask_slot_texture_small_mipmap = "false"    # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_slot_texture_large_mipmap = "true"     # Essential for hardware-resized masks
+mask_shadow_texture_small_mipmap = "false"  # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_shadow_texture_large_mipmap = "true"   # Essential for hardware-resized masks
+
+
+# Pass0: Linearize the input based on CRT gamma and bob interlaced fields.
+# (Bobbing ensures we can immediately blur without getting artifacts.)
+shader0 = "shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang"
+alias0 = "ORIG_LINEARIZED"
+filter_linear0 = "false"
+scale_type0 = "source"
+scale0 = "1.0"
+srgb_framebuffer0 = "true"
+
+# Pass1: Resample interlaced (and misconverged) scanlines vertically.
+# Separating vertical/horizontal scanline sampling is faster: It lets us
+# consider more scanlines while calculating weights for fewer pixels, and
+# it reduces our samples from vertical*horizontal to vertical+horizontal.
+# This has to come right after ORIG_LINEARIZED, because there's no
+# "original_source" scale_type we can use later.
+shader1 = "shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang"
+alias1 = "VERTICAL_SCANLINES"
+filter_linear1 = "true"
+scale_type_x1 = "source"
+scale_x1 = "1.0"
+scale_type_y1 = "viewport"
+scale_y1 = "1.0"
+#float_framebuffer1 = "true"
+srgb_framebuffer1 = "true"
+
+# Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and
+# account for convergence offsets.  We want to blur a predictable portion of the
+# screen to match the phosphor bloom, and absolute scale works best for
+# reliable results with a fixed-size bloom.  Picking a scale is tricky:
+# a.) 400x300 is a good compromise for the "fake-bloom" version: It's low enough
+#     to blur high-res/interlaced sources but high enough that resampling
+#     doesn't smear low-res sources too much.
+# b.) 320x240 works well for the "real bloom" version: It's 1-1.5% faster, and
+#     the only noticeable visual difference is a larger halation spread (which
+#     may be a good thing for people who like to crank it up).
+# Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's
+# *intended* for an ~4:3 aspect ratio).
+shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx-fake-bloom-intel.slang"
+alias2 = "BLOOM_APPROX"
+filter_linear2 = "true"
+scale_type2 = "absolute"
+scale_x2 = "400"
+scale_y2 = "300"
+srgb_framebuffer2 = "true"
+
+# Pass3: Vertically blur the input for halation and refractive diffusion.
+# Base this on BLOOM_APPROX: This blur should be small and fast, and blurring
+# a constant portion of the screen is probably physically correct if the
+# viewport resolution is proportional to the simulated CRT size.
+shader3 = "../blurs/blur9fast-vertical.slang"
+filter_linear3 = "true"
+scale_type3 = "source"
+scale3 = "1.0"
+srgb_framebuffer3 = "true"
+
+# Pass4: Horizontally blur the input for halation and refractive diffusion.
+# Note: Using a one-pass 9x9 blur is about 1% slower.
+shader4 = "../blurs/blur9fast-horizontal.slang"
+alias4 = "HALATION_BLUR"
+filter_linear4 = "true"
+scale_type4 = "source"
+scale4 = "1.0"
+srgb_framebuffer4 = "true"
+
+# Pass5: Resample (misconverged) scanlines horizontally, apply halation, and
+# apply the phosphor mask, then fake a phosphor bloom, all in one pass.
+shader5 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-fake-bloom-intel.slang"
+alias5 = "MASKED_SCANLINES"
+filter_linear5 = "true" # This could just as easily be nearest neighbor.
+scale_type5 = "viewport"
+scale5 = "1.0"
+#float_framebuffer5 = "true"
+srgb_framebuffer5 = "true"
+
+# Pass 6: Compute curvature/AA:
+shader6 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang"
+filter_linear6 = "true"
+scale_type6 = "viewport"
+mipmap_input6 = "true"
+texture_wrap_mode6 = "clamp_to_edge"
+
+
+
diff --git a/crt/crt-royale_fallback.slangp b/crt/crt-royale-intel.slangp
similarity index 56%
rename from crt/crt-royale_fallback.slangp
rename to crt/crt-royale-intel.slangp
index 931f032..b2c1d46 100644
--- a/crt/crt-royale_fallback.slangp
+++ b/crt/crt-royale-intel.slangp
@@ -1,22 +1,22 @@
 # IMPORTANT:
 # Shader passes need to know details about the image in the mask_texture LUT
-# files, so set the following constants in user-preset-constants.h accordingly:
+# files, so set the following constants in user-cgp-constants.h accordingly:
 # 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's)
 # 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's)
 # 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's)
 # 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1])
 # 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1])
 # 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1])
-# Shader passes also need to know certain scales set in this preset, but their
-# compilation model doesn't currently allow the preset file to tell them.  Make
-# sure to set the following constants in user-preset-constants.h accordingly too:
+# Shader passes also need to know certain scales set in this .slangp, but their
+# compilation model doesn't currently allow the .slangp file to tell them.  Make
+# sure to set the following constants in user-cgp-constants.h accordingly too:
 # 1.) bloom_approx_scale_x = scale_x2
-# 2.) mask_resize_viewport_scale = vec2(scale_x6, scale_y5)
+# 2.) mask_resize_viewport_scale = float2(scale_x6, scale_y5)
 # Finally, shader passes need to know the value of geom_max_aspect_ratio used to
 # calculate scale_y5 (among other values):
 # 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5)
 
-shaders = "12"
+shaders = "10"
 
 # Set an identifier, filename, and sampling traits for the phosphor mask texture.
 # Load an aperture grille, slot mask, and an EDP shadow mask, and load a small
@@ -71,6 +71,7 @@ scale_type_x1 = "source"
 scale_x1 = "1.0"
 scale_type_y1 = "viewport"
 scale_y1 = "1.0"
+#float_framebuffer1 = "true"
 srgb_framebuffer1 = "true"
 
 # Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and
@@ -85,7 +86,7 @@ srgb_framebuffer1 = "true"
 #     may be a good thing for people who like to crank it up).
 # Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's
 # *intended* for an ~4:3 aspect ratio).
-shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang"
+shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang"
 alias2 = "BLOOM_APPROX"
 filter_linear2 = "true"
 scale_type2 = "absolute"
@@ -112,95 +113,42 @@ scale_type4 = "source"
 scale4 = "1.0"
 srgb_framebuffer4 = "true"
 
-# Pass5: Lanczos-resize the phosphor mask vertically.  Set the absolute
-# scale_x5 == mask_texture_small_size.x (see IMPORTANT above).  Larger scales
-# will blur, and smaller scales could get nasty.  The vertical size must be
-# based on the viewport size and calculated carefully to avoid artifacts later.
-# First calculate the minimum number of mask tiles we need to draw.
-# Since curvature is computed after the scanline masking pass:
-#   num_resized_mask_tiles = 2.0;
-# If curvature were computed in the scanline masking pass (it's not):
-#   max_mask_texel_border = ~3.0 * (1/3.0 + 4.0*sqrt(2.0) + 0.5 + 1.0);
-#   max_mask_tile_border = max_mask_texel_border/
-#       (min_resized_phosphor_triad_size * mask_triads_per_tile);
-#   num_resized_mask_tiles = max(2.0, 1.0 + max_mask_tile_border * 2.0);
-#   At typical values (triad_size >= 2.0, mask_triads_per_tile == 8):
-#       num_resized_mask_tiles = ~3.8
-# Triad sizes are given in horizontal terms, so we need geom_max_aspect_ratio
-# to relate them to vertical resolution.  The widest we expect is:
-#   geom_max_aspect_ratio = 4.0/3.0  # Note: Shader passes need to know this!
-# The fewer triads we tile across the screen, the larger each triad will be as a
-# fraction of the viewport size, and the larger scale_y5 must be to draw a full
-# num_resized_mask_tiles.  Therefore, we must decide the smallest number of
-# triads we'll guarantee can be displayed on screen.  We'll set this according
-# to 3-pixel triads at 768p resolution (the lowest anyone's likely to use):
-#   min_allowed_viewport_triads = 768.0*geom_max_aspect_ratio / 3.0 = 341.333333
-# Now calculate the viewport scale that ensures we can draw resized_mask_tiles:
-#   min_scale_x = resized_mask_tiles * mask_triads_per_tile /
-#       min_allowed_viewport_triads
-#   scale_y5 = geom_max_aspect_ratio * min_scale_x
-#   # Some code might depend on equal scales:
-#   scale_x6 = scale_y5
-# Given our default geom_max_aspect_ratio and min_allowed_viewport_triads:
-#   scale_y5 = 4.0/3.0 * 2.0/(341.33333 / 8.0) = 0.0625
-# IMPORTANT: The scales MUST be calculated in this way.  If you wish to change
-# geom_max_aspect_ratio, update that constant in user-preset-constants.h!
-shader5 = "shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang"
-filter_linear5 = "true"
-scale_type_x5 = "absolute"
-scale_x5 = "64"
-scale_type_y5 = "viewport"
-scale_y5 = "0.0625" # Safe for >= 341.333 horizontal triads at viewport size
-#srgb_framebuffer5 = "false" # mask_texture is already assumed linear
-
-# Pass6: Lanczos-resize the phosphor mask horizontally.  scale_x6 = scale_y5.
-# TODO: Check again if the shaders actually require equal scales.
-shader6 = "shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang"
-alias6 = "MASK_RESIZE"
-filter_linear6 = "false"
-scale_type_x6 = "viewport"
-scale_x6 = "0.0625"
-scale_type_y6 = "source"
-scale_y6 = "1.0"
-#srgb_framebuffer6 = "false" # mask_texture is already assumed linear
-
-# Pass7: Resample (misconverged) scanlines horizontally, apply halation, and
+# Pass5: Resample (misconverged) scanlines horizontally, apply halation, and
 # apply the phosphor mask.
-shader7 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang"
-alias7 = "MASKED_SCANLINES"
+shader5 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang"
+alias5 = "MASKED_SCANLINES"
+filter_linear5 = "true" # This could just as easily be nearest neighbor.
+scale_type5 = "viewport"
+scale5 = "1.0"
+#float_framebuffer5 = "true"
+srgb_framebuffer5 = "true"
+
+# Pass 6: Compute a brightpass.  This will require reading the final mask.
+shader6 = "shaders/crt-royale/src/crt-royale-brightpass.slang"
+alias6 = "BRIGHTPASS"
+filter_linear6 = "true" # This could just as easily be nearest neighbor.
+scale_type6 = "viewport"
+scale6 = "1.0"
+srgb_framebuffer6 = "true"
+
+# Pass 7: Blur the brightpass vertically
+shader7 = "shaders/crt-royale/src/crt-royale-bloom-vertical.slang"
 filter_linear7 = "true" # This could just as easily be nearest neighbor.
-scale_type7 = "viewport"
+scale_type7 = "source"
 scale7 = "1.0"
 srgb_framebuffer7 = "true"
 
-# Pass 8: Compute a brightpass.  This will require reading the final mask.
-shader8 = "shaders/crt-royale/src/crt-royale-brightpass.slang"
-alias8 = "BRIGHTPASS"
-filter_linear8 = "true" # This could just as easily be nearest neighbor.
-scale_type8 = "viewport"
+# Pass 8: Blur the brightpass horizontally and combine it with the dimpass:
+shader8 = "shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang"
+filter_linear8 = "true"
+scale_type8 = "source"
 scale8 = "1.0"
 srgb_framebuffer8 = "true"
 
-# Pass 9: Blur the brightpass vertically
-shader9 = "shaders/crt-royale/src/crt-royale-bloom-vertical.slang"
-filter_linear9 = "true" # This could just as easily be nearest neighbor.
-scale_type9 = "source"
-scale9 = "1.0"
-srgb_framebuffer9 = "true"
+# Pass 9: Compute curvature/AA:
+shader9 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang"
+filter_linear9 = "true"
+scale_type9 = "viewport"
+mipmap_input9 = "true"
+texture_wrap_mode9 = "clamp_to_edge"
 
-# Pass 10: Blur the brightpass horizontally and combine it with the dimpass:
-shader10 = "shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang"
-filter_linear10 = "true"
-scale_type10 = "source"
-scale10 = "1.0"
-srgb_framebuffer10 = "true"
-
-# Pass 11: Compute curvature/AA:
-shader11 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang"
-filter_linear11 = "true"
-scale_type11 = "viewport"
-mipmap_input11 = "true"
-texture_wrap_mode11 = "clamp_to_edge"
-
-parameters = "beam_num_scanlines"
-beam_num_scanlines = 3.0
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/LICENSE.TXT b/crt/shaders/crt-royale/LICENSE.TXT
new file mode 100644
index 0000000..d8cf7d4
--- /dev/null
+++ b/crt/shaders/crt-royale/LICENSE.TXT
@@ -0,0 +1,280 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
diff --git a/crt/shaders/crt-royale/README.TXT b/crt/shaders/crt-royale/README.TXT
new file mode 100644
index 0000000..16636b8
--- /dev/null
+++ b/crt/shaders/crt-royale/README.TXT
@@ -0,0 +1,493 @@
+////////////////////////////////////////////////////////////////////////////////
+////          crt-royale, by TroggleMonkey <trogglemonkey@gmx.com>          ////
+////                     Last Updated: August 16, 2014                      ////
+////////////////////////////////////////////////////////////////////////////////
+
+REQUIREMENTS:
+The earliest official Retroarch version fully supporting crt-royale is 1.0.0.3
+(currently unreleased).  Earlier versions lack shader parameters and proper
+mipmapping and sRGB support, but the shader may still run at reduced quality.
+
+The earliest development version fully supporting this shader is:
+    commit ba40be909913c9ccc34dab5d452fba4fe61af9d0
+    Author: Themaister <maister@archlinux.us>
+    Date:   Thu Jun 5 17:41:10 2014 +0200
+A few earlier revisions support the required features, but they may be buggier.
+
+
+BASICS:
+crt-royale is a highly customizable CRT shader for Retroarch and other programs
+supporting the libretro Cg shader standard.  It uses a number of nonstandardized
+extensions like sRGB FBO's, mipmapping, and runtime shader parameters, but
+hopefully it will run without much of a fuss on new implementations of the
+standard as well.
+
+There are a huge number of parameters.  Among the things you can customize:
+* Phosphor mask type: An aperture grille, slot mask, and shadow mask are each
+  included, although the latter won't be seeing much usage until 1440p displays
+  and better become more common (4k UHD and 8k UHD are increasingly optimal).
+* Phosphor mask dot pitch
+* Phosphor mask resampling method: Choose between Lanczos sinc resizing,
+  mipmapped hardware resizing, and no resizing of the input LUT.
+* Phosphor bloom softness and type (real or fake ;))
+* Gaussian and generalized Gaussian scanline beam properties/distribution,
+  including convergence offsets
+* Screen geometry, including curvature (spherical, alternative spherical, or
+  cylindrical like Trinitrons), tilt, and borders
+* Antialiasing level, resampling filter, and sharpness parameters for gracefully
+  combining screen curvature with high-frequency phosphor details, including
+  optionally resampling based on RGB subpixel positions.
+* Halation (electrons bouncing under the glass and lighting random phosphors)
+  random phosphors)
+* Refractive diffusion (light spreading from the imperfect CRT glass face)
+* Interlacing options
+* etc.
+
+There are two major ways to customize the shader:
+* Runtime shader parameters allow convenient experimentation with real-time
+  feedback, but they are much slower, because they prevent static evaluation of
+  a lot of math.  Disabling them drastically speeds up the shader.
+* If runtime shader parameters are disabled (partially or totally), those same
+  settings can be freely altered in the text of the user-settings.h file.  There
+  are also a number of other static-only settings, including the #define macros
+  which indicate where and when to allow runtime shader parameters.  To disable
+  them entirely, comment out the "#define RUNTIME_SHADER_PARAMS_ENABLE" line by
+  putting a double-slash ("//") at the beginning...your FPS will skyrocket.
+
+You may also note that there are two major versions of the shader preset:
+* crt-royale.cgh is the "full" version of the shader, which blooms the light
+  from the brighter phosphors to maintain brightness and avoid clipping.
+* crt-royale-fake-bloom.cgh is the "cheater's" version of the shader, which
+  only fakes the bloom based on carefully blending in a [potentially blurred]
+  version of the original input.  This version is MUCH faster, and you have to
+  strain to see the difference, so people with slower GPU's will prefer it.
+
+There's a lot to play around with, and I encourage everyone using this shader to
+read through the user-settings.h file to learn about the parameters.  Before
+loading the shader, be sure to read the next section, entitled...
+
+
+////////////////////////////////////////////////////////////////////////////////
+////                    FREQUENTLY EXPECTED QUESTIONS:                      ////
+////////////////////////////////////////////////////////////////////////////////
+
+1.) WHY IS THE SHADER CRASHING WHEN I LOAD IT?!?
+Do you get C6001 or C6002 errors with integrated graphics, like Intel HD 4000?
+If so, please try one of the following .cgp presets:
+* crt-royale-intel.cgp
+* crt-royale-fake-bloom-intel.cgp
+These load .cg wrappers that #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+(also available in user-settings.h) before loading the main .cg shader files.
+
+Integrated graphics compatibility mode will disable these three features, which
+currently require more registers or instructions than Intel GPU's allow:
+* PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+  (This may be reenabled in a later release.)
+* RUNTIME_GEOMETRY_MODE: You must change the screen geometry/curvature using
+  the geom_mode_static setting in user-settings.h.
+* The high-quality 4x4 Gaussian resize for the bloom approximation
+
+Using Intel-specific .cgp files is equivalent to #defining
+INTEGRATED_GRAPHICS_COMPATIBILITY_MODE in your user-settings.h.  Out of the box,
+user-settings.h is configured for maximum configurability and compatibility with
+dedicated nVidia and AMD/ATI GPU's.  Compatibility mode is disabled by default
+to avoid silently degrading quality for AMD/ATI and nVidia users, so Intel-
+specific .cgp's are a convenient way for Intel users to play with the shader
+without editing text files.
+
+I've tested this solution on Intel HD 4000 graphics, and it should work for that
+GPU at least, but please let me know if you're still having problems!
+
+--------------------------------------------------------------------------------
+
+2.) WHY IS EVERYTHING SO SLOW?!?:
+Out of the box, this will be a problem for all but monster GPU's.  The default
+user-settings.h file disables any features and optimizations which might cause
+compilation failure on AMD/ATI GPU's.  Despite the name of the options, this is
+not a problem with your card or drivers; it's a shortcoming in the Cg shader
+compiler's nVidia-centric profile setups.
+
+Uncommenting the following #define macros at the top of user-settings.h will
+help performance a good deal on compatible nVidia cards:
+    #define DRIVERS_ALLOW_DERIVATIVES
+    #define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+    #define DRIVERS_ALLOW_TEX2DLOD
+    #define DRIVERS_ALLOW_TEX2DBIAS
+A few of these warrant some elaboration.  First, derivatives:
+
+Derivatives allow the shader to cheaply calculate a tangent-space matrix for
+correct antialiasing when curvature or overscan are used.  Without them, there
+are two options:
+    a.) Cheat, and there will be artifacts with strong cylindrical curvature
+    b.) Compute the correct tangent-space matrix analytically.  This is used
+        by default, and it's controlled by this option near the bottom:
+            geom_force_correct_tangent_matrix = true
+
+Dynamic branches:
+Dynamic branches allow the shader to avoid performing computations that it
+doesn't need (but might have, given different runtime options).  Without them,
+the shader has to either let the GPU evaluate every possible codepath and select
+a result, or make a "best guess" ahead of time.  The full phosphor bloom suffers
+most from not having dynamic branches, because the shader doesn't know how big
+of a blur to use until it knows your phosphor mask dot pitch...which you set at
+runtime if shader parameters are enabled.
+
+If RUNTIME_PHOSPHOR_BLOOM_SIGMA is commented out (faster), this won't matter:
+The shader will just select the blur size and standard deviation suitable for
+the mask_triad_size_desired_static setting in user-settings.cgp.  It will be
+fast, but larger triads won't blur enough, and smaller triads will blur more
+than they need to.  However, if RUNTIME_PHOSPHOR_BLOOM_SIGMA is enabled, the
+shader will calculate an optimal standard deviation and *try* to use the right
+blur size for it...but using an "if standard deviation is such and such"
+condition would be prohibitively slow without dynamic branches.  Instead, the
+shader uses the largest and slowest blur the user lets it use (to cover the
+widest range of triad sizes and standard deviations), according to these macros:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+The more you have uncommented, the larger the triads you can blur, but the
+slower runtime sigmas will be if your GPU can't use dynamic branches.  By
+default, triads up to 6 pixels wide will be bloomed perfectly, and a little
+beyond that (8 should be fine), but going too far beyond that will create
+blocking artifacts in the blur due to an insufficient support size.
+
+tex2Dlod:
+The tex2Dlod function allows the shader to disables anisotropic filtering, which
+can get confused when we're manually tiling the texture coordinates for a small
+resized phosphor mask tile (it creates nasty seam artifacts).  There are several
+ways the shader can deal with this: The cheapest is to use tex2Dlod to tile the
+output of MASK_RESIZE across the screen...and the slower alternatives either
+require derivatives or force the shader to draw 2 tiles to MASK_RESIZE in each
+direction, thereby reducing your maximum allowed dot pitch by half.
+
+tex2Dbias:
+According to nVidia's Cg language standard library page, tex2Dbias requires the
+fp30 profile, which doesn't work on ATI/AMD cards...but you might actually have
+mixed results.  This can be used as a substitute for tex2Dlod at times, so it's
+worth trying even on ATI.
+
+--------------------------------------------------------------------------------
+
+3.) WHY IS EVERYTHING STILL SO SLOW?!?:
+For maximum quality and configurability out of the box, almost all shader
+parameters are enabled by default (except for the disproportionately expensive
+runtime subpixel offsets).  Some are more expensive than others.  Commenting
+the following macro disables all shader parameters:
+    #define RUNTIME_SHADER_PARAMS_ENABLE
+Commenting these macros disables selective shader parameters:
+    #define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #define RUNTIME_ANTIALIAS_WEIGHTS
+    //#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #define RUNTIME_GEOMETRY_TILT
+    #define RUNTIME_GEOMETRY_MODE
+    #define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+Note that all shader parameters will still show up in your GUI list, and the
+disabled ones simply won't work.
+
+Finally, there are a lot of other options enabled by default that carry serious
+performance penalties.  For instance, the default antialiasing filter is a
+cubic filter, because it's the most configurable, but it's also quite slow if
+RUNTIME_ANTIALIAS_WEIGHTS is #defined.  A lot of the static true/false options
+have a significant influence, and the shader is faster if the red subpixel
+offset (from which the blue one is calculated as well) is zero...even if it's
+a static value, because RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS is commented out.
+To avoid any confusion, I should also clarify now that subpixel offsets are
+separate from scanline beam convergence offsets.
+
+To quickly see how much performance you can get from other settings, you can
+temporarily replace your user-settings.h with one of:
+a.) crt-royale-settings-files/user-settings-fast-static-ati.h
+b.) crt-royale-settings-files/user-settings-fast-static-nvidia.h
+Then load crt-royale-fake-bloom.cgp.  It should be far more playable.
+
+--------------------------------------------------------------------------------
+
+4.) WHY WON'T MY SHADER BLOOM MY PHOSPHORS ENOUGH?
+First, see the discussion about dynamic branching above, in 1.
+If you don't have dynamic branches, you can either uncomment the lines that
+let the shader pessimistically use larger blurs than it's guaranteed to need
+(which is slow), or...you can just use crt-royale-fake-bloom.cgp, which
+doesn't have this problem. :)
+
+--------------------------------------------------------------------------------
+
+5.) WHY CAN'T I MAKE MY PHOSPHORS ANY BIGGER?
+By default, the phosphor mask is Lanczos-resized in earlier passes to your
+specified dot pitch (mask_sample_mode = 0).  This gives a much sharper result
+than mipmapped hardware sampling (mask_sample_mode = 1), but it can be much
+slower without taking proper care: If the input mask tile (containing 8
+phosphor triads by default) is large, like 512x512, and you try to resize it
+to 24x24 for 3x3 pixel triads, the resizer has to take 128 samples in each
+pass/direction (the max allowed) for a 3-lobe Lanczos filter.  This can be
+very slow, so I made the output of MASK_RESIZE very small by default: Just
+1/16th of the viewport size in each direction.  The exact limit scales with
+your viewport size, and it *should* be reasonable, but the restrictions can
+get tighter if we can't use tex2Dlod and have to fit two whole tiles (16
+phosphor triads with default 8-triad tiles) into the MASK_RESIZE pass for
+compatibility with anisotropic filtering (long story).
+
+If you want bigger phosphor triads, you have two options:
+a.) Set mask_sample_mode to 1 in your shader params (if enabled) or set
+    mask_sample_mode_static to 1 in your user-settings.h file.  This will use
+    hardware sampling, which is softer but has no limitations.
+b.) To increase the limit with manual mask-resizing (best quality), you need to
+    do five things:
+    1.) Go into your .cgp file and find the MASK_RESIZE pass (the horizontal
+        mask resizing pass) and the one before it (the vertical mask resizing
+        pass).  Find the viewport-relative scales, which should say 0.0625, and
+        change them to 0.125 or even 0.25.
+    2.) Still in your .cgp file, also make sure your mask_*_texture_small
+        filenames point to LUT textures that are larger than your final desired
+        onscreen size (upsizing is not currently permitted).
+    3.) Go into user-cgp-constants.h and change mask_resize_viewport_scale from
+        0.0625 to the new value you changed it to in step 1.  This is necessary,
+        because we can't pass that value from the .cgp file to the shader, and
+        the shader can't compute the viewport size (necessary) without it.
+    4.) Still in user-cgp-constants.h, update mask_texture_small_size and
+        mask_triads_per_tile appropriately if you changed your LUT texture in
+        step 2.
+    5.) Reload your .cgp file.
+I REALLY wish there was an easier way to do that, but my hands are tied until
+.cgp files are allowed to pass more information to .cg shaders (which would
+require major updates to the cg2glsl script).
+
+--------------------------------------------------------------------------------
+
+6.) WHY CAN'T I MAKE MY PHOSPHORS ANY SMALLER THAN 2 PIXELS PER TRIAD?
+This is controlled by mask_min_allowed_triad_size in your user-settings.h file.
+Set it to 1.0 instead of 2.0 (anything lower than 1 is pointless), and you're
+set.  It defaults to 2.0 to make mask resizing twice as fast when dynamic
+branches aren't allowed.  Some people may want to be able to fade the phosphors
+away entirely to get a more PVM-like scanlined image though, so change it to 1.0
+for that (or get a higher-resolution display ;)).
+
+Note: This setting should be obsolete soon.  I have some ideas for more
+sophisticated mask resampling that I just don't have a spare few hours to
+implement yet.
+
+--------------------------------------------------------------------------------
+
+7.) I AM NOT RUNNING INTEGRATED GRAPHICS.  WHY AM I GETTING ERRORS?
+First recheck the top of your user-settings.h to make sure incompatible driver
+options are commented out (disabled).  If they're all disabled and you're still
+having problems, you've probably found a bug.  There are bound to be a number of
+them with certain setting combinations, and there might even be a few individual
+settings I broke more recently than I tested them.  My contact information is up
+top, so let me know!
+
+--------------------------------------------------------------------------------
+
+8.) WHY AM I GETTING BANDING IN DARK COLORS?  OR, WHY WON'T MIPMAPPING WORK?
+crt-royale uses features like sRGB and mipmapping, which are not available in
+the latest Retroarch release (1.0.0.2) at the time of this writing.
+
+You may get banding in dark colors if your platform or Retroarch version doesn't
+support sRGB FBO's, and mask_sample_mode 1 will look awful without mipmapping.
+I expect most platforms capable of running this shader at full speed will
+support sRGB FBO's, but if yours doesn't, please let me know, and I'll include
+a note about it.
+
+Alternately, setting levels_autodim_temp too low will cause precision loss and
+banding.
+
+--------------------------------------------------------------------------------
+
+9.) HOW DO I SET GEOMETRY/CURVATURE/ETC.?
+If RUNTIME_SHADER_PARAMS_ENABLE and RUNTIME_GEOMETRY_MODE are both #defined (not
+commented out) in user-settings.cgp, you can find these options in your shader
+parameters (in Retroarch's RGUI for instance) under e.g. geom_mode.  Otherwise,
+you can set the corresponding e.g. geom_mode_static options in user-settings.h.
+
+--------------------------------------------------------------------------------
+
+10.) WHY DON'T MY SHADER PARAMETERS STICK?
+This is a bit confusing, at least in the version of Retroarch I'm using.
+In the Shader Options menu, Parameters (Current) controls what's on your screen
+right now, whereas Parameters (RGUI) seems to control what gets saved to a
+shader preset (in your base shaders directory) with Save As Shader Preset.
+
+--------------------------------------------------------------------------------
+
+11.) WHY DID YOU SLOW THE SHADER DOWN WITH ALL OF THESE FEATURES I DON'T WANT?
+     WHY DIDN'T YOU MAKE THE DEFAULTS MORE TO MY LIKING?
+
+The default settings tend to best match flat ~13" slot mask TV's with sharp
+scanlines.  Real CRT's however vary a lot in their characteristics (and many are
+softer in more ways than one), so it's impossible to make the default settings
+look like everyone's favorite CRT.  Moreover, it's impossible to decide which
+of the slower features and options are superfluous:
+
+Some people love curvature, and some people hate it.  Some people love
+scanlines, and some people hate them.  Some people love phosphors, and some
+people hate them.  Some people love interlacing support, and some people hate
+it.  Some people love sharpness, and some people hate it.  Some people love
+convergence error, and some people hate it.  The one thing you hate the most is
+probably someone else's most critical feature.  This is why there are so many
+options, why the shader is so complicated, and why it's impossible to please
+everyone out of the box...unfortunately.
+
+That said, if you spend some time tweaking the settings, you're bound to get a
+picture you like.  Once you've made up your mind, you can save the settings to
+a user-settings.h file and disable shader parameters and other slow options to
+get the kind of performance you want.
+
+--------------------------------------------------------------------------------
+
+12.) WHY DIDN'T YOU INCLUDE A SHADER PRESET WITH NTSC SUPPORT?  WHY DIDN'T YOU
+     INCLUDE MORE CANNED PRESETS WITH DIFFERENT OPTIONS?  WHY CAN'T I SELECT
+     FROM ONE OF SEVERAL USER SETTINGS FILES WITHOUT MANUAL FILE RENAMING?
+
+I do plan on adding a version that uses the NTSC shader for the first two
+passes, but it will take a bit of work, because there are several NTSC shader
+versions as it is.  It's easy enough to combine the HALATION_BLUR passes into a
+one-pass blur from blurs/blur9x9fast.cg, but I'm not sure yet just how much
+modification the NTSC shader passes themselves might need for best results.
+
+I originally wanted NTSC support to be included out-of-the-box, but I'd also
+like to release the shader ASAP, so it'll have to wait.
+
+As for other canned presets, that's a little more complicated: I DO intend on
+creating more canned presets, but the combinatorial explosion of major codepath
+options in this shader is too overwhelming to be as exhaustive as I'd like.
+When I get the time, I'll add what I can to make this more user-friendly.
+In the meantime, I'll start adding a few different default versions of the
+user settings file and put them in a subdirectory for people to manually
+place in the main directory and rename to "user-settings.h."
+
+However, the libretro Cg shader specification (and the Cg to GLSL compiler) does
+not currently allow .cgp files to pass any static settings to the source files.
+This presents a huge problem, because it means that in order to create a new
+preset with different options, I also have to create duplicate files for EVERY
+single .cg pass for every permutation, not just the .cgp.  I plan on creating
+a number of skeleton wrapper .cg files in a subdirectory (which set a few
+options and then include the main .cg file for the pass), but it'll be a while
+yet.  In the meantime, I'd rather let people play with what's already done than
+keep it hidden on my hard drive.
+
+--------------------------------------------------------------------------------
+
+13.) WHY DO SO MANY VALUES IN USER_SETTINGS.H HAVE A _STATIC SUFFIX?
+
+The "_static" suffix is there to prevent naming conflicts with runtime shader
+parameters: The shader usually uses a version without the suffix, which is
+assigned either the value of the "_static" version or the runtime shader
+parameter version.  If a value in uset-settings.h doesn't have a "_static"
+suffix, it's usually because it's a static compile-time option only, with no
+corresponding runtime version.  Basically, you can ignore the suffix. :)
+
+--------------------------------------------------------------------------------
+
+14.) ARE THERE ANY BROKEN SETTINGS I SHOULD BE AWARE OF?
+     WHAT IF I WANT TO CHANGE SETTINGS IN THE .CGP FILE?
+
+As far as I know, all of the options in user-settings.h and the runtime shader
+parameters are pretty robust, with a few caveats:
+* As noted above, there are some tradeoffs between runtime and compile-time
+  options.  If runtime blur sigmas are disabled for instance, the phosphor
+  bloom (and to a lesser extent, the fake bloom) may not blur the right amount.
+* If you set your aspect ratio incorrectly, and mask_specify_num_triads == 1.0
+  (i.e. true, as opposed to 0.0, which is false), the shader will misinterpret
+  the number of triads you want by the same proportion.
+* Disabled shader parameters will do nothing, including either:
+    a.) mask_triad_size_desired
+    b.) mask_num_triads_desired,
+  depending on the value of mask_specify_num_triads.
+
+There is a broken and unimplemented option in derived-settings-and-constants.h,
+but users shouldn't need to mess around in there anyway.  (It's related to the
+more efficient phosphor mask resampling I want to implement.)
+
+However, the .cgp files are another story: They are pretty brittle, especially
+when it comes to their interaction with user-cgp-constants.h.  Be aware that the
+shader passes rely on scale types and sizes in your .cgp file being exactly what
+they expect.  Do not change any scale types from the defaults, or you'll get
+artifacts under certain conditions.  You can change the BLOOM_APPROX and
+MASK_RESIZE scale values (not scale types), but you must update the associated
+constant in user-cgp-constants.h to let the .cg shader files know about it, and
+the implications may reach farther than you expect.  Similarly, if you plan on
+changing an LUT texture, make sure you update the associated constants in
+user-cgp-constants.h.  In short, if you plan on changing anything in a .cgp
+file, you'll want to read it thoroughly first, especially the "IMPORTANT"
+section at the top.
+
+--------------------------------------------------------------------------------
+
+15.) WHAT ARE THE MOST COMMON DOT PITCHES FOR CRT TELEVISIONS?
+     WHAT KIND OF RESOLUTION WOULD I NEED FOR A REAL SHADOW MASK?
+
+The most demanding CRT we're ever likely to emulate is a Sony PVM-20M4U:
+    Width: 450mm
+    Aperture Grille Pitch: 0.31mm
+    Triads in 4:3 frame: 1451, assuming little to no overscan
+For 3-pixel triads, we would need about 6k UHD resolution.  A BVM-20F1U has
+similar requirements.
+
+However, common slot masks are far more similar to the kind of image this shader
+will produce at 900p, 1080p, 1200p, and 1440p:
+1.) A typical 13" diagonal CRT might have a 0.60mm slot pitch, for a total of
+    440.26666666666665 or so phosphor triads horizontally.
+2.) A typical 19" diagonal CRT might have a 0.75mm slot pitch, for a total of
+    514.7733333333333 or so phosphor triads horizontally.
+3.) According to http://repairfaq.ece.drexel.edu/REPAIR/F_crtfaq.html, a
+    typical 25" diagonal CRT might have a 0.9mm slot pitch, for a total of
+    564.4444444444445 or so phosphor triads horizontally.
+4.) A 21" Samsung SMC210N CCTV monitor (450 TV lines) has a 0.7mm stripe
+    pitch, for a total of 609.6 or so phosphor triads horizontally.
+
+The included EDP shadow mask starts looking very good with ~6-pixel triads, so
+it may take nearly 4k resolution to make it a particularly compelling option.
+However, it's possible to make smaller shadow masks on a pixel-by-pixel basis
+and tile them at a 1:1 ratio (mask_sample_mode = 2).  I may include a mask like
+this in a future update.
+
+--------------------------------------------------------------------------------
+
+16.) IS THIS PHOSPHOR BLOOM REALISTIC?
+
+Probably not:
+
+Realistically, the "phosphor bloom" blurs bright phosphors significantly more
+than your eyes would bloom the brighter phosphors on a real CRT.  This extra
+blurring however is necessary to distribute enough brightness to nearby pixels
+that we can amplify the overall brightness to that of the original source after
+applying the phosphor mask.  If you're interested, there are more comments on
+the subject at the top of the fragment shader in crt-royale-bloom-approx.cg.
+
+On the subject of the phosphor bloom: I intended to include some exposition
+about the math behind the brightpass calculation (and the much more complex
+and thorough calculation I originally used to blur the minimal amount necessary,
+which turned out to be inferior in practice), but that document isn't release-
+ready at the moment.  Sorry Hyllian. ;)
+
+--------------------------------------------------------------------------------
+
+17.) SO WHAT DO YOU PLAN ON ADDING IN THE FUTURE?
+
+I'd like to add these relatively soon:
+1.) A combined ntsc-crt-royale.cgp and ntsc-crt-royale-fake-bloom.cgp.
+2.) More presets, especially if maister or squarepusher find a way to make the
+Cg to GLSL compiler process .cgp files (which will allows .cgp's to pass
+arbitrary #defines to the .cg shader passes).
+3.) More efficient and flexible phosphor mask resampling.  Hopefully, this will
+    make it possible to manually resize the mask on Intel HD Graphics as well.
+4.) Make it more easy and convenient to use and experiment with mask_sample_mode
+    2 (direct 1:1 tiling of an input texture) by using a separate LUT texture
+    with its own parameters in user-cgp-constants.h, etc.  I haven't done this
+    yet because it requires yet another texture sample that could hurt other
+    codepaths, and I'm waiting until I have time to optimize it.
+5.) Refine the runtime shader parameters: Some of them are probably too fine-
+    grained and slow to change.
+
+Maybe's:
+1.) I've had trouble getting LUT's from subdirectories to work consistently
+    across platforms, but I'd like to get around that and include more mask
+    textures I've made.
+2.) If you're using spherical curvature with a small radius, the edges of the
+    sphere are blocky due to the pixel discards being done in 2x2 fragment
+    blocks.  I'd like to fix this if it can be done without a performance hit.
+3.) I have some ideas for procedural mask generation with a fast, closed-form
+    low-pass filter, but I don't know if I'll ever get around to it.
+
diff --git a/crt/shaders/crt-royale/THANKS.TXT b/crt/shaders/crt-royale/THANKS.TXT
new file mode 100644
index 0000000..4966f0e
--- /dev/null
+++ b/crt/shaders/crt-royale/THANKS.TXT
@@ -0,0 +1,43 @@
+Thank you squarepusher and maister, for hammering out the shader framework that
+made this possible and being so receptive to my feedback for Retroarch and the
+libretro Cg shader spec.  Thank you especially maister, for designing the sRGB
+support with me and implementing all the code for both sRGB FBO's and mipmapped
+FBO's in less time than it took me to add mipmapped LUT's alone!
+
+I want to thank xythen and DOLLS for inspiring me with their early efforts:
+    http://board.byuu.org/viewtopic.php?f=10&t=147
+    http://board.byuu.org/viewtopic.php?p=3820#p3834
+I've never spoken with them, but I never would have thought to make this shader
+if xythen hadn't gotten the ball rolling, or if DOLLS hadn't made his point
+about just how far CRT emulation could go with his phosphor mask prototypes,
+convergence error images, and barrel distortion code.
+
+I also want to thank hunterk for his excellent blog, especially this post:
+    http://filthypants.blogspot.com/2011/05/
+        more-emulator-pixel-shaders-crt-updated.html
+Along with caligari's work, his PhosphorLUT shader provoked me to experiment
+with game-style bloom as a way to reconcile shadow masks with full brightness.
+Along with Pulp Fiction, he also gets credit for helping me name this shader. :D
+
+Thank you Hyllian for your enthusiasm: It kept me focused on actually releasing
+this shader instead of refining it in perpetuity!
+
+Finally, I want to thank cgwg for everything he has done for CRT emulation:
+He was the first to consider the effects of halation, and (in addition to
+caligari?) he did the most research on the Gaussian properties of scanline
+electron beams.  His forum posts and links to academic research were very
+helpful, and so were the few PM's we exchanged many months ago: I originally
+meant to wet my feet by extending his shader with cylindrical curvature before
+writing my own.  I never managed to understand his curvature code (due to all of
+the different algebraic/trigonometric stages being rolled into one), and I gave
+up and started from scratch, but talking with him helped me piece together how
+his spherical uv<=>xyz mapping worked mathematically.  My own is subtly
+different, but not on purpose. ;)  A lot of the user parameters for geometry
+were inspired by his own (including Euler angle tilt and a "view distance" for
+controlling the field of view with a simplified near-plane).  Last but not
+least, my border dimming code was based more directly off of his: I did what I
+could to write a fresh implementation of his algorithm with new features, but
+the line between code and algorithm is pretty thin in that function, and it's
+a testament to him coming up with such an elegant solution.
+
+TroggleMonkey
diff --git a/crt/shaders/crt-royale/src/bind-shader-params.h b/crt/shaders/crt-royale/src/bind-shader-params.h
index 5a1792e..08555da 100644
--- a/crt/shaders/crt-royale/src/bind-shader-params.h
+++ b/crt/shaders/crt-royale/src/bind-shader-params.h
@@ -27,7 +27,7 @@
 
 //  Override some parameters for gamma-management.h and tex2Dantialias.h:
 #define OVERRIDE_DEVICE_GAMMA
-const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+static const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
 #define ANTIALIAS_OVERRIDE_BASICS
 #define ANTIALIAS_OVERRIDE_PARAMETERS
 
@@ -38,8 +38,9 @@ const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
 #endif
 
 //  Bind option names to shader parameter uniforms or static constants.
+#ifdef HARDCODE_SETTINGS
 #ifdef PARAMETER_UNIFORM
-/*    uniform float crt_gamma;
+    uniform float crt_gamma;
     uniform float lcd_gamma;
     uniform float levels_contrast;
     uniform float halation_weight;
@@ -57,8 +58,8 @@ const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
         uniform float beam_horiz_filter;
         uniform float beam_horiz_linear_rgb_weight;
     #else
-        const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
-        const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
+        static const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
+        static const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
     #endif
     uniform float convergence_offset_x_r;
     uniform float convergence_offset_x_g;
@@ -69,7 +70,7 @@ const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
     #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
         uniform float mask_type;
     #else
-        const float mask_type = clamp(mask_type_static, 0.0, 2.0);
+        static const float mask_type = clamp(mask_type_static, 0.0, 2.0);
     #endif
     uniform float mask_sample_mode_desired;
     uniform float mask_specify_num_triads;
@@ -81,8 +82,8 @@ const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
         uniform float aa_cubic_c;
         uniform float aa_gauss_sigma;
     #else
-        const float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
-        const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
+        static const float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
+        static const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
     #endif
     uniform float geom_mode_runtime;
     uniform float geom_radius;
@@ -97,113 +98,114 @@ const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
     uniform float border_darkness;
     uniform float border_compress;
     uniform float interlace_bff;
-    uniform float interlace_1080i; */
+    uniform float interlace_1080i;
 #else
     //  Use constants from user-settings.h, and limit ranges appropriately:
-    const float crt_gamma = max(0.0, crt_gamma_static);
-    const float lcd_gamma = max(0.0, lcd_gamma_static);
-    const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0);
-    const float halation_weight = clamp(halation_weight_static, 0.0, 1.0);
-    const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0);
-    const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static);
-    const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0);
-    const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static);
-    const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static);
-    const float beam_spot_power = max(beam_spot_power_static, 0.0);
-    const float beam_min_shape = max(2.0, beam_min_shape_static);
-    const float beam_max_shape = max(beam_min_shape, beam_max_shape_static);
-    const float beam_shape_power = max(0.0, beam_shape_power_static);
-//    const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
-    const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static);
-    const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
-    //  Unpack vector elements to match scalar uniforms:
-    const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0);
-    const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0);
-    const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0);
-    const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0);
-    const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0);
-    const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0);
-    const float mask_type = clamp(mask_type_static, 0.0, 2.0);
-    const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0);
-    const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0);
- //   const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0);
-    const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0);
-    const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5);
-    const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5);
-    const float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
-    const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
-    const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0);
-    const float geom_radius = max(1.0/(2.0*pi), geom_radius_static);         //  Clamp to [1/(2*pi), 1024]?
-    const float geom_view_dist = max(0.5, geom_view_dist_static);            //  Clamp to [0.5, 1024]?
-    const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi);
-    const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi);
-    const float geom_aspect_ratio_x = geom_aspect_ratio_static;              //  Force >= 1?
-    const float geom_aspect_ratio_y = 1.0;
-    const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x);
-    const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y);
-    const float border_size = clamp(border_size_static, 0.0, 0.5);           //  0.5 reaches to image center
-    const float border_darkness = max(0.0, border_darkness_static);
-    const float border_compress = max(1.0, border_compress_static);          //  < 1.0 darkens whole image
-    const float interlace_bff = float(interlace_bff_static);
-    const float interlace_1080i = float(interlace_1080i_static);
+    static const float crt_gamma = max(0.0, crt_gamma_static);
+    static const float lcd_gamma = max(0.0, lcd_gamma_static);
+    static const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0);
+    static const float halation_weight = clamp(halation_weight_static, 0.0, 1.0);
+    static const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0);
+    static const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static);
+    static const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0);
+    static const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static);
+    static const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static);
+    static const float beam_spot_power = max(beam_spot_power_static, 0.0);
+    static const float beam_min_shape = max(2.0, beam_min_shape_static);
+    static const float beam_max_shape = max(beam_min_shape, beam_max_shape_static);
+    static const float beam_shape_power = max(0.0, beam_shape_power_static);
+    static const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
+    static const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static);
+    static const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
+    //  Unpack static vector elements to match scalar uniforms:
+    static const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0);
+    static const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0);
+    static const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0);
+    static const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0);
+    static const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0);
+    static const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0);
+    static const float mask_type = clamp(mask_type_static, 0.0, 2.0);
+    static const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0);
+    static const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0);
+    static const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0);
+    static const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0);
+    static const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5);
+    static const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5);
+    static const float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
+    static const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
+    static const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0);
+    static const float geom_radius = max(1.0/(2.0*pi), geom_radius_static);         //  Clamp to [1/(2*pi), 1024]?
+    static const float geom_view_dist = max(0.5, geom_view_dist_static);            //  Clamp to [0.5, 1024]?
+    static const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi);
+    static const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi);
+    static const float geom_aspect_ratio_x = geom_aspect_ratio_static;              //  Force >= 1?
+    static const float geom_aspect_ratio_y = 1.0;
+    static const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x);
+    static const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y);
+    static const float border_size = clamp(border_size_static, 0.0, 0.5);           //  0.5 reaches to image center
+    static const float border_darkness = max(0.0, border_darkness_static);
+    static const float border_compress = max(1.0, border_compress_static);          //  < 1.0 darkens whole image
+    static const float interlace_bff = float(interlace_bff_static);
+    static const float interlace_1080i = float(interlace_1080i_static);
+#endif
 #endif
 
 
 //  Provide accessors for vector constants that pack scalar uniforms:
-vec2 get_aspect_vector(const float geom_aspect_ratio)
+inline float2 get_aspect_vector(const float geom_aspect_ratio)
 {
     //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
     //  the absolute scale from affecting the uv-mapping for curvature:
     const float geom_clamped_aspect_ratio =
         min(geom_aspect_ratio, geom_max_aspect_ratio);
-    const vec2 geom_aspect =
-        normalize(vec2(geom_clamped_aspect_ratio, 1.0));
+    const float2 geom_aspect =
+        normalize(float2(geom_clamped_aspect_ratio, 1.0));
     return geom_aspect;
 }
 
-vec2 get_geom_overscan_vector()
+inline float2 get_geom_overscan_vector()
 {
-    return vec2(geom_overscan_x, geom_overscan_y);
+    return float2(geom_overscan_x, geom_overscan_y);
 }
 
-vec2 get_geom_tilt_angle_vector()
+inline float2 get_geom_tilt_angle_vector()
 {
-    return vec2(geom_tilt_angle_x, geom_tilt_angle_y);
+    return float2(geom_tilt_angle_x, geom_tilt_angle_y);
 }
 
-vec3 get_convergence_offsets_x_vector()
+inline float3 get_convergence_offsets_x_vector()
 {
-    return vec3(convergence_offset_x_r, convergence_offset_x_g,
+    return float3(convergence_offset_x_r, convergence_offset_x_g,
         convergence_offset_x_b);
 }
 
-vec3 get_convergence_offsets_y_vector()
+inline float3 get_convergence_offsets_y_vector()
 {
-    return vec3(convergence_offset_y_r, convergence_offset_y_g,
+    return float3(convergence_offset_y_r, convergence_offset_y_g,
         convergence_offset_y_b);
 }
 
-vec2 get_convergence_offsets_r_vector()
+inline float2 get_convergence_offsets_r_vector()
 {
-    return vec2(convergence_offset_x_r, convergence_offset_y_r);
+    return float2(convergence_offset_x_r, convergence_offset_y_r);
 }
 
-vec2 get_convergence_offsets_g_vector()
+inline float2 get_convergence_offsets_g_vector()
 {
-    return vec2(convergence_offset_x_g, convergence_offset_y_g);
+    return float2(convergence_offset_x_g, convergence_offset_y_g);
 }
 
-vec2 get_convergence_offsets_b_vector()
+inline float2 get_convergence_offsets_b_vector()
 {
-    return vec2(convergence_offset_x_b, convergence_offset_y_b);
+    return float2(convergence_offset_x_b, convergence_offset_y_b);
 }
 
-vec2 get_aa_subpixel_r_offset()
+inline float2 get_aa_subpixel_r_offset()
 {
     #ifdef RUNTIME_ANTIALIAS_WEIGHTS
         #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
             //  WARNING: THIS IS EXTREMELY EXPENSIVE.
-            return vec2(aa_subpixel_r_offset_x_runtime,
+            return float2(aa_subpixel_r_offset_x_runtime,
                 aa_subpixel_r_offset_y_runtime);
         #else
             return aa_subpixel_r_offset_static;
@@ -214,17 +216,17 @@ vec2 get_aa_subpixel_r_offset()
 }
 
 //  Provide accessors settings which still need "cooking:"
-float get_mask_amplify()
+inline float get_mask_amplify()
 {
-    const float mask_grille_amplify = 1.0/mask_grille_avg_color;
-    const float mask_slot_amplify = 1.0/mask_slot_avg_color;
-    const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    static const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    static const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    static const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
     return mask_type < 0.5 ? mask_grille_amplify :
         mask_type < 1.5 ? mask_slot_amplify :
         mask_shadow_amplify;
 }
 
-float get_mask_sample_mode()
+inline float get_mask_sample_mode()
 {
     #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
         #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
diff --git a/crt/shaders/crt-royale/src/bloom-functions.h b/crt/shaders/crt-royale/src/bloom-functions.h
index 7e00dee..8ce2d0e 100644
--- a/crt/shaders/crt-royale/src/bloom-functions.h
+++ b/crt/shaders/crt-royale/src/bloom-functions.h
@@ -37,14 +37,13 @@
 ///////////////////////////////  BLOOM CONSTANTS  //////////////////////////////
 
 //  Compute constants with manual inlines of the functions below:
-const float bloom_diff_thresh = 1.0/256.0;
+static const float bloom_diff_thresh = 1.0/256.0;
+
 
-//  Assume an extremely large viewport size for asymptotic results:
-const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
 
 ///////////////////////////////////  HELPERS  //////////////////////////////////
 
-float get_min_sigma_to_blur_triad(const float triad_size,
+inline float get_min_sigma_to_blur_triad(const float triad_size,
     const float thresh)
 {
     //  Requires:   1.) triad_size is the final phosphor triad size in pixels
@@ -60,7 +59,7 @@ float get_min_sigma_to_blur_triad(const float triad_size,
     //return 0.5985*triad_size - triad_size*sqrt(thresh)
 }
 
-float get_absolute_scale_blur_sigma(const float thresh)
+inline float get_absolute_scale_blur_sigma(const float thresh)
 {
     //  Requires:   1.) min_expected_triads must be a global float.  The number
     //                  of horizontal phosphor triads in the final image must be
@@ -93,7 +92,7 @@ float get_absolute_scale_blur_sigma(const float thresh)
             max_viewport_size_x/min_allowed_viewport_triads.x, thresh);
 }
 
-float get_center_weight(const float sigma)
+inline float get_center_weight(const float sigma)
 {
     //  Given a Gaussian blur sigma, get the blur weight for the center texel.
     #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
@@ -161,8 +160,8 @@ float get_center_weight(const float sigma)
     #endif
 }
 
-vec3 tex2DblurNfast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+inline float3 tex2DblurNfast(const sampler2D texture, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  If sigma is static, we can safely branch and use the smallest blur
     //  that's big enough.  Ignore #define hints, because we'll only use a
@@ -186,40 +185,40 @@ vec3 tex2DblurNfast(const sampler2D tex, const vec2 tex_uv,
     #ifdef PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
         if(sigma <= blur9_std_dev)
         {
-            return tex2Dblur9fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
         }
         else if(sigma <= blur17_std_dev)
         {
-            return tex2Dblur17fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
         }
         else if(sigma <= blur25_std_dev)
         {
-            return tex2Dblur25fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
         }
         else if(sigma <= blur31_std_dev)
         {
-            return tex2Dblur31fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
         }
         else
         {
-            return tex2Dblur43fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
         }
     #else
         //  If we can't afford to branch, we can only guess at what blur
         //  size we need.  Therefore, use the largest blur allowed.
         #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
-            return tex2Dblur43fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur43fast(texture, tex_uv, dxdy, sigma);
         #else
         #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
-            return tex2Dblur31fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur31fast(texture, tex_uv, dxdy, sigma);
         #else
         #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
-            return tex2Dblur25fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur25fast(texture, tex_uv, dxdy, sigma);
         #else
         #ifdef PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
-            return tex2Dblur17fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur17fast(texture, tex_uv, dxdy, sigma);
         #else
-            return tex2Dblur9fast(tex, tex_uv, dxdy, sigma);
+            return tex2Dblur9fast(texture, tex_uv, dxdy, sigma);
         #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
         #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
         #endif  //  PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
@@ -227,7 +226,7 @@ vec3 tex2DblurNfast(const sampler2D tex, const vec2 tex_uv,
     #endif  //  PHOSPHOR_BLOOM_BRANCH_FOR_BLUR_SIZE
 }
 
-float get_bloom_approx_sigma(const float output_size_x_runtime,
+inline float get_bloom_approx_sigma(const float output_size_x_runtime,
     const float estimated_viewport_size_x)
 {
     //  Requires:   1.) output_size_x_runtime == BLOOM_APPROX.output_size.x.
@@ -243,15 +242,15 @@ float get_bloom_approx_sigma(const float output_size_x_runtime,
     //              bilinear filtering, so use static calculations.
     //  Assume the default static value.  This is a compromise that ensures
     //  typical triads are blurred, even if unusually large ones aren't.
-    const float mask_num_triads_static =
+    static const float mask_num_triads_static =
         max(min_allowed_viewport_triads.x, mask_num_triads_desired_static);
     const float mask_num_triads_from_size =
-        estimated_viewport_size_x/params.mask_triad_size_desired;
+        estimated_viewport_size_x/global.mask_triad_size_desired;
     const float mask_num_triads_runtime = max(min_allowed_viewport_triads.x,
-        mix(mask_num_triads_from_size, params.mask_num_triads_desired,
-            mask_specify_num_triads));
+        lerp(mask_num_triads_from_size, global.mask_num_triads_desired,
+            global.mask_specify_num_triads));
     //  Assume an extremely large viewport size for asymptotic results:
-     const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+    static const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
     if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
     {
         //  Use the runtime num triads and output size:
@@ -264,7 +263,7 @@ float get_bloom_approx_sigma(const float output_size_x_runtime,
         //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
         //  account for the Gaussian scanline sigma from the last pass too.
         //  The bloom will be too wide horizontally but tall enough vertically.
-        return length(vec2(bloom_approx_sigma, beam_max_sigma));
+        return length(float2(bloom_approx_sigma, beam_max_sigma));
     }
     else    //  3x3 blur resize (the bilinear resize doesn't need a sigma)
     {
@@ -272,12 +271,12 @@ float get_bloom_approx_sigma(const float output_size_x_runtime,
         //  reason to choose blur3x3 is to avoid dynamic weights, so use a
         //  static calculation.
         #ifdef PHOSPHOR_BLOOM_FAKE
-            const float output_size_x_static =
+            static const float output_size_x_static =
                 bloom_approx_size_x_for_fake;
         #else
-            const float output_size_x_static = bloom_approx_size_x;
+            static const float output_size_x_static = bloom_approx_size_x;
         #endif
-        const float asymptotic_triad_size =
+        static const float asymptotic_triad_size =
             max_viewport_size_x/mask_num_triads_static;
         const float asymptotic_sigma = get_min_sigma_to_blur_triad(
             asymptotic_triad_size, bloom_diff_thresh);
@@ -286,11 +285,11 @@ float get_bloom_approx_sigma(const float output_size_x_runtime,
         //  The BLOOM_APPROX input has to be ORIG_LINEARIZED to avoid moire, but
         //  try accounting for the Gaussian scanline sigma from the last pass
         //  too; use the static default value:
-        return length(vec2(bloom_approx_sigma, beam_max_sigma_static));
+        return length(float2(bloom_approx_sigma, beam_max_sigma_static));
     }
 }
 
-float get_final_bloom_sigma(const float bloom_sigma_runtime)
+inline float get_final_bloom_sigma(const float bloom_sigma_runtime)
 {
     //  Requires:   1.) bloom_sigma_runtime is a precalculated sigma that's
     //                  optimal for the [known] triad size.
@@ -303,7 +302,7 @@ float get_final_bloom_sigma(const float bloom_sigma_runtime)
     //  Notes:      Call this from the fragment shader, NOT the vertex shader,
     //              so static sigmas can be constant-folded!
     const float bloom_sigma_optimistic = get_min_sigma_to_blur_triad(
-        params.mask_triad_size_desired, bloom_diff_thresh);
+        mask_triad_size_desired_static, bloom_diff_thresh);
     #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
         return bloom_sigma_runtime;
     #else
@@ -313,4 +312,6 @@ float get_final_bloom_sigma(const float bloom_sigma_runtime)
     #endif
 }
 
-#endif  //  BLOOM_FUNCTIONS_H
\ No newline at end of file
+
+#endif  //  BLOOM_FUNCTIONS_H
+
diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang
new file mode 100644
index 0000000..e242024
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx-intel.slang
@@ -0,0 +1,3 @@
+#version 450
+#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+#include "crt-royale-bloom-approx.h"
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.h
similarity index 56%
rename from crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang
rename to crt/shaders/crt-royale/src/crt-royale-bloom-approx.h
index 0fd6d24..984819e 100644
--- a/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.h
@@ -1,16 +1,3 @@
-#version 450
-
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-	vec4 ORIG_LINEARIZEDSize;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -29,32 +16,170 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+	vec4 ORIG_LINEARIZEDSize;
+} params;
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+} global;
 
+#define ORIG_LINEARIZEDvideo_size params.SourceSize.xy
+#define ORIG_LINEARIZEDtexture_size params.SourceSize.xy
+
+float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
-#include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
 #include "../../../../include/gamma-management.h"
-#include "../../../../include/blur-functions.h"
+#include "derived-settings-and-constants.h"
 #include "scanline-functions.h"
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out vec2 blur_dxdy;
+layout(location = 2) out vec2 uv_scanline_step;
+layout(location = 3) out float estimated_viewport_size_x;
+layout(location = 4) out vec2 texture_size_inv;
+layout(location = 5) out vec2 tex_uv_to_pixel_scale;
+
+void main()
+{
+   gl_Position = global.MVP * Position;
+   float2 vTexCoord = TexCoord;
+    const float2 video_uv = vTexCoord * IN.texture_size/IN.video_size;
+    tex_uv = video_uv * ORIG_LINEARIZEDvideo_size /
+        ORIG_LINEARIZEDtexture_size;
+    //  The last pass (vertical scanlines) had a viewport y scale, so we can
+    //  use it to calculate a better runtime sigma:
+    estimated_viewport_size_x =
+        IN.video_size.y * geom_aspect_ratio_x/geom_aspect_ratio_y;
+
+    //  Get the uv sample distance between output pixels.  We're using a resize
+    //  blur, so arbitrary upsizing will be acceptable if filter_linearN =
+    //  "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
+    //  "true" too.  The blur will be much more accurate if a true 4x4 Gaussian
+    //  resize is used instead of tex2Dblur3x3_resize (which samples between
+    //  texels even for upsizing).
+    const float2 dxdy_min_scale = ORIG_LINEARIZEDvideo_size/IN.output_size;
+    const float2 texture_size_inv = float2(1.0)/ORIG_LINEARIZEDtexture_size;
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  For upsizing, we'll snap to texels and sample the nearest 4.
+        const float2 dxdy_scale = max(dxdy_min_scale, float2(1.0));
+        blur_dxdy = dxdy_scale * texture_size_inv;
+    }
+    else
+    {
+        const float2 dxdy_scale = dxdy_min_scale;
+        blur_dxdy = dxdy_scale * texture_size_inv;
+    }
+    //  tex2Dresize_gaussian4x4 needs to know a bit more than the other filters:
+    tex_uv_to_pixel_scale = IN.output_size *
+        ORIG_LINEARIZEDtexture_size / ORIG_LINEARIZEDvideo_size;
+    //texture_size_inv = texture_size_inv;
+
+    //  Detecting interlacing again here lets us apply convergence offsets in
+    //  this pass.  il_step_multiple contains the (texel, scanline) step
+    //  multiple: 1 for progressive, 2 for interlaced.
+    const float2 orig_video_size = ORIG_LINEARIZEDvideo_size;
+    const float y_step = 1.0 + float(is_interlaced(orig_video_size.y));
+    const float2 il_step_multiple = float2(1.0, y_step);
+    //  Get the uv distance between (texels, same-field scanlines):
+    uv_scanline_step = il_step_multiple / ORIG_LINEARIZEDtexture_size;
+}
+
+#pragma stage fragment
+#pragma format R8G8B8A8_SRGB
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in vec2 blur_dxdy;
+layout(location = 2) in vec2 uv_scanline_step;
+layout(location = 3) in float estimated_viewport_size_x;
+layout(location = 4) in vec2 texture_size_inv;
+layout(location = 5) in vec2 tex_uv_to_pixel_scale;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
+layout(set = 0, binding = 4) uniform sampler2D Original;
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+#include "../../../../include/blur-functions.h"
 #include "bloom-functions.h"
+#include "../../../../include/gamma-management.h"
+
 
 ///////////////////////////////////  HELPERS  //////////////////////////////////
 
-vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const vec2 texture_size, const vec2 texture_size_inv,
-    const vec2 tex_uv_to_pixel_scale, const float sigma)
+float3 tex2Dresize_gaussian4x4(sampler2D tex, float2 tex_uv, float2 dxdy, float2 tex_size, float2 texture_size_inv, float2 tex_uv_to_pixel_scale, float sigma)
 {
     //  Requires:   1.) All requirements of gamma-management.h must be satisfied!
     //              2.) filter_linearN must == "true" in your .cgp preset.
     //              3.) mipmap_inputN must == "true" in your .cgp preset if
     //                  IN.output_size << SRC.video_size.
     //              4.) dxdy should contain the uv pixel spacing:
-    //                      dxdy = max(vec2(1.0),
+    //                      dxdy = max(float2(1.0),
     //                          SRC.video_size/IN.output_size)/SRC.texture_size;
     //              5.) texture_size == SRC.texture_size
-    //              6.) texture_size_inv == vec2(1.0)/SRC.texture_size
+    //              6.) texture_size_inv == float2(1.0)/SRC.texture_size
     //              7.) tex_uv_to_pixel_scale == IN.output_size *
     //                      SRC.texture_size / SRC.video_size;
     //              8.) sigma is the desired Gaussian standard deviation, in
@@ -72,65 +197,65 @@ vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv,
     const float denom_inv = 0.5/(sigma*sigma);
     //  We're taking 4x4 samples, and we're snapping to texels for upsizing.
     //  Find texture coords for sample 5 (second row, second column):
-    const vec2 curr_texel = tex_uv * texture_size;
-    const vec2 prev_texel =
-        floor(curr_texel - vec2(under_half)) + vec2(0.5);
-    const vec2 prev_texel_uv = prev_texel * texture_size_inv;
-    const bvec2 snap = lessThanEqual(dxdy , texture_size_inv);
-    const vec2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
-    const vec2 sample5_uv = mix(sample5_downsize_uv, prev_texel_uv, snap);
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_uv = prev_texel * texture_size_inv;
+    const float2 snap = float2((dxdy.x <= texture_size_inv.x), (dxdy.y <= texture_size_inv.y));
+    const float2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
+    const float2 sample5_uv = lerp(sample5_downsize_uv, prev_texel_uv, snap);
     //  Compute texture coords for other samples:
-    const vec2 dx = vec2(dxdy.x, 0.0);
-    const vec2 sample0_uv = sample5_uv - dxdy;
-    const vec2 sample10_uv = sample5_uv + dxdy;
-    const vec2 sample15_uv = sample5_uv + 2.0 * dxdy;
-    const vec2 sample1_uv = sample0_uv + dx;
-    const vec2 sample2_uv = sample0_uv + 2.0 * dx;
-    const vec2 sample3_uv = sample0_uv + 3.0 * dx;
-    const vec2 sample4_uv = sample5_uv - dx;
-    const vec2 sample6_uv = sample5_uv + dx;
-    const vec2 sample7_uv = sample5_uv + 2.0 * dx;
-    const vec2 sample8_uv = sample10_uv - 2.0 * dx;
-    const vec2 sample9_uv = sample10_uv - dx;
-    const vec2 sample11_uv = sample10_uv + dx;
-    const vec2 sample12_uv = sample15_uv - 3.0 * dx;
-    const vec2 sample13_uv = sample15_uv - 2.0 * dx;
-    const vec2 sample14_uv = sample15_uv - dx;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 sample0_uv = sample5_uv - dxdy;
+    const float2 sample10_uv = sample5_uv + dxdy;
+    const float2 sample15_uv = sample5_uv + 2.0 * dxdy;
+    const float2 sample1_uv = sample0_uv + dx;
+    const float2 sample2_uv = sample0_uv + 2.0 * dx;
+    const float2 sample3_uv = sample0_uv + 3.0 * dx;
+    const float2 sample4_uv = sample5_uv - dx;
+    const float2 sample6_uv = sample5_uv + dx;
+    const float2 sample7_uv = sample5_uv + 2.0 * dx;
+    const float2 sample8_uv = sample10_uv - 2.0 * dx;
+    const float2 sample9_uv = sample10_uv - dx;
+    const float2 sample11_uv = sample10_uv + dx;
+    const float2 sample12_uv = sample15_uv - 3.0 * dx;
+    const float2 sample13_uv = sample15_uv - 2.0 * dx;
+    const float2 sample14_uv = sample15_uv - dx;
     //  Load each sample:
-    const vec3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
-    const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
-    const vec3 sample2 = tex2D_linearize(tex, sample2_uv).rgb;
-    const vec3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
-    const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
-    const vec3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
-    const vec3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
-    const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
-    const vec3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
-    const vec3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
-    const vec3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
-    const vec3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
-    const vec3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
-    const vec3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
-    const vec3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
-    const vec3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
+    float3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
+    float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    float3 sample2 = tex2D_linearize(tex, dx).rgb;
+    float3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
+    float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    float3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
+    float3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
+    float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    float3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
+    float3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
+    float3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
+    float3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
+    float3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
+    float3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
+    float3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
+    float3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
     //  Compute destination pixel offsets for each sample:
-    const vec2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
-    const vec2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
+    const float2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const float2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
     //  Compute Gaussian sample weights:
     const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
     const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
@@ -152,81 +277,13 @@ vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv,
         w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
         w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
     //  Weight and sum the samples:
-    const vec3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+    const float3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
         w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
         w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
         w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
     return sum * weight_sum_inv;
 }
 
-#pragma stage vertex
-layout(location = 0) in vec4 Position;
-layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 tex_uv;
-layout(location = 1) out float estimated_viewport_size_x;
-layout(location = 2) out vec2 blur_dxdy;
-layout(location = 3) out vec2 uv_scanline_step;
-layout(location = 4) out vec2 texture_size_inv;
-layout(location = 5) out vec2 tex_uv_to_pixel_scale;
-
-void main()
-{
-    //  This vertex shader copies blurs/vertex-shader-blur-one-pass-resize.h,
-    //  except we're using a different source image.
-   gl_Position = params.MVP * Position;
-   const vec2 video_uv = TexCoord;
-   tex_uv = video_uv;
-    //  The last pass (vertical scanlines) had a viewport y scale, so we can
-    //  use it to calculate a better runtime sigma:
-	estimated_viewport_size_x = registers.SourceSize.y * params.geom_aspect_ratio_x / params.geom_aspect_ratio_y;
-   
-    //  Get the uv sample distance between output pixels.  We're using a resize
-    //  blur, so arbitrary upsizing will be acceptable if filter_linearN =
-    //  "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
-    //  "true" too.  The blur will be much more accurate if a true 4x4 Gaussian
-    //  resize is used instead of tex2Dblur3x3_resize (which samples between
-    //  texels even for upsizing).
-	const vec2 dxdy_min_scale = registers.ORIG_LINEARIZEDSize.xy * registers.OutputSize.zw;
-    texture_size_inv = registers.ORIG_LINEARIZEDSize.zw;
-    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
-    {
-        //  For upsizing, we'll snap to texels and sample the nearest 4.
-        const vec2 dxdy_scale = max(dxdy_min_scale, vec2(1.0));
-        blur_dxdy = dxdy_scale * texture_size_inv;
-    }
-    else
-    {
-        const vec2 dxdy_scale = dxdy_min_scale;
-        blur_dxdy = dxdy_scale * texture_size_inv;
-	}
-	
-	tex_uv_to_pixel_scale = registers.OutputSize.xy;
-//  texture_size_inv = texture_size_inv; <- commented out because it's pointless in slang
-
-    //  Detecting interlacing again here lets us apply convergence offsets in
-    //  this pass.  il_step_multiple contains the (texel, scanline) step
-    //  multiple: 1 for progressive, 2 for interlaced.
-    const vec2 orig_video_size = registers.ORIG_LINEARIZEDSize.xy;
-	float interlace_check = 0.0;
-	if (is_interlaced(orig_video_size.y) == true) interlace_check = 1.0;
-    const float y_step = 1.0 + interlace_check;
-    const vec2 il_step_multiple = vec2(1.0, y_step);
-    //  Get the uv distance between (texels, same-field scanlines):
-    uv_scanline_step = il_step_multiple * registers.ORIG_LINEARIZEDSize.zw;
-}
-
-#pragma stage fragment
-#pragma format R8G8B8A8_SRGB
-layout(location = 0) in vec2 tex_uv;
-layout(location = 1) in float estimated_viewport_size_x;
-layout(location = 2) in vec2 blur_dxdy;
-layout(location = 3) in vec2 uv_scanline_step;
-layout(location = 4) in vec2 texture_size_inv;
-layout(location = 5) in vec2 tex_uv_to_pixel_scale;
-layout(location = 0) out vec4 FragColor;
-layout(set = 0, binding = 2) uniform sampler2D Source;
-layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
-
 void main()
 {
     //  Would a viewport-relative size work better for this pass?  (No.)
@@ -264,45 +321,52 @@ void main()
     //      bandwidth if it's done at a small constant scale.
     
     //  Get the constants we need to sample:
-	const vec2 texture_size = registers.ORIG_LINEARIZEDSize.xy;
-	vec2 tex_uv_r, tex_uv_g, tex_uv_b;
-	
-	if(beam_misconvergence == true)
+//    const sampler2D texture = ORIG_LINEARIZED.texture;
+//    const float2 tex_uv = tex_uv;
+//    const float2 blur_dxdy = blur_dxdy;
+    const float2 texture_size_ = ORIG_LINEARIZEDtexture_size;
+//    const float2 texture_size_inv = texture_size_inv;
+//    const float2 tex_uv_to_pixel_scale = tex_uv_to_pixel_scale;
+    float2 tex_uv_r, tex_uv_g, tex_uv_b;
+
+    if(beam_misconvergence)
     {
-        const vec2 convergence_offsets_r = vec2(params.convergence_offset_x_r, params.convergence_offset_y_r);//get_convergence_offsets_r_vector();
-        const vec2 convergence_offsets_g = vec2(params.convergence_offset_x_g, params.convergence_offset_y_g);//get_convergence_offsets_g_vector();
-        const vec2 convergence_offsets_b = vec2(params.convergence_offset_x_b, params.convergence_offset_y_b);//get_convergence_offsets_b_vector();
-        tex_uv_r = tex_uv - vec2(params.convergence_offset_x_r, params.convergence_offset_y_r) * uv_scanline_step;
-        tex_uv_g = tex_uv - vec2(params.convergence_offset_x_g, params.convergence_offset_y_g) * uv_scanline_step;
-        tex_uv_b = tex_uv - vec2(params.convergence_offset_x_b, params.convergence_offset_y_b) * uv_scanline_step;
+        const float2 uv_scanline_step = uv_scanline_step;
+        const float2 convergence_offsets_r = get_convergence_offsets_r_vector();
+        const float2 convergence_offsets_g = get_convergence_offsets_g_vector();
+        const float2 convergence_offsets_b = get_convergence_offsets_b_vector();
+        tex_uv_r = tex_uv - convergence_offsets_r * uv_scanline_step;
+        tex_uv_g = tex_uv - convergence_offsets_g * uv_scanline_step;
+        tex_uv_b = tex_uv - convergence_offsets_b * uv_scanline_step;
     }
-	//  Get the blur sigma:
-    const float bloom_approx_sigma = get_bloom_approx_sigma(registers.OutputSize.x, estimated_viewport_size_x);
-	
-	//  Sample the resized and blurred texture, and apply convergence offsets if
+    //  Get the blur sigma:
+    const float bloom_approx_sigma = get_bloom_approx_sigma(IN.output_size.x,
+        estimated_viewport_size_x);
+
+    //  Sample the resized and blurred texture, and apply convergence offsets if
     //  necessary.  Applying convergence offsets here triples our samples from
     //  16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
     //  HALATION_BLUR 3 times at full resolution every time they're used.
-    vec3 color_r, color_g, color_b, color;
-	if(bloom_approx_filter > 1.5)
+    float3 color_r, color_g, color_b, color;
+    if(bloom_approx_filter > 1.5)
     {
         //  Use a 4x4 Gaussian resize.  This is slower but technically correct.
-        if(beam_misconvergence == true)
+        if(beam_misconvergence)
         {
             color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
-                blur_dxdy, texture_size, texture_size_inv,
+                blur_dxdy, texture_size_, texture_size_inv,
                 tex_uv_to_pixel_scale, bloom_approx_sigma);
             color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
-                blur_dxdy, texture_size, texture_size_inv,
+                blur_dxdy, texture_size_, texture_size_inv,
                 tex_uv_to_pixel_scale, bloom_approx_sigma);
             color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
-                blur_dxdy, texture_size, texture_size_inv,
+                blur_dxdy, texture_size_, texture_size_inv,
                 tex_uv_to_pixel_scale, bloom_approx_sigma);
         }
         else
         {
             color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
-                blur_dxdy, texture_size, texture_size_inv,
+                blur_dxdy, texture_size_, texture_size_inv,
                 tex_uv_to_pixel_scale, bloom_approx_sigma);
         }
     }
@@ -311,7 +375,7 @@ void main()
         //  Use a 3x3 resize blur.  This is the softest option, because we're
         //  blurring already blurry bilinear samples.  It doesn't play quite as
         //  nicely with convergence offsets, but it has its charms.
-        if(beam_misconvergence == true)
+        if(beam_misconvergence)
         {
             color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
                 blur_dxdy, bloom_approx_sigma);
@@ -333,7 +397,7 @@ void main()
         //  too sharp above ~400x300, but the blurs break down above that
         //  resolution too, unless min_allowed_viewport_triads is high enough to
         //  keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
-        if(beam_misconvergence == true)
+        if(beam_misconvergence)
         {
             color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
             color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
@@ -344,11 +408,11 @@ void main()
             color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
         }
     }
-	//  Pack the colors from the red/green/blue beams into a single vector:
-    if(beam_misconvergence == true)
+    //  Pack the colors from the red/green/blue beams into a single vector:
+    if(beam_misconvergence)
     {
-        color = vec3(color_r.r, color_g.g, color_b.b);
+        color = float3(color_r.r, color_g.g, color_b.b);
     }
     //  Encode and output the blurred image:
-   FragColor = vec4(texture(ORIG_LINEARIZED, tex_uv));//vec4(color, 1.0);//
-}
+		FragColor = encode_output(float4(tex2D_linearize(ORIG_LINEARIZED, tex_uv)));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang
index e07e58a..14d4e76 100755
--- a/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx.slang
@@ -1,354 +1,2 @@
 #version 450
-
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-	vec4 ORIG_LINEARIZEDSize;
-} registers;
-
-#include "params.inc"
-
-/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
-
-//  crt-royale: A full-featured CRT shader, with cheese.
-//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
-//
-//  This program is free software; you can redistribute it and/or modify it
-//  under the terms of the GNU General Public License as published by the Free
-//  Software Foundation; either version 2 of the License, or any later version.
-//
-//  This program is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-//  more details.
-//
-//  You should have received a copy of the GNU General Public License along with
-//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-//  Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-#include "../user-settings.h"
-#include "derived-settings-and-constants.h"
-#include "bind-shader-params.h"
-#include "../../../../include/gamma-management.h"
-#include "../../../../include/blur-functions.h"
-#include "scanline-functions.h"
-#include "bloom-functions.h"
-
-///////////////////////////////////  HELPERS  //////////////////////////////////
-
-vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const vec2 texture_size, const vec2 texture_size_inv,
-    const vec2 tex_uv_to_pixel_scale, const float sigma)
-{
-    //  Requires:   1.) All requirements of gamma-management.h must be satisfied!
-    //              2.) filter_linearN must == "true" in your .cgp preset.
-    //              3.) mipmap_inputN must == "true" in your .cgp preset if
-    //                  IN.output_size << SRC.video_size.
-    //              4.) dxdy should contain the uv pixel spacing:
-    //                      dxdy = max(vec2(1.0),
-    //                          SRC.video_size/IN.output_size)/SRC.texture_size;
-    //              5.) texture_size == SRC.texture_size
-    //              6.) texture_size_inv == vec2(1.0)/SRC.texture_size
-    //              7.) tex_uv_to_pixel_scale == IN.output_size *
-    //                      SRC.texture_size / SRC.video_size;
-    //              8.) sigma is the desired Gaussian standard deviation, in
-    //                  terms of output pixels.  It should be < ~0.66171875 to
-    //                  ensure the first unused sample (outside the 4x4 box) has
-    //                  a weight < 1.0/256.0.
-    //  Returns:    A true 4x4 Gaussian resize of the input.
-    //  Description:
-    //  Given correct inputs, this Gaussian resizer samples 4 pixel locations
-    //  along each downsized dimension and/or 4 texel locations along each
-    //  upsized dimension.  It computes dynamic weights based on the pixel-space
-    //  distance of each sample from the destination pixel.  It is arbitrarily
-    //  resizable and higher quality than tex2Dblur3x3_resize, but it's slower.
-    //  TODO: Move this to a more suitable file once there are others like it.
-    const float denom_inv = 0.5/(sigma*sigma);
-    //  We're taking 4x4 samples, and we're snapping to texels for upsizing.
-    //  Find texture coords for sample 5 (second row, second column):
-    const vec2 curr_texel = tex_uv * texture_size;
-    const vec2 prev_texel =
-        floor(curr_texel - vec2(under_half)) + vec2(0.5);
-    const vec2 prev_texel_uv = prev_texel * texture_size_inv;
-    const bvec2 snap = lessThanEqual(dxdy , texture_size_inv);
-    const vec2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
-    const vec2 sample5_uv = mix(sample5_downsize_uv, prev_texel_uv, snap);
-    //  Compute texture coords for other samples:
-    const vec2 dx = vec2(dxdy.x, 0.0);
-    const vec2 sample0_uv = sample5_uv - dxdy;
-    const vec2 sample10_uv = sample5_uv + dxdy;
-    const vec2 sample15_uv = sample5_uv + 2.0 * dxdy;
-    const vec2 sample1_uv = sample0_uv + dx;
-    const vec2 sample2_uv = sample0_uv + 2.0 * dx;
-    const vec2 sample3_uv = sample0_uv + 3.0 * dx;
-    const vec2 sample4_uv = sample5_uv - dx;
-    const vec2 sample6_uv = sample5_uv + dx;
-    const vec2 sample7_uv = sample5_uv + 2.0 * dx;
-    const vec2 sample8_uv = sample10_uv - 2.0 * dx;
-    const vec2 sample9_uv = sample10_uv - dx;
-    const vec2 sample11_uv = sample10_uv + dx;
-    const vec2 sample12_uv = sample15_uv - 3.0 * dx;
-    const vec2 sample13_uv = sample15_uv - 2.0 * dx;
-    const vec2 sample14_uv = sample15_uv - dx;
-    //  Load each sample:
-    const vec3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
-    const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
-    const vec3 sample2 = tex2D_linearize(tex, sample2_uv).rgb;
-    const vec3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
-    const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
-    const vec3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
-    const vec3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
-    const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
-    const vec3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
-    const vec3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
-    const vec3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
-    const vec3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
-    const vec3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
-    const vec3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
-    const vec3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
-    const vec3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
-    //  Compute destination pixel offsets for each sample:
-    const vec2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
-    const vec2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
-    const vec2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
-    //  Compute Gaussian sample weights:
-    const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
-    const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
-    const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv);
-    const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv);
-    const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv);
-    const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv);
-    const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv);
-    const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv);
-    const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv);
-    const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv);
-    const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv);
-    const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv);
-    const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv);
-    const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv);
-    const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv);
-    const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv);
-    const float weight_sum_inv = 1.0/(
-        w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
-        w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
-    //  Weight and sum the samples:
-    const vec3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
-        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
-        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
-        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
-    return sum * weight_sum_inv;
-}
-
-#pragma stage vertex
-layout(location = 0) in vec4 Position;
-layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 tex_uv;
-layout(location = 1) out float estimated_viewport_size_x;
-layout(location = 2) out vec2 blur_dxdy;
-layout(location = 3) out vec2 uv_scanline_step;
-layout(location = 4) out vec2 texture_size_inv;
-layout(location = 5) out vec2 tex_uv_to_pixel_scale;
-
-void main()
-{
-    //  This vertex shader copies blurs/vertex-shader-blur-one-pass-resize.h,
-    //  except we're using a different source image.
-   gl_Position = params.MVP * Position;
-   const vec2 video_uv = TexCoord;
-   tex_uv = video_uv;
-    //  The last pass (vertical scanlines) had a viewport y scale, so we can
-    //  use it to calculate a better runtime sigma:
-	estimated_viewport_size_x = registers.SourceSize.y * params.geom_aspect_ratio_x / params.geom_aspect_ratio_y;
-   
-    //  Get the uv sample distance between output pixels.  We're using a resize
-    //  blur, so arbitrary upsizing will be acceptable if filter_linearN =
-    //  "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
-    //  "true" too.  The blur will be much more accurate if a true 4x4 Gaussian
-    //  resize is used instead of tex2Dblur3x3_resize (which samples between
-    //  texels even for upsizing).
-	const vec2 dxdy_min_scale = registers.ORIG_LINEARIZEDSize.xy * registers.OutputSize.zw;
-    texture_size_inv = registers.ORIG_LINEARIZEDSize.zw;
-    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
-    {
-        //  For upsizing, we'll snap to texels and sample the nearest 4.
-        const vec2 dxdy_scale = max(dxdy_min_scale, vec2(1.0));
-        blur_dxdy = dxdy_scale * texture_size_inv;
-    }
-    else
-    {
-        const vec2 dxdy_scale = dxdy_min_scale;
-        blur_dxdy = dxdy_scale * texture_size_inv;
-	}
-	
-	tex_uv_to_pixel_scale = registers.OutputSize.xy;
-//  texture_size_inv = texture_size_inv; <- commented out because it's pointless in slang
-
-    //  Detecting interlacing again here lets us apply convergence offsets in
-    //  this pass.  il_step_multiple contains the (texel, scanline) step
-    //  multiple: 1 for progressive, 2 for interlaced.
-    const vec2 orig_video_size = registers.ORIG_LINEARIZEDSize.xy;
-	float interlace_check = 0.0;
-	if (is_interlaced(orig_video_size.y) == true) interlace_check = 1.0;
-    const float y_step = 1.0 + interlace_check;
-    const vec2 il_step_multiple = vec2(1.0, y_step);
-    //  Get the uv distance between (texels, same-field scanlines):
-    uv_scanline_step = il_step_multiple * registers.ORIG_LINEARIZEDSize.zw;
-}
-
-#pragma stage fragment
-#pragma format R8G8B8A8_SRGB
-layout(location = 0) in vec2 tex_uv;
-layout(location = 1) in float estimated_viewport_size_x;
-layout(location = 2) in vec2 blur_dxdy;
-layout(location = 3) in vec2 uv_scanline_step;
-layout(location = 4) in vec2 texture_size_inv;
-layout(location = 5) in vec2 tex_uv_to_pixel_scale;
-layout(location = 0) out vec4 FragColor;
-layout(set = 0, binding = 2) uniform sampler2D Source;
-layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
-
-void main()
-{
-    //  Would a viewport-relative size work better for this pass?  (No.)
-    //  PROS:
-    //  1.) Instead of writing an absolute size to user-cgp-constants.h, we'd
-    //      write a viewport scale.  That number could be used to directly scale
-    //      the viewport-resolution bloom sigma and/or triad size to a smaller
-    //      scale.  This way, we could calculate an optimal dynamic sigma no
-    //      matter how the dot pitch is specified.
-    //  CONS:
-    //  1.) Texel smearing would be much worse at small viewport sizes, but
-    //      performance would be much worse at large viewport sizes, so there
-    //      would be no easy way to calculate a decent scale.
-    //  2.) Worse, we could no longer get away with using a constant-size blur!
-    //      Instead, we'd have to face all the same difficulties as the real
-    //      phosphor bloom, which requires static #ifdefs to decide the blur
-    //      size based on the expected triad size...a dynamic value.
-    //  3.) Like the phosphor bloom, we'd have less control over making the blur
-    //      size correct for an optical blur.  That said, we likely overblur (to
-    //      maintain brightness) more than the eye would do by itself: 20/20
-    //      human vision distinguishes ~1 arc minute, or 1/60 of a degree.  The
-    //      highest viewing angle recommendation I know of is THX's 40.04 degree
-    //      recommendation, at which 20/20 vision can distinguish about 2402.4
-    //      lines.  Assuming the "TV lines" definition, that means 1201.2
-    //      distinct light lines and 1201.2 distinct dark lines can be told
-    //      apart, i.e. 1201.2 pairs of lines.  This would correspond to 1201.2
-    //      pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total
-    //      (if they're alternately lit).  That's a max of 800.8 triads.  Using
-    //      a more popular 30 degree viewing angle recommendation, 20/20 vision
-    //      can distinguish 1800 lines, or 600 triads of alternately lit
-    //      phosphors.  In contrast, we currently blur phosphors all the way
-    //      down to 341.3 triads to ensure full brightness.
-    //  4.) Realistically speaking, we're usually just going to use bilinear
-    //      filtering in this pass anyway, but it only works well to limit
-    //      bandwidth if it's done at a small constant scale.
-    
-    //  Get the constants we need to sample:
-	const vec2 texture_size = registers.ORIG_LINEARIZEDSize.xy;
-	vec2 tex_uv_r, tex_uv_g, tex_uv_b;
-	
-	if(beam_misconvergence == true)
-    {
-        const vec2 convergence_offsets_r = vec2(params.convergence_offset_x_r, params.convergence_offset_y_r);//get_convergence_offsets_r_vector();
-        const vec2 convergence_offsets_g = vec2(params.convergence_offset_x_g, params.convergence_offset_y_g);//get_convergence_offsets_g_vector();
-        const vec2 convergence_offsets_b = vec2(params.convergence_offset_x_b, params.convergence_offset_y_b);//get_convergence_offsets_b_vector();
-        tex_uv_r = tex_uv - vec2(params.convergence_offset_x_r, params.convergence_offset_y_r) * uv_scanline_step;
-        tex_uv_g = tex_uv - vec2(params.convergence_offset_x_g, params.convergence_offset_y_g) * uv_scanline_step;
-        tex_uv_b = tex_uv - vec2(params.convergence_offset_x_b, params.convergence_offset_y_b) * uv_scanline_step;
-    }
-	//  Get the blur sigma:
-    const float bloom_approx_sigma = get_bloom_approx_sigma(registers.OutputSize.x, estimated_viewport_size_x);
-	
-	//  Sample the resized and blurred texture, and apply convergence offsets if
-    //  necessary.  Applying convergence offsets here triples our samples from
-    //  16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
-    //  HALATION_BLUR 3 times at full resolution every time they're used.
-    vec3 color_r, color_g, color_b, color;
-	if(bloom_approx_filter > 1.5)
-    {
-        //  Use a 4x4 Gaussian resize.  This is slower but technically correct.
-        if(beam_misconvergence == true)
-        {
-            color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
-                blur_dxdy, texture_size, texture_size_inv,
-                tex_uv_to_pixel_scale, bloom_approx_sigma);
-            color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
-                blur_dxdy, texture_size, texture_size_inv,
-                tex_uv_to_pixel_scale, bloom_approx_sigma);
-            color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
-                blur_dxdy, texture_size, texture_size_inv,
-                tex_uv_to_pixel_scale, bloom_approx_sigma);
-        }
-        else
-        {
-            color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
-                blur_dxdy, texture_size, texture_size_inv,
-                tex_uv_to_pixel_scale, bloom_approx_sigma);
-        }
-    }
-    else if(bloom_approx_filter > 0.5)
-    {
-        //  Use a 3x3 resize blur.  This is the softest option, because we're
-        //  blurring already blurry bilinear samples.  It doesn't play quite as
-        //  nicely with convergence offsets, but it has its charms.
-        if(beam_misconvergence == true)
-        {
-            color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
-                blur_dxdy, bloom_approx_sigma);
-            color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g,
-                blur_dxdy, bloom_approx_sigma);
-            color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b,
-                blur_dxdy, bloom_approx_sigma);
-        }
-        else
-        {
-            color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy);
-        }
-    }
-    else
-    {
-        //  Use bilinear sampling.  This approximates a 4x4 Gaussian resize MUCH
-        //  better than tex2Dblur3x3_resize for the very small sigmas we're
-        //  likely to use at small output resolutions.  (This estimate becomes
-        //  too sharp above ~400x300, but the blurs break down above that
-        //  resolution too, unless min_allowed_viewport_triads is high enough to
-        //  keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
-        if(beam_misconvergence == true)
-        {
-            color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
-            color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
-            color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb;
-        }
-        else
-        {
-            color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
-        }
-    }
-	//  Pack the colors from the red/green/blue beams into a single vector:
-    if(beam_misconvergence == true)
-    {
-        color = vec3(color_r.r, color_g.g, color_b.b);
-    }
-    //  Encode and output the blurred image:
-   FragColor = vec4(color, 1.0);//vec4(texture(ORIG_LINEARIZED, tex_uv));//
-}
+#include "crt-royale-bloom-approx.h"
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang
index 8708b7b..e74cb02 100755
--- a/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang
@@ -1,17 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OutputSize;
-	vec4 ORIG_LINEARIZEDSize;
-	vec4 HALATION_BLURSize;
-	vec4 MASKED_SCANLINESSize;
-	vec4 BRIGHTPASSSize;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -30,18 +18,93 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+	vec4 MASKED_SCANLINESSize;
+	vec4 HALATION_BLURSize;
+	vec4 BRIGHTPASSSize;
+} global;
+
+#define MASKED_SCANLINEStexture MASKED_SCANLINES
+#define MASKED_SCANLINEStexture_size global.MASKED_SCANLINESSize.xy
+#define MASKED_SCANLINESvideo_size global.MASKED_SCANLINESSize.xy
+#define HALATION_BLURtexture HALATION_BLUR
+#define HALATION_BLURtexture_size global.HALATION_BLURSize.xy
+#define HALATION_BLURvideo_size global.HALATION_BLURSize.xy
+#define BRIGHTPASStexture BRIGHTPASS
+#define BRIGHTPASStexture_size global.BRIGHTPASSSize.xy
+#define BRIGHTPASSvideo_size global.BRIGHTPASSSize.xy
+
+float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
 #include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
 
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
+///////////////////////////////  VERTEX INCLUDES  //////////////////////////////
 
 #include "../../../../include/gamma-management.h"
-#include "bloom-functions.h"
 #include "phosphor-mask-resizing.h"
 #include "scanline-functions.h"
 
@@ -56,34 +119,52 @@ layout(location = 4) out vec2 bloom_tex_uv;
 layout(location = 5) out vec2 bloom_dxdy;
 layout(location = 6) out float bloom_sigma_runtime;
 
+// copied from bloom-functions.h
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
 void main()
 {
-   gl_Position = params.MVP * Position;
-   video_uv = TexCoord;
+   gl_Position = global.MVP * Position;
+   float2 tex_uv = TexCoord;
    
     //  Our various input textures use different coords:
-    scanline_tex_uv = video_uv * registers.MASKED_SCANLINESSize.xy *
-        registers.MASKED_SCANLINESSize.zw;
-    halation_tex_uv = video_uv * registers.HALATION_BLURSize.xy *
-        registers.HALATION_BLURSize.zw;
-    brightpass_tex_uv = video_uv * registers.BRIGHTPASSSize.xy *
-        registers.BRIGHTPASSSize.zw;
-    bloom_tex_uv = TexCoord;
+    const float2 video_uv = tex_uv * IN.texture_size/IN.video_size;
+//    video_uv = video_uv;
+    scanline_tex_uv = video_uv * MASKED_SCANLINESvideo_size /
+        MASKED_SCANLINEStexture_size;
+    halation_tex_uv = video_uv * HALATION_BLURvideo_size /
+        HALATION_BLURtexture_size;
+    brightpass_tex_uv = video_uv * BRIGHTPASSvideo_size /
+        BRIGHTPASStexture_size;
+    bloom_tex_uv = tex_uv;
 
     //  We're horizontally blurring the bloom input (vertically blurred
     //  brightpass).  Get the uv distance between output pixels / input texels
     //  in the horizontal direction (this pass must NOT resize):
-    bloom_dxdy = vec2(registers.SourceSize.z, 0.0);
+    bloom_dxdy = float2(1.0/IN.texture_size.x, 0.0);
 
     //  Calculate a runtime bloom_sigma in case it's needed:
     const float mask_tile_size_x = get_resized_mask_tile_size(
-        registers.OutputSize.xy, registers.OutputSize.xy * mask_resize_viewport_scale, false).x;
+        IN.output_size, IN.output_size * mask_resize_viewport_scale, false).x;
     bloom_sigma_runtime = get_min_sigma_to_blur_triad(
-        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh);
+        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
 }
 
 #pragma stage fragment
-#pragma format R8G8B8A8_SRGB
 layout(location = 0) in vec2 video_uv;
 layout(location = 1) in vec2 scanline_tex_uv;
 layout(location = 2) in vec2 halation_tex_uv;
@@ -93,40 +174,45 @@ layout(location = 5) in vec2 bloom_dxdy;
 layout(location = 6) in float bloom_sigma_runtime;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
-layout(set = 0, binding = 3) uniform sampler2D MASKED_SCANLINES;
-layout(set = 0, binding = 4) uniform sampler2D HALATION_BLUR;
-layout(set = 0, binding = 5) uniform sampler2D BRIGHTPASS;
+layout(set = 0, binding = 3) uniform sampler2D HALATION_BLUR;
+layout(set = 0, binding = 4) uniform sampler2D BRIGHTPASS;
+layout(set = 0, binding = 5) uniform sampler2D MASKED_SCANLINES;
+#define bloom_texture Source
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+#include "bloom-functions.h"
 
 void main()
 {
-//  Blur the vertically blurred brightpass horizontally by 9/17/25/43x:
+    //  Blur the vertically blurred brightpass horizontally by 9/17/25/43x:
     const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime);
-    const vec3 blurred_brightpass = tex2DblurNfast(Source,
+    const float3 blurred_brightpass = tex2DblurNfast(bloom_texture,
         bloom_tex_uv, bloom_dxdy, bloom_sigma);
 
-//  Sample the masked scanlines.  Alpha contains the auto-dim factor:
-    const vec3 intensity_dim =
-        tex2D_linearize(MASKED_SCANLINES, scanline_tex_uv).rgb;
+    //  Sample the masked scanlines.  Alpha contains the auto-dim factor:
+    const float3 intensity_dim =
+        tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb;
     const float auto_dim_factor = levels_autodim_temp;
     const float undim_factor = 1.0/auto_dim_factor;
-	
-	//  Calculate the mask dimpass, add it to the blurred brightpass, and
+
+    //  Calculate the mask dimpass, add it to the blurred brightpass, and
     //  undim (from scanline auto-dim) and amplify (from mask dim) the result:
     const float mask_amplify = get_mask_amplify();
-    const vec3 brightpass = tex2D_linearize(BRIGHTPASS,
+    const float3 brightpass = tex2D_linearize(BRIGHTPASStexture,
         brightpass_tex_uv).rgb;
-    const vec3 dimpass = intensity_dim - brightpass;
-    const vec3 phosphor_bloom = (dimpass + blurred_brightpass) *
-        mask_amplify * undim_factor * params.levels_contrast;
-		
-	//  Sample the halation texture, and let some light bleed into refractive
+    const float3 dimpass = intensity_dim - brightpass;
+    const float3 phosphor_bloom = (dimpass + blurred_brightpass) *
+        mask_amplify * undim_factor * levels_contrast;
+
+    //  Sample the halation texture, and let some light bleed into refractive
     //  diffusion.  Conceptually this occurs before the phosphor bloom, but
     //  adding it in earlier passes causes black crush in the diffusion colors.
-    const vec3 diffusion_color = params.levels_contrast * tex2D_linearize(
-        HALATION_BLUR, halation_tex_uv).rgb;
-    const vec3 final_bloom = mix(phosphor_bloom,
-        diffusion_color, params.diffusion_weight);
-		
-	//  Encode and output the bloomed image:
-   FragColor = encode_output(vec4(final_bloom, 1.0));
-}
+    const float3 diffusion_color = levels_contrast * tex2D_linearize(
+        HALATION_BLURtexture, halation_tex_uv).rgb;
+    const float3 final_bloom = lerp(phosphor_bloom,
+        diffusion_color, global.diffusion_weight);
+
+    //  Encode and output the bloomed image:
+    FragColor = encode_output(float4(final_bloom, 1.0));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang
index 06b5a99..c0e95f8 100755
--- a/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-bloom-vertical.slang
@@ -1,15 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -28,20 +18,84 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+	vec4 MASKED_SCANLINESSize;
+	vec4 BLOOM_APPROXSize;
+} global;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
 #include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
 
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
 #include "../../../../include/gamma-management.h"
-#include "bloom-functions.h"
 #include "phosphor-mask-resizing.h"
 
+float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
+
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
@@ -49,23 +103,40 @@ layout(location = 0) out vec2 tex_uv;
 layout(location = 1) out vec2 bloom_dxdy;
 layout(location = 2) out float bloom_sigma_runtime;
 
+// copied from bloom-functions.h
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
+
 void main()
 {
-   gl_Position = params.MVP * Position;
-   tex_uv = TexCoord;
+   gl_Position = global.MVP * Position;
+   tex_uv = TexCoord * 1.0001;
    
-    //  Get the uv sample distance between output pixels.  Calculate dxdy like
+	//  Get the uv sample distance between output pixels.  Calculate dxdy like
     //  blurs/vertex-shader-blur-fast-vertical.h.
-    const vec2 dxdy_scale = registers.SourceSize.xy * registers.OutputSize.zw;
-    const vec2 dxdy = dxdy_scale * registers.SourceSize.zw;
+    const float2 dxdy_scale = IN.video_size/IN.output_size;
+    const float2 dxdy = dxdy_scale/IN.texture_size;
     //  This blur is vertical-only, so zero out the vertical offset:
-    bloom_dxdy = vec2(0.0, dxdy.y);
+    bloom_dxdy = float2(0.0, dxdy.y);
 
     //  Calculate a runtime bloom_sigma in case it's needed:
     const float mask_tile_size_x = get_resized_mask_tile_size(
-        registers.OutputSize.xy, registers.OutputSize.xy * mask_resize_viewport_scale, false).x;
+        IN.output_size, IN.output_size * mask_resize_viewport_scale, false).x;
     bloom_sigma_runtime = get_min_sigma_to_blur_triad(
-        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh);
+        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
 }
 
 #pragma stage fragment
@@ -75,13 +146,18 @@ layout(location = 1) in vec2 bloom_dxdy;
 layout(location = 2) in float bloom_sigma_runtime;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+#include "bloom-functions.h"
 
 void main()
 {
     //  Blur the brightpass horizontally with a 9/17/25/43x blur:
     const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime);
-    const vec3 color = tex2DblurNfast(Source, tex_uv,
+    const float3 color = tex2DblurNfast(input_texture, tex_uv,
         bloom_dxdy, bloom_sigma);
     //  Encode and output the blurred image:
-   FragColor = encode_output(vec4(color, 1.0));
-}
+    FragColor = encode_output(float4(color, 1.0));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-brightpass.slang b/crt/shaders/crt-royale/src/crt-royale-brightpass.slang
index 806b717..bac816c 100755
--- a/crt/shaders/crt-royale/src/crt-royale-brightpass.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-brightpass.slang
@@ -1,17 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-	vec4 MASKED_SCANLINESSize;
-	vec4 BLOOM_APPROXSize;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -30,114 +18,198 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+	vec4 MASKED_SCANLINESSize;
+	vec4 BLOOM_APPROXSize;
+} global;
+
+#define MASKED_SCANLINEStexture MASKED_SCANLINES
+#define MASKED_SCANLINEStexture_size global.MASKED_SCANLINESSize.xy
+#define MASKED_SCANLINESvideo_size global.MASKED_SCANLINESSize.xy
+#define BLOOM_APPROXtexture BLOOM_APPROX
+#define BLOOM_APPROXtexture_size global.BLOOM_APPROXSize.xy
+#define BLOOM_APPROXvideo_size global.BLOOM_APPROXSize.xy
+
+float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+const float bloom_diff_thresh_ = 1.0/256.0;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
 #include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
 
 
-//////////////////////////////////  INCLUDES  //////////////////////////////////
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 
 #include "../../../../include/gamma-management.h"
-#include "../../../../include/blur-functions.h"
 #include "phosphor-mask-resizing.h"
 #include "scanline-functions.h"
-#include "bloom-functions.h"
-
 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 video_uv;
-layout(location = 1) out vec2 scanline_tex_uv;
+layout(location = 0) out vec2 scanline_tex_uv;
+layout(location = 1) out vec2 blur3x3_tex_uv;
 layout(location = 2) out float bloom_sigma_runtime;
-layout(location = 3) out vec2 blur3x3_tex_uv;
+
+// copied from bloom-functions.h
+inline float get_min_sigma_to_blur_triad(const float triad_size,
+    const float thresh)
+{
+    //  Requires:   1.) triad_size is the final phosphor triad size in pixels
+    //              2.) thresh is the max desired pixel difference in the
+    //                  blurred triad (e.g. 1.0/256.0).
+    //  Returns:    Return the minimum sigma that will fully blur a phosphor
+    //              triad on the screen to an even color, within thresh.
+    //              This closed-form function was found by curve-fitting data.
+    //  Estimate: max error = ~0.086036, mean sq. error = ~0.0013387:
+    return -0.05168 + 0.6113*triad_size -
+        1.122*triad_size*sqrt(0.000416 + thresh);
+    //  Estimate: max error = ~0.16486, mean sq. error = ~0.0041041:
+    //return 0.5985*triad_size - triad_size*sqrt(thresh)
+}
 
 void main()
 {
-   gl_Position = params.MVP * Position;
-   const vec2 tex_uv = TexCoord;
+   gl_Position = global.MVP * Position;
+   float2 tex_uv = TexCoord;
     //  Our various input textures use different coords:
-    video_uv = tex_uv;
-    scanline_tex_uv = video_uv * registers.MASKED_SCANLINESSize.xy *
-        registers.MASKED_SCANLINESSize.zw;
-    blur3x3_tex_uv = video_uv * registers.BLOOM_APPROXSize.xy * registers.BLOOM_APPROXSize.zw;
+    float2 video_uv = tex_uv * IN.texture_size/IN.video_size;
+    //video_uv = video_uv;
+    scanline_tex_uv = video_uv * MASKED_SCANLINESvideo_size /
+        MASKED_SCANLINEStexture_size;
+    blur3x3_tex_uv = video_uv * BLOOM_APPROXvideo_size / BLOOM_APPROXtexture_size;
 
     //  Calculate a runtime bloom_sigma in case it's needed:
     const float mask_tile_size_x = get_resized_mask_tile_size(
-        registers.OutputSize.xy, registers.OutputSize.xy * mask_resize_viewport_scale, false).x;
+        IN.output_size, IN.output_size * mask_resize_viewport_scale, false).x;
     bloom_sigma_runtime = get_min_sigma_to_blur_triad(
-        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh);
+        mask_tile_size_x / mask_triads_per_tile, bloom_diff_thresh_);
 }
 
 #pragma stage fragment
-#pragma format R8G8B8A8_SRGB
-layout(location = 0) in vec2 video_uv;
-layout(location = 1) in vec2 scanline_tex_uv;
+layout(location = 0) in vec2 scanline_tex_uv;
+layout(location = 1) in vec2 blur3x3_tex_uv;
 layout(location = 2) in float bloom_sigma_runtime;
-layout(location = 3) in vec2 blur3x3_tex_uv;
 layout(location = 0) out vec4 FragColor;
-layout(set = 0, binding = 2) uniform sampler2D Source;
-layout(set = 0, binding = 3) uniform sampler2D MASKED_SCANLINES;
-layout(set = 0, binding = 4) uniform sampler2D BLOOM_APPROX;
+layout(set = 0, binding = 2) uniform sampler2D MASKED_SCANLINES;
+layout(set = 0, binding = 3) uniform sampler2D BLOOM_APPROX;
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+#include "bloom-functions.h"
+#include "../../../../include/blur-functions.h"
 
 void main()
 {
-	//  Sample the masked scanlines:
-    const vec3 intensity_dim =
-        tex2D_linearize(MASKED_SCANLINES, scanline_tex_uv).rgb;
+    //  Sample the masked scanlines:
+    const float3 intensity_dim =
+        tex2D_linearize(MASKED_SCANLINEStexture, scanline_tex_uv).rgb;
     //  Get the full intensity, including auto-undimming, and mask compensation:
     const float auto_dim_factor = levels_autodim_temp;
     const float undim_factor = 1.0/auto_dim_factor;
     const float mask_amplify = get_mask_amplify();
-    const vec3 intensity = intensity_dim * undim_factor * mask_amplify *
-        params.levels_contrast;
-		
-	//  Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines
+    const float3 intensity = intensity_dim * undim_factor * mask_amplify *
+        levels_contrast;
+
+    //  Sample BLOOM_APPROX to estimate what a straight blur of masked scanlines
     //  would look like, so we can estimate how much energy we'll receive from
     //  blooming neighbors:
-    const vec3 phosphor_blur_approx = params.levels_contrast * tex2D_linearize(
-        BLOOM_APPROX, blur3x3_tex_uv).rgb;
-		
-	//  Compute the blur weight for the center texel and the maximum energy we
+    const float3 phosphor_blur_approx = levels_contrast * tex2D_linearize(
+        BLOOM_APPROXtexture, blur3x3_tex_uv).rgb;
+
+    //  Compute the blur weight for the center texel and the maximum energy we
     //  expect to receive from neighbors:
     const float bloom_sigma = get_final_bloom_sigma(bloom_sigma_runtime);
     const float center_weight = get_center_weight(bloom_sigma);
-    const vec3 max_area_contribution_approx =
-        max(vec3(0.0), phosphor_blur_approx - center_weight * intensity);
-		
-	//  Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0),
+    const float3 max_area_contribution_approx =
+        max(float3(0.0, 0.0, 0.0), phosphor_blur_approx - center_weight * intensity);
+    //  Assume neighbors will blur 100% of their intensity (blur_ratio = 1.0),
     //  because it actually gets better results (on top of being very simple),
     //  but adjust all intensities for the user's desired underestimate factor:
-    const vec3 area_contrib_underestimate =
-        params.bloom_underestimate_levels * max_area_contribution_approx;
-    const vec3 intensity_underestimate =
-        params.bloom_underestimate_levels * intensity;
-		
-	//  Calculate the blur_ratio, the ratio of intensity we want to blur:
+    const float3 area_contrib_underestimate =
+        bloom_underestimate_levels * max_area_contribution_approx;
+    const float3 intensity_underestimate =
+        bloom_underestimate_levels * intensity;
+    //  Calculate the blur_ratio, the ratio of intensity we want to blur:
     #ifdef BRIGHTPASS_AREA_BASED
         //  This area-based version changes blur_ratio more smoothly and blurs
         //  more, clipping less but offering less phosphor differentiation:
-        const vec3 phosphor_blur_underestimate = params.bloom_underestimate_levels *
+        const float3 phosphor_blur_underestimate = bloom_underestimate_levels *
             phosphor_blur_approx;
-        const vec3 soft_intensity = max(intensity_underestimate,
+        const float3 soft_intensity = max(intensity_underestimate,
             phosphor_blur_underestimate * mask_amplify);
-        const vec3 blur_ratio_temp =
-            ((vec3(1.0) - area_contrib_underestimate) /
-            soft_intensity - vec3(1.0)) / (center_weight - 1.0);
+        const float3 blur_ratio_temp =
+            ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) /
+            soft_intensity - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
     #else
-        const vec3 blur_ratio_temp =
-            ((vec3(1.0) - area_contrib_underestimate) /
-            intensity_underestimate - vec3(1.0)) / (center_weight - 1.0);
+        const float3 blur_ratio_temp =
+            ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) /
+            intensity_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
     #endif
-	
-	const vec3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0);
+    const float3 blur_ratio = clamp(blur_ratio_temp, 0.0, 1.0);
     //  Calculate the brightpass based on the auto-dimmed, unamplified, masked
     //  scanlines, encode if necessary, and return!
-    const vec3 brightpass = intensity_dim *
-        mix(blur_ratio, vec3(1.0), params.bloom_excess);
-		
-   FragColor = encode_output(vec4(brightpass, 1.0));
-}
+    const float3 brightpass = intensity_dim *
+        lerp(blur_ratio, float3(1.0, 1.0, 1.0), global.bloom_excess);
+    FragColor = encode_output(float4(brightpass, 1.0));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
index 02ec577..24d9c76 100755
--- a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
@@ -1,21 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	uint FrameCount;
-} registers;
-
-layout(std140, set = 0, binding = 0) uniform UBO
-{
-	mat4 MVP;
-    float interlace_bff;
-	float beam_horiz_filter;
-} params;
-
-#pragma parameter interlace_bff "interlace_bff" 1.0 0.0 1.0 1.0
-#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -34,18 +18,75 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+} global;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
-//  can #define things like this in the preset file.
+//  can #define things like this in the .cgp preset file.
 #define FIRST_PASS
 #define SIMULATE_CRT_ON_LCD
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
 #include "bind-shader-params.h"
 #include "../../../../include/gamma-management.h"
@@ -56,59 +97,60 @@ layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
 layout(location = 0) out vec2 tex_uv;
 layout(location = 1) out vec2 uv_step;
+layout(location = 2) out float interlaced;
 
 void main()
 {
-	gl_Position = params.MVP * Position;
-	tex_uv = TexCoord;
+   gl_Position = global.MVP * Position;
+   tex_uv = TexCoord * 1.00001;
+   uv_step = float2(1.0)/IN.texture_size;
    
-	//  Save the uv distance between texels:
-	uv_step = vec2(1.0) * registers.SourceSize.zw;
+    //  Detect interlacing: 1.0 = true, 0.0 = false.
+    const float2 _video_size = IN.video_size;
+    interlaced = float(is_interlaced(_video_size.y));
 }
 
 #pragma stage fragment
 #pragma format R8G8B8A8_SRGB
 layout(location = 0) in vec2 tex_uv;
 layout(location = 1) in vec2 uv_step;
+layout(location = 2) in float interlaced;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
 
 void main()
 {
-	//  Detect interlacing: 1.0 = true, 0.0 = false.
-	const vec2 video_size = registers.SourceSize.xy;
-	bool interlaced = is_interlaced(video_size.y);
-	
-//  Linearize the input based on CRT gamma and bob interlaced fields.
-//  Bobbing ensures we can immediately blur without getting artifacts.
-//  Note: TFF/BFF won't matter for sources that double-weave or similar.
-if(interlace_detect == true)
+    //  Linearize the input based on CRT gamma and bob interlaced fields.
+    //  Bobbing ensures we can immediately blur without getting artifacts.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+    if(bool(interlace_detect))
     {
         //  Sample the current line and an average of the previous/next line;
         //  tex2D_linearize will decode CRT gamma.  Don't bother branching:
-//        const vec2 tex_uv = tex_uv;
-        const vec2 v_step = vec2(0.0, uv_step.y);
-        const vec3 curr_line = tex2D_linearize(
-            Source, tex_uv).rgb;
-        const vec3 last_line = tex2D_linearize(
-            Source, tex_uv - v_step).rgb;
-        const vec3 next_line = tex2D_linearize(
-            Source, tex_uv + v_step).rgb;
-        const vec3 interpolated_line = 0.5 * (last_line + next_line);
+//        const float2 tex_uv = tex_uv;
+        const float2 v_step = float2(0.0, uv_step.y);
+        const float3 curr_line = tex2D_linearize(
+            input_texture, tex_uv).rgb;
+        const float3 last_line = tex2D_linearize(
+            input_texture, tex_uv - v_step).rgb;
+        const float3 next_line = tex2D_linearize(
+            input_texture, tex_uv + v_step).rgb;
+        const float3 interpolated_line = 0.5 * (last_line + next_line);
         //  If we're interlacing, determine which field curr_line is in:
-        const float modulus = float(interlaced) + 1.0;
+        const float modulus = interlaced + 1.0;
         const float field_offset =
-            mod(registers.FrameCount + float(params.interlace_bff), modulus);
-        const float curr_line_texel = tex_uv.y * registers.SourceSize.y;
+            fmod(params.frame_count + global.interlace_bff, modulus);
+        const float curr_line_texel = tex_uv.y * IN.texture_size.y;
         //  Use under_half to fix a rounding bug around exact texel locations.
         const float line_num_last = floor(curr_line_texel - under_half);
-        const float wrong_field = mod(line_num_last + field_offset, modulus);
+        const float wrong_field = fmod(line_num_last + field_offset, modulus);
         //  Select the correct color, and output the result:
-        const vec3 color = mix(curr_line, interpolated_line, wrong_field);
-        FragColor = encode_output(vec4(color, 1.0));
+        const float3 color = lerp(curr_line, interpolated_line, wrong_field);
+        FragColor =  encode_output(float4(color, 1.0));
     }
     else
     {
-        FragColor = encode_output(tex2D_linearize(Source, tex_uv));
+        FragColor =  encode_output(tex2D_linearize(input_texture, tex_uv));
     }
-}
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-backup.slang b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-backup.slang
deleted file mode 100755
index 2e4466e..0000000
--- a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-backup.slang
+++ /dev/null
@@ -1,245 +0,0 @@
-#version 450
-
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} registers;
-
-#include "params.inc"
-
-/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
-
-//  crt-royale: A full-featured CRT shader, with cheese.
-//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
-//
-//  This program is free software; you can redistribute it and/or modify it
-//  under the terms of the GNU General Public License as published by the Free
-//  Software Foundation; either version 2 of the License, or any later version.
-//
-//  This program is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-//  more details.
-//
-//  You should have received a copy of the GNU General Public License along with
-//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-//  Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
-
-#define LAST_PASS
-#define SIMULATE_CRT_ON_LCD
-#include "../user-settings.h"
-#include "derived-settings-and-constants.h"
-#include "bind-shader-params.h"
-
-#ifndef DONT_DEFINE //RUNTIME_GEOMETRY_TILT
-    //  Create a local-to-global rotation matrix for the CRT's coordinate frame
-    //  and its global-to-local inverse.  See the vertex shader for details.
-    //  It's faster to compute these statically if possible.
-    const vec2 sin_tilt = sin(geom_tilt_angle_static);
-    const vec2 cos_tilt = cos(geom_tilt_angle_static);
-    const mat3x3 geom_local_to_global_static = mat3x3(
-        cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
-        0.0, cos_tilt.y, -sin_tilt.y,
-        -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
-    const mat3x3 geom_global_to_local_static = mat3x3(
-        cos_tilt.x, 0.0, -sin_tilt.x,
-        sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x,
-        cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x);
-#endif
-
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-#include "../../../../include/gamma-management.h"
-#include "tex2Dantialias.h"
-#include "geometry-functions.h"
-
-///////////////////////////////////  HELPERS  //////////////////////////////////
-
-mat2x2 mul_scale(vec2 scale, mat2x2 matrix)
-{
-    //mat2x2 scale_matrix = mat2x2(scale.x, 0.0, 0.0, scale.y);
-    //return (matrix * scale_matrix);
-    return mat2x2(vec4(matrix[0].xy, matrix[1].xy) * scale.xxyy);
-}
-
-#pragma stage vertex
-layout(location = 0) in vec4 Position;
-layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 tex_uv;
-layout(location = 1) out vec4 video_and_texture_size_inv;
-layout(location = 2) out vec2 output_size_inv;
-layout(location = 3) out vec3 eye_pos_local;
-layout(location = 4) out vec4 geom_aspect_and_overscan;
-#ifdef RUNTIME_GEOMETRY_TILT
-layout(location = 5) out vec3 global_to_local_row0;
-layout(location = 6) out vec3 global_to_local_row1;
-layout(location = 7) out vec3 global_to_local_row2;
-#endif
-
-void main()
-{
-   gl_Position = params.MVP * Position;
-   tex_uv = TexCoord;
-   
-   video_and_texture_size_inv = vec4(registers.SourceSize.zw, registers.SourceSize.zw);
-    output_size_inv = registers.OutputSize.zw;
-
-    //  Get aspect/overscan vectors from scalar parameters (likely uniforms):
-    const float viewport_aspect_ratio = registers.OutputSize.x * registers.OutputSize.w;
-    const vec2 geom_aspect = get_aspect_vector(viewport_aspect_ratio);
-    const vec2 geom_overscan = get_geom_overscan_vector();
-    geom_aspect_and_overscan = vec4(geom_aspect, geom_overscan);
-	
-	#ifdef DONT_DEFINE //RUNTIME_GEOMETRY_TILT
-        //  Create a local-to-global rotation matrix for the CRT's coordinate
-        //  frame and its global-to-local inverse.  Rotate around the x axis
-        //  first (pitch) and then the y axis (yaw) with yucky Euler angles.
-        //  Positive angles go clockwise around the right-vec and up-vec.
-        //  Runtime shader parameters prevent us from computing these globally,
-        //  but we can still combine the pitch/yaw matrices by hand to cut a
-        //  few instructions.  Note that cg matrices fill row1 first, then row2,
-        //  etc. (row-major order).
-        const vec2 geom_tilt_angle = get_geom_tilt_angle_vector();
-        const vec2 sin_tilt = sin(geom_tilt_angle);
-        const vec2 cos_tilt = cos(geom_tilt_angle);
-        //  Conceptual breakdown:
-        //      const mat3x3 rot_x_matrix = mat3x3(
-        //          1.0, 0.0, 0.0,
-        //          0.0, cos_tilt.y, -sin_tilt.y,
-        //          0.0, sin_tilt.y, cos_tilt.y);
-        //      const mat3x3 rot_y_matrix = mat3x3(
-        //          cos_tilt.x, 0.0, sin_tilt.x,
-        //          0.0, 1.0, 0.0,
-        //          -sin_tilt.x, 0.0, cos_tilt.x);
-        //      const mat3x3 local_to_global =
-        //          rot_x_matrix * rot_y_matrix;
-        //      const mat3x3 global_to_local =
-        //          transpose(local_to_global);
-        mat3x3 local_to_global = mat3x3(
-            cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
-            0.0, cos_tilt.y, -sin_tilt.y,
-            -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
-        //  This is a pure rotation, so transpose = inverse:
-        mat3x3 global_to_local = transpose(local_to_global);
-        //  Decompose the matrix into 3 vec3's for output:
-        global_to_local_row0 = vec3(global_to_local[0].xyz);
-        global_to_local_row1 = vec3(global_to_local[1].xyz);
-        global_to_local_row2 = vec3(global_to_local[2].xyz);
-    #else
-        const mat3x3 global_to_local = geom_global_to_local_static;
-        const mat3x3 local_to_global = geom_local_to_global_static;
-    #endif
-	
-	//  Get an optimal eye position based on geom_view_dist, viewport_aspect,
-    //  and CRT radius/rotation:
-    #ifdef RUNTIME_GEOMETRY_MODE
-        geom_mode = params.geom_mode_runtime;
-    #else
-        const float geom_mode = geom_mode_static;
-    #endif
-	
-	const vec3 eye_pos_global = get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode);
-    eye_pos_local = eye_pos_global, global_to_local;
-}
-
-#pragma stage fragment
-layout(location = 0) in vec2 tex_uv;
-layout(location = 1) in vec4 video_and_texture_size_inv;
-layout(location = 2) in vec2 output_size_inv;
-layout(location = 3) in vec3 eye_pos_local;
-layout(location = 4) in vec4 geom_aspect_and_overscan;
-#ifdef RUNTIME_GEOMETRY_TILT
-layout(location = 5) in vec3 global_to_local_row0;
-layout(location = 6) in vec3 global_to_local_row1;
-layout(location = 7) in vec3 global_to_local_row2;
-#endif
-layout(location = 0) out vec4 FragColor;
-layout(set = 0, binding = 2) uniform sampler2D Source;
-
-void main()
-{
-    //  Localize some parameters:
-    const vec2 geom_aspect = geom_aspect_and_overscan.xy;
-    const vec2 geom_overscan = geom_aspect_and_overscan.zw;
-    const vec2 video_size_inv = video_and_texture_size_inv.xy;
-    const vec2 texture_size_inv = video_and_texture_size_inv.zw;
-	
-	#ifdef RUNTIME_GEOMETRY_TILT
-        const mat3x3 global_to_local = mat3x3(global_to_local_row0,
-            global_to_local_row1, global_to_local_row2);
-    #else
-        const mat3x3 global_to_local = geom_global_to_local_static;
-    #endif
-    #ifdef RUNTIME_GEOMETRY_MODE
-        geom_mode = params.geom_mode_runtime;
-    #else
-        const float geom_mode = geom_mode_static;
-    #endif
-	
-	//  Get flat and curved texture coords for the current fragment point sample
-    //  and a pixel_to_tangent_video_uv matrix for transforming pixel offsets:
-    //  video_uv = relative position in video frame, mapped to [0.0, 1.0] range
-    //  tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range
-    const vec2 flat_video_uv = tex_uv * (registers.SourceSize.xy * video_size_inv);
-    mat2x2 pixel_to_video_uv;
-    vec2 video_uv_no_geom_overscan;
-    if(geom_mode > 0.5)
-    {
-        video_uv_no_geom_overscan =
-            get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv,
-                eye_pos_local, output_size_inv, geom_aspect,
-                geom_mode, global_to_local, pixel_to_video_uv);
-    }
-    else
-    {
-        video_uv_no_geom_overscan = flat_video_uv;
-        pixel_to_video_uv = mat2x2(
-            output_size_inv.x, 0.0, 0.0, output_size_inv.y);
-    }
-	
-	//  Correct for overscan here (not in curvature code):
-    const vec2 video_uv =
-        (video_uv_no_geom_overscan - vec2(0.5))/geom_overscan + vec2(0.5);
-    const vec2 tex_uv = video_uv * (registers.SourceSize.xy * texture_size_inv);
-	
-	//  Get a matrix transforming pixel vectors to tex_uv vectors:
-    const mat2x2 pixel_to_tex_uv =
-        mul_scale(registers.SourceSize.xy * texture_size_inv /
-            geom_aspect_and_overscan.zw, pixel_to_video_uv);
-			
-	//  Sample!  Skip antialiasing if aa_level < 0.5 or both of these hold:
-    //  1.) Geometry/curvature isn't used
-    //  2.) Overscan == vec2(1.0)
-    //  Skipping AA is sharper, but it's only faster with dynamic branches.
-    const vec2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset());
-    bool need_subpixel_aa = true;
-	if(abs_aa_r_offset.x + abs_aa_r_offset.y < 0.0) need_subpixel_aa = false;
-    vec3 color;
-    if(aa_level > 0.5 && (geom_mode > 0.5 || any(notEqual(geom_overscan , vec2(1.0)))))
-    {
-        //  Sample the input with antialiasing (due to sharp phosphors, etc.):
-        color = tex2Daa(Source, tex_uv, pixel_to_tex_uv, registers.FrameCount);
-    }
-    else if(aa_level > 0.5 && need_subpixel_aa = true)
-    {
-        //  Sample at each subpixel location:
-        color = tex2Daa_subpixel_weights_only(
-            Source, tex_uv, pixel_to_tex_uv);
-    }
-    else
-    {
-        color = tex2D_linearize(Source, tex_uv).rgb;
-    }
-	
-	//  Dim borders and output the final result:
-    const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect);
-    const vec3 final_color = color * border_dim_factor;
-
-   FragColor = encode_output(vec4(final_color, 1.0));
-}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang
new file mode 100644
index 0000000..46a53e4
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass-intel.slang
@@ -0,0 +1,4 @@
+#version 450
+
+#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+#include "crt-royale-geometry-aa-last-pass.h"
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.h b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.h
new file mode 100644
index 0000000..ef99b01
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.h
@@ -0,0 +1,293 @@
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+	vec4 MASKED_SCANLINESSize;
+	vec4 HALATION_BLURSize;
+	vec4 BRIGHTPASSSize;
+} global;
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#define LAST_PASS
+#define SIMULATE_CRT_ON_LCD
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+#include "bind-shader-params.h"
+
+#ifndef RUNTIME_GEOMETRY_TILT
+    //  Create a local-to-global rotation matrix for the CRT's coordinate frame
+    //  and its global-to-local inverse.  See the vertex shader for details.
+    //  It's faster to compute these statically if possible.
+    static const float2 sin_tilt = sin(geom_tilt_angle_static);
+    static const float2 cos_tilt = cos(geom_tilt_angle_static);
+    static const float3x3 geom_local_to_global_static = float3x3(
+        cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
+        0.0, cos_tilt.y, -sin_tilt.y,
+        -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
+    static const float3x3 geom_global_to_local_static = float3x3(
+        cos_tilt.x, 0.0, -sin_tilt.x,
+        sin_tilt.y*sin_tilt.x, cos_tilt.y, sin_tilt.y*cos_tilt.x,
+        cos_tilt.y*sin_tilt.x, -sin_tilt.y, cos_tilt.y*cos_tilt.x);
+#endif
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../../../../include/gamma-management.h"
+#include "tex2Dantialias.h"
+#include "geometry-functions.h"
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+float2x2 mul_scale(float2 scale, float2x2 matrix)
+{
+    //float2x2 scale_matrix = float2x2(scale.x, 0.0, 0.0, scale.y);
+    //return mul(scale_matrix, matrix);
+    return float2x2(float4(matrix[0][0],matrix[0][1],matrix[1][0],matrix[1][1]) * scale.xxyy);
+}
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out vec4 video_and_texture_size_inv;
+layout(location = 2) out vec2 output_size_inv;
+layout(location = 3) out vec3 eye_pos_local;
+layout(location = 4) out vec4 geom_aspect_and_overscan;
+layout(location = 5) out vec3 global_to_local_row0;
+layout(location = 6) out vec3 global_to_local_row1;
+layout(location = 7) out vec3 global_to_local_row2;
+
+void main()
+{
+   gl_Position = global.MVP * Position;
+   tex_uv = TexCoord;
+    video_and_texture_size_inv =
+        float4(1.0, 1.0, 1.0, 1.0) / float4(IN.video_size, IN.texture_size);
+    output_size_inv = float2(1.0, 1.0)/IN.output_size;
+
+    //  Get aspect/overscan vectors from scalar parameters (likely uniforms):
+    const float viewport_aspect_ratio = IN.output_size.x/IN.output_size.y;
+    const float2 geom_aspect = get_aspect_vector(viewport_aspect_ratio);
+    const float2 geom_overscan = get_geom_overscan_vector();
+    geom_aspect_and_overscan = float4(geom_aspect, geom_overscan);
+
+    #ifdef RUNTIME_GEOMETRY_TILT
+        //  Create a local-to-global rotation matrix for the CRT's coordinate
+        //  frame and its global-to-local inverse.  Rotate around the x axis
+        //  first (pitch) and then the y axis (yaw) with yucky Euler angles.
+        //  Positive angles go clockwise around the right-vec and up-vec.
+        //  Runtime shader parameters prevent us from computing these globally,
+        //  but we can still combine the pitch/yaw matrices by hand to cut a
+        //  few instructions.  Note that cg matrices fill row1 first, then row2,
+        //  etc. (row-major order).
+        const float2 geom_tilt_angle = get_geom_tilt_angle_vector();
+        const float2 sin_tilt = sin(geom_tilt_angle);
+        const float2 cos_tilt = cos(geom_tilt_angle);
+        //  Conceptual breakdown:
+        //      static const float3x3 rot_x_matrix = float3x3(
+        //          1.0, 0.0, 0.0,
+        //          0.0, cos_tilt.y, -sin_tilt.y,
+        //          0.0, sin_tilt.y, cos_tilt.y);
+        //      static const float3x3 rot_y_matrix = float3x3(
+        //          cos_tilt.x, 0.0, sin_tilt.x,
+        //          0.0, 1.0, 0.0,
+        //          -sin_tilt.x, 0.0, cos_tilt.x);
+        //      static const float3x3 local_to_global =
+        //          mul(rot_y_matrix, rot_x_matrix);
+        //      static const float3x3 global_to_local =
+        //          transpose(local_to_global);
+        const float3x3 local_to_global = float3x3(
+            cos_tilt.x, sin_tilt.y*sin_tilt.x, cos_tilt.y*sin_tilt.x,
+            0.0, cos_tilt.y, -sin_tilt.y,
+            -sin_tilt.x, sin_tilt.y*cos_tilt.x, cos_tilt.y*cos_tilt.x);
+        //  This is a pure rotation, so transpose = inverse:
+        const float3x3 global_to_local = transpose(local_to_global);
+        //  Decompose the matrix into 3 float3's for output:
+        global_to_local_row0 = float3(global_to_local[0][0], global_to_local[0][1], global_to_local[0][2]);//._m00_m01_m02);
+        global_to_local_row1 = float3(global_to_local[1][0], global_to_local[1][1], global_to_local[1][2]);//._m10_m11_m12);
+        global_to_local_row2 = float3(global_to_local[2][0], global_to_local[2][1], global_to_local[2][2]);//._m20_m21_m22);
+    #else
+        static const float3x3 global_to_local = geom_global_to_local_static;
+        static const float3x3 local_to_global = geom_local_to_global_static;
+    #endif
+
+    //  Get an optimal eye position based on geom_view_dist, viewport_aspect,
+    //  and CRT radius/rotation:
+    #ifdef RUNTIME_GEOMETRY_MODE
+        const float geom_mode = geom_mode_runtime;
+    #else
+        static const float geom_mode = geom_mode_static;
+    #endif
+    const float3 eye_pos_global =
+        get_ideal_global_eye_pos(local_to_global, geom_aspect, geom_mode);
+    eye_pos_local = mul(global_to_local, eye_pos_global);
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in vec4 video_and_texture_size_inv;
+layout(location = 2) in vec2 output_size_inv;
+layout(location = 3) in vec3 eye_pos_local;
+layout(location = 4) in vec4 geom_aspect_and_overscan;
+layout(location = 5) in vec3 global_to_local_row0;
+layout(location = 6) in vec3 global_to_local_row1;
+layout(location = 7) in vec3 global_to_local_row2;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
+
+void main()
+{
+    //  Localize some parameters:
+    const float2 geom_aspect = geom_aspect_and_overscan.xy;
+    const float2 geom_overscan = geom_aspect_and_overscan.zw;
+    const float2 video_size_inv = video_and_texture_size_inv.xy;
+    const float2 texture_size_inv = video_and_texture_size_inv.zw;
+    //const float2 output_size_inv = output_size_inv;
+    #ifdef RUNTIME_GEOMETRY_TILT
+        const float3x3 global_to_local = float3x3(global_to_local_row0,
+            global_to_local_row1, global_to_local_row2);
+    #else
+        static const float3x3 global_to_local = geom_global_to_local_static;
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        const float geom_mode = geom_mode_runtime;
+    #else
+        static const float geom_mode = geom_mode_static;
+    #endif
+
+    //  Get flat and curved texture coords for the current fragment point sample
+    //  and a pixel_to_tangent_video_uv matrix for transforming pixel offsets:
+    //  video_uv = relative position in video frame, mapped to [0.0, 1.0] range
+    //  tex_uv = relative position in padded texture, mapped to [0.0, 1.0] range
+    const float2 flat_video_uv = tex_uv * (IN.texture_size * video_size_inv);
+    float2x2 pixel_to_video_uv;
+    float2 video_uv_no_geom_overscan;
+    if(geom_mode > 0.5)
+    {
+        video_uv_no_geom_overscan =
+            get_curved_video_uv_coords_and_tangent_matrix(flat_video_uv,
+                eye_pos_local, output_size_inv, geom_aspect,
+                geom_mode, global_to_local, pixel_to_video_uv);
+    }
+    else
+    {
+        video_uv_no_geom_overscan = flat_video_uv;
+        pixel_to_video_uv = float2x2(
+            output_size_inv.x, 0.0, 0.0, output_size_inv.y);
+    }
+    //  Correct for overscan here (not in curvature code):
+    const float2 video_uv =
+        (video_uv_no_geom_overscan - float2(0.5, 0.5))/geom_overscan + float2(0.5, 0.5);
+    const float2 tex_uv = video_uv * (IN.video_size * texture_size_inv);
+
+    //  Get a matrix transforming pixel vectors to tex_uv vectors:
+    const float2x2 pixel_to_tex_uv =
+        mul_scale(IN.video_size * texture_size_inv /
+            geom_aspect_and_overscan.zw, pixel_to_video_uv);
+
+    //  Sample!  Skip antialiasing if aa_level < 0.5 or both of these hold:
+    //  1.) Geometry/curvature isn't used
+    //  2.) Overscan == float2(1.0, 1.0)
+    //  Skipping AA is sharper, but it's only faster with dynamic branches.
+    const float2 abs_aa_r_offset = abs(get_aa_subpixel_r_offset());
+    const bool need_subpixel_aa = abs_aa_r_offset.x + abs_aa_r_offset.y > 0.0;
+    float3 color;
+/*  //TODO/FIXME: This block is what causes the black screen when geom_mode >= 1.0
+    if(aa_level > 0.5 && (geom_mode > 0.5 || any(bool2((geom_overscan.x != 1.0), (geom_overscan.y != 1.0)))))
+    {
+        //  Sample the input with antialiasing (due to sharp phosphors, etc.):
+        color = tex2Daa(input_texture, tex_uv, pixel_to_tex_uv, float(IN.frame_count));
+    }
+
+    else */if(aa_level > 0.5 && need_subpixel_aa)
+    {
+        //  Sample at each subpixel location:
+        color = tex2Daa_subpixel_weights_only(
+            input_texture, tex_uv, pixel_to_tex_uv);
+    }
+    else
+    {
+        color = tex2D_linearize(input_texture, tex_uv).rgb;
+    }
+
+    //  Dim borders and output the final result:
+    const float border_dim_factor = get_border_dim_factor(video_uv, geom_aspect);
+    const float3 final_color = color * border_dim_factor;
+
+    FragColor = encode_output(float4(final_color, 1.0));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang
index 1a0fef1..18cd6e3 100755
--- a/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang
@@ -1,43 +1,3 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} params;
-
-layout(std140, set = 0, binding = 0) uniform UBO
-{
-	mat4 MVP;
-} global;
-
-#define LAST_PASS
-#define SIMULATE_CRT_ON_LCD
-#include "../user-settings.h"
-#include "derived-settings-and-constants.h"
-#include "bind-shader-params.h"
-
-#include "../../../../include/gamma-management.h"
-
-#pragma stage vertex
-layout(location = 0) in vec4 Position;
-layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 vTexCoord;
-
-void main()
-{
-   gl_Position = global.MVP * Position;
-   vTexCoord = TexCoord;
-}
-
-#pragma stage fragment
-layout(location = 0) in vec2 vTexCoord;
-layout(location = 0) out vec4 FragColor;
-layout(set = 0, binding = 2) uniform sampler2D Source;
-
-void main()
-{
-   FragColor = encode_output(vec4(texture(Source, vTexCoord).rgb, 1.0));
-}
\ No newline at end of file
+#include "crt-royale-geometry-aa-last-pass.h"
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang b/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang
index 0fcdc2f..10f6235 100755
--- a/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang
@@ -1,15 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -28,9 +18,68 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+} global;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
 #include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
@@ -49,42 +98,45 @@ layout(location = 2) out vec2 resize_magnification_scale;
 layout(location = 3) out vec2 src_dxdy;
 layout(location = 4) out vec2 tile_size_uv;
 layout(location = 5) out vec2 input_tiles_per_texture;
-layout(location = 6) out vec2 tex_uv;
 
 void main()
 {
-   gl_Position = params.MVP * Position;
-   tex_uv = TexCoord;
-   
-    //  First estimate the viewport size (the user will get the wrong number of
+   gl_Position = global.MVP * Position;
+   float2 tex_uv = TexCoord.xy;
+	//  First estimate the viewport size (the user will get the wrong number of
     //  triads if it's wrong and mask_specify_num_triads is 1.0/true).
-	const vec2 estimated_viewport_size =
-        registers.OutputSize.xy / mask_resize_viewport_scale;
-	//  Find the final size of our resized phosphor mask tiles.  We probably
+    const float2 estimated_viewport_size =
+        IN.output_size / mask_resize_viewport_scale;
+    //  Find the final size of our resized phosphor mask tiles.  We probably
     //  estimated the viewport size and MASK_RESIZE output size differently last
     //  pass, so do not swear they were the same. ;)
-	const vec2 mask_resize_tile_size = get_resized_mask_tile_size(
-        estimated_viewport_size, registers.OutputSize.xy, false);
-		
-	//  We'll render resized tiles until filling the output FBO or meeting a
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        estimated_viewport_size, IN.output_size, false);
+
+    //  We'll render resized tiles until filling the output FBO or meeting a
     //  limit, so compute [wrapped] tile uv coords based on the output uv coords
     //  and the number of tiles that will fit in the FBO.
-    const vec2 output_tiles_this_pass = registers.OutputSize.xy / mask_resize_tile_size;
-    const vec2 output_video_uv = tex_uv;
-    tile_uv_wrap = output_video_uv * output_tiles_this_pass;
-	
-	//  Get the texel size of an input tile and related values:
-    const vec2 input_tile_size = vec2(min(
-        mask_resize_src_lut_size.x, registers.SourceSize.x), mask_resize_tile_size.y);
-    tile_size_uv = input_tile_size * registers.SourceSize.zw;
-    input_tiles_per_texture = registers.SourceSize.xy / input_tile_size;
-	
-	//  Derive [wrapped] texture uv coords from [wrapped] tile uv coords and
+    const float2 output_tiles_this_pass = IN.output_size / mask_resize_tile_size;
+    const float2 output_video_uv = tex_uv * IN.texture_size / IN.video_size;
+    const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
+
+    //  Get the texel size of an input tile and related values:
+    const float2 input_tile_size = float2(min(
+        mask_resize_src_lut_size.x, IN.video_size.x), mask_resize_tile_size.y);
+    tile_size_uv = input_tile_size / IN.texture_size;
+    input_tiles_per_texture = IN.texture_size / input_tile_size;
+
+    //  Derive [wrapped] texture uv coords from [wrapped] tile uv coords and
     //  the tile size in uv coords, and save frac() for the fragment shader.
     src_tex_uv_wrap = tile_uv_wrap * tile_size_uv;
-	
-	resize_magnification_scale = mask_resize_tile_size / input_tile_size;
-    src_dxdy = vec2(registers.SourceSize.z, 0.0);
+
+    //  Output the values we need, including the magnification scale and step:
+    //tile_uv_wrap = tile_uv_wrap;
+    //src_tex_uv_wrap = src_tex_uv_wrap;
+    resize_magnification_scale = mask_resize_tile_size / input_tile_size;
+    src_dxdy = float2(1.0/IN.texture_size.x, 0.0);
+    //tile_size_uv = tile_size_uv;
+    //input_tiles_per_texture = input_tiles_per_texture;
 }
 
 #pragma stage fragment
@@ -94,9 +146,9 @@ layout(location = 2) in vec2 resize_magnification_scale;
 layout(location = 3) in vec2 src_dxdy;
 layout(location = 4) in vec2 tile_size_uv;
 layout(location = 5) in vec2 input_tiles_per_texture;
-layout(location = 6) in vec2 tex_uv;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
 
 void main()
 {
@@ -108,17 +160,17 @@ void main()
     //  easier tiled sampling later.
     #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
         //  Discard unneeded fragments in case our profile allows real branches.
-//        const vec2 tile_uv_wrap = tile_uv_wrap;
-        if(params.mask_sample_mode_desired < 0.5 &&
+        const float2 tile_uv_wrap = tile_uv_wrap;
+        if(get_mask_sample_mode() < 0.5 &&
             max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles)
         {
             const float src_dx = src_dxdy.x;
-            const vec2 src_tex_uv = fract(src_tex_uv_wrap);
-            const vec3 pixel_color = downsample_horizontal_sinc_tiled(Source,
-                src_tex_uv, registers.SourceSize.xy, src_dxdy.x,
+            const float2 src_tex_uv = frac(src_tex_uv_wrap);
+            const float3 pixel_color = downsample_horizontal_sinc_tiled(input_texture,
+                src_tex_uv, IN.texture_size, src_dxdy.x,
                 resize_magnification_scale.x, tile_size_uv.x);
             //  The input LUT was linear RGB, and so is our output:
-            FragColor = vec4(pixel_color, 1.0);
+            FragColor = float4(pixel_color, 1.0);
         }
         else
         {
@@ -126,6 +178,6 @@ void main()
         }
     #else
         discard;
-        FragColor = vec4(1.0);
+        FragColor = float4(1.0);
     #endif
 }
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang b/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang
index 4946536..5b57779 100755
--- a/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang
@@ -1,15 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -28,9 +18,68 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+} global;
 
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
 #include "../user-settings.h"
 #include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
@@ -42,43 +91,41 @@ layout(push_constant) uniform Push
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 tex_uv;
-layout(location = 1) out vec2 src_tex_uv_wrap;
-layout(location = 2) out vec2 resize_magnification_scale;
+layout(location = 0) out vec2 src_tex_uv_wrap;
+layout(location = 1) out vec2 resize_magnification_scale;
 
 void main()
 {
-   gl_Position = params.MVP * Position;
-   tex_uv = TexCoord;
-   
-    //  First estimate the viewport size (the user will get the wrong number of
+   gl_Position = global.MVP * Position;
+   float2 tex_uv = TexCoord;
+	//  First estimate the viewport size (the user will get the wrong number of
     //  triads if it's wrong and mask_specify_num_triads is 1.0/true).
-    const float viewport_y = registers.OutputSize.y / mask_resize_viewport_scale.y;
+    const float viewport_y = IN.output_size.y / mask_resize_viewport_scale.y;
     const float aspect_ratio = geom_aspect_ratio_x / geom_aspect_ratio_y;
-    const vec2 estimated_viewport_size =
-        vec2(viewport_y * aspect_ratio, viewport_y);
+    const float2 estimated_viewport_size =
+        float2(viewport_y * aspect_ratio, viewport_y);
     //  Estimate the output size of MASK_RESIZE (the next pass).  The estimated
     //  x component shouldn't matter, because we're not using the x result, and
     //  we're not swearing it's correct (if we did, the x result would influence
     //  the y result to maintain the tile aspect ratio).
-    const vec2 estimated_mask_resize_output_size =
-        vec2(registers.OutputSize.y * aspect_ratio, registers.OutputSize.y);
+    const float2 estimated_mask_resize_output_size =
+        float2(IN.output_size.y * aspect_ratio, IN.output_size.y);
     //  Find the final intended [y] size of our resized phosphor mask tiles,
     //  then the tile size for the current pass (resize y only):
-    const vec2 mask_resize_tile_size = get_resized_mask_tile_size(
+    float2 mask_resize_tile_size = get_resized_mask_tile_size(
         estimated_viewport_size, estimated_mask_resize_output_size, false);
-    const vec2 pass_output_tile_size = vec2(min(
-        mask_resize_src_lut_size.x, registers.OutputSize.x), mask_resize_tile_size.y);
+    float2 pass_output_tile_size = float2(min(
+        mask_resize_src_lut_size.x, IN.output_size.x), mask_resize_tile_size.y);
 
     //  We'll render resized tiles until filling the output FBO or meeting a
     //  limit, so compute [wrapped] tile uv coords based on the output uv coords
     //  and the number of tiles that will fit in the FBO.
-    const vec2 output_tiles_this_pass = registers.OutputSize.xy / pass_output_tile_size;
-    const vec2 output_video_uv = tex_uv;
-    const vec2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
+    const float2 output_tiles_this_pass = IN.output_size / pass_output_tile_size;
+    const float2 output_video_uv = tex_uv * IN.texture_size / IN.video_size;
+    const float2 tile_uv_wrap = output_video_uv * output_tiles_this_pass;
 
     //  The input LUT is just a single mask tile, so texture uv coords are the
-    //  same as tile uv coords (save fract() for the fragment shader).  The
+    //  same as tile uv coords (save frac() for the fragment shader).  The
     //  magnification scale is also straightforward:
     src_tex_uv_wrap = tile_uv_wrap;
     resize_magnification_scale =
@@ -86,69 +133,19 @@ void main()
 }
 
 #pragma stage fragment
-layout(location = 0) in vec2 tex_uv;
-layout(location = 1) in vec2 src_tex_uv_wrap;
-layout(location = 2) in vec2 resize_magnification_scale;
+layout(location = 0) in vec2 src_tex_uv_wrap;
+layout(location = 1) in vec2 resize_magnification_scale;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
-layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large;
-layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large;
-layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large;
-
-void main()
-{
-    //  Resize the input phosphor mask tile to the final vertical size it will
-    //  appear on screen.  Keep 1x horizontal size if possible (IN.output_size
-    //  >= mask_resize_src_lut_size), and otherwise linearly sample horizontally
-    //  to fit exactly one tile.  Lanczos-resizing the phosphor mask achieves
-    //  much sharper results than mipmapping, and vertically resizing first
-    //  minimizes the total number of taps required.  We output a number of
-    //  resized tiles >= mask_resize_num_tiles for easier tiled sampling later.
-    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
-        //  Discard unneeded fragments in case our profile allows real branches.
-        const vec2 tile_uv_wrap = src_tex_uv_wrap;
-        if(params.mask_sample_mode_desired < 0.5 &&
-            tile_uv_wrap.y <= mask_resize_num_tiles)
-        {
-            const float src_dy = 1.0/mask_resize_src_lut_size.y;
-            const vec2 src_tex_uv = fract(src_tex_uv_wrap);
-            vec3 pixel_color;
-            //  If mask_type is static, this branch will be resolved statically.
-            if(params.mask_type < 0.5)
-            {
-                pixel_color = downsample_vertical_sinc_tiled(
-                    mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size,
-                    src_dy, resize_magnification_scale.y, 1.0);
-            }
-            else if(params.mask_type < 1.5)
-            {
-                pixel_color = downsample_vertical_sinc_tiled(
-                    mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size,
-                    src_dy, resize_magnification_scale.y, 1.0);
-            }
-            else
-            {
-                pixel_color = downsample_vertical_sinc_tiled(
-                    mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size,
-                    src_dy, resize_magnification_scale.y, 1.0);
-            }
-            //  The input LUT was linear RGB, and so is our output:
-            FragColor = vec4(pixel_color, 1.0);
-        }
-        else
-        {
-            discard;
-        }
-    #else
-        discard;
-        FragColor = vec4(1.0);
-    #endif
-}
+	layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large;
+	layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large;
+	layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large;
 #else
-layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_small;
-layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_small;
-layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_small;
+	layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_small;
+	layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_small;
+	layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_small;
+#endif
 
 void main()
 {
@@ -159,36 +156,58 @@ void main()
     //  much sharper results than mipmapping, and vertically resizing first
     //  minimizes the total number of taps required.  We output a number of
     //  resized tiles >= mask_resize_num_tiles for easier tiled sampling later.
+    //const float2 src_tex_uv_wrap = src_tex_uv_wrap;
     #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
         //  Discard unneeded fragments in case our profile allows real branches.
-        const vec2 tile_uv_wrap = src_tex_uv_wrap;
-        if(params.mask_sample_mode_desired < 0.5 &&
+        const float2 tile_uv_wrap = src_tex_uv_wrap;
+        if(get_mask_sample_mode() < 0.5 &&
             tile_uv_wrap.y <= mask_resize_num_tiles)
         {
-            const float src_dy = 1.0/mask_resize_src_lut_size.y;
-            const vec2 src_tex_uv = fract(src_tex_uv_wrap);
-            vec3 pixel_color;
+            static const float src_dy = 1.0/mask_resize_src_lut_size.y;
+            const float2 src_tex_uv = frac(src_tex_uv_wrap);
+            float3 pixel_color;
             //  If mask_type is static, this branch will be resolved statically.
-            if(params.mask_type < 0.5)
-            {
-                pixel_color = downsample_vertical_sinc_tiled(
-                    mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size,
-                    src_dy, resize_magnification_scale.y, 1.0);
-            }
-            else if(params.mask_type < 1.5)
-            {
-                pixel_color = downsample_vertical_sinc_tiled(
-                    mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size,
-                    src_dy, resize_magnification_scale.y, 1.0);
-            }
-            else
-            {
-                pixel_color = downsample_vertical_sinc_tiled(
-                    mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size,
-                    src_dy, resize_magnification_scale.y, 1.0);
-            }
+			#ifdef PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+				if(mask_type < 0.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_grille_texture_large, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else if(mask_type < 1.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_slot_texture_large, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_shadow_texture_large, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+			#else
+				if(mask_type < 0.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_grille_texture_small, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else if(mask_type < 1.5)
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_slot_texture_small, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+				else
+				{
+					pixel_color = downsample_vertical_sinc_tiled(
+						mask_shadow_texture_small, src_tex_uv, mask_resize_src_lut_size,
+						src_dy, resize_magnification_scale.y, 1.0);
+				}
+			#endif
             //  The input LUT was linear RGB, and so is our output:
-            FragColor = vec4(pixel_color, 1.0);
+            FragColor = float4(pixel_color, 1.0);
         }
         else
         {
@@ -196,7 +215,6 @@ void main()
         }
     #else
         discard;
-        FragColor = vec4(1.0);
-    #endif
-}
-#endif
\ No newline at end of file
+        FragColor = float4(1.0);
+	#endif
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang
new file mode 100644
index 0000000..b38d831
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask-intel.slang
@@ -0,0 +1,4 @@
+#version 450
+
+#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+#include "crt-royale-scanlines-horizontal-apply-mask.h"
diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.h b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.h
new file mode 100644
index 0000000..459a80a
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.h
@@ -0,0 +1,364 @@
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float mask_triad_size_desired;
+	float mask_specify_num_triads;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+	vec4 VERTICAL_SCANLINESSize;
+	vec4 BLOOM_APPROXSize;
+	vec4 HALATION_BLURSize;
+	vec4 MASK_RESIZESize;
+} global;
+
+#define VERTICAL_SCANLINEStexture VERTICAL_SCANLINES
+#define VERTICAL_SCANLINEStexture_size global.VERTICAL_SCANLINESSize.xy
+#define VERTICAL_SCANLINESvideo_size global.VERTICAL_SCANLINESSize.xy
+#define BLOOM_APPROXtexture BLOOM_APPROX
+#define BLOOM_APPROXtexture_size global.BLOOM_APPROXSize.xy
+#define BLOOM_APPROXvideo_size global.BLOOM_APPROXSize.xy
+#define HALATION_BLURtexture HALATION_BLUR
+#define HALATION_BLURtexture_size global.HALATION_BLURSize.xy
+#define HALATION_BLURvideo_size global.HALATION_BLURSize.xy
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+	#define MASK_RESIZEtexture Source
+#else
+	#define MASK_RESIZEtexture MASK_RESIZE
+#endif
+#define MASK_RESIZEtexture_size global.MASK_RESIZESize.xy
+#define MASK_RESIZEvideo_size global.MASK_RESIZESize.xy
+
+float bloom_approx_scale_x = params.OutputSize.x / params.SourceSize.y;
+const float max_viewport_size_x = 1080.0*1024.0*(4.0/3.0);
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+#include "bind-shader-params.h"
+
+
+///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
+
+#include "scanline-functions.h"
+#include "phosphor-mask-resizing.h"
+#include "../../../../include/gamma-management.h"
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+inline float4 tex2Dtiled_mask_linearize(const sampler2D tex,
+    const float2 tex_uv)
+{
+    //  If we're manually tiling a texture, anisotropic filtering can get
+    //  confused.  One workaround is to just select the lowest mip level:
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+            //  TODO: Use tex2Dlod_linearize with a calculated mip level.
+            return tex2Dlod_linearize(tex, float4(tex_uv, 0.0, 0.0));
+        #else
+            #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+                return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0));
+            #else
+                return tex2D_linearize(tex, tex_uv);
+            #endif
+        #endif
+    #else
+        return tex2D_linearize(tex, tex_uv);
+    #endif
+}
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 video_uv;
+layout(location = 1) out vec2 scanline_tex_uv;
+layout(location = 2) out vec2 blur3x3_tex_uv;
+layout(location = 3) out vec2 halation_tex_uv;
+layout(location = 4) out vec2 scanline_texture_size_inv;
+layout(location = 5) out vec4 mask_tile_start_uv_and_size;
+layout(location = 6) out vec2 mask_tiles_per_screen;
+
+void main()
+{
+   gl_Position = global.MVP * Position;
+   float2 tex_uv = TexCoord;
+	//  Our various input textures use different coords.
+    video_uv = tex_uv * IN.texture_size/IN.video_size;
+    scanline_texture_size_inv =
+        float2(1.0, 1.0)/VERTICAL_SCANLINEStexture_size;
+    //video_uv = video_uv;
+    scanline_tex_uv = video_uv * VERTICAL_SCANLINESvideo_size *
+        scanline_texture_size_inv;
+    blur3x3_tex_uv = video_uv * BLOOM_APPROXvideo_size /
+        BLOOM_APPROXtexture_size;
+    halation_tex_uv = video_uv * HALATION_BLURvideo_size /
+        HALATION_BLURtexture_size;
+    //scanline_texture_size_inv = scanline_texture_size_inv;
+
+    //  Get a consistent name for the final mask texture size.  Sample mode 0
+    //  uses the manually resized mask, but ignore it if we never resized.
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        const float mask_sample_mode = get_mask_sample_mode();
+        const float2 mask_resize_texture_size = mask_sample_mode < 0.5 ?
+            MASK_RESIZEtexture_size : mask_texture_large_size;
+        const float2 mask_resize_video_size = mask_sample_mode < 0.5 ?
+            MASK_RESIZEvideo_size : mask_texture_large_size;
+    #else
+        const float2 mask_resize_texture_size = mask_texture_large_size;
+        const float2 mask_resize_video_size = mask_texture_large_size;
+    #endif
+    //  Compute mask tile dimensions, starting points, etc.:
+    //float2 mask_tiles_per_screen;
+    mask_tile_start_uv_and_size = get_mask_sampling_parameters(
+        mask_resize_texture_size, mask_resize_video_size, IN.output_size,
+        mask_tiles_per_screen);
+    //mask_tiles_per_screen = mask_tiles_per_screen;
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 video_uv;
+layout(location = 1) in vec2 scanline_tex_uv;
+layout(location = 2) in vec2 blur3x3_tex_uv;
+layout(location = 3) in vec2 halation_tex_uv;
+layout(location = 4) in vec2 scanline_texture_size_inv;
+layout(location = 5) in vec4 mask_tile_start_uv_and_size;
+layout(location = 6) in vec2 mask_tiles_per_screen;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large;
+layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large;
+layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large;
+layout(set = 0, binding = 6) uniform sampler2D VERTICAL_SCANLINES;
+layout(set = 0, binding = 7) uniform sampler2D BLOOM_APPROX;
+layout(set = 0, binding = 8) uniform sampler2D HALATION_BLUR;
+#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+layout(set = 0, binding = 9) uniform sampler2D MASK_RESIZE;
+#endif
+
+//////////////////////////////  FRAGMENT INCLUDES  //////////////////////////////
+
+#include "bloom-functions.h"
+
+void main()
+{
+    //  This pass: Sample (misconverged?) scanlines to the final horizontal
+    //  resolution, apply halation (bouncing electrons), and apply the phosphor
+    //  mask.  Fake a bloom if requested.  Unless we fake a bloom, the output
+    //  will be dim from the scanline auto-dim, mask dimming, and low gamma.
+
+    //  Horizontally sample the current row (a vertically interpolated scanline)
+    //  and account for horizontal convergence offsets, given in units of texels.
+    const float3 scanline_color_dim = sample_rgb_scanline_horizontal(
+        VERTICAL_SCANLINEStexture, scanline_tex_uv,
+        VERTICAL_SCANLINEStexture_size, scanline_texture_size_inv);
+    const float auto_dim_factor = levels_autodim_temp;
+
+    //  Sample the phosphor mask:
+    const float2 tile_uv_wrap = video_uv * mask_tiles_per_screen;
+    const float2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv(
+        tile_uv_wrap, mask_tile_start_uv_and_size);
+    float3 phosphor_mask_sample;
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        const bool sample_orig_luts = get_mask_sample_mode() > 0.5;
+    #else
+        static const bool sample_orig_luts = true;
+    #endif
+    if(sample_orig_luts)
+    {
+        //  If mask_type is static, this branch will be resolved statically.
+        if(mask_type < 0.5)
+        {
+            phosphor_mask_sample = tex2D_linearize(
+                mask_grille_texture_large, mask_tex_uv).rgb;
+        }
+        else if(mask_type < 1.5)
+        {
+            phosphor_mask_sample = tex2D_linearize(
+                mask_slot_texture_large, mask_tex_uv).rgb;
+        }
+        else
+        {
+            phosphor_mask_sample = tex2D_linearize(
+                mask_shadow_texture_large, mask_tex_uv).rgb;
+        }
+    }
+    else
+    {
+        //  Sample the resized mask, and avoid tiling artifacts:
+        phosphor_mask_sample = tex2Dtiled_mask_linearize(
+            MASK_RESIZEtexture, mask_tex_uv).rgb;
+    }
+
+    //  Sample the halation texture (auto-dim to match the scanlines), and
+    //  account for both horizontal and vertical convergence offsets, given
+    //  in units of texels horizontally and same-field scanlines vertically:
+    const float3 halation_color = tex2D_linearize(
+        HALATION_BLURtexture, halation_tex_uv).rgb;
+
+    //  Apply halation: Halation models electrons flying around under the glass
+    //  and hitting the wrong phosphors (of any color).  It desaturates, so
+    //  average the halation electrons to a scalar.  Reduce the local scanline
+    //  intensity accordingly to conserve energy.
+    const float3 halation_intensity_dim =
+        float3(dot(halation_color, float3(auto_dim_factor/3.0)));
+    const float3 electron_intensity_dim = lerp(scanline_color_dim,
+        halation_intensity_dim, global.halation_weight);
+
+    //  Apply the phosphor mask:
+    const float3 phosphor_emission_dim = electron_intensity_dim *
+        phosphor_mask_sample;
+
+    #ifdef PHOSPHOR_BLOOM_FAKE
+        //  The BLOOM_APPROX pass approximates a blurred version of a masked
+        //  and scanlined image.  It's usually used to compute the brightpass,
+        //  but we can also use it to fake the bloom stage entirely.  Caveats:
+        //  1.) A fake bloom is conceptually different, since we're mixing in a
+        //      fully blurred low-res image, and the biggest implication are:
+        //  2.) If mask_amplify is incorrect, results deteriorate more quickly.
+        //  3.) The inaccurate blurring hurts quality in high-contrast areas.
+        //  4.) The bloom_underestimate_levels parameter seems less sensitive.
+        //  Reverse the auto-dimming and amplify to compensate for mask dimming:
+		#define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
+        #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
+            static const float blur_contrast = 1.05;
+        #else
+            static const float blur_contrast = 1.0;
+        #endif
+        const float mask_amplify = get_mask_amplify();
+        const float undim_factor = 1.0/auto_dim_factor;
+        const float3 phosphor_emission =
+            phosphor_emission_dim * undim_factor * mask_amplify;
+        //  Get a phosphor blur estimate, accounting for convergence offsets:
+        const float3 electron_intensity = electron_intensity_dim * undim_factor;
+        const float3 phosphor_blur_approx_soft = tex2D_linearize(
+            BLOOM_APPROXtexture, blur3x3_tex_uv).rgb;
+        const float3 phosphor_blur_approx = lerp(phosphor_blur_approx_soft,
+            electron_intensity, 0.1) * blur_contrast;
+        //  We could blend between phosphor_emission and phosphor_blur_approx,
+        //  solving for the minimum blend_ratio that avoids clipping past 1.0:
+        //      1.0 >= total_intensity
+        //      1.0 >= phosphor_emission * (1.0 - blend_ratio) +
+        //              phosphor_blur_approx * blend_ratio
+        //      blend_ratio = (phosphor_emission - 1.0)/
+        //          (phosphor_emission - phosphor_blur_approx);
+        //  However, this blurs far more than necessary, because it aims for
+        //  full brightness, not minimal blurring.  To fix it, base blend_ratio
+        //  on a max area intensity only so it varies more smoothly:
+        const float3 phosphor_blur_underestimate =
+            phosphor_blur_approx * bloom_underestimate_levels;
+        const float3 area_max_underestimate =
+            phosphor_blur_underestimate * mask_amplify;
+        #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
+            const float3 blend_ratio_temp =
+                (area_max_underestimate - float3(1.0, 1.0, 1.0)) /
+                (area_max_underestimate - phosphor_blur_underestimate);
+        #else
+            //  Try doing it like an area-based brightpass.  This is nearly
+            //  identical, but it's worth toying with the code in case I ever
+            //  find a way to make it look more like a real bloom.  (I've had
+            //  some promising textures from combining an area-based blend ratio
+            //  for the phosphor blur and a more brightpass-like blend-ratio for
+            //  the phosphor emission, but I haven't found a way to make the
+            //  brightness correct across the whole color range, especially with
+            //  different bloom_underestimate_levels values.)
+            const float desired_triad_size = lerp(global.mask_triad_size_desired,
+                IN.output_size.x/global.mask_num_triads_desired,
+                global.mask_specify_num_triads);
+            const float bloom_sigma = get_min_sigma_to_blur_triad(
+                desired_triad_size, bloom_diff_thresh);
+            const float center_weight = get_center_weight(bloom_sigma);
+            const float3 max_area_contribution_approx =
+                max(float3(0.0, 0.0, 0.0), phosphor_blur_approx -
+                center_weight * phosphor_emission);
+            const float3 area_contrib_underestimate =
+                bloom_underestimate_levels * max_area_contribution_approx;
+            const float3 blend_ratio_temp =
+                ((float3(1.0, 1.0, 1.0) - area_contrib_underestimate) /
+                area_max_underestimate - float3(1.0, 1.0, 1.0)) / (center_weight - 1.0);
+        #endif
+        //  Clamp blend_ratio in case it's out-of-range, but be SUPER careful:
+        //  min/max/clamp are BIZARRELY broken with lerp (optimization bug?),
+        //  and this redundant sequence avoids bugs, at least on nVidia cards:
+        const float3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0);
+        const float3 blend_ratio = lerp(blend_ratio_clamped, float3(1.0,1.0,1.0), global.bloom_excess);
+        //  Blend the blurred and unblurred images:
+        const float3 phosphor_emission_unclipped =
+            lerp(phosphor_emission, phosphor_blur_approx, blend_ratio);
+        //  Simulate refractive diffusion by reusing the halation sample.
+        const float3 pixel_color = lerp(phosphor_emission_unclipped,
+            halation_color, global.diffusion_weight);
+    #else
+        const float3 pixel_color = phosphor_emission_dim;
+    #endif
+    //  Encode if necessary, and output.
+    FragColor = encode_output(float4(pixel_color, 1.0));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang
index fb2afa8..5303e71 100755
--- a/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang
@@ -1,294 +1,2 @@
 #version 450
-
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OutputSize;
-	vec4 MASK_RESIZESize;
-	vec4 ORIG_LINEARIZEDSize;
-	vec4 VERTICAL_SCANLINESSize;
-	vec4 BLOOM_APPROXSize;
-	vec4 HALATION_BLURSize;
-} registers;
-
-#include "params.inc"
-
-/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
-
-//  crt-royale: A full-featured CRT shader, with cheese.
-//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
-//
-//  This program is free software; you can redistribute it and/or modify it
-//  under the terms of the GNU General Public License as published by the Free
-//  Software Foundation; either version 2 of the License, or any later version.
-//
-//  This program is distributed in the hope that it will be useful, but WITHOUT
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
-//  more details.
-//
-//  You should have received a copy of the GNU General Public License along with
-//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
-//  Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
-
-#include "../user-settings.h"
-#include "derived-settings-and-constants.h"
-#include "bind-shader-params.h"
-
-
-//////////////////////////////////  INCLUDES  //////////////////////////////////
-
-#include "scanline-functions.h"
-#include "phosphor-mask-resizing.h"
-#include "bloom-functions.h"//"bloom-functions.h"
-#include "../../../../include/gamma-management.h"
-
-///////////////////////////////////  HELPERS  //////////////////////////////////
-
-vec4 tex2Dtiled_mask_linearize(const sampler2D tex,
-    const vec2 tex_uv)
-{
-    //  If we're manually tiling a texture, anisotropic filtering can get
-    //  confused.  One workaround is to just select the lowest mip level:
-    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
-        #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
-            //  TODO: Use tex2Dlod_linearize with a calculated mip level.
-            return tex2Dlod_linearize(tex, vec4(tex_uv, 0.0, 0.0));
-        #else
-            #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
-                return tex2Dbias_linearize(tex, float4(tex_uv, 0.0, -16.0));
-            #else
-                return tex2D_linearize(tex, tex_uv);
-            #endif
-        #endif
-    #else
-        return tex2D_linearize(tex, tex_uv);
-    #endif
-}
-
-#pragma stage vertex
-layout(location = 0) in vec4 Position;
-layout(location = 1) in vec2 TexCoord;
-layout(location = 0) out vec2 video_uv;
-layout(location = 1) out vec2 scanline_tex_uv;
-layout(location = 2) out vec2 blur3x3_tex_uv;
-layout(location = 3) out vec2 halation_tex_uv;
-layout(location = 4) out vec2 scanline_texture_size_inv;
-layout(location = 5) out vec4 mask_tile_start_uv_and_size;
-layout(location = 6) out vec2 mask_tiles_per_screen;
-
-void main()
-{
-   gl_Position = params.MVP * Position;
-   
-   //  Our various input textures use different coords.
-   video_uv = TexCoord;
-    scanline_texture_size_inv =
-        registers.VERTICAL_SCANLINESSize.zw;
-	scanline_tex_uv = video_uv;// * registers.VERTICAL_SCANLINESSize.xy *
-        scanline_texture_size_inv;
-	blur3x3_tex_uv = video_uv;// * registers.BLOOM_APPROXSize.xy *
-        registers.BLOOM_APPROXSize.zw;
-	halation_tex_uv = video_uv;// * registers.HALATION_BLURSize.xy *
-        registers.HALATION_BLURSize.zw;
-		
-	//  Get a consistent name for the final mask texture size.  Sample mode 0
-    //  uses the manually resized mask, but ignore it if we never resized.
-    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
-        const float mask_sample_mode = params.mask_sample_mode_desired;//get_mask_sample_mode();
-        vec2 mask_resize_texture_size = registers.MASK_RESIZESize.xy;
-			if(mask_sample_mode > 0.5) mask_resize_texture_size = mask_texture_large_size;
-        vec2 mask_resize_video_size = registers.MASK_RESIZESize.xy;
-			if(mask_sample_mode > 0.5) mask_resize_video_size = mask_texture_large_size;
-    #else
-        const vec2 mask_resize_texture_size = mask_texture_large_size;
-        const vec2 mask_resize_video_size = mask_texture_large_size;
-    #endif
-//		mask_tiles_per_screen = vec2(1280.0, 480.0);
-	
-	//  Compute mask tile dimensions, starting points, etc.:
-        mask_tile_start_uv_and_size = get_mask_sampling_parameters(
-        mask_resize_texture_size, mask_resize_video_size, registers.OutputSize.xy,
-        mask_tiles_per_screen);
-}
-
-#pragma stage fragment
-#pragma format R8G8B8A8_SRGB
-layout(location = 0) in vec2 video_uv;
-layout(location = 1) in vec2 scanline_tex_uv;
-layout(location = 2) in vec2 blur3x3_tex_uv;
-layout(location = 3) in vec2 halation_tex_uv;
-layout(location = 4) in vec2 scanline_texture_size_inv;
-layout(location = 5) in vec4 mask_tile_start_uv_and_size;
-layout(location = 6) in vec2 mask_tiles_per_screen;
-layout(location = 0) out vec4 FragColor;
-layout(set = 0, binding = 2) uniform sampler2D Source;
-layout(set = 0, binding = 3) uniform sampler2D mask_grille_texture_large;
-layout(set = 0, binding = 4) uniform sampler2D mask_slot_texture_large;
-layout(set = 0, binding = 5) uniform sampler2D mask_shadow_texture_large;
-layout(set = 0, binding = 6) uniform sampler2D VERTICAL_SCANLINES;
-layout(set = 0, binding = 7) uniform sampler2D BLOOM_APPROX;
-layout(set = 0, binding = 8) uniform sampler2D HALATION_BLUR;
-#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
-layout(set = 0, binding = 9) uniform sampler2D MASK_RESIZE;
-#endif
-
-void main()
-{
-    //  This pass: Sample (misconverged?) scanlines to the final horizontal
-    //  resolution, apply halation (bouncing electrons), and apply the phosphor
-    //  mask.  Fake a bloom if requested.  Unless we fake a bloom, the output
-    //  will be dim from the scanline auto-dim, mask dimming, and low gamma.
-
-    //  Horizontally sample the current row (a vertically interpolated scanline)
-    //  and account for horizontal convergence offsets, given in units of texels.
-    const vec3 scanline_color_dim = sample_rgb_scanline_horizontal(
-        VERTICAL_SCANLINES, scanline_tex_uv,
-        registers.VERTICAL_SCANLINESSize.xy, scanline_texture_size_inv);
-    const float auto_dim_factor = levels_autodim_temp;
-	
-	//  Sample the phosphor mask:
-    const vec2 tile_uv_wrap = video_uv * mask_tiles_per_screen;
-    const vec2 mask_tex_uv = convert_phosphor_tile_uv_wrap_to_tex_uv(
-        tile_uv_wrap, mask_tile_start_uv_and_size);
-    vec3 phosphor_mask_sample;
-    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
-        bool sample_orig_luts = true;
-			if (params.mask_sample_mode_desired < 0.5) sample_orig_luts = false;
-    #else
-        const bool sample_orig_luts = true;
-    #endif
-	
-	if(sample_orig_luts == true)
-    {
-        //  If mask_type is static, this branch will be resolved statically.
-        if(params.mask_type < 0.5)
-        {
-            phosphor_mask_sample = tex2D_linearize(
-                mask_grille_texture_large, mask_tex_uv).rgb;
-        }
-        else if(params.mask_type < 1.5)
-        {
-            phosphor_mask_sample = tex2D_linearize(
-                mask_slot_texture_large, mask_tex_uv).rgb;
-        }
-        else
-        {
-            phosphor_mask_sample = tex2D_linearize(
-                mask_shadow_texture_large, mask_tex_uv).rgb;
-        }
-    }
-    else
-    {
-        //  Sample the resized mask, and avoid tiling artifacts:
-        phosphor_mask_sample = tex2Dtiled_mask_linearize(
-            MASK_RESIZE, mask_tex_uv).rgb;
-    }
-	
-	//  Sample the halation texture (auto-dim to match the scanlines), and
-    //  account for both horizontal and vertical convergence offsets, given
-    //  in units of texels horizontally and same-field scanlines vertically:
-    const vec3 halation_color = tex2D_linearize(
-        HALATION_BLUR, halation_tex_uv).rgb;
-		
-	//  Apply halation: Halation models electrons flying around under the glass
-    //  and hitting the wrong phosphors (of any color).  It desaturates, so
-    //  average the halation electrons to a scalar.  Reduce the local scanline
-    //  intensity accordingly to conserve energy.
-    const vec3 halation_intensity_dim =
-        vec3(dot(halation_color, vec3(auto_dim_factor/3.0)));
-    const vec3 electron_intensity_dim = mix(scanline_color_dim,
-        halation_intensity_dim, params.halation_weight);
-		
-	//  Apply the phosphor mask:
-    const vec3 phosphor_emission_dim = electron_intensity_dim *
-        phosphor_mask_sample;
-//		#define PHOSPHOR_BLOOM_FAKE // TODO/FIXME: something seems wrong with the non-FAKE path
-	#ifdef PHOSPHOR_BLOOM_FAKE
-        //  The BLOOM_APPROX pass approximates a blurred version of a masked
-        //  and scanlined image.  It's usually used to compute the brightpass,
-        //  but we can also use it to fake the bloom stage entirely.  Caveats:
-        //  1.) A fake bloom is conceptually different, since we're mixing in a
-        //      fully blurred low-res image, and the biggest implication are:
-        //  2.) If mask_amplify is incorrect, results deteriorate more quickly.
-        //  3.) The inaccurate blurring hurts quality in high-contrast areas.
-        //  4.) The bloom_underestimate_levels parameter seems less sensitive.
-        //  Reverse the auto-dimming and amplify to compensate for mask dimming:
-        #define PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
-        #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
-            const float blur_contrast = 1.05;
-        #else
-            const float blur_contrast = 1.0;
-        #endif
-        const float mask_amplify = get_mask_amplify();
-        const float undim_factor = 1.0/auto_dim_factor;
-        const vec3 phosphor_emission =
-            phosphor_emission_dim * undim_factor * mask_amplify;
-        //  Get a phosphor blur estimate, accounting for convergence offsets:
-        const vec3 electron_intensity = electron_intensity_dim * undim_factor;
-        const vec3 phosphor_blur_approx_soft = tex2D_linearize(
-            BLOOM_APPROX, blur3x3_tex_uv).rgb;
-        const vec3 phosphor_blur_approx = mix(phosphor_blur_approx_soft,
-            electron_intensity, 0.1) * blur_contrast;
-        //  We could blend between phosphor_emission and phosphor_blur_approx,
-        //  solving for the minimum blend_ratio that avoids clipping past 1.0:
-        //      1.0 >= total_intensity
-        //      1.0 >= phosphor_emission * (1.0 - blend_ratio) +
-        //              phosphor_blur_approx * blend_ratio
-        //      blend_ratio = (phosphor_emission - 1.0)/
-        //          (phosphor_emission - phosphor_blur_approx);
-        //  However, this blurs far more than necessary, because it aims for
-        //  full brightness, not minimal blurring.  To fix it, base blend_ratio
-        //  on a max area intensity only so it varies more smoothly:
-        const vec3 phosphor_blur_underestimate =
-            phosphor_blur_approx * params.bloom_underestimate_levels;
-        const vec3 area_max_underestimate =
-            phosphor_blur_underestimate * mask_amplify;
-        #ifdef PHOSPHOR_BLOOM_FAKE_WITH_SIMPLE_BLEND
-            const vec3 blend_ratio_temp =
-                (area_max_underestimate - vec3(1.0)) /
-                (area_max_underestimate - phosphor_blur_underestimate);
-        #else
-            //  Try doing it like an area-based brightpass.  This is nearly
-            //  identical, but it's worth toying with the code in case I ever
-            //  find a way to make it look more like a real bloom.  (I've had
-            //  some promising textures from combining an area-based blend ratio
-            //  for the phosphor blur and a more brightpass-like blend-ratio for
-            //  the phosphor emission, but I haven't found a way to make the
-            //  brightness correct across the whole color range, especially with
-            //  different bloom_underestimate_levels values.)
-            const float desired_triad_size = mix(params.mask_triad_size_desired,
-                registers.OutputSize.x/params.mask_num_triads_desired,
-                params.mask_specify_num_triads);
-            const float bloom_sigma = get_min_sigma_to_blur_triad(
-                desired_triad_size, bloom_diff_thresh);
-            const float center_weight = get_center_weight(bloom_sigma);
-            const vec3 max_area_contribution_approx =
-                max(vec3(0.0), phosphor_blur_approx -
-                center_weight * phosphor_emission);
-            const vec3 area_contrib_underestimate =
-                params.bloom_underestimate_levels * max_area_contribution_approx;
-            const vec3 blend_ratio_temp =
-                ((vec3(1.0) - area_contrib_underestimate) /
-                area_max_underestimate - vec3(1.0)) / (center_weight - 1.0);
-        #endif
-		//  Clamp blend_ratio in case it's out-of-range, but be SUPER careful:
-        //  min/max/clamp are BIZARRELY broken with lerp (optimization bug?),
-        //  and this redundant sequence avoids bugs, at least on nVidia cards:
-        const vec3 blend_ratio_clamped = max(clamp(blend_ratio_temp, 0.0, 1.0), 0.0);
-        const vec3 blend_ratio = mix(blend_ratio_clamped, vec3(1.0), params.bloom_excess);
-        //  Blend the blurred and unblurred images:
-        const vec3 phosphor_emission_unclipped =
-            mix(phosphor_emission, phosphor_blur_approx, blend_ratio);
-        //  Simulate refractive diffusion by reusing the halation sample.
-        const vec3 pixel_color = mix(phosphor_emission_unclipped,
-            halation_color, params.diffusion_weight);
-    #else
-        const vec3 pixel_color = phosphor_emission_dim;
-    #endif
-	
-   FragColor = encode_output(vec4(pixel_color, 1.0));
-}
+#include "crt-royale-scanlines-horizontal-apply-mask.h"
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang b/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang
index cfc0e64..423ed38 100755
--- a/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang
@@ -1,15 +1,5 @@
 #version 450
 
-layout(push_constant) uniform Push
-{
-	vec4 SourceSize;
-	vec4 OriginalSize;
-	vec4 OutputSize;
-	uint FrameCount;
-} registers;
-
-#include "params.inc"
-
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
 //  crt-royale: A full-featured CRT shader, with cheese.
@@ -28,10 +18,67 @@ layout(push_constant) uniform Push
 //  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 //  Place, Suite 330, Boston, MA 02111-1307 USA
 
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+	float lcd_gamma;
+	float levels_contrast;
+	float halation_weight;
+	float diffusion_weight;
+	float bloom_underestimate_levels;
+	float bloom_excess;
+	float beam_min_sigma;
+	float beam_max_sigma;
+	float beam_spot_power;
+	float beam_min_shape;
+	float beam_max_shape;
+	float beam_shape_power;
+	float beam_horiz_filter;
+	float beam_horiz_sigma;
+	float beam_horiz_linear_rgb_weight;
+	float convergence_offset_x_r;
+	float convergence_offset_x_g;
+	float convergence_offset_x_b;
+	float convergence_offset_y_r;
+	float convergence_offset_y_g;
+	float convergence_offset_y_b;
+	float mask_type;
+	float mask_sample_mode_desired;
+	float mask_num_triads_desired;
+	float aa_subpixel_r_offset_x_runtime;
+	float aa_subpixel_r_offset_y_runtime;
+	float aa_cubic_c;
+	float aa_gauss_sigma;
+	float geom_mode_runtime;
+	float geom_radius;
+	float geom_view_dist;
+	float geom_tilt_angle_x;
+	float geom_tilt_angle_y;
+	float geom_aspect_ratio_x;
+	float geom_aspect_ratio_y;
+	float geom_overscan_x;
+	float geom_overscan_y;
+	float border_size;
+	float border_darkness;
+	float border_compress;
+	float interlace_bff;
+	float interlace_1080i;
+} global;
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-//#include "../user-settings.h"
+#include "params.inc"
+#include "../../../../include/compat_macros.inc"
+#include "../user-settings.h"
 #include "derived-settings-and-constants.h"
 #include "bind-shader-params.h"
 #include "scanline-functions.h"
@@ -41,46 +88,51 @@ layout(push_constant) uniform Push
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
 layout(location = 0) out vec2 tex_uv;
-layout(location = 1) out vec2 uv_step;
-layout(location = 2) out vec2 il_step_multiple;
-layout(location = 3) out float pixel_height_in_scanlines;
+layout(location = 1) out vec2 uv_step;                     //  uv size of a texel (x) and scanline (y)
+layout(location = 2) out vec2 il_step_multiple;            //  (1, 1) = progressive, (1, 2) = interlaced
+layout(location = 3) out float pixel_height_in_scanlines;  //  Height of an output pixel in scanlines
+layout(location = 4) out float sigma_range;
+layout(location = 5) out float shape_range;
 
 void main()
 {
-   gl_Position = params.MVP * Position;
-   tex_uv = TexCoord;
+   gl_Position = global.MVP * Position;
+   tex_uv = TexCoord * 1.00001;
    
-    //  Detect interlacing: il_step_multiple indicates the step multiple between
+	//  Detect interlacing: il_step_multiple indicates the step multiple between
     //  lines: 1 is for progressive sources, and 2 is for interlaced sources.
-    const vec2 video_size = registers.SourceSize.xy;
-	float interlace_check = is_interlaced(video_size.y) ? 1.0 : 0.0;
-    const float y_step = 1.0 + interlace_check;
-    il_step_multiple = vec2(1.0, y_step);
+    float2 video_size_ = IN.video_size.xy;
+    const float y_step = 1.0 + float(is_interlaced(video_size_.y));
+    il_step_multiple = float2(1.0, y_step);
     //  Get the uv tex coords step between one texel (x) and scanline (y):
-    uv_step = il_step_multiple * registers.SourceSize.zw;
-	
-	//  If shader parameters are used, {min, max}_{sigma, shape} are runtime
+    uv_step = il_step_multiple / IN.texture_size;
+
+    //  If shader parameters are used, {min, max}_{sigma, shape} are runtime
     //  values.  Compute {sigma, shape}_range outside of scanline_contrib() so
     //  they aren't computed once per scanline (6 times per fragment and up to
     //  18 times per vertex):
-	const float sigma_range = max(params.beam_max_sigma, params.beam_min_sigma) -
-        params.beam_min_sigma;
-    const float shape_range = max(params.beam_max_shape, params.beam_min_shape) -
-        params.beam_min_shape;
-		
-	//  We need the pixel height in scanlines for antialiased/integral sampling:
-    pixel_height_in_scanlines = (video_size.y * registers.OutputSize.w) / 
+    const float sigma_range = max(beam_max_sigma, beam_min_sigma) -
+        beam_min_sigma;
+    const float shape_range = max(beam_max_shape, beam_min_shape) -
+        beam_min_shape;
+
+    //  We need the pixel height in scanlines for antialiased/integral sampling:
+    const float ph = (video_size_.y / IN.output_size.y) / 
         il_step_multiple.y;
+    pixel_height_in_scanlines = ph;
 }
 
 #pragma stage fragment
 #pragma format R8G8B8A8_SRGB
 layout(location = 0) in vec2 tex_uv;
-layout(location = 1) in vec2 uv_step;
-layout(location = 2) in vec2 il_step_multiple;
-layout(location = 3) in float pixel_height_in_scanlines;
+layout(location = 1) in vec2 uv_step;                      //  uv size of a texel (x) and scanline (y)
+layout(location = 2) in vec2 il_step_multiple;             //  (1, 1) = progressive, (1, 2) = interlaced
+layout(location = 3) in float pixel_height_in_scanlines;   //  Height of an output pixel in scanlines
+layout(location = 4) in float sigma_range;
+layout(location = 5) in float shape_range;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
+#define input_texture Source
 
 void main()
 {
@@ -88,155 +140,157 @@ void main()
     //  vertical resolution.  Temporarily auto-dim the output to avoid clipping.
 
     //  Read some attributes into local variables:
-    const vec2 texture_size = registers.SourceSize.xy;
-    const vec2 texture_size_inv = registers.SourceSize.zw;
-    const float frame_count = vec2(registers.FrameCount, registers.FrameCount).x;
+    float2 texture_size_ = IN.texture_size;
+    float2 texture_size_inv = 1.0/texture_size_;
+    //const float2 uv_step = uv_step;
+    //const float2 il_step_multiple = il_step_multiple;
+    float frame_count = float(IN.frame_count);
     const float ph = pixel_height_in_scanlines;
-	
-	//  Get the uv coords of the previous scanline (in this field), and the
+
+    //  Get the uv coords of the previous scanline (in this field), and the
     //  scanline's distance from this sample, in scanlines.
     float dist;
-    const vec2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size,
+    const float2 scanline_uv = get_last_scanline_uv(tex_uv, texture_size_,
         texture_size_inv, il_step_multiple, frame_count, dist);
     //  Consider 2, 3, 4, or 6 scanlines numbered 0-5: The previous and next
     //  scanlines are numbered 2 and 3.  Get scanline colors colors (ignore
-    //  horizontal sampling, since registers.OutputSize.x = video_size.x).
+    //  horizontal sampling, since since IN.output_size.x = video_size.x).
     //  NOTE: Anisotropic filtering creates interlacing artifacts, which is why
     //  ORIG_LINEARIZED bobbed any interlaced input before this pass.
-    const vec2 v_step = vec2(0.0, uv_step.y);
-    const vec3 scanline2_color = tex2D_linearize(Source, scanline_uv).rgb;
-    const vec3 scanline3_color =
-        tex2D_linearize(Source, scanline_uv + v_step).rgb;
-    vec3 scanline0_color, scanline1_color, scanline4_color, scanline5_color,
+    const float2 v_step = float2(0.0, uv_step.y);
+    const float3 scanline2_color = tex2D_linearize(input_texture, scanline_uv).rgb;
+    const float3 scanline3_color =
+        tex2D_linearize(input_texture, scanline_uv + v_step).rgb;
+    float3 scanline0_color, scanline1_color, scanline4_color, scanline5_color,
         scanline_outside_color;
     float dist_round;
     //  Use scanlines 0, 1, 4, and 5 for a total of 6 scanlines:
-	if(params.beam_num_scanlines > 5.5)
+    if(beam_num_scanlines > 5.5)
     {
         scanline1_color =
-            tex2D_linearize(Source, scanline_uv - v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv - v_step).rgb;
         scanline4_color =
-            tex2D_linearize(Source, scanline_uv + 2.0 * v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb;
         scanline0_color =
-            tex2D_linearize(Source, scanline_uv - 2.0 * v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv - 2.0 * v_step).rgb;
         scanline5_color =
-            tex2D_linearize(Source, scanline_uv + 3.0 * v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv + 3.0 * v_step).rgb;
     }
-	//  Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines:
-    else if(params.beam_num_scanlines > 4.5)
+    //  Use scanlines 1, 4, and either 0 or 5 for a total of 5 scanlines:
+    else if(beam_num_scanlines > 4.5)
     {
         scanline1_color =
-            tex2D_linearize(Source, scanline_uv - v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv - v_step).rgb;
         scanline4_color =
-            tex2D_linearize(Source, scanline_uv + 2.0 * v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb;
         //  dist is in [0, 1]
         dist_round = round(dist);
-        const vec2 sample_0_or_5_uv_off =
-            mix(-2.0 * v_step, 3.0 * v_step, dist_round);
+        const float2 sample_0_or_5_uv_off =
+            lerp(-2.0 * v_step, 3.0 * v_step, dist_round);
         //  Call this "scanline_outside_color" to cope with the conditional
         //  scanline number:
         scanline_outside_color = tex2D_linearize(
-            Source, scanline_uv + sample_0_or_5_uv_off).rgb;
+            input_texture, scanline_uv + sample_0_or_5_uv_off).rgb;
     }
-	//  Use scanlines 1 and 4 for a total of 4 scanlines:
-    else if(params.beam_num_scanlines > 3.5)
+    //  Use scanlines 1 and 4 for a total of 4 scanlines:
+    else if(beam_num_scanlines > 3.5)
     {
         scanline1_color =
-            tex2D_linearize(Source, scanline_uv - v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv - v_step).rgb;
         scanline4_color =
-            tex2D_linearize(Source, scanline_uv + 2.0 * v_step).rgb;
+            tex2D_linearize(input_texture, scanline_uv + 2.0 * v_step).rgb;
     }
     //  Use scanline 1 or 4 for a total of 3 scanlines:
-    else if(params.beam_num_scanlines > 2.5)
+    else if(beam_num_scanlines > 2.5)
     {
         //  dist is in [0, 1]
         dist_round = round(dist);
-        const vec2 sample_1or4_uv_off =
-            mix(-v_step, 2.0 * v_step, dist_round);
+        const float2 sample_1or4_uv_off =
+            lerp(-v_step, 2.0 * v_step, dist_round);
         scanline_outside_color = tex2D_linearize(
-            Source, scanline_uv + sample_1or4_uv_off).rgb;
+            input_texture, scanline_uv + sample_1or4_uv_off).rgb;
     }
-	
-	//  Compute scanline contributions, accounting for vertical convergence.
+    
+    //  Compute scanline contributions, accounting for vertical convergence.
     //  Vertical convergence offsets are in units of current-field scanlines.
     //  dist2 means "positive sample distance from scanline 2, in scanlines:"
-    vec3 dist2 = vec3(dist);
-    if(beam_misconvergence == true)
+    float3 dist2 = float3(dist);
+    if(beam_misconvergence)
     {
-        const vec3 convergence_offsets_vert_rgb =
-            vec3(params.convergence_offset_y_r, params.convergence_offset_y_g, params.convergence_offset_y_b);//get_convergence_offsets_y_vector();
-        dist2 = vec3(dist) - convergence_offsets_vert_rgb;
+        const float3 convergence_offsets_vert_rgb =
+            get_convergence_offsets_y_vector();
+        dist2 = float3(dist) - convergence_offsets_vert_rgb;
     }
-	//  Calculate {sigma, shape}_range outside of scanline_contrib so it's only
+    //  Calculate {sigma, shape}_range outside of scanline_contrib so it's only
     //  done once per pixel (not 6 times) with runtime params.  Don't reuse the
     //  vertex shader calculations, so static versions can be constant-folded.
-    const float sigma_range = max(params.beam_max_sigma, params.beam_min_sigma) -
-        params.beam_min_sigma;
-    const float shape_range = max(params.beam_max_shape, params.beam_min_shape) -
-        params.beam_min_shape;
-	//  Calculate and sum final scanline contributions, starting with lines 2/3.
+	//  TODO/FIXME: nvm, use the ones from the vertex /shrug
+/*    const float sigma_range = max(beam_max_sigma, beam_min_sigma) -
+        beam_min_sigma;
+    const float shape_range = max(beam_max_shape, beam_min_shape) -
+        beam_min_shape;*/
+    //  Calculate and sum final scanline contributions, starting with lines 2/3.
     //  There is no normalization step, because we're not interpolating a
     //  continuous signal.  Instead, each scanline is an additive light source.
-    const vec3 scanline2_contrib = scanline_contrib(dist2,
+    const float3 scanline2_contrib = scanline_contrib(dist2,
         scanline2_color, ph, sigma_range, shape_range);
-    const vec3 scanline3_contrib = scanline_contrib(abs(vec3(1.0) - dist2),
+    const float3 scanline3_contrib = scanline_contrib(abs(float3(1.0) - dist2),
         scanline3_color, ph, sigma_range, shape_range);
-    vec3 scanline_intensity = scanline2_contrib + scanline3_contrib;
-	
-	if(params.beam_num_scanlines > 5.5)
+    float3 scanline_intensity = scanline2_contrib + scanline3_contrib;
+    if(beam_num_scanlines > 5.5)
     {
-        vec3 scanline0_contrib =
-            scanline_contrib(dist2 + vec3(2.0), scanline0_color,
+        const float3 scanline0_contrib =
+            scanline_contrib(dist2 + float3(2.0), scanline0_color,
                 ph, sigma_range, shape_range);
-        vec3 scanline1_contrib =
-            scanline_contrib(dist2 + vec3(1.0), scanline1_color,
+        const float3 scanline1_contrib =
+            scanline_contrib(dist2 + float3(1.0), scanline1_color,
                 ph, sigma_range, shape_range);
-        vec3 scanline4_contrib =
-            scanline_contrib(abs(vec3(2.0) - dist2), scanline4_color,
+        const float3 scanline4_contrib =
+            scanline_contrib(abs(float3(2.0) - dist2), scanline4_color,
                 ph, sigma_range, shape_range);
-        vec3 scanline5_contrib =
-            scanline_contrib(abs(vec3(3.0) - dist2), scanline5_color,
+        const float3 scanline5_contrib =
+            scanline_contrib(abs(float3(3.0) - dist2), scanline5_color,
                 ph, sigma_range, shape_range);
         scanline_intensity += scanline0_contrib + scanline1_contrib +
             scanline4_contrib + scanline5_contrib;
     }
-    else if(params.beam_num_scanlines > 4.5)
+    else if(beam_num_scanlines > 4.5)
     {
-        vec3 scanline1_contrib =
-            scanline_contrib(dist2 + vec3(1.0), scanline1_color,
+        const float3 scanline1_contrib =
+            scanline_contrib(dist2 + float3(1.0), scanline1_color,
                 ph, sigma_range, shape_range);
-        vec3 scanline4_contrib =
-            scanline_contrib(abs(vec3(2.0) - dist2), scanline4_color,
+        const float3 scanline4_contrib =
+            scanline_contrib(abs(float3(2.0) - dist2), scanline4_color,
                 ph, sigma_range, shape_range);
-        vec3 dist0or5 = mix(
-            dist2 + vec3(2.0), vec3(3.0) - dist2, dist_round);
-        vec3 scanline0or5_contrib = scanline_contrib(
+        const float3 dist0or5 = lerp(
+            dist2 + float3(2.0), float3(3.0) - dist2, dist_round);
+        const float3 scanline0or5_contrib = scanline_contrib(
             dist0or5, scanline_outside_color, ph, sigma_range, shape_range);
         scanline_intensity += scanline1_contrib + scanline4_contrib +
             scanline0or5_contrib;
     }
-    else if(params.beam_num_scanlines > 3.5)
+    else if(beam_num_scanlines > 3.5)
     {
-        vec3 scanline1_contrib =
-            scanline_contrib(dist2 + vec3(1.0), scanline1_color,
+        const float3 scanline1_contrib =
+            scanline_contrib(dist2 + float3(1.0), scanline1_color,
                 ph, sigma_range, shape_range);
-        vec3 scanline4_contrib =
-            scanline_contrib(abs(vec3(2.0) - dist2), scanline4_color,
+        const float3 scanline4_contrib =
+            scanline_contrib(abs(float3(2.0) - dist2), scanline4_color,
                 ph, sigma_range, shape_range);
         scanline_intensity += scanline1_contrib + scanline4_contrib;
     }
-    else if(params.beam_num_scanlines > 2.5)
+    else if(beam_num_scanlines > 2.5)
     {
-        vec3 dist1or4 = mix(
-            dist2 + vec3(1.0), vec3(2.0) - dist2, dist_round);
-        vec3 scanline1or4_contrib = scanline_contrib(
+        const float3 dist1or4 = lerp(
+            dist2 + float3(1.0), float3(2.0) - dist2, dist_round);
+        const float3 scanline1or4_contrib = scanline_contrib(
             dist1or4, scanline_outside_color, ph, sigma_range, shape_range);
         scanline_intensity += scanline1or4_contrib;
     }
-	
-	//  Auto-dim the image to avoid clipping, encode if necessary, and output.
+
+    //  Auto-dim the image to avoid clipping, encode if necessary, and output.
     //  My original idea was to compute a minimal auto-dim factor and put it in
     //  the alpha channel, but it wasn't working, at least not reliably.  This
     //  is faster anyway, levels_autodim_temp = 0.5 isn't causing banding.
-   FragColor = vec4(encode_output(vec4(scanline_intensity * levels_autodim_temp, 1.0)));
-}
+    FragColor = encode_output(float4(scanline_intensity * levels_autodim_temp, 1.0));
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/derived-settings-and-constants.h b/crt/shaders/crt-royale/src/derived-settings-and-constants.h
index 356eea3..1c39a97 100644
--- a/crt/shaders/crt-royale/src/derived-settings-and-constants.h
+++ b/crt/shaders/crt-royale/src/derived-settings-and-constants.h
@@ -29,12 +29,12 @@
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
 #include "../user-settings.h"
-#include "user-preset-constants.h"
+#include "user-cgp-constants.h"
 
 
 ///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
 
-//  Avoid dividing by zero; using a macro overloads for float, vec2, etc.:
+//  Avoid dividing by zero; using a macro overloads for float, float2, etc.:
 #define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
 
 //  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
@@ -81,10 +81,10 @@
     #endif
     //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
     //  inferior in most cases, so replace 2.0 with 0.0:
-     const float bloom_approx_filter =
+    static const float bloom_approx_filter =
         bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
 #else
-     const float bloom_approx_filter = bloom_approx_filter_static;
+    static const float bloom_approx_filter = bloom_approx_filter_static;
 #endif
 
 //  Disable slow runtime paths if static parameters are used.  Most of these
@@ -199,12 +199,12 @@
     #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
         //  TODO: Take advantage of this!
         #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
-         const vec2 mask_resize_src_lut_size = mask_texture_large_size;
+        static const float2 mask_resize_src_lut_size = mask_texture_large_size;
     #else
-         const vec2 mask_resize_src_lut_size = mask_texture_small_size;
+        static const float2 mask_resize_src_lut_size = mask_texture_small_size;
     #endif
 #else
-     const vec2 mask_resize_src_lut_size = mask_texture_small_size;
+    static const float2 mask_resize_src_lut_size = mask_texture_small_size;
 #endif
 
 
@@ -237,35 +237,35 @@
 //  determine how many border texels and tiles we need, based on how the result
 //  will be sampled:
 #ifdef GEOMETRY_EARLY
-         const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        static const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
         //  Most antialiasing filters have a base radius of 4.0 pixels:
-         const float max_aa_base_pixel_border = 4.0 +
+        static const float max_aa_base_pixel_border = 4.0 +
             max_subpixel_offset;
 #else
-     const float max_aa_base_pixel_border = 0.0;
+    static const float max_aa_base_pixel_border = 0.0;
 #endif
 //  Anisotropic filtering adds about 0.5 to the pixel border:
 #ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
-     const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
 #else
-     const float max_aniso_pixel_border = max_aa_base_pixel_border;
+    static const float max_aniso_pixel_border = max_aa_base_pixel_border;
 #endif
 //  Fixing discontinuities adds 1.0 more to the pixel border:
 #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
-     const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+    static const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
 #else
-     const float max_tiled_pixel_border = max_aniso_pixel_border;
+    static const float max_tiled_pixel_border = max_aniso_pixel_border;
 #endif
 //  Convert the pixel border to an integer texel border.  Assume same-pass
 //  curvature about triples the texel frequency:
 #ifdef GEOMETRY_EARLY
-     const float max_mask_texel_border =
+    static const float max_mask_texel_border =
         ceil(max_tiled_pixel_border * 3.0);
 #else
-     const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+    static const float max_mask_texel_border = ceil(max_tiled_pixel_border);
 #endif
 //  Convert the texel border to a tile border using worst-case assumptions:
- const float max_mask_tile_border = max_mask_texel_border/
+static const float max_mask_tile_border = max_mask_texel_border/
     (mask_min_allowed_triad_size * mask_triads_per_tile);
 
 //  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
@@ -274,41 +274,41 @@
     #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
         //  Special case: Render two tiles without borders.  Anisotropic
         //  filtering doesn't seem to be a problem here.
-         const float mask_resize_num_tiles = 1.0 + 1.0;
-         const float mask_start_texels = 0.0;
+        static const float mask_resize_num_tiles = 1.0 + 1.0;
+        static const float mask_start_texels = 0.0;
     #else
-         const float mask_resize_num_tiles = 1.0 +
+        static const float mask_resize_num_tiles = 1.0 +
             2.0 * max_mask_tile_border;
-         const float mask_start_texels = max_mask_texel_border;
+        static const float mask_start_texels = max_mask_texel_border;
     #endif
 #else
-     const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
-     const float mask_start_texels = max_mask_texel_border;
+    static const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+    static const float mask_start_texels = max_mask_texel_border;
 #endif
 
 //  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
 //  mask_resize_viewport_scale.  This limits the maximum final triad size.
 //  Estimate the minimum number of triads we can split the screen into in each
 //  dimension (we'll be as correct as mask_resize_viewport_scale is):
- const float mask_resize_num_triads =
+static const float mask_resize_num_triads =
     mask_resize_num_tiles * mask_triads_per_tile;
- const vec2 min_allowed_viewport_triads =
-    vec2(mask_resize_num_triads) / mask_resize_viewport_scale;
+static const float2 min_allowed_viewport_triads =
+    float2(mask_resize_num_triads) / mask_resize_viewport_scale;
 
 
 ////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
 
- const float pi = 3.141592653589;
+static const float pi = 3.141592653589;
 //  We often want to find the location of the previous texel, e.g.:
-//      const vec2 curr_texel = uv * texture_size;
-//      const vec2 prev_texel = floor(curr_texel - vec2(0.5)) + vec2(0.5);
-//      const vec2 prev_texel_uv = prev_texel / texture_size;
+//      const float2 curr_texel = uv * texture_size;
+//      const float2 prev_texel = floor(curr_texel - float2(0.5)) + float2(0.5);
+//      const float2 prev_texel_uv = prev_texel / texture_size;
 //  However, many GPU drivers round incorrectly around exact texel locations.
 //  We need to subtract a little less than 0.5 before flooring, and some GPU's
 //  require this value to be farther from 0.5 than others; define it here.
-//      const vec2 prev_texel =
-//          floor(curr_texel - vec2(under_half)) + vec2(0.5);
- const float under_half = 0.4995;
+//      const float2 prev_texel =
+//          floor(curr_texel - float2(under_half)) + float2(0.5);
+static const float under_half = 0.4995;
 
 
 #endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
diff --git a/crt/shaders/crt-royale/src/geometry-functions.h b/crt/shaders/crt-royale/src/geometry-functions.h
index de8036a..ed5a7f8 100644
--- a/crt/shaders/crt-royale/src/geometry-functions.h
+++ b/crt/shaders/crt-royale/src/geometry-functions.h
@@ -32,22 +32,23 @@
 //  Curvature-related constants:
 #define MAX_POINT_CLOUD_SIZE 9
 
+
 /////////////////////////////  CURVATURE FUNCTIONS /////////////////////////////
 
-vec2 quadratic_solve(const float a, const float b_over_2, const float c)
+float2 quadratic_solve(const float a, const float b_over_2, const float c)
 {
     //  Requires:   1.) a, b, and c are quadratic formula coefficients
     //              2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out)
     //              3.) b_over_2 must be guaranteed < 0.0 (avoids a branch)
-    //  Returns:    Returns vec2(first_solution, discriminant), so the caller
+    //  Returns:    Returns float2(first_solution, discriminant), so the caller
     //              can choose how to handle the "no intersection" case.  The
     //              Kahan or Citardauq formula is used for numerical robustness.
     const float discriminant = b_over_2*b_over_2 - a*c;
     const float solution0 = c/(-b_over_2 + sqrt(discriminant));
-    return vec2(solution0, discriminant);
+    return float2(solution0, discriminant);
 }
 
-vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec)
+float2 intersect_sphere(const float3 view_vec, const float3 eye_pos_vec)
 {
     //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
     //                  local coordinate frame (eye_pos_vec is a position, i.e.
@@ -60,11 +61,11 @@ vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec)
     //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
     const float a = dot(view_vec, view_vec);
     const float b_over_2 = dot(view_vec, eye_pos_vec);  //  * 2.0 factored out
-    const float c = dot(eye_pos_vec, eye_pos_vec) - params.geom_radius*params.geom_radius;
+    const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius;
     return quadratic_solve(a, b_over_2, c);
 }
 
-vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec)
+float2 intersect_cylinder(const float3 view_vec, const float3 eye_pos_vec)
 {
     //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
     //                  local coordinate frame (eye_pos_vec is a position, i.e.
@@ -77,57 +78,57 @@ vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec)
     //              Real-Time Collision Detection, p. 195-196, and this version
     //              uses LaGrange's identity to reduce operations.
     //  Arbitrary "cylinder top" reference point for an infinite cylinder:
-    const vec3 cylinder_top_vec = vec3(0.0, params.geom_radius, 0.0);
-    const vec3 cylinder_axis_vec = vec3(0.0, 1.0, 0.0);//vec3(0.0, 2.0*geom_radius, 0.0);
-    const vec3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
-    const vec3 axis_x_view = cross(cylinder_axis_vec, view_vec);
-    const vec3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
+    const float3 cylinder_top_vec = float3(0.0, geom_radius, 0.0);
+    const float3 cylinder_axis_vec = float3(0.0, 1.0, 0.0);//float3(0.0, 2.0*geom_radius, 0.0);
+    const float3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
+    const float3 axis_x_view = cross(cylinder_axis_vec, view_vec);
+    const float3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
     //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
     const float a = dot(axis_x_view, axis_x_view);
     const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view);
     const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) -
-        params.geom_radius*params.geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
+        geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
     return quadratic_solve(a, b_over_2, c);
 }
 
-vec2 cylinder_xyz_to_uv(const vec3 intersection_pos_local,
-    const vec2 geom_aspect)
+float2 cylinder_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
 {
     //  Requires:   An xyz intersection position on a cylinder.
     //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
     //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
     //              and define square_uv.y = -intersection_pos_local.y (+v = -y).
     //  Start with a numerically robust arc length calculation.
-    const float angle_from_image_center = atan(intersection_pos_local.z,
-		intersection_pos_local.x);
-    const float signed_arc_len = angle_from_image_center * params.geom_radius;
+    const float angle_from_image_center = atan2(intersection_pos_local.x,
+        intersection_pos_local.z);
+    const float signed_arc_len = angle_from_image_center * geom_radius;
     //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
     //  by the aspect ratio to stretch the mapping appropriately:
-    const vec2 square_uv = vec2(signed_arc_len, -intersection_pos_local.y);
-    const vec2 video_uv = square_uv / geom_aspect;
+    const float2 square_uv = float2(signed_arc_len, -intersection_pos_local.y);
+    const float2 video_uv = square_uv / geom_aspect;
     return video_uv;
 }
 
-vec3 cylinder_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
+float3 cylinder_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
 {
     //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
     //  Returns:    An xyz intersection position on a cylinder.  This is the
     //              inverse of cylinder_xyz_to_uv().
     //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
     //  then calculate an xyz position for the cylindrical mapping above.
-    const vec2 square_uv = video_uv * geom_aspect;
+    const float2 square_uv = video_uv * geom_aspect;
     const float arc_len = square_uv.x;
-    const float angle_from_image_center = arc_len / params.geom_radius;
-    const float x_pos = sin(angle_from_image_center) * params.geom_radius;
-    const float z_pos = cos(angle_from_image_center) * params.geom_radius;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float x_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
     //  Or: z = sqrt(geom_radius**2 - x**2)
     //  Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle)
-    const vec3 intersection_pos_local = vec3(x_pos, -square_uv.y, z_pos);
+    const float3 intersection_pos_local = float3(x_pos, -square_uv.y, z_pos);
     return intersection_pos_local;
 }
 
-vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local,
-    const vec2 geom_aspect)
+float2 sphere_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
 {
     //  Requires:   An xyz intersection position on a sphere.
     //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
@@ -143,116 +144,119 @@ vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local,
     //  sphere intersection point and the image center using a method posted by
     //  Roger Stafford on comp.soft-sys.matlab:
     //  https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ
-    const vec3 image_center_pos_local = vec3(0.0, 0.0, params.geom_radius);
+    const float3 image_center_pos_local = float3(0.0, 0.0, geom_radius);
     const float cp_len =
         length(cross(intersection_pos_local, image_center_pos_local));
     const float dp = dot(intersection_pos_local, image_center_pos_local);
-    const float angle_from_image_center = atan(dp, cp_len);
-    const float arc_len = angle_from_image_center * params.geom_radius;
+    const float angle_from_image_center = atan2(cp_len, dp);
+    const float arc_len = angle_from_image_center * geom_radius;
     //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
     //  by the aspect ratio to stretch the mapping appropriately:
-    const vec2 square_uv_unit = normalize(vec2(intersection_pos_local.x,
+    const float2 square_uv_unit = normalize(float2(intersection_pos_local.x,
         -intersection_pos_local.y));
-    const vec2 square_uv = arc_len * square_uv_unit;
-    const vec2 video_uv = square_uv / geom_aspect;
+    const float2 square_uv = arc_len * square_uv_unit;
+    const float2 video_uv = square_uv / geom_aspect;
     return video_uv;
 }
 
-vec3 sphere_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
+float3 sphere_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
 {
     //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
     //  Returns:    An xyz intersection position on a sphere.  This is the
     //              inverse of sphere_xyz_to_uv().
     //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
     //  then calculate an xyz position for the spherical mapping above.
-    const vec2 square_uv = video_uv * geom_aspect;
+    const float2 square_uv = video_uv * geom_aspect;
     //  Using length or sqrt here butchers the framerate on my 8800GTS if
     //  this function is called too many times, and so does taking the max
     //  component of square_uv/square_uv_unit (program length threshold?).
     //float arc_len = length(square_uv);
-    const vec2 square_uv_unit = normalize(square_uv);
+    const float2 square_uv_unit = normalize(square_uv);
     const float arc_len = square_uv.y/square_uv_unit.y;
-    const float angle_from_image_center = arc_len / params.geom_radius;
+    const float angle_from_image_center = arc_len / geom_radius;
     const float xy_dist_from_sphere_center =
-        sin(angle_from_image_center) * params.geom_radius;
-    //vec2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
-    const vec2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
-    const float z_pos = cos(angle_from_image_center) * params.geom_radius;
-    const vec3 intersection_pos_local = vec3(xy_pos.x, -xy_pos.y, z_pos);
+        sin(angle_from_image_center) * geom_radius;
+    //float2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
+    const float2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    const float3 intersection_pos_local = float3(xy_pos.x, -xy_pos.y, z_pos);
     return intersection_pos_local;
 }
 
-vec2 sphere_alt_xyz_to_uv(const vec3 intersection_pos_local,
-    const vec2 geom_aspect)
+float2 sphere_alt_xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect)
 {
     //  Requires:   An xyz intersection position on a cylinder.
     //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
     //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
     //              and define square_uv.y == signed arc length in yz-space.
     //  See cylinder_xyz_to_uv() for implementation details (very similar).
-    const vec2 angle_from_image_center = atan((intersection_pos_local.zz),
-        vec2(intersection_pos_local.x, -intersection_pos_local.y));
-    const vec2 signed_arc_len = angle_from_image_center * params.geom_radius;
-    const vec2 video_uv = signed_arc_len / geom_aspect;
+    const float2 angle_from_image_center = atan2(
+        float2(intersection_pos_local.x, -intersection_pos_local.y),
+        intersection_pos_local.zz);
+    const float2 signed_arc_len = angle_from_image_center * geom_radius;
+    const float2 video_uv = signed_arc_len / geom_aspect;
     return video_uv;
 }
 
-vec3 sphere_alt_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
+float3 sphere_alt_uv_to_xyz(const float2 video_uv, const float2 geom_aspect)
 {
     //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
     //  Returns:    An xyz intersection position on a sphere.  This is the
     //              inverse of sphere_alt_xyz_to_uv().
     //  See cylinder_uv_to_xyz() for implementation details (very similar).
-    const vec2 square_uv = video_uv * geom_aspect;
-    const vec2 arc_len = square_uv;
-    const vec2 angle_from_image_center = arc_len / params.geom_radius;
-    const vec2 xy_pos = sin(angle_from_image_center) * params.geom_radius;
-    const float z_pos = sqrt(params.geom_radius*params.geom_radius - dot(xy_pos, xy_pos));
-    return vec3(xy_pos.x, -xy_pos.y, z_pos);
+    const float2 square_uv = video_uv * geom_aspect;
+    const float2 arc_len = square_uv;
+    const float2 angle_from_image_center = arc_len / geom_radius;
+    const float2 xy_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos));
+    return float3(xy_pos.x, -xy_pos.y, z_pos);
 }
 
-vec2 intersect(const vec3 view_vec_local, const vec3 eye_pos_local,
+inline float2 intersect(const float3 view_vec_local, const float3 eye_pos_local,
     const float geom_mode)
 {
-    if (geom_mode < 2.5) return intersect_sphere(view_vec_local, eye_pos_local);
-	else return intersect_cylinder(view_vec_local, eye_pos_local);
+    return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) :
+        intersect_cylinder(view_vec_local, eye_pos_local);
 }
 
-vec2 xyz_to_uv(const vec3 intersection_pos_local,
-    const vec2 geom_aspect, const float geom_mode)
+inline float2 xyz_to_uv(const float3 intersection_pos_local,
+    const float2 geom_aspect, const float geom_mode)
 {
-    if (geom_mode < 1.5) return sphere_xyz_to_uv(intersection_pos_local, geom_aspect);
-	else if (geom_mode < 2.5) return sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect);
-	else return cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
+    return geom_mode < 1.5 ?
+            sphere_xyz_to_uv(intersection_pos_local, geom_aspect) :
+        geom_mode < 2.5 ?
+            sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) :
+            cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
 }
 
-vec3 uv_to_xyz(const vec2 uv, const vec2 geom_aspect,
+inline float3 uv_to_xyz(const float2 uv, const float2 geom_aspect,
     const float geom_mode)
 {
-	if (geom_mode < 1.5) return sphere_uv_to_xyz(uv, geom_aspect);
-	else if (geom_mode < 2.5) return sphere_alt_uv_to_xyz(uv, geom_aspect);
-	else return cylinder_uv_to_xyz(uv, geom_aspect);
+    return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) :
+        geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) :
+        cylinder_uv_to_xyz(uv, geom_aspect);
 }
 
-vec2 view_vec_to_uv(const vec3 view_vec_local, const vec3 eye_pos_local,
-    const vec2 geom_aspect, const float geom_mode, out vec3 intersection_pos)
+float2 view_vec_to_uv(const float3 view_vec_local, const float3 eye_pos_local,
+    const float2 geom_aspect, const float geom_mode, out float3 intersection_pos)
 {
     //  Get the intersection point on the primitive, given an eye position
     //  and view vector already in its local coordinate frame:
-    const vec2 intersect_dist_and_discriminant = intersect(view_vec_local,
+    const float2 intersect_dist_and_discriminant = intersect(view_vec_local,
         eye_pos_local, geom_mode);
-    const vec3 intersection_pos_local = eye_pos_local +
+    const float3 intersection_pos_local = eye_pos_local +
         view_vec_local * intersect_dist_and_discriminant.x;
     //  Save the intersection position to an output parameter:
     intersection_pos = intersection_pos_local;
     //  Transform into uv coords, but give out-of-range coords if the
     //  view ray doesn't intersect the primitive in the first place:
-	if (intersect_dist_and_discriminant.y > 0.005) return xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode);
-	else return vec2(1.0);
+    return intersect_dist_and_discriminant.y > 0.005 ?
+        xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : float2(1.0);
 }
 
-vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
-    const vec2 geom_aspect, const vec3 global_coords[MAX_POINT_CLOUD_SIZE],
+float3 get_ideal_global_eye_pos_for_points(float3 eye_pos,
+    const float2 geom_aspect, const float3 global_coords[MAX_POINT_CLOUD_SIZE],
     const int num_points)
 {
     //  Requires:   Parameters:
@@ -278,7 +282,7 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
     //  that result in each point being projected to a screen edge/corner in
     //  pseudo-normalized device coords (where xy ranges from [-0.5, 0.5]
     //  and z = eyespace z):
-    //      pndc_coord = vec3(vec2(eyespace_xyz.x, -eyespace_xyz.y)*
+    //      pndc_coord = float3(float2(eyespace_xyz.x, -eyespace_xyz.y)*
     //      geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z);
     //  Notes:
     //  The field of view is controlled by geom_view_dist's magnitude relative to
@@ -288,11 +292,11 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
     //  But for the purposes of perspective divide, it should be considered:
     //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist
     //      view_vec.z = -1.0
-    const int max_centering_iters = 1;  //  Keep for easy testing.
+    static const int max_centering_iters = 1;  //  Keep for easy testing.
     for(int iter = 0; iter < max_centering_iters; iter++)
     {
         //  0.) Get the eyespace coordinates of our point cloud:
-        vec3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
+        float3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
         for(int i = 0; i < num_points; i++)
         {
             eyespace_coords[i] = global_coords[i] - eye_pos;
@@ -302,31 +306,31 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
         //      Eyespace +y = up, screenspace +y = down, so flip y after
         //      applying the eyespace offset (on the way to "clip space").
         //  Solve for two offsets per point based on:
-        //      (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) *
-        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(-0.5)
-        //      (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) *
-        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(0.5)
+        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(-0.5)
+        //      (eyespace_xyz.xy - offset_dr) * float2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = float2(0.5)
         //  offset_ul and offset_dr represent the farthest we can move the
         //  eye_pos up-left and down-right.  Save the min of all offset_dr's
         //  and the max of all offset_ul's (since it's negative).
-        float abs_radius = abs(params.geom_radius);  //  In case anyone gets ideas. ;)
-        vec2 offset_dr_min = vec2(10.0 * abs_radius, 10.0 * abs_radius);
-        vec2 offset_ul_max = vec2(-10.0 * abs_radius, -10.0 * abs_radius);
+        float abs_radius = abs(geom_radius);  //  In case anyone gets ideas. ;)
+        float2 offset_dr_min = float2(10.0 * abs_radius, 10.0 * abs_radius);
+        float2 offset_ul_max = float2(-10.0 * abs_radius, -10.0 * abs_radius);
         for(int i = 0; i < num_points; i++)
         {
-            const vec2 flipy = vec2(1.0, -1.0);
-            vec3 eyespace_xyz = eyespace_coords[i];
-            vec2 offset_dr = eyespace_xyz.xy - vec2(-0.5) *
-                (geom_aspect * -eyespace_xyz.z) / (params.geom_view_dist * flipy);
-            vec2 offset_ul = eyespace_xyz.xy - vec2(0.5) *
-                (geom_aspect * -eyespace_xyz.z) / (params.geom_view_dist * flipy);
+            static const float2 flipy = float2(1.0, -1.0);
+            float3 eyespace_xyz = eyespace_coords[i];
+            float2 offset_dr = eyespace_xyz.xy - float2(-0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            float2 offset_ul = eyespace_xyz.xy - float2(0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
             offset_dr_min = min(offset_dr_min, offset_dr);
             offset_ul_max = max(offset_ul_max, offset_ul);
         }
         //  1b.)Update eye_pos: Adding the average of offset_ul_max and
         //      offset_dr_min gives it equal leeway on the top vs. bottom
         //      and left vs. right.  Recalculate eyespace_coords accordingly.
-        vec2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
+        float2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
         eye_pos.xy += center_offset;
         for(int i = 0; i < num_points; i++)
         {
@@ -347,14 +351,14 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
         //      We'll vectorize the actual computation.  Take the maximum of
         //      these four for a single offset, and continue taking the max
         //      for every point (use max because offset.z is negative).
-        float offset_z_max = -10.0 * params.geom_radius * params.geom_view_dist;
+        float offset_z_max = -10.0 * geom_radius * geom_view_dist;
         for(int i = 0; i < num_points; i++)
         {
-            vec3 eyespace_xyz_flipy = eyespace_coords[i] *
-                vec3(1.0, -1.0, 1.0);
-            vec4 offset_zzzz = eyespace_xyz_flipy.zzzz +
-                (eyespace_xyz_flipy.xyxy * params.geom_view_dist) /
-                (vec4(-0.5, -0.5, 0.5, 0.5) * vec4(geom_aspect, geom_aspect));
+            float3 eyespace_xyz_flipy = eyespace_coords[i] *
+                float3(1.0, -1.0, 1.0);
+            float4 offset_zzzz = eyespace_xyz_flipy.zzzz +
+                (eyespace_xyz_flipy.xyxy * geom_view_dist) /
+                (float4(-0.5, -0.5, 0.5, 0.5) * float4(geom_aspect, geom_aspect));
             //  Ignore offsets that push positive x/y values to opposite
             //  boundaries, and vice versa, and don't let the camera move
             //  past a point in the dead center of the screen:
@@ -374,19 +378,20 @@ vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
     return eye_pos;
 }
 
-vec3 get_ideal_global_eye_pos(const mat3x3 local_to_global,
-    const vec2 geom_aspect, const float geom_mode)
+float3 get_ideal_global_eye_pos(const float3x3 local_to_global,
+    const float2 geom_aspect, const float geom_mode)
 {
     //  Start with an initial eye_pos that includes the entire primitive
     //  (sphere or cylinder) in its field-of-view:
-    const vec3 high_view = vec3(0.0, geom_aspect.y, -params.geom_view_dist);
-    const vec3 low_view = high_view * vec3(1.0, -1.0, 1.0);
+    const float3 high_view = float3(0.0, geom_aspect.y, -geom_view_dist);
+    const float3 low_view = high_view * float3(1.0, -1.0, 1.0);
     const float len_sq = dot(high_view, high_view);
     const float fov = abs(acos(dot(high_view, low_view)/len_sq));
     //  Trigonometry/similar triangles say distance = geom_radius/sin(fov/2):
-    const float eye_z_spherical = params.geom_radius/sin(fov*0.5);
-    vec3 eye_pos = vec3(0.0, 0.0, eye_z_spherical);
-	if (geom_mode < 2.5) eye_pos = vec3(0.0, 0.0, max(params.geom_view_dist, eye_z_spherical));
+    const float eye_z_spherical = geom_radius/sin(fov*0.5);
+    const float3 eye_pos = geom_mode < 2.5 ?
+        float3(0.0, 0.0, eye_z_spherical) :
+        float3(0.0, 0.0, max(geom_view_dist, eye_z_spherical));
 
     //  Get global xyz coords of extreme sample points on the simulated CRT
     //  screen.  Start with the center, edge centers, and corners of the
@@ -394,37 +399,35 @@ vec3 get_ideal_global_eye_pos(const mat3x3 local_to_global,
     //  by closer points on the primitive, but they may NOT be occluded by
     //  the convex hull of the remaining samples (i.e. the remaining convex
     //  hull might not envelope points that do occlude a back-facing point.)
-    const int num_points = MAX_POINT_CLOUD_SIZE;
-    vec3 global_coords[MAX_POINT_CLOUD_SIZE];
-    global_coords[0] = (uv_to_xyz(vec2(0.0, 0.0), geom_aspect, geom_mode) * local_to_global);
-    global_coords[1] = (uv_to_xyz(vec2(0.0, -0.5), geom_aspect, geom_mode) * local_to_global);
-    global_coords[2] = (uv_to_xyz(vec2(0.0, 0.5), geom_aspect, geom_mode) * local_to_global);
-    global_coords[3] = (uv_to_xyz(vec2(-0.5, 0.0), geom_aspect, geom_mode) * local_to_global);
-    global_coords[4] = (uv_to_xyz(vec2(0.5, 0.0), geom_aspect, geom_mode) * local_to_global);
-    global_coords[5] = (uv_to_xyz(vec2(-0.5, -0.5), geom_aspect, geom_mode) * local_to_global);
-    global_coords[6] = (uv_to_xyz(vec2(0.5, -0.5), geom_aspect, geom_mode) * local_to_global);
-    global_coords[7] = (uv_to_xyz(vec2(-0.5, 0.5), geom_aspect, geom_mode) * local_to_global);
-    global_coords[8] = (uv_to_xyz(vec2(0.5, 0.5), geom_aspect, geom_mode) * local_to_global);
+    static const int num_points = MAX_POINT_CLOUD_SIZE;
+    float3 global_coords[MAX_POINT_CLOUD_SIZE];
+    global_coords[0] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.0), geom_aspect, geom_mode));
+    global_coords[1] = mul(local_to_global, uv_to_xyz(float2(0.0, -0.5), geom_aspect, geom_mode));
+    global_coords[2] = mul(local_to_global, uv_to_xyz(float2(0.0, 0.5), geom_aspect, geom_mode));
+    global_coords[3] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[4] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[5] = mul(local_to_global, uv_to_xyz(float2(-0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[6] = mul(local_to_global, uv_to_xyz(float2(0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[7] = mul(local_to_global, uv_to_xyz(float2(-0.5, 0.5), geom_aspect, geom_mode));
+    global_coords[8] = mul(local_to_global, uv_to_xyz(float2(0.5, 0.5), geom_aspect, geom_mode));
     //  Adding more inner image points could help in extreme cases, but too many
     //  points will kille the framerate.  For safety, default to the initial
     //  eye_pos if any z coords are negative:
     float num_negative_z_coords = 0.0;
     for(int i = 0; i < num_points; i++)
     {
-		if (global_coords[0].z < 0.0)
-        {num_negative_z_coords += float(global_coords[0].z);}
+        num_negative_z_coords += float(global_coords[0].z < 0.0);
     }
     //  Outsource the optimized eye_pos calculation:
-	if (num_negative_z_coords > 0.5)
-		return eye_pos;
-	else
-        return get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect, global_coords, num_points);
+    return num_negative_z_coords > 0.5 ? eye_pos :
+        get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect,
+            global_coords, num_points);
 }
 
-mat3x3 get_pixel_to_object_matrix(const mat3x3 global_to_local,
-    const vec3 eye_pos_local, const vec3 view_vec_global,
-    const vec3 intersection_pos_local, const vec3 normal,
-    const vec2 output_size_inv)
+float3x3 get_pixel_to_object_matrix(const float3x3 global_to_local,
+    const float3 eye_pos_local, const float3 view_vec_global,
+    const float3 intersection_pos_local, const float3 normal,
+    const float2 output_size_inv)
 {
     //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
     //              descriptions of each parameter.
@@ -437,26 +440,26 @@ mat3x3 get_pixel_to_object_matrix(const mat3x3 global_to_local,
     //              vectors to 3D vectors along the CRT's surface, for later
     //              conversion to uv vectors.)
     //  Shorthand inputs:
-    const vec3 pos = intersection_pos_local;
-    const vec3 eye_pos = eye_pos_local;
+    const float3 pos = intersection_pos_local;
+    const float3 eye_pos = eye_pos_local;
     //  Get a piecewise-linear matrix transforming from "pixelspace" offset
     //  vectors (1.0 = one pixel) to object space vectors in the tangent
     //  plane (faster than finding 3 view-object intersections).
     //  1.) Get the local view vecs for the pixels to the right and down:
-    const vec3 view_vec_right_global = view_vec_global +
-        vec3(output_size_inv.x, 0.0, 0.0);
-    const vec3 view_vec_down_global = view_vec_global +
-        vec3(0.0, -output_size_inv.y, 0.0);
-    const vec3 view_vec_right_local =
-        (view_vec_right_global * global_to_local);
-    const vec3 view_vec_down_local =
-        (view_vec_down_global * global_to_local);
+    const float3 view_vec_right_global = view_vec_global +
+        float3(output_size_inv.x, 0.0, 0.0);
+    const float3 view_vec_down_global = view_vec_global +
+        float3(0.0, -output_size_inv.y, 0.0);
+    const float3 view_vec_right_local =
+        mul(global_to_local, view_vec_right_global);
+    const float3 view_vec_down_local =
+        mul(global_to_local, view_vec_down_global);
     //  2.) Using the true intersection point, intersect the neighboring
     //      view vectors with the tangent plane:
-    const vec3 intersection_vec_dot_normal = vec3(dot(pos - eye_pos, normal));
-    const vec3 right_pos = eye_pos + (intersection_vec_dot_normal /
+    const float3 intersection_vec_dot_normal = float3(dot(pos - eye_pos, normal), dot(pos - eye_pos, normal), dot(pos - eye_pos, normal));
+    const float3 right_pos = eye_pos + (intersection_vec_dot_normal /
         dot(view_vec_right_local, normal))*view_vec_right_local;
-    const vec3 down_pos = eye_pos + (intersection_vec_dot_normal /
+    const float3 down_pos = eye_pos + (intersection_vec_dot_normal /
         dot(view_vec_down_local, normal))*view_vec_down_local;
     //  3.) Subtract the original intersection pos from its neighbors; the
     //      resulting vectors are object-space vectors tangent to the plane.
@@ -464,17 +467,17 @@ mat3x3 get_pixel_to_object_matrix(const mat3x3 global_to_local,
     //      and (0.0, 1.0) pixel offsets, so they form the first two basis
     //      vectors of a pixelspace to object space transformation.  This
     //      transformation is 2D to 3D, so use (0, 0, 0) for the third vector.
-    const vec3 object_right_vec = right_pos - pos;
-    const vec3 object_down_vec = down_pos - pos;
-    const mat3x3 pixel_to_object = mat3x3(
+    const float3 object_right_vec = right_pos - pos;
+    const float3 object_down_vec = down_pos - pos;
+    const float3x3 pixel_to_object = float3x3(
         object_right_vec.x, object_down_vec.x, 0.0,
         object_right_vec.y, object_down_vec.y, 0.0,
         object_right_vec.z, object_down_vec.z, 0.0);
     return pixel_to_object;
 }
 
-mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
-    const vec3 normal, const vec2 geom_aspect, const float geom_mode)
+float3x3 get_object_to_tangent_matrix(const float3 intersection_pos_local,
+    const float3 normal, const float2 geom_aspect, const float geom_mode)
 {
     //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
     //              descriptions of each parameter.
@@ -490,7 +493,7 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
     //  We want the inverse of the TBN matrix (transpose of the cotangent
     //  matrix), which transforms ordinary vectors from object->tangent space.
     //  Start by calculating the relevant basis vectors in accordance with
-    //  Christian Sch�ler's blog post "Followup: Normal Mapping Without
+    //  Christian Schüler's blog post "Followup: Normal Mapping Without
     //  Precomputed Tangents":  http://www.thetenthplanet.de/archives/1180
     //  With our particular uv mapping, the scale of the u and v directions
     //  is determined entirely by the aspect ratio for cylindrical and ordinary
@@ -498,13 +501,13 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
     //  determined by it (the alternate mapping is more complex).  Therefore, we
     //  must ensure appropriate cotangent and cobitangent lengths as well.
     //  Base these off the uv<=>xyz mappings for each primitive.
-    const vec3 pos = intersection_pos_local;
-    const vec3 x_vec = vec3(1.0, 0.0, 0.0);
-    const vec3 y_vec = vec3(0.0, 1.0, 0.0);
+    const float3 pos = intersection_pos_local;
+    static const float3 x_vec = float3(1.0, 0.0, 0.0);
+    static const float3 y_vec = float3(0.0, 1.0, 0.0);
     //  The tangent and bitangent vectors correspond with increasing u and v,
     //  respectively.  Mathematically we'd base the cotangent/cobitangent on
     //  those, but we'll compute the cotangent/cobitangent directly when we can.
-    vec3 cotangent_unscaled, cobitangent_unscaled;
+    float3 cotangent_unscaled, cobitangent_unscaled;
     //  geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE.
     if(geom_mode < 1.5)
     {
@@ -526,10 +529,10 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
         //  This mapping works a bit like the cylindrical mapping in two
         //  directions, which makes the lengths and directions more complex.
         //  Unfortunately, I can't find much of a shortcut:
-        const vec3 tangent = normalize(
-            cross(y_vec, vec3(pos.x, 0.0, pos.z))) * geom_aspect.x;
-        const vec3 bitangent = normalize(
-            cross(x_vec, vec3(0.0, pos.yz))) * geom_aspect.y;
+        const float3 tangent = normalize(
+            cross(y_vec, float3(pos.x, 0.0, pos.z))) * geom_aspect.x;
+        const float3 bitangent = normalize(
+            cross(x_vec, float3(0.0, pos.yz))) * geom_aspect.y;
         cotangent_unscaled = cross(normal, bitangent);
         cobitangent_unscaled = cross(tangent, normal);
     }
@@ -537,31 +540,31 @@ mat3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
     {
         //  Cylinder:
         //  tangent = normalize(cross(y_vec, normal)) * geom_aspect.x;
-        //  bitangent = vec3(0.0, -geom_aspect.y, 0.0);
+        //  bitangent = float3(0.0, -geom_aspect.y, 0.0);
         //  inv_determinant = 1.0/length(cross(bitangent, tangent))
         //  cotangent = cross(normal, bitangent) * inv_determinant
         //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
         //  cobitangent = cross(tangent, normal) * inv_determinant
-        //            == vec3(0.0, -geom_aspect.x, 0.0) * inv_determinant
+        //            == float3(0.0, -geom_aspect.x, 0.0) * inv_determinant
         cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y;
-        cobitangent_unscaled = vec3(0.0, -geom_aspect.x, 0.0);
+        cobitangent_unscaled = float3(0.0, -geom_aspect.x, 0.0);
     }
-    const vec3 computed_normal =
+    const float3 computed_normal =
         cross(cobitangent_unscaled, cotangent_unscaled);
-    const float inv_determinant = inversesqrt(dot(computed_normal, computed_normal));
-    const vec3 cotangent = cotangent_unscaled * inv_determinant;
-    const vec3 cobitangent = cobitangent_unscaled * inv_determinant;
+    const float inv_determinant = rsqrt(dot(computed_normal, computed_normal));
+    const float3 cotangent = cotangent_unscaled * inv_determinant;
+    const float3 cobitangent = cobitangent_unscaled * inv_determinant;
     //  The [cotangent, cobitangent, normal] column vecs form the cotangent
     //  frame, i.e. the inverse-transpose TBN matrix.  Get its transpose:
-    const mat3x3 object_to_tangent = mat3x3(cotangent, cobitangent, normal);
+    const float3x3 object_to_tangent = float3x3(cotangent, cobitangent, normal);
     return object_to_tangent;
 }
 
-vec2 get_curved_video_uv_coords_and_tangent_matrix(
-    const vec2 flat_video_uv, const vec3 eye_pos_local,
-    const vec2 output_size_inv, const vec2 geom_aspect,
-    const float geom_mode, const mat3x3 global_to_local,
-    out mat2x2 pixel_to_tangent_video_uv)
+float2 get_curved_video_uv_coords_and_tangent_matrix(
+    const float2 flat_video_uv, const float3 eye_pos_local,
+    const float2 output_size_inv, const float2 geom_aspect,
+    const float geom_mode, const float3x3 global_to_local,
+    out float2x2 pixel_to_tangent_video_uv)
 {
     //  Requires:   Parameters:
     //              1.) flat_video_uv coords are in range [0.0, 1.0], where
@@ -570,7 +573,7 @@ vec2 get_curved_video_uv_coords_and_tangent_matrix(
     //              2.) eye_pos_local is the 3D camera position in the simulated
     //                  CRT's local coordinate frame.  For best results, it must
     //                  be computed based on the same geom_view_dist used here.
-    //              3.) output_size_inv = vec2(1.0)/IN.output_size
+    //              3.) output_size_inv = float2(1.0)/IN.output_size
     //              4.) geom_aspect = get_aspect_vector(
     //                      IN.output_size.x / IN.output_size.y);
     //              5.) geom_mode is a static or runtime mode setting:
@@ -600,66 +603,66 @@ vec2 get_curved_video_uv_coords_and_tangent_matrix(
     //      For the effect of "looking through a window" at a CRT, it should be
     //      set equal to the user's distance from their physical screen, in
     //      units of the viewport's physical diagonal size.
-    const vec2 view_uv = (flat_video_uv - vec2(0.5)) * geom_aspect;
-    const vec3 view_vec_global =
-        vec3(view_uv.x, -view_uv.y, -params.geom_view_dist);
+    const float2 view_uv = (flat_video_uv - float2(0.5)) * geom_aspect;
+    const float3 view_vec_global =
+        float3(view_uv.x, -view_uv.y, -geom_view_dist);
     //  Transform the view vector into the CRT's local coordinate frame, convert
     //  to video_uv coords, and get the local 3D intersection position:
-    const vec3 view_vec_local = (view_vec_global * global_to_local);
-    vec3 pos;
-    const vec2 centered_uv = view_vec_to_uv(
+    const float3 view_vec_local = mul(global_to_local, view_vec_global);
+    float3 pos;
+    const float2 centered_uv = view_vec_to_uv(
         view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos);
-    const vec2 video_uv = centered_uv + vec2(0.5);
+    const float2 video_uv = centered_uv + float2(0.5);
     //  Get a pixel-to-tangent-video-uv matrix.  The caller could deal with
     //  all but one of these cases, but that would be more complicated.
     #ifdef DRIVERS_ALLOW_DERIVATIVES
         //  Derivatives obtain a matrix very fast, but the direction of pixel-
         //  space +y seems to depend on the pass.  Enforce the correct direction
         //  on a best-effort basis (but it shouldn't matter for antialiasing).
-        const vec2 duv_dx = ddx(video_uv);
-        const vec2 duv_dy = ddy(video_uv);
+        const float2 duv_dx = ddx(video_uv);
+        const float2 duv_dy = ddy(video_uv);
         #ifdef LAST_PASS
-            pixel_to_tangent_video_uv = mat2x2(
+            pixel_to_tangent_video_uv = float2x2(
                 duv_dx.x, duv_dy.x,
                 -duv_dx.y, -duv_dy.y);
         #else
-            pixel_to_tangent_video_uv = mat2x2(
+            pixel_to_tangent_video_uv = float2x2(
                 duv_dx.x, duv_dy.x,
                 duv_dx.y, duv_dy.y);
         #endif
     #else
         //  Manually define a transformation matrix.  We'll assume pixel-space
         //  +y = down, just like +v = down.
-        if(geom_force_correct_tangent_matrix == true)
+        if(geom_force_correct_tangent_matrix)
         {
             //  Get the surface normal based on the local intersection position:
-            vec3 normal_base = pos;
-			if (geom_mode > 2.5) normal_base = vec3(pos.x, 0.0, pos.z);
-            const vec3 normal = normalize(normal_base);
+            const float3 normal_base = geom_mode < 2.5 ? pos :
+                float3(pos.x, 0.0, pos.z);
+            const float3 normal = normalize(normal_base);
             //  Get pixel-to-object and object-to-tangent matrices and combine
             //  them into a 2x2 pixel-to-tangent matrix for video_uv offsets:
-            const mat3x3 pixel_to_object = get_pixel_to_object_matrix(
+            const float3x3 pixel_to_object = get_pixel_to_object_matrix(
                 global_to_local, eye_pos_local, view_vec_global, pos, normal,
                 output_size_inv);
-            const mat3x3 object_to_tangent = get_object_to_tangent_matrix(
+            const float3x3 object_to_tangent = get_object_to_tangent_matrix(
                 pos, normal, geom_aspect, geom_mode);
-            const mat3x3 pixel_to_tangent3x3 =
-                (pixel_to_object * object_to_tangent);
-            pixel_to_tangent_video_uv = mat2x2(
-                pixel_to_tangent3x3[0].xyz, pixel_to_tangent3x3[1].x);
+            const float3x3 pixel_to_tangent3x3 =
+                mul(object_to_tangent, pixel_to_object);
+            pixel_to_tangent_video_uv = float2x2(
+                pixel_to_tangent3x3[0][0], pixel_to_tangent3x3[0][1], pixel_to_tangent3x3[1][0], pixel_to_tangent3x3[1][1]);//._m00_m01_m10_m11);
         }
         else
         {
             //  Ignore curvature, and just consider flat scaling.  The
             //  difference is only apparent with strong curvature:
-            pixel_to_tangent_video_uv = mat2x2(
+            pixel_to_tangent_video_uv = float2x2(
                 output_size_inv.x, 0.0, 0.0, output_size_inv.y);
         }
     #endif
     return video_uv;
 }
 
-float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect)
+float get_border_dim_factor(const float2 video_uv, const float2 geom_aspect)
 {
     //  COPYRIGHT NOTE FOR THIS FUNCTION:
     //  Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey
@@ -671,15 +674,20 @@ float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect)
 
     //  Calculate border_dim_factor from the proximity to uv-space image
     //  borders; geom_aspect/border_size/border/darkness/border_compress are globals:
-    const vec2 edge_dists = min(video_uv, vec2(1.0) - video_uv) *
+    const float2 edge_dists = min(video_uv, float2(1.0) - video_uv) *
         geom_aspect;
-    const vec2 border_penetration =
-        max(vec2(params.border_size) - edge_dists, vec2(0.0));
-    const float penetration_ratio = length(border_penetration)/params.border_size;
+    const float2 border_penetration =
+        max(float2(border_size) - edge_dists, float2(0.0));
+    const float penetration_ratio = length(border_penetration)/border_size;
     const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0);
     const float border_dim_factor =
-        pow(border_escape_ratio, params.border_darkness) * max(1.0, params.border_compress);
+        pow(border_escape_ratio, border_darkness) * max(1.0, border_compress);
     return min(border_dim_factor, 1.0);
 }
 
-#endif  //  GEOMETRY_FUNCTIONS_H
\ No newline at end of file
+
+
+#endif  //  GEOMETRY_FUNCTIONS_H
+
+
+
diff --git a/crt/shaders/crt-royale/src/params.inc b/crt/shaders/crt-royale/src/params.inc
index e442b5e..46044ec 100644
--- a/crt/shaders/crt-royale/src/params.inc
+++ b/crt/shaders/crt-royale/src/params.inc
@@ -1,101 +1,88 @@
-#ifndef PARAMS_INC
-#define PARAMS_INC
-
-layout(std140, set = 0, binding = 0) uniform UBO
-{
-	mat4 MVP;
-//	float crt_gamma;
-//	float lcd_gamma;
-	float levels_contrast;
-	float halation_weight;
-	float diffusion_weight;
-	float bloom_underestimate_levels;
-	float bloom_excess;
-	float beam_min_sigma;
-	float beam_max_sigma;
-	float beam_spot_power;
-	float beam_min_shape;
-	float beam_max_shape;
-	float beam_shape_power;
-	float beam_horiz_filter;
-	float beam_horiz_sigma;
-//	float beam_horiz_linear_rgb_weight;
-	float convergence_offset_x_r;
-	float convergence_offset_x_g;
-	float convergence_offset_x_b;
-	float convergence_offset_y_r;
-	float convergence_offset_y_g;
-	float convergence_offset_y_b;
-	float mask_type;
-	float mask_sample_mode_desired;
-	float mask_specify_num_triads;
-	float mask_triad_size_desired;
-	float mask_num_triads_desired;
-//	float aa_subpixel_r_offset_x_runtime;
-//	float aa_subpixel_r_offset_y_runtime;
-//	float aa_cubic_c;
-//	float aa_gauss_sigma;
-//	float geom_mode_runtime;
-//	float geom_radius;
-//	float geom_view_dist;
-//	float geom_tilt_angle_x;
-//	float geom_tilt_angle_y;
-	float geom_aspect_ratio_x;
-	float geom_aspect_ratio_y;
-//	float geom_overscan_x;
-//	float geom_overscan_y;
-//	float border_size;
-//	float border_darkness;
-//	float border_compress;
-	float interlace_1080i;
-	float beam_num_scanlines;
-} params;
+//#define HARDCODE_SETTINGS
 
+#ifndef HARDCODE_SETTINGS
 //  Set shader params for all passes here:
-//#pragma parameter crt_gamma "crt_gamma" 2.5 1.0 5.0 0.025
-//#pragma parameter lcd_gamma "lcd_gamma" 2.2 1.0 5.0 0.025
-#pragma parameter levels_contrast "levels_contrast" 1.0 0.0 4.0 0.015625
-#pragma parameter halation_weight "halation_weight" 0.0 0.0 1.0 0.005
-#pragma parameter diffusion_weight "diffusion_weight" 0.075 0.0 1.0 0.005
-#pragma parameter bloom_underestimate_levels "bloom_underestimate_levels" 0.8 0.0 5.0 0.01
-#pragma parameter bloom_excess "bloom_excess" 0.0 0.0 1.0 0.005
-#pragma parameter beam_min_sigma "beam_min_sigma" 0.02 0.005 1.0 0.005
-#pragma parameter beam_max_sigma "beam_max_sigma" 0.3 0.005 1.0 0.005
-#pragma parameter beam_spot_power "beam_spot_power" 0.33 0.01 16.0 0.01
-#pragma parameter beam_min_shape "beam_min_shape" 2.0 2.0 32.0 0.1
-#pragma parameter beam_max_shape "beam_max_shape" 4.0 2.0 32.0 0.1
-#pragma parameter beam_shape_power "beam_shape_power" 0.25 0.01 16.0 0.01
-#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0
-#pragma parameter beam_horiz_sigma "beam_horiz_sigma" 0.35 0.0 0.67 0.005
-//#pragma parameter beam_horiz_linear_rgb_weight "beam_horiz_linear_rgb_weight" 1.0 0.0 1.0 0.01
-#pragma parameter convergence_offset_x_r "convergence_offset_x_r" 0.0 -4.0 4.0 0.05
-#pragma parameter convergence_offset_x_g "convergence_offset_x_g" 0.0 -4.0 4.0 0.05
-#pragma parameter convergence_offset_x_b "convergence_offset_x_b" 0.0 -4.0 4.0 0.05
-#pragma parameter convergence_offset_y_r "convergence_offset_y_r" 0.0 -2.0 2.0 0.05
-#pragma parameter convergence_offset_y_g "convergence_offset_y_g" 0.0 -2.0 2.0 0.05
-#pragma parameter convergence_offset_y_b "convergence_offset_y_b" 0.0 -2.0 2.0 0.05
-#pragma parameter mask_type "mask_type" 1.0 0.0 2.0 1.0
-#pragma parameter mask_sample_mode_desired "mask_sample_mode" 1.0 0.0 2.0 1.0   //  Consider blocking mode 2.
-#pragma parameter mask_specify_num_triads "mask_specify_num_triads" 0.0 0.0 1.0 1.0
-#pragma parameter mask_triad_size_desired "mask_triad_size_desired" 3.0 1.0 18.0 0.125
-#pragma parameter mask_num_triads_desired "mask_num_triads_desired" 480.0 342.0 1920.0 1.0
-//#pragma parameter aa_subpixel_r_offset_x_runtime "aa_subpixel_r_offset_x" -0.333333333 -0.333333333 0.333333333 0.333333333
-//#pragma parameter aa_subpixel_r_offset_y_runtime "aa_subpixel_r_offset_y" 0.0 -0.333333333 0.333333333 0.333333333
-//#pragma parameter aa_cubic_c "antialias_cubic_sharpness" 0.5 0.0 4.0 0.015625
-//#pragma parameter aa_gauss_sigma "antialias_gauss_sigma" 0.5 0.0625 1.0 0.015625
-//#pragma parameter geom_mode_runtime "geom_mode" 0.0 0.0 3.0 1.0
-//#pragma parameter geom_radius "geom_radius" 2.0 0.16 1024.0 0.1
-//#pragma parameter geom_view_dist "geom_view_dist" 2.0 0.5 1024.0 0.25
-//#pragma parameter geom_tilt_angle_x "geom_tilt_angle_x" 0.0 -3.14159265 3.14159265 0.017453292519943295
-//#pragma parameter geom_tilt_angle_y "geom_tilt_angle_y" 0.0 -3.14159265 3.14159265 0.017453292519943295
-#pragma parameter geom_aspect_ratio_x "geom_aspect_ratio_x" 432.0 1.0 512.0 1.0
-#pragma parameter geom_aspect_ratio_y "geom_aspect_ratio_y" 329.0 1.0 512.0 1.0
-//#pragma parameter geom_overscan_x "geom_overscan_x" 1.0 0.00390625 4.0 0.00390625
-//#pragma parameter geom_overscan_y "geom_overscan_y" 1.0 0.00390625 4.0 0.00390625
-//#pragma parameter border_size "border_size" 0.015 0.0000001 0.5 0.005
-//#pragma parameter border_darkness "border_darkness" 2.0 0.0 16.0 0.0625
-//#pragma parameter border_compress "border_compress" 2.5 1.0 64.0 0.0625
-#pragma parameter interlace_1080i "interlace_1080i" 0.0 0.0 1.0 1.0
-#pragma parameter beam_num_scanlines "beam_num_scanlines" 4.0 2.0 6.0 1.0
-
+#pragma parameter crt_gamma "Simulated CRT Gamma" 2.5 1.0 5.0 0.025
+#define crt_gamma global.crt_gamma
+#pragma parameter lcd_gamma "Your Display Gamma" 2.2 1.0 5.0 0.025
+#define lcd_gamma global.lcd_gamma
+#pragma parameter levels_contrast "Contrast" 1.0 0.0 4.0 0.015625
+#define levels_contrast global.levels_contrast
+#pragma parameter halation_weight "Halation Weight" 0.0 0.0 1.0 0.005
+#pragma parameter diffusion_weight "Diffusion Weight" 0.075 0.0 1.0 0.005
+#pragma parameter bloom_underestimate_levels "Bloom - Underestimate Levels" 0.8 0.0 5.0 0.01
+#define bloom_underestimate_levels global.bloom_underestimate_levels
+#pragma parameter bloom_excess "Bloom - Excess" 0.0 0.0 1.0 0.005
+#pragma parameter beam_min_sigma "Beam - Min Sigma" 0.02 0.005 1.0 0.005
+#define beam_min_sigma global.beam_min_sigma
+#pragma parameter beam_max_sigma "Beam - Max Sigma" 0.3 0.005 1.0 0.005
+#define beam_max_sigma global.beam_max_sigma
+#pragma parameter beam_spot_power "Beam - Spot Power" 0.33 0.01 16.0 0.01
+#define beam_spot_power global.beam_spot_power
+#pragma parameter beam_min_shape "Beam - Min Shape" 2.0 2.0 32.0 0.1
+#define beam_min_shape global.beam_min_shape
+#pragma parameter beam_max_shape "Beam - Max Shape" 4.0 2.0 32.0 0.1
+#define beam_max_shape global.beam_max_shape
+#pragma parameter beam_shape_power "Beam - Shape Power" 0.25 0.01 16.0 0.01
+#define beam_shape_power global.beam_shape_power
+#pragma parameter beam_horiz_filter "Beam - Horiz Filter" 0.0 0.0 2.0 1.0
+#define beam_horiz_filter global.beam_horiz_filter
+#pragma parameter beam_horiz_sigma "Beam - Horiz Sigma" 0.35 0.0 0.67 0.005
+#define beam_horiz_sigma global.beam_horiz_sigma
+#pragma parameter beam_horiz_linear_rgb_weight "Beam - Horiz Linear RGB Weight" 1.0 0.0 1.0 0.01
+#pragma parameter convergence_offset_x_r "Convergence - Offset X Red" 0.0 -4.0 4.0 0.05
+#define convergence_offset_x_r global.convergence_offset_x_r
+#pragma parameter convergence_offset_x_g "Convergence - Offset X Green" 0.0 -4.0 4.0 0.05
+#define convergence_offset_x_g global.convergence_offset_x_g
+#pragma parameter convergence_offset_x_b "Convergence - Offset X Blue" 0.0 -4.0 4.0 0.05
+#define convergence_offset_x_b global.convergence_offset_x_b
+#pragma parameter convergence_offset_y_r "Convergence - Offset Y Red" 0.0 -2.0 2.0 0.05
+#define convergence_offset_y_r global.convergence_offset_y_r
+#pragma parameter convergence_offset_y_g "Convergence - Offset Y Green" 0.0 -2.0 2.0 0.05
+#define convergence_offset_y_g global.convergence_offset_y_g
+#pragma parameter convergence_offset_y_b "Convergence - Offset Y Blue" 0.0 -2.0 2.0 0.05
+#define convergence_offset_y_b global.convergence_offset_y_b
+#pragma parameter mask_type "Mask - Type" 1.0 0.0 2.0 1.0
+#define mask_type global.mask_type
+#pragma parameter mask_sample_mode_desired "Mask - Sample Mode" 0.0 0.0 2.0 1.0   //  Consider blocking mode 2.
+#define mask_sample_mode_desired global.mask_sample_mode_desired
+#pragma parameter mask_specify_num_triads "Mask - Specify Number of Triads" 0.0 0.0 1.0 1.0
+#pragma parameter mask_triad_size_desired "Mask - Triad Size Desired" 3.0 1.0 18.0 0.125
+#pragma parameter mask_num_triads_desired "Mask - Number of Triads Desired" 480.0 342.0 1920.0 1.0
+#pragma parameter aa_subpixel_r_offset_x_runtime "AA - Subpixel R Offset X" -0.333333333 -0.333333333 0.333333333 0.333333333
+#define aa_subpixel_r_offset_x_runtime global.aa_subpixel_r_offset_x_runtime
+#pragma parameter aa_subpixel_r_offset_y_runtime "AA - Subpixel R Offset Y" 0.0 -0.333333333 0.333333333 0.333333333
+#define aa_subpixel_r_offset_y_runtime global.aa_subpixel_r_offset_y_runtime
+#pragma parameter aa_cubic_c "AA - Cubic Sharpness" 0.5 0.0 4.0 0.015625
+#define aa_cubic_c global.aa_cubic_c
+#pragma parameter aa_gauss_sigma "AA - Gaussian Sigma" 0.5 0.0625 1.0 0.015625
+#define aa_gauss_sigma global.aa_gauss_sigma
+#pragma parameter geom_mode_runtime "Geometry - Mode" 0.0 0.0 3.0 1.0
+#define geom_mode_runtime global.geom_mode_runtime
+#pragma parameter geom_radius "Geometry - Radius" 2.0 0.16 1024.0 0.1
+#define geom_radius global.geom_radius
+#pragma parameter geom_view_dist "Geometry - View Distance" 2.0 0.5 1024.0 0.25
+#define geom_view_dist global.geom_view_dist
+#pragma parameter geom_tilt_angle_x "Geometry - Tilt Angle X" 0.0 -3.14159265 3.14159265 0.017453292519943295
+#define geom_tilt_angle_x global.geom_tilt_angle_x
+#pragma parameter geom_tilt_angle_y "Geometry - Tilt Angle Y" 0.0 -3.14159265 3.14159265 0.017453292519943295
+#define geom_tilt_angle_y global.geom_tilt_angle_y
+#pragma parameter geom_aspect_ratio_x "Geometry - Aspect Ratio X" 432.0 1.0 512.0 1.0
+#define geom_aspect_ratio_x global.geom_aspect_ratio_x
+#pragma parameter geom_aspect_ratio_y "Geometry - Aspect Ratio Y" 329.0 1.0 512.0 1.0
+#define geom_aspect_ratio_y global.geom_aspect_ratio_y
+#pragma parameter geom_overscan_x "Geometry - Overscan X" 1.0 0.00390625 4.0 0.00390625
+#define geom_overscan_x global.geom_overscan_x
+#pragma parameter geom_overscan_y "Geometry - Overscan Y" 1.0 0.00390625 4.0 0.00390625
+#define geom_overscan_y global.geom_overscan_y
+#pragma parameter border_size "Border - Size" 0.015 0.0000001 0.5 0.005
+#define border_size global.border_size
+#pragma parameter border_darkness "Border - Darkness" 2.0 0.0 16.0 0.0625
+#define border_darkness global.border_darkness
+#pragma parameter border_compress "Border - Compression" 2.5 1.0 64.0 0.0625
+#define border_compress global.border_compress
+#pragma parameter interlace_bff "Interlacing - Bottom Field First" 0.0 0.0 1.0 1.0
+//#define interlace_bff global.interlace_bff
+#pragma parameter interlace_1080i "Interlace - Detect 1080i" 0.0 0.0 1.0 1.0
+#define interlace_1080i global.interlace_1080i
 #endif
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/phosphor-mask-resizing.h b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h
index 8508688..dc82562 100644
--- a/crt/shaders/crt-royale/src/phosphor-mask-resizing.h
+++ b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h
@@ -40,76 +40,26 @@
 #endif  //  No else needed: Dynamic loops assumed.
 
 
-    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
-        const vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \
-        const vec4 tile_uv_r = fract(                                         \
-            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
-        const vec4 tex_uv_r = tile_uv_r * tile_size_uv_r;
-
-    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
-        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
-        const vec3 new_sample0 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
-        const vec3 new_sample1 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
-        const vec3 new_sample2 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
-        const vec3 new_sample3 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
-        UPDATE_COLOR_AND_WEIGHT_SUMS;
-		
-	#define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
-        const vec4 dist = magnification_scale *                              \
-            abs(first_dist_unscaled - true_i);                                 \
-        const vec4 pi_dist = pi * dist;                                      \
-        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
-        pixel_color += new_sample0 * weights.xxx;                              \
-        pixel_color += new_sample1 * weights.yyy;                              \
-        pixel_color += new_sample2 * weights.zzz;                              \
-        pixel_color += new_sample3 * weights.www;                              \
-        weight_sum += weights;
-		
-	#ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
-        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
-            const vec4 pi_dist_over_lobes = pi_over_lobes * dist;            \
-            const vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
-                (pi_dist*pi_dist_over_lobes), vec4(1.0));
-    #else
-        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
-            const vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0));
-    #endif
-	
-	#define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
-        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
-        const vec3 new_sample0 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
-        const vec3 new_sample1 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
-        const vec3 new_sample2 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
-        const vec3 new_sample3 = tex2Dlod0try(texture,                       \
-            vec2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
-        UPDATE_COLOR_AND_WEIGHT_SUMS;
-
 //////////////////////////////////  CONSTANTS  /////////////////////////////////
 
 //  The larger the resized tile, the fewer samples we'll need for downsizing.
 //  See if we can get a static min tile size > mask_min_allowed_tile_size:
-const float mask_min_allowed_tile_size = ceil(
+static const float mask_min_allowed_tile_size = ceil(
     mask_min_allowed_triad_size * mask_triads_per_tile);
-const float mask_min_expected_tile_size = 
+static const float mask_min_expected_tile_size = 
         mask_min_allowed_tile_size;
 //  Limit the number of sinc resize taps by the maximum minification factor:
-const float pi_over_lobes = pi/mask_sinc_lobes;
-const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+static const float pi_over_lobes = pi/mask_sinc_lobes;
+static const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
     mask_resize_src_lut_size.x/mask_min_expected_tile_size;
 //  Vectorized loops sample in multiples of 4.  Round up to be safe:
-const float max_sinc_resize_samples_m4 = ceil(
+static const float max_sinc_resize_samples_m4 = ceil(
     max_sinc_resize_samples_float * 0.25) * 4.0;
-	
-	/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
 
-float get_dynamic_loop_size(const float magnification_scale)
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
 {
     //  Requires:   The following global constants must be defined:
     //              1.) mask_sinc_lobes
@@ -130,10 +80,10 @@ float get_dynamic_loop_size(const float magnification_scale)
     return min(min_samples_m4, max_samples_m4);
 }
 
-vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv, 
-    const vec2 texture_size, const float dr, 
+float2 get_first_texel_tile_uv_and_dist(const float2 tex_uv, 
+    const float2 tex_size, const float dr, 
     const float input_tiles_per_texture_r, const float samples,
-    const bool vertical)
+    static const bool vertical)
 {
     //  Requires:   1.) dr == du == 1.0/texture_size.x or
     //                  dr == dv == 1.0/texture_size.y
@@ -151,216 +101,122 @@ vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv,
     //  so get the first sample location and distance.  Modify both dimensions
     //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
     //  (and incorrect) dimension at the end.
-    const vec2 curr_texel = tex_uv * texture_size;
-    const vec2 prev_texel =
-        floor(curr_texel - vec2(under_half)) + vec2(0.5);
-    const vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0);
-    const vec2 first_texel_uv_wrap_2D = first_texel * dr;
-    const vec2 first_texel_dist_2D = curr_texel - first_texel;
-    //  Convert from tex_uv to tile_uv coords so we can sub fracts for fmods.
-    const vec2 first_texel_tile_uv_wrap_2D =
+    const float2 curr_texel = tex_uv * tex_size;
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 first_texel = prev_texel - float2(samples/2.0 - 1.0);
+    const float2 first_texel_uv_wrap_2D = first_texel * dr;
+    const float2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const float2 first_texel_tile_uv_wrap_2D =
         first_texel_uv_wrap_2D * input_tiles_per_texture_r;
     //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
     //  samples,but the first texel is special, since it might be negative.
-    vec2 coord_negative = vec2(0.0);
-        if(first_texel_tile_uv_wrap_2D.x < 0.0) coord_negative.x = first_texel_tile_uv_wrap_2D.x;
-		if(first_texel_tile_uv_wrap_2D.x < 0.0) coord_negative.y = first_texel_tile_uv_wrap_2D.y;
-    const vec2 first_texel_tile_uv_2D =
-        fract(first_texel_tile_uv_wrap_2D) + coord_negative;
+    const float2 coord_negative =
+        float2((first_texel_tile_uv_wrap_2D.x < 0.),(first_texel_tile_uv_wrap_2D.y < 0.));
+    const float2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
     //  Pack the first texel's tile_uv coord and texel distance in 1D:
-    const vec2 tile_u_and_dist =
-        vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
-    const vec2 tile_v_and_dist =
-        vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    const float2 tile_u_and_dist =
+        float2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const float2 tile_v_and_dist =
+        float2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
     return vertical ? tile_v_and_dist : tile_u_and_dist;
-    //return mix(tile_u_and_dist, tile_v_and_dist, float(vertical));
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
 }
 
-vec4 tex2Dlod0try(const sampler2D tex, const vec2 tex_uv)
+inline float4 tex2Dlod0try(const sampler2D tex, const float2 tex_uv)
 {
     //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
     //  One [slow] workaround is to select the lowest mip level:
     #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
-        return tex2Dlod(tex, vec4(tex_uv, 0.0, 0.0));
+        return textureLod(tex, float4(tex_uv, 0.0, 0.0).xy);
     #else
         #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
-            return tex2Dbias(tex, vec4(tex_uv, 0.0, -16.0));
+            return tex2Dbias(tex, float4(tex_uv, 0.0, -16.0));
         #else
             return texture(tex, tex_uv);
         #endif
     #endif
 }
-	
-////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
 
-vec2 get_resized_mask_tile_size(const vec2 estimated_viewport_size,
-    const vec2 estimated_mask_resize_output_size,
-    const bool solemnly_swear_same_inputs_for_every_pass)
-{
-    //  Requires:   The following global constants must be defined according to
-    //              certain constraints:
-    //              1.) mask_resize_num_triads: Must be high enough that our
-    //                  mask sampling method won't have artifacts later
-    //                  (long story; see derived-settings-and-constants.h)
-    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
-    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
-    //              4.) mask_min_allowed_triad_size: User setting (the more
-    //                  restrictive it is, the faster the resize will go)
-    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
-    //              6.) mask_triad_size_desired_{runtime, static}
-    //              7.) mask_num_triads_desired_{runtime, static}
-    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
-    //              The function parameters must be defined as follows:
-    //              1.) estimated_viewport_size == (final viewport size);
-    //                  If mask_specify_num_triads is 1.0/true and the viewport
-    //                  estimate is wrong, the number of triads will differ from
-    //                  the user's preference by about the same factor.
-    //              2.) estimated_mask_resize_output_size: Must equal the
-    //                  output size of the MASK_RESIZE pass.
-    //                  Exception: The x component may be estimated garbage if
-    //                  and only if the caller throws away the x result.
-    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
-    //                  unless you can guarantee that every call across every
-    //                  pass will use the same sizes for the other parameters.
-    //              When calling this across multiple passes, always use the
-    //              same y viewport size/scale, and always use the same x
-    //              viewport size/scale when using the x result.
-    //  Returns:    Return the final size of a manually resized mask tile, after
-    //              constraining the desired size to avoid artifacts.  Under
-    //              unusual circumstances, tiles may become stretched vertically
-    //              (see wall of text below).
-    //  Stated tile properties must be correct:
-    const float tile_aspect_ratio_inv =
-        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
-    const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
-    const vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
-    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
-    //  wrong, the user preference will be misinterpreted:
-    const float desired_tile_size_x = mask_triads_per_tile * mix(
-        params.mask_triad_size_desired,
-        estimated_viewport_size.x / params.mask_num_triads_desired,
-        params.mask_specify_num_triads);
-    if(params.mask_sample_mode_desired > 0.5)
-    {
-        //  We don't need constraints unless we're sampling MASK_RESIZE.
-        return desired_tile_size_x * tile_aspect;
-    }
-    //  Make sure we're not upsizing:
-    const float temp_tile_size_x =
-        min(desired_tile_size_x, mask_resize_src_lut_size.x);
-    //  Enforce min_tile_size and max_tile_size in both dimensions:
-    const vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
-    const vec2 min_tile_size =
-        mask_min_allowed_tile_size * tile_aspect;
-    const vec2 max_tile_size =
-        estimated_mask_resize_output_size / mask_resize_num_tiles;
-    const vec2 clamped_tile_size =
-        clamp(temp_tile_size, min_tile_size, max_tile_size);
-    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
-    //  If we're currently resizing in the y dimension, the x components
-    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
-    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
-    //  We can't adjust the y size based on clamped_tile_size.x.  If it
-    //  clamps when it shouldn't, it won't clamp again when later passes
-    //  call this function with the correct sizes, and the discrepancy will
-    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
-    //  the x size based on the y size, but not vice versa, unless the
-    //  caller swears the parameters were the same (correct) in every pass.
-    //  As a result, triads could appear vertically stretched if:
-    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
-    //      LUT's might clamp x more than y (all provided LUT's are square)
-    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
-    //      with a vertically oriented screen (not accounted for anyway)
-    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
-    //      Viewport scales are equal by default.
-    //  If any of these are the case, you can fix the stretching by setting:
-    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
-    //          (1.0 / min_expected_aspect_ratio) *
-    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
-    const float x_tile_size_from_y =
-        clamped_tile_size.y * tile_aspect_ratio;
-    const float y_tile_size_from_x = mix(clamped_tile_size.y,
-        clamped_tile_size.x * tile_aspect_ratio_inv,
-        float(solemnly_swear_same_inputs_for_every_pass));
-    const vec2 reclamped_tile_size = vec2(
-        min(clamped_tile_size.x, x_tile_size_from_y),
-        min(clamped_tile_size.y, y_tile_size_from_x));
-    //  We need integer tile sizes in both directions for tiled sampling to
-    //  work correctly.  Use floor (to make sure we don't round up), but be
-    //  careful to avoid a rounding bug where floor decreases whole numbers:
-    const vec2 final_resized_tile_size =
-        floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
-    return final_resized_tile_size;
-}
 
-/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a float4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const float4 true_i = float4(i_base + i) + float4(0.0, 1.0, 2.0, 3.0); \
+        const float4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const float4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const float4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), float4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const float4 weights = min(sin(pi_dist)/pi_dist, float4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const float4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const float4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const float3 new_sample0 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const float3 new_sample1 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const float3 new_sample2 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const float3 new_sample3 = tex2Dlod0try(tex,                       \
+            float2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
 
-vec4 get_mask_sampling_parameters(const vec2 mask_resize_texture_size,
-    const vec2 mask_resize_video_size, const vec2 true_viewport_size,
-    out vec2 mask_tiles_per_screen)
-{
-    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
-    //                  met, particularly regarding global constants.
-    //              The function parameters must be defined as follows:
-    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
-    //                  if get_mask_sample_mode() is 0 (otherwise anything)
-    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
-    //                  if get_mask_sample_mode() is 0 (otherwise anything)
-    //              3.) true_viewport_size == IN.output_size for a pass set to
-    //                  1.0 viewport scale (i.e. it must be correct)
-    //  Returns:    Return a vec4 containing:
-    //                  xy: tex_uv coords for the start of the mask tile
-    //                  zw: tex_uv size of the mask tile from start to end
-    //              mask_tiles_per_screen is an out parameter containing the
-    //              number of mask tiles that will fit on the screen.
-    //  First get the final resized tile size.  The viewport size and mask
-    //  resize viewport scale must be correct, but don't solemnly swear they
-    //  were correct in both mask resize passes unless you know it's true.
-    //  (We can better ensure a correct tile aspect ratio if the parameters are
-    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
-    //  sizes across passes, resulting in broken texture coordinates.)
-    const float mask_sample_mode = params.mask_sample_mode_desired;//get_mask_sample_mode();
-    const vec2 mask_resize_tile_size = get_resized_mask_tile_size(
-        true_viewport_size, mask_resize_video_size, false);
-    if(mask_sample_mode < 0.5)
-    {
-        //  Sample MASK_RESIZE: The resized tile is a fracttion of the texture
-        //  size and starts at a nonzero offset to allow for border texels:
-        const vec2 mask_tile_uv_size = mask_resize_tile_size /
-            mask_resize_texture_size;
-        const vec2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
-        const vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
-        //  mask_tiles_per_screen must be based on the *true* viewport size:
-        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
-        return vec4(mask_tile_start_uv, mask_tile_uv_size);
-    }
-    else
-    {
-        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
-        //  "tile" to be the full texture containing many triads.  Otherwise,
-        //  we're hardware-resampling an LUT, and the texture truly contains a
-        //  single unresized phosphor mask tile anyway.
-        const vec2 mask_tile_uv_size = vec2(1.0);
-        const vec2 mask_tile_start_uv = vec2(0.0);
-        if(mask_sample_mode > 1.5)
-        {
-            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
-            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
-        }
-        else
-        {
-            //  Hardware-resize the original LUT:
-            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
-        }
-        return vec4(mask_tile_start_uv, mask_tile_uv_size);
-    }
-}
 
 ////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
 
-vec3 downsample_vertical_sinc_tiled(const sampler2D texture,
-    const vec2 tex_uv, const vec2 texture_size, const float dr,
-    const float magnification_scale, const float tile_size_uv_r)
+float3 downsample_vertical_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, static const float dr,
+    const float magnification_scale, static const float tile_size_uv_r)
 {
     //  Requires:   1.) dr == du == 1.0/texture_size.x or
     //                  dr == dv == 1.0/texture_size.y
@@ -381,29 +237,29 @@ vec3 downsample_vertical_sinc_tiled(const sampler2D texture,
     #ifdef USE_SINGLE_STATIC_LOOP
         //  A static loop can be faster, but it might blur too much from using
         //  more samples than it should.
-        const int samples = int(max_sinc_resize_samples_m4);
+        static const int samples = int(max_sinc_resize_samples_m4);
     #else
         const int samples = int(get_dynamic_loop_size(magnification_scale));
     #endif
 
     //  Get the first sample location (scalar tile uv coord along the resized
     //  dimension) and distance from the output location (in texels):
-    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    static const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
     //  true = vertical resize:
-    const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
-        tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, true);
-    const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
-    const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, true);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
     //  Get the tile sample offset:
-    const float tile_dr = dr * input_tiles_per_texture_r;
+    static const float tile_dr = dr * input_tiles_per_texture_r;
 
     //  Sum up each weight and weighted sample color, varying the looping
     //  strategy based on our expected dynamic loop capabilities.  See the
     //  loop body macros above.
     int i_base = 0;
-    vec4 weight_sum = vec4(0.0);
-    vec3 pixel_color = vec3(0.0);
-    const int i_step = 4;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
     #ifdef BREAK_LOOPS_INTO_PIECES
         if(samples - i_base >= 64)
         {
@@ -460,14 +316,14 @@ vec3 downsample_vertical_sinc_tiled(const sampler2D texture,
         }
     #endif
     //  Normalize so the weight_sum == 1.0, and return:
-    const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
-    const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + 
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x + 
         weight_sum_reduce.y);
     return (pixel_color/scalar_weight_sum);
 }
 
-vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
-    const vec2 tex_uv, const vec2 texture_size, const float dr,
+float3 downsample_horizontal_sinc_tiled(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size, const float dr,
     const float magnification_scale, const float tile_size_uv_r)
 {
     //  Differences from downsample_horizontal_sinc_tiled:
@@ -486,7 +342,7 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
     //  we're resizing along, e.g. "dx" in this case.
     #ifdef USE_SINGLE_STATIC_LOOP
         //  If we have to load all samples, we might as well use them.
-        const int samples = int(max_sinc_resize_samples_m4);
+        static const int samples = int(max_sinc_resize_samples_m4);
     #else
         const int samples = int(get_dynamic_loop_size(magnification_scale));
     #endif
@@ -495,10 +351,10 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
     //  dimension) and distance from the output location (in texels):
     const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
     //  false = horizontal resize:
-    const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
-        tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, false);
-    const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
-    const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    const float2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, tex_size, dr, input_tiles_per_texture_r, samples, false);
+    const float4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const float4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
     //  Get the tile sample offset:
     const float tile_dr = dr * input_tiles_per_texture_r;
 
@@ -506,9 +362,9 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
     //  strategy based on our expected dynamic loop capabilities.  See the
     //  loop body macros above.
     int i_base = 0;
-    vec4 weight_sum = vec4(0.0);
-    vec3 pixel_color = vec3(0.0);
-    const int i_step = 4;
+    float4 weight_sum = float4(0.0);
+    float3 pixel_color = float3(0.0);
+    static const int i_step = 4;
     #ifdef BREAK_LOOPS_INTO_PIECES
         if(samples - i_base >= 64)
         {
@@ -565,47 +421,243 @@ vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
         }
     #endif
     //  Normalize so the weight_sum == 1.0, and return:
-    const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
-    const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x +
+    const float2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const float3 scalar_weight_sum = float3(weight_sum_reduce.x +
         weight_sum_reduce.y);
     return (pixel_color/scalar_weight_sum);
 }
 
-vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap,
-    const vec4 mask_tile_start_uv_and_size)
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+float2 get_resized_mask_tile_size(const float2 estimated_viewport_size,
+    const float2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    static const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    static const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    static const float2 tile_aspect = float2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        global.mask_triad_size_desired,
+        estimated_viewport_size.x / global.mask_num_triads_desired,
+        global.mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const float2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    static const float2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const float2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const float2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const float2 reclamped_tile_size = float2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const float2 final_resized_tile_size =
+        floor(reclamped_tile_size + float2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+float4 get_mask_sampling_parameters(const float2 mask_resize_texture_size,
+    const float2 mask_resize_video_size, const float2 true_viewport_size,
+    out float2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == IN.output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a float4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const float2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const float2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const float2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const float2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        static const float2 mask_tile_uv_size = float2(1.0);
+        static const float2 mask_tile_start_uv = float2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return float4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+/*
+float2 fix_tiling_discontinuities_normalized(const float2 tile_uv,
+    float2 duv_dx, float2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const float2 pixel_in_first_half_tile = float2((tile_uv.x < 0.5),(tile_uv.y < 0.5));
+    const float2 jump_exists = float2(((duv_dx + duv_dy).x > 0.5),((duv_dx + duv_dy).y > 0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+*/
+float2 convert_phosphor_tile_uv_wrap_to_tex_uv(const float2 tile_uv_wrap,
+    const float4 mask_tile_start_uv_and_size)
 {
     //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
     //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
     //                  tile center.  The input coords can range from [0, inf],
-    //                  and their fracttional parts map to a repeated tile.
+    //                  and their fractional parts map to a repeated tile.
     //                  ("Tile" can mean texture, the video embedded in the
     //                  texture, or some other "tile" embedded in a texture.)
     //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
     //                  for the start of the embedded tile in the full texture.
-    //              3.) mask_tile_start_uv_and_size.zw contains the [fracttional]
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
     //                  tex_uv size of the embedded tile in the full texture.
     //  Returns:    Return tex_uv coords (used for texture sampling)
     //              corresponding to tile_uv_wrap.
-    if(params.mask_sample_mode_desired < 0.5)
+    if(get_mask_sample_mode() < 0.5)
     {
         //  Manually repeat the resized mask tile to fill the screen:
-        //  First get fracttional tile_uv coords.  Using fract/fmod on coords
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
         //  confuses anisotropic filtering; fix it as user options dictate.
         //  derived-settings-and-constants.h disables incompatible options.
         #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
-            vec2 tile_uv = fract(tile_uv_wrap * 0.5) * 2.0;
+            float2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
         #else
-            vec2 tile_uv = fract(tile_uv_wrap);
+            float2 tile_uv = frac(tile_uv_wrap);
         #endif
         #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
-            const vec2 tile_uv_dx = ddx(tile_uv);
-            const vec2 tile_uv_dy = ddy(tile_uv);
+            const float2 tile_uv_dx = ddx(tile_uv);
+            const float2 tile_uv_dy = ddy(tile_uv);
             tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
                 tile_uv_dx, tile_uv_dy);
         #endif
         //  The tile is embedded in a padded FBO, and it may start at a
         //  nonzero offset if border texels are used to avoid artifacts:
-        const vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+        const float2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
             tile_uv * mask_tile_start_uv_and_size.zw;
         return mask_tex_uv;
     }
@@ -620,5 +672,6 @@ vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap,
     }
 }
 
+
 #endif  //  PHOSPHOR_MASK_RESIZING_H
 
diff --git a/crt/shaders/crt-royale/src/scanline-functions.h b/crt/shaders/crt-royale/src/scanline-functions.h
index 5169b3d..9c4f9e5 100644
--- a/crt/shaders/crt-royale/src/scanline-functions.h
+++ b/crt/shaders/crt-royale/src/scanline-functions.h
@@ -27,284 +27,10 @@
 #include "../../../../include/special-functions.h"
 #include "../../../../include/gamma-management.h"
 
+
 /////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
 
-vec3 get_raw_interpolated_color(const vec3 color0,
-    const vec3 color1, const vec3 color2, const vec3 color3,
-    const vec4 weights)
-{
-    //  Use max to avoid bizarre artifacts from negative colors:
-    return max(mat4x3(color0, color1, color2, color3) * weights, 0.0);
-}
-
-vec3 get_interpolated_linear_color(const vec3 color0, const vec3 color1,
-    const vec3 color2, const vec3 color3, const vec4 weights)
-{
-    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
-    //                  intermediate_gamma must be globally defined, and input
-    //                  colors are interpreted as linear RGB unless you #define
-    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
-    //                  interpreted as gamma-encoded with intermediate_gamma).
-    //              2.) color0-3 are colors sampled from a texture with tex2D().
-    //                  They are interpreted as defined in requirement 1.
-    //              3.) weights contains weights for each color, summing to 1.0.
-    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
-    //                  float in [0.0, 1.0] describing how much blending should
-    //                  be done in linear RGB (rest is gamma-corrected RGB).
-    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
-    //                  if beam_horiz_linear_rgb_weight is anything other than a
-    //                  static constant, or we may try branching at runtime
-    //                  without dynamic branches allowed (slow).
-    //  Returns:    Return an interpolated color lookup between the four input
-    //              colors based on the weights in weights.  The final color will
-    //              be a linear RGB value, but the blending will be done as
-    //              indicated above.
-    const float intermediate_gamma = get_intermediate_gamma();
-    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
-    //  profile allows dynamic branches (faster than computing extra pows):
-    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
-        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
-    #else
-        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
-            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
-        #endif
-    #endif
-    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
-        //  beam_horiz_linear_rgb_weight is static, so we can branch:
-        #ifdef GAMMA_ENCODE_EVERY_FBO
-            const vec3 gamma_mixed_color = pow(get_raw_interpolated_color(
-                color0, color1, color2, color3, weights), vec3(intermediate_gamma));
-            if(beam_horiz_linear_rgb_weight > 0.0)
-            {
-                const vec3 linear_mixed_color = get_raw_interpolated_color(
-                    pow(color0, vec3(intermediate_gamma)),
-                    pow(color1, vec3(intermediate_gamma)),
-                    pow(color2, vec3(intermediate_gamma)),
-                    pow(color3, vec3(intermediate_gamma)),
-                    weights);
-                return mix(gamma_mixed_color, linear_mixed_color,
-                    beam_horiz_linear_rgb_weight);
-            }
-            else
-            {
-                return gamma_mixed_color;
-            }
-        #else
-            const vec3 linear_mixed_color = get_raw_interpolated_color(
-                color0, color1, color2, color3, weights);
-            if(beam_horiz_linear_rgb_weight < 1.0)
-            {
-                const vec3 gamma_mixed_color = get_raw_interpolated_color(
-                    pow(color0, vec3(1.0/intermediate_gamma)),
-                    pow(color1, vec3(1.0/intermediate_gamma)),
-                    pow(color2, vec3(1.0/intermediate_gamma)),
-                    pow(color3, vec3(1.0/intermediate_gamma)),
-                    weights);
-                return mix(gamma_mixed_color, linear_mixed_color,
-                    beam_horiz_linear_rgb_weight);
-            }
-            else
-            {
-                return linear_mixed_color;
-            }
-        #endif  //  GAMMA_ENCODE_EVERY_FBO
-    #else
-        #ifdef GAMMA_ENCODE_EVERY_FBO
-            //  Inputs: color0-3 are colors in gamma-encoded RGB.
-            const vec3 gamma_mixed_color = pow(get_raw_interpolated_color(
-                color0, color1, color2, color3, weights), vec3(intermediate_gamma));
-            const vec3 linear_mixed_color = get_raw_interpolated_color(
-                pow(color0, vec3(intermediate_gamma)),
-                pow(color1, vec3(intermediate_gamma)),
-                pow(color2, vec3(intermediate_gamma)),
-                pow(color3, vec3(intermediate_gamma)),
-                weights);
-            return mix(gamma_mixed_color, linear_mixed_color,
-                beam_horiz_linear_rgb_weight);
-        #else
-            //  Inputs: color0-3 are colors in linear RGB.
-            const vec3 linear_mixed_color = get_raw_interpolated_color(
-                color0, color1, color2, color3, weights);
-            const vec3 gamma_mixed_color = get_raw_interpolated_color(
-                    pow(color0, vec3(1.0/intermediate_gamma)),
-                    pow(color1, vec3(1.0/intermediate_gamma)),
-                    pow(color2, vec3(1.0/intermediate_gamma)),
-                    pow(color3, vec3(1.0/intermediate_gamma)),
-                    weights);
-            return mix(gamma_mixed_color, linear_mixed_color,
-                beam_horiz_linear_rgb_weight);
-        #endif  //  GAMMA_ENCODE_EVERY_FBO
-    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
-}
-
-vec3 get_scanline_color(const sampler2D tex, const vec2 scanline_uv,
-    const vec2 uv_step_x, const vec4 weights)
-{
-    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
-    //                  desired line or scanline and horizontally snapped to the
-    //                  texel just left of the output pixel (color1)
-    //              2.) uv_step_x must contain the horizontal uv distance
-    //                  between texels.
-    //              3.) weights must contain interpolation filter weights for
-    //                  color0, color1, color2, and color3, where color1 is just
-    //                  left of the output pixel.
-    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
-    //              nearby texels, according to weights and the conventions of
-    //              get_interpolated_linear_color().
-    //  We can ignore the outside texture lookups for Quilez resampling.
-    const vec3 color1 = texture(tex, scanline_uv).rgb;
-    const vec3 color2 = texture(tex, scanline_uv + uv_step_x).rgb;
-    vec3 color0 = vec3(0.0);
-    vec3 color3 = vec3(0.0);
-    if(params.beam_horiz_filter > 0.5)
-    {
-        color0 = texture(tex, scanline_uv - uv_step_x).rgb;
-        color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb;
-    }
-    //  Sample the texture as-is, whether it's linear or gamma-encoded:
-    //  get_interpolated_linear_color() will handle the difference.
-    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
-}
-
-vec3 sample_single_scanline_horizontal(const sampler2D texture,
-    const vec2 tex_uv, const vec2 texture_size,
-    const vec2 texture_size_inv)
-{
-    //  TODO: Add function requirements.
-    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
-    const vec2 curr_texel = tex_uv * texture_size;
-    //  Use under_half to fix a rounding bug right around exact texel locations.
-    const vec2 prev_texel =
-        floor(curr_texel - vec2(under_half)) + vec2(0.5);
-    const vec2 prev_texel_hor = vec2(prev_texel.x, curr_texel.y);
-    const vec2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
-    const float prev_dist = curr_texel.x - prev_texel_hor.x;
-    const vec4 sample_dists = vec4(1.0 + prev_dist, prev_dist,
-        1.0 - prev_dist, 2.0 - prev_dist);
-    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
-    vec4 weights;
-    if(params.beam_horiz_filter < 0.5)
-    {
-        //  Quilez:
-        const float x = sample_dists.y;
-        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
-        weights = vec4(0.0, 1.0 - w2, w2, 0.0);
-    }
-    else if(params.beam_horiz_filter < 1.5)
-    {
-        //  Gaussian:
-        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
-        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
-    }
-    else
-    {
-        //  Lanczos2:
-        const vec4 pi_dists = FIX_ZERO(sample_dists * pi);
-        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
-            (pi_dists * pi_dists);
-    }
-    //  Ensure the weight sum == 1.0:
-    const vec4 final_weights = weights/dot(weights, vec4(1.0));
-    //  Get the interpolated horizontal scanline color:
-    const vec2 uv_step_x = vec2(texture_size_inv.x, 0.0);
-    return get_scanline_color(
-        texture, prev_texel_hor_uv, uv_step_x, final_weights);
-}
-
-bool is_interlaced(float num_lines)
-{
-    //  Detect interlacing based on the number of lines in the source.
-    if(interlace_detect == true)
-    {
-        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
-        //  NTSC Emulators: Typically 224 or 240 lines
-        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
-        //  PAL Emulators: ?
-        //  ATSC: 720p, 1080i, 1080p
-        //  Where do we place our cutoffs?  Assumptions:
-        //  1.) We only need to care about active lines.
-        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
-        //  3.) Anything > 576 lines is probably not interlaced...
-        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
-        //  5.) Just in case the main program uses calculated video sizes,
-        //      we should nudge the float thresholds a bit.
-        bool sd_interlace;
-		if (num_lines > 288.5 && num_lines < 576.5)
-			{sd_interlace = true;}
-		else
-			{sd_interlace = false;}
-        bool hd_interlace;
-        if (num_lines > 1079.5 && num_lines < 1080.5)
-			{hd_interlace = true;}
-		else
-			{hd_interlace = false;}
-		return (sd_interlace || hd_interlace);
-    }
-    else
-    {
-        return false;
-    }
-}
-
-vec3 sample_rgb_scanline_horizontal(const sampler2D tex,
-    const vec2 tex_uv, const vec2 texture_size,
-    const vec2 texture_size_inv)
-{
-    //  TODO: Add function requirements.
-    //  Rely on a helper to make convergence easier.
-    if(beam_misconvergence == true)
-    {
-        const vec3 convergence_offsets_rgb =
-            get_convergence_offsets_x_vector();
-        const vec3 offset_u_rgb =
-            convergence_offsets_rgb * texture_size_inv.xxx;
-        const vec2 scanline_uv_r = tex_uv - vec2(offset_u_rgb.r, 0.0);
-        const vec2 scanline_uv_g = tex_uv - vec2(offset_u_rgb.g, 0.0);
-        const vec2 scanline_uv_b = tex_uv - vec2(offset_u_rgb.b, 0.0);
-        const vec3 sample_r = sample_single_scanline_horizontal(
-            tex, scanline_uv_r, texture_size, texture_size_inv);
-        const vec3 sample_g = sample_single_scanline_horizontal(
-            tex, scanline_uv_g, texture_size, texture_size_inv);
-        const vec3 sample_b = sample_single_scanline_horizontal(
-            tex, scanline_uv_b, texture_size, texture_size_inv);
-        return vec3(sample_r.r, sample_g.g, sample_b.b);
-    }
-    else
-    {
-        return sample_single_scanline_horizontal(tex, tex_uv, texture_size,
-            texture_size_inv);
-    }
-}
-
-vec2 get_last_scanline_uv(const vec2 tex_uv, const vec2 texture_size,
-    const vec2 texture_size_inv, const vec2 il_step_multiple,
-    const float frame_count, out float dist)
-{
-    //  Compute texture coords for the last/upper scanline, accounting for
-    //  interlacing: With interlacing, only consider even/odd scanlines every
-    //  other frame.  Top-field first (TFF) order puts even scanlines on even
-    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
-    //      frac(tex_uv * texture_size) == x.5
-    //  Caution: If these coordinates ever seem incorrect, first make sure it's
-    //  not because anisotropic filtering is blurring across field boundaries.
-    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
-    const float field_offset = floor(il_step_multiple.y * 0.75) *
-        mod(frame_count + float(interlace_bff), 2.0);
-    const vec2 curr_texel = tex_uv * texture_size;
-    //  Use under_half to fix a rounding bug right around exact texel locations.
-    const vec2 prev_texel_num = floor(curr_texel - vec2(under_half));
-    const float wrong_field = mod(
-        prev_texel_num.y + field_offset, il_step_multiple.y);
-    const vec2 scanline_texel_num = prev_texel_num - vec2(0.0, wrong_field);
-    //  Snap to the center of the previous scanline in the current field:
-    const vec2 scanline_texel = scanline_texel_num + vec2(0.5);
-    const vec2 scanline_uv = scanline_texel * texture_size_inv;
-    //  Save the sample's distance from the scanline, in units of scanlines:
-    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
-    return scanline_uv;
-}
-
-vec3 get_gaussian_sigma(const vec3 color, const float sigma_range)
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
 {
     //  Requires:   Globals:
     //              1.) beam_min_sigma and beam_max_sigma are global floats
@@ -356,19 +82,19 @@ vec3 get_gaussian_sigma(const vec3 color, const float sigma_range)
     if(beam_spot_shape_function < 0.5)
     {
         //  Use a power function:
-        return vec3(beam_min_sigma) + sigma_range *
-            pow(color, vec3(beam_spot_power));
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, float3(beam_spot_power));
     }
     else
     {
         //  Use a spherical function:
-        const vec3 color_minus_1 = color - vec3(1.0);
-        return vec3(beam_min_sigma) + sigma_range *
-            sqrt(vec3(1.0) - color_minus_1*color_minus_1);
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
     }
 }
 
-vec3 get_generalized_gaussian_beta(const vec3 color,
+inline float3 get_generalized_gaussian_beta(const float3 color,
     const float shape_range)
 {
     //  Requires:   Globals:
@@ -394,11 +120,11 @@ vec3 get_generalized_gaussian_beta(const vec3 color,
     //      beta widen and sharpen peaks at the risk of aliasing.
     //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
     //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
-    return beam_min_shape + shape_range * pow(color, vec3(beam_shape_power));
+    return beam_min_shape + shape_range * pow(color, float3(beam_shape_power));
 }
 
-vec3 scanline_gaussian_integral_contrib(const vec3 dist,
-    const vec3 color, const float pixel_height, const float sigma_range)
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
 {
     //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
     //                  point(s) from a scanline in units of scanlines, where
@@ -419,16 +145,16 @@ vec3 scanline_gaussian_integral_contrib(const vec3 dist,
     //  average brightness over a given pixel area.  Even if curved coords were
     //  used in this pass, a flat scalar pixel height works almost as well as a
     //  pixel height computed from a full pixel-space to scanline-space matrix.
-    const vec3 sigma = get_gaussian_sigma(color, sigma_range);
-    const vec3 ph_offset = vec3(pixel_height * 0.5);
-    const vec3 denom_inv = 1.0/(sigma*sqrt(2.0));
-    const vec3 integral_high = erf((dist + ph_offset)*denom_inv);
-    const vec3 integral_low = erf((dist - ph_offset)*denom_inv);
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
     return color * 0.5*(integral_high - integral_low)/pixel_height;
 }
 
-vec3 scanline_generalized_gaussian_integral_contrib(const vec3 dist,
-    const vec3 color, const float pixel_height, const float sigma_range,
+float3 scanline_generalized_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range,
     const float shape_range)
 {
     //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
@@ -450,44 +176,44 @@ vec3 scanline_generalized_gaussian_integral_contrib(const vec3 dist,
     //  models models standard deviation at beta == 2, because the standard
     //  deviation depends on both alpha and beta (keeping alpha independent is
     //  faster and preserves intuitive behavior and a full spectrum of results).
-    const vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
-    const vec3 beta = get_generalized_gaussian_beta(color, shape_range);
-    const vec3 alpha_inv = vec3(1.0)/alpha;
-    const vec3 s = vec3(1.0)/beta;
-    const vec3 ph_offset = vec3(pixel_height * 0.5);
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
     //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
     //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
-    const vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, beta);
-    const vec3 dist1 = dist + ph_offset;
-    const vec3 dist0 = dist - ph_offset;
-    const vec3 integral_high = sign(dist1) * normalized_ligamma_impl(
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
         s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
-    const vec3 integral_low = sign(dist0) * normalized_ligamma_impl(
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
         s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
     return color * 0.5*(integral_high - integral_low)/pixel_height;
 }
 
-vec3 scanline_gaussian_sampled_contrib(const vec3 dist, const vec3 color,
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
     const float pixel_height, const float sigma_range)
 {
     //  See scanline_gaussian integral_contrib() for detailed comments!
     //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
-    const vec3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
     //  Avoid repeated divides:
-    const vec3 sigma_inv = vec3(1.0)/sigma;
-    const vec3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
-    const vec3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
     if(beam_antialias_level > 0.5)
     {
         //  Sample 1/3 pixel away in each direction as well:
-        const vec3 sample_offset = vec3(pixel_height/3.0);
-        const vec3 dist2 = dist + sample_offset;
-        const vec3 dist3 = abs(dist - sample_offset);
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
         //  Average three pure Gaussian samples:
-        const vec3 scale = color/3.0 * outer_denom_inv;
-        const vec3 weight1 = exp(-(dist*dist)*inner_denom_inv);
-        const vec3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
-        const vec3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
         return scale * (weight1 + weight2 + weight3);
     }
     else
@@ -496,30 +222,30 @@ vec3 scanline_gaussian_sampled_contrib(const vec3 dist, const vec3 color,
     }
 }
 
-vec3 scanline_generalized_gaussian_sampled_contrib(const vec3 dist,
-    const vec3 color, const float pixel_height, const float sigma_range,
+float3 scanline_generalized_gaussian_sampled_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range,
     const float shape_range)
 {
     //  See scanline_generalized_gaussian_integral_contrib() for details!
     //  generalized sample =
     //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
-    const vec3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
-    const vec3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
     //  Avoid repeated divides:
-    const vec3 alpha_inv = vec3(1.0)/alpha;
-    const vec3 beta_inv = vec3(1.0)/beta;
-    const vec3 scale = color * beta * 0.5 * alpha_inv /
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
         gamma_impl(beta_inv, beta);
     if(beam_antialias_level > 0.5)
     {
         //  Sample 1/3 pixel closer to and farther from the scanline too.
-        const vec3 sample_offset = vec3(pixel_height/3.0);
-        const vec3 dist2 = dist + sample_offset;
-        const vec3 dist3 = abs(dist - sample_offset);
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
         //  Average three generalized Gaussian samples:
-        const vec3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
-        const vec3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
-        const vec3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
         return scale/3.0 * (weight1 + weight2 + weight3);
     }
     else
@@ -528,7 +254,7 @@ vec3 scanline_generalized_gaussian_sampled_contrib(const vec3 dist,
     }
 }
 
-vec3 scanline_contrib(vec3 dist, vec3 color,
+inline float3 scanline_contrib(float3 dist, float3 color,
     float pixel_height, const float sigma_range, const float shape_range)
 {
     //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
@@ -539,7 +265,7 @@ vec3 scanline_contrib(vec3 dist, vec3 color,
     //  Returns:    Return a scanline's light output over a given pixel, using
     //              a generalized or pure Gaussian distribution and sampling or
     //              integrals as desired by user codepath choices.
-    if(beam_generalized_gaussian == true)
+    if(beam_generalized_gaussian)
     {
         if(beam_antialias_level > 1.5)
         {
@@ -567,4 +293,279 @@ vec3 scanline_contrib(vec3 dist, vec3 color,
     }
 }
 
-#endif  //  SCANLINE_FUNCTIONS_H
\ No newline at end of file
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), float3(intermediate_gamma));
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(intermediate_gamma)),
+                    pow(color1, float3(intermediate_gamma)),
+                    pow(color2, float3(intermediate_gamma)),
+                    pow(color3, float3(intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, float3(intermediate_gamma)),
+                pow(color1, float3(intermediate_gamma)),
+                pow(color2, float3(intermediate_gamma)),
+                pow(color3, float3(intermediate_gamma)),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, float3(1.0/intermediate_gamma)),
+                    pow(color1, float3(1.0/intermediate_gamma)),
+                    pow(color2, float3(1.0/intermediate_gamma)),
+                    pow(color3, float3(1.0/intermediate_gamma)),
+                    weights);
+			// wtf fixme
+//			const float beam_horiz_linear_rgb_weight1 = 1.0;
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                global.beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D tex, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = texture(tex, scanline_uv).rgb;
+    const float3 color2 = texture(tex, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = texture(tex, scanline_uv - uv_step_x).rgb;
+        color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        tex, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D tex,
+    const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            tex, scanline_uv_r, tex_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            tex, scanline_uv_g, tex_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            tex, scanline_uv_b, tex_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(tex, tex_uv, tex_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 tex_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * tex_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+	// wtf fixme
+//	const float interlace_bff1 = 1.0;
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(global.interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * tex_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+
+inline bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        const bool sd_interlace = ((num_lines > 288.5) && (num_lines < 576.5));
+        const bool hd_interlace = bool(interlace_1080i) ?
+            ((num_lines > 1079.5) && (num_lines < 1080.5)) :
+            false;
+        return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
diff --git a/crt/shaders/crt-royale/src/tex2Dantialias.h b/crt/shaders/crt-royale/src/tex2Dantialias.h
index af7dd59..93fe7d4 100644
--- a/crt/shaders/crt-royale/src/tex2Dantialias.h
+++ b/crt/shaders/crt-royale/src/tex2Dantialias.h
@@ -27,18 +27,18 @@
 //              1.) All requirements of gamma-management.h must be satisfied!
 //              2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe-
 //                  space offsets to texture uv offsets.  You can get this with:
-//                      const vec2 duv_dx = ddx(tex_uv);
-//                      const vec2 duv_dy = ddy(tex_uv);
-//                      const mat2x2 pixel_to_tex_uv = mat2x2(
+//                      const float2 duv_dx = ddx(tex_uv);
+//                      const float2 duv_dy = ddy(tex_uv);
+//                      const float2x2 pixel_to_tex_uv = float2x2(
 //                          duv_dx.x, duv_dy.x,
 //                          duv_dx.y, duv_dy.y);
 //                  This is left to the user in case the current Cg profile
 //                  doesn't support ddx()/ddy().  Ideally, the user could find
 //                  calculate a distorted tangent-space mapping analytically.
 //                  If not, a simple flat mapping can be obtained with:
-//                      const vec2 xy_to_uv_scale = IN.output_size *
+//                      const float2 xy_to_uv_scale = IN.output_size *
 //                          IN.video_size/IN.texture_size;
-//                      const mat2x2 pixel_to_tex_uv = mat2x2(
+//                      const float2x2 pixel_to_tex_uv = float2x2(
 //                          xy_to_uv_scale.x, 0.0,
 //                          0.0, xy_to_uv_scale.y);
 //  Optional:   To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and:
@@ -70,8 +70,8 @@
 //                          0.5/aa_pixel_diameter;
 //              3.) Set subpixel offsets.  This requires an accessor function
 //                  for compatibility with scalar runtime shader params.  Return
-//                  a vec2 pixel offset in [-0.5, 0.5] for the red subpixel:
-//                      vec2 get_aa_subpixel_r_offset()
+//                  a float2 pixel offset in [-0.5, 0.5] for the red subpixel:
+//                      float2 get_aa_subpixel_r_offset()
 //              The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to
 //              override (all of) the following default static values.  However,
 //              the file's structure requires them to be declared static const:
@@ -84,7 +84,7 @@
 //                  values; much larger gauss_sigmas ironically prefer slightly
 //                  smaller support given sparse sampling, and vice versa.)
 //              3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter;
-//              4.) static const vec2 aa_xy_axis_importance:
+//              4.) static const float2 aa_xy_axis_importance:
 //                  The sparse N-queens sampling grid interacts poorly with
 //                  negative-lobed 2D filters.  However, if aliasing is much
 //                  stronger in one direction (e.g. horizontally with a phosphor
@@ -93,11 +93,11 @@
 //                  aa_xy_axis_importance down to a minimum of 0.5 (box support),
 //                  after which point only the offsets used for calculating
 //                  weights continue to scale downward.  This works as follows:
-//                  If aa_xy_axis_importance = vec2(1.0, 1.0/support_radius),
+//                  If aa_xy_axis_importance = float2(1.0, 1.0/support_radius),
 //                  the vertical support radius will drop to 1.0, and we'll just
 //                  filter vertical offsets with the first filter lobe, while
 //                  horizontal offsets go through the full multi-lobe filter.
-//                  If aa_xy_axis_importance = vec2(1.0, 0.0), the vertical
+//                  If aa_xy_axis_importance = float2(1.0, 0.0), the vertical
 //                  support radius will drop to box support, and the vertical
 //                  offsets will be ignored entirely (essentially giving us a
 //                  box filter vertically).  The former is potentially smoother
@@ -141,7 +141,7 @@
 //  2.) For decent results, negative-lobed filters must be computed based on
 //      separable weights, not radial distances, because the sparse sampling
 //      makes no guarantees about radial distributions.  Even then, it's much
-//      better to set aa_xy_axis_importance to e.g. vec2(1.0, 0.0) to use e.g.
+//      better to set aa_xy_axis_importance to e.g. float2(1.0, 0.0) to use e.g.
 //      Lanczos2 horizontally and a box filter vertically.  This is mainly due
 //      to the sparse N-queens sampling and a statistically enormous positive or
 //      negative covariance between horizontal and vertical weights.
@@ -154,32 +154,33 @@
 //  exploit temporal AA better, but it would require a dynamic branch or a lot
 //  of conditional moves, so it's prohibitively slow for the minor benefit.
 
+
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
 #ifndef ANTIALIAS_OVERRIDE_BASICS
     //  The following settings must be static constants:
-    const float aa_level = 12.0;
-    const float aa_filter = 0.0;
-    const bool aa_temporal = false;
+    static const float aa_level = 12.0;
+    static const float aa_filter = 0.0;
+    static const bool aa_temporal = false;
 #endif
 
 #ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS
     //  Users may override these parameters, but the file structure requires
     //  them to be static constants; see the descriptions above.
-    const float aa_pixel_diameter = 1.0;
-    const float aa_lanczos_lobes = 3.0;
-    const float aa_gauss_support = 1.0 / aa_pixel_diameter;
-    const float aa_tent_support = 1.0 / aa_pixel_diameter;
+    static const float aa_pixel_diameter = 1.0;
+    static const float aa_lanczos_lobes = 3.0;
+    static const float aa_gauss_support = 1.0 / aa_pixel_diameter;
+    static const float aa_tent_support = 1.0 / aa_pixel_diameter;
     
     //  If we're using a negative-lobed filter, default to using it horizontally
     //  only, and use only the first lobe vertically or a box filter, over a
     //  correspondingly smaller range.  This compensates for the sparse sampling
     //  grid's typically large positive/negative x/y covariance.
-    vec2 aa_xy_axis_importance =
-        aa_filter < 5.5 ? vec2(1.0) :         //  Box, tent, Gaussian
-        aa_filter < 8.5 ? vec2(1.0, 0.0) :    //  Cubic and Lanczos sinc
-        aa_filter < 9.5 ? vec2(1.0, 1.0/aa_lanczos_lobes) :   //  Lanczos jinc
-        vec2(1.0);                            //  Default to box
+    static const float2 aa_xy_axis_importance =
+        aa_filter < 5.5 ? float2(1.0) :         //  Box, tent, Gaussian
+        aa_filter < 8.5 ? float2(1.0, 0.0) :    //  Cubic and Lanczos sinc
+        aa_filter < 9.5 ? float2(1.0, 1.0/aa_lanczos_lobes) :   //  Lanczos jinc
+        float2(1.0);                            //  Default to box
 #endif
 
 #ifndef ANTIALIAS_OVERRIDE_PARAMETERS
@@ -189,39 +190,40 @@
     //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
     //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
     //  4.) C = 0.0 is a soft spline filter.
-//    const float aa_cubic_c = 0.5;
-//    const float aa_gauss_sigma = 0.5 / aa_pixel_diameter;
+    static const float aa_cubic_c = 0.5;
+    static const float aa_gauss_sigma = 0.5 / aa_pixel_diameter;
     //  Users may override the subpixel offset accessor function with their own.
     //  A function is used for compatibility with scalar runtime shader params.
-    vec2 get_aa_subpixel_r_offset()
+    inline float2 get_aa_subpixel_r_offset()
     {
-        return vec2(0.0, 0.0);
+        return float2(0.0, 0.0);
     }
 #endif
 
+
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
-//#include "../../../../include/gamma-management.h"
-#include "gamma-management.h"
+#include "../../../../include/gamma-management.h"
+
 
 //////////////////////////////////  CONSTANTS  /////////////////////////////////
 
-const float aa_box_support = 0.5;
-const float aa_cubic_support = 2.0;
+static const float aa_box_support = 0.5;
+static const float aa_cubic_support = 2.0;
 
 
 ////////////////////////////  GLOBAL NON-CONSTANTS  ////////////////////////////
 
 //  We'll want to define these only once per fragment at most.
 #ifdef RUNTIME_ANTIALIAS_WEIGHTS
-     float aa_cubic_b;
-     float cubic_branch1_x3_coeff;
-     float cubic_branch1_x2_coeff;
-     float cubic_branch1_x0_coeff;
-     float cubic_branch2_x3_coeff;
-     float cubic_branch2_x2_coeff;
-     float cubic_branch2_x1_coeff;
-     float cubic_branch2_x0_coeff;
+    float aa_cubic_b;
+    float cubic_branch1_x3_coeff;
+    float cubic_branch1_x2_coeff;
+    float cubic_branch1_x0_coeff;
+    float cubic_branch2_x3_coeff;
+    float cubic_branch2_x2_coeff;
+    float cubic_branch2_x1_coeff;
+    float cubic_branch2_x0_coeff;
 #endif
 
 
@@ -235,38 +237,38 @@ void assign_aa_cubic_constants()
     #ifdef RUNTIME_ANTIALIAS_WEIGHTS
         if(aa_filter > 5.5 && aa_filter < 7.5)
         {
-            aa_cubic_b = 1.0 - 2.0*params.aa_cubic_c;
-            cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*params.aa_cubic_c;
-            cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*params.aa_cubic_c;
+            aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+            cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+            cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
             cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
-            cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * params.aa_cubic_c;
-            cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*params.aa_cubic_c;
-            cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*params.aa_cubic_c;
-            cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*params.aa_cubic_c;
+            cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+            cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+            cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+            cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
         }
     #endif
 }
 
-vec4 get_subpixel_support_diam_and_final_axis_importance()
+inline float4 get_subpixel_support_diam_and_final_axis_importance()
 {
     //  Statically select the base support radius:
-    float base_support_radius;	
-        if(aa_filter < 1.5) base_support_radius = aa_box_support;
-        else if(aa_filter < 3.5) base_support_radius = aa_tent_support;
-        else if(aa_filter < 5.5) base_support_radius = aa_gauss_support;
-        else if(aa_filter < 7.5) base_support_radius = aa_cubic_support;
-        else if(aa_filter < 9.5) base_support_radius = aa_lanczos_lobes;
-        else base_support_radius = aa_box_support; //  Default to box
+    static const float base_support_radius =
+        aa_filter < 1.5 ? aa_box_support :
+        aa_filter < 3.5 ? aa_tent_support :
+        aa_filter < 5.5 ? aa_gauss_support :
+        aa_filter < 7.5 ? aa_cubic_support :
+        aa_filter < 9.5 ? aa_lanczos_lobes :
+        aa_box_support; //  Default to box
     //  Expand the filter support for subpixel filtering.
-    const vec2 subpixel_support_radius_raw =
-        vec2(base_support_radius) + abs(get_aa_subpixel_r_offset());
+    const float2 subpixel_support_radius_raw =
+        float2(base_support_radius) + abs(get_aa_subpixel_r_offset());
     if(aa_filter < 1.5)
     {
         //  Ignore aa_xy_axis_importance for box filtering.
-        const vec2 subpixel_support_diam =
+        const float2 subpixel_support_diam =
             2.0 * subpixel_support_radius_raw;
-        const vec2 final_axis_importance = vec2(1.0);
-        return vec4(subpixel_support_diam, final_axis_importance);
+        const float2 final_axis_importance = float2(1.0);
+        return float4(subpixel_support_diam, final_axis_importance);
     }
     else
     {
@@ -274,55 +276,54 @@ vec4 get_subpixel_support_diam_and_final_axis_importance()
         //  it further than box support.  This allows decent vertical AA without
         //  messing up horizontal weights or using something silly like Lanczos4
         //  horizontally with a huge vertical average over an 8-pixel radius.
-        const vec2 subpixel_support_radius = max(vec2(aa_box_support),
+        const float2 subpixel_support_radius = max(float2(aa_box_support, aa_box_support),
             subpixel_support_radius_raw * aa_xy_axis_importance);
         //  Adjust aa_xy_axis_importance to compensate for what's already done:
-        const vec2 final_axis_importance = aa_xy_axis_importance *
+        const float2 final_axis_importance = aa_xy_axis_importance *
             subpixel_support_radius_raw/subpixel_support_radius;
-        const vec2 subpixel_support_diam = 2.0 * subpixel_support_radius;
-        return vec4(subpixel_support_diam, final_axis_importance);
+        const float2 subpixel_support_diam = 2.0 * subpixel_support_radius;
+        return float4(subpixel_support_diam, final_axis_importance);
     }
 }
 
+
 ///////////////////////////  FILTER WEIGHT FUNCTIONS  //////////////////////////
 
-float eval_box_filter(const float dist)
+inline float eval_box_filter(const float dist)
 {
-if(abs(dist) <= aa_box_support) return 1.0;//abs(dist);
-else return 0.0;
+    return float(abs(dist) <= aa_box_support);
 }
 
-float eval_separable_box_filter(const vec2 offset)
+inline float eval_separable_box_filter(const float2 offset)
 {
-	if(all(lessThanEqual(abs(offset) , vec2(aa_box_support)))) return 1.0;//float(abs(offset));
-	else return 0.0;
+    return float(all(bool2((abs(offset.x) <= aa_box_support), (abs(offset.y) <= aa_box_support))));
 }
 
-float eval_tent_filter(const float dist)
+inline float eval_tent_filter(const float dist)
 {
     return clamp((aa_tent_support - dist)/
         aa_tent_support, 0.0, 1.0);
 }
 
-float eval_gaussian_filter(const float dist)
+inline float eval_gaussian_filter(const float dist)
 {
-    return exp(-(dist*dist) / (2.0*params.aa_gauss_sigma*params.aa_gauss_sigma));
+    return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma));
 }
 
-float eval_cubic_filter(const float dist)
+inline float eval_cubic_filter(const float dist)
 {
     //  Compute coefficients like assign_aa_cubic_constants(), but statically.
     #ifndef RUNTIME_ANTIALIAS_WEIGHTS
         //  When runtime weights are used, these values are instead written to
         //  global uniforms at the beginning of each tex2Daa* call.
-        const float aa_cubic_b = 1.0 - 2.0*params.aa_cubic_c;
-        const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*params.aa_cubic_c;
-        const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*params.aa_cubic_c;
+        const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+        const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+        const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
         const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
-        const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * params.aa_cubic_c;
-        const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*params.aa_cubic_c;
-        const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*params.aa_cubic_c;
-        const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*params.aa_cubic_c;
+        const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+        const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+        const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+        const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
     #endif
     const float abs_dist = abs(dist);
     //  Compute the cubic based on the Horner's method formula in:
@@ -338,32 +339,32 @@ float eval_cubic_filter(const float dist)
             0.0)/6.0;
 }
 
-float eval_separable_cubic_filter(const vec2 offset)
+inline float eval_separable_cubic_filter(const float2 offset)
 {
-    //  This is faster than using a specific vec2 version:
+    //  This is faster than using a specific float2 version:
     return eval_cubic_filter(offset.x) *
         eval_cubic_filter(offset.y);
 }
 
-vec2 eval_sinc_filter(const vec2 offset)
+inline float2 eval_sinc_filter(const float2 offset)
 {
     //  It's faster to let the caller handle the zero case, or at least it
     //  was when I used macros and the shader preset took a full minute to load.
-    const vec2 pi_offset = pi * offset;
+    const float2 pi_offset = pi * offset;
     return sin(pi_offset)/pi_offset;
 }
 
-float eval_separable_lanczos_sinc_filter(const vec2 offset_unsafe)
+inline float eval_separable_lanczos_sinc_filter(const float2 offset_unsafe)
 {
     //  Note: For sparse sampling, you really need to pick an axis to use
-    //  Lanczos along (e.g. set aa_xy_axis_importance = vec2(1.0, 0.0)).
-    const vec2 offset = FIX_ZERO(offset_unsafe);
-    const vec2 xy_weights = eval_sinc_filter(offset) *
+    //  Lanczos along (e.g. set aa_xy_axis_importance = float2(1.0, 0.0)).
+    const float2 offset = FIX_ZERO(offset_unsafe);
+    const float2 xy_weights = eval_sinc_filter(offset) *
         eval_sinc_filter(offset/aa_lanczos_lobes);
     return xy_weights.x * xy_weights.y;
 }
 
-float eval_jinc_filter_unorm(const float x)
+inline float eval_jinc_filter_unorm(const float x)
 {
     //  This is a Jinc approximation for x in [0, 45).  We'll use x in range
     //  [0, 4*pi) or so.  There are faster/closer approximations based on
@@ -383,19 +384,19 @@ float eval_jinc_filter_unorm(const float x)
         0.180837503591406);
 }
 
-float eval_jinc_filter(const float dist)
+inline float eval_jinc_filter(const float dist)
 {
     return eval_jinc_filter_unorm(pi * dist);
 }
 
-float eval_lanczos_jinc_filter(const float dist)
+inline float eval_lanczos_jinc_filter(const float dist)
 {
     return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes);
 }
 
 
-vec3 eval_unorm_rgb_weights(const vec2 offset,
-    const vec2 final_axis_importance)
+inline float3 eval_unorm_rgb_weights(const float2 offset,
+    const float2 final_axis_importance)
 {
     //  Requires:   1.) final_axis_impportance must be computed according to
     //                  get_subpixel_support_diam_and_final_axis_importance().
@@ -407,135 +408,135 @@ vec3 eval_unorm_rgb_weights(const vec2 offset,
     //                      subpixel_support_diameter.y/2])
     //  Returns:    Sample weights at R/G/B destination subpixels for the
     //              given xy pixel offset.
-    const vec2 offset_g = offset * final_axis_importance;
-    const vec2 aa_r_offset = get_aa_subpixel_r_offset();
-    const vec2 offset_r = offset_g - aa_r_offset * final_axis_importance;
-    const vec2 offset_b = offset_g + aa_r_offset * final_axis_importance;
+    const float2 offset_g = offset * final_axis_importance;
+    const float2 aa_r_offset = get_aa_subpixel_r_offset();
+    const float2 offset_r = offset_g - aa_r_offset * final_axis_importance;
+    const float2 offset_b = offset_g + aa_r_offset * final_axis_importance;
     //  Statically select a filter:
     if(aa_filter < 0.5)
     {
-        return vec3(eval_separable_box_filter(offset_r),
+        return float3(eval_separable_box_filter(offset_r),
             eval_separable_box_filter(offset_g),
             eval_separable_box_filter(offset_b));
     }
     else if(aa_filter < 1.5)
     {
-        return vec3(eval_box_filter(length(offset_r)),
+        return float3(eval_box_filter(length(offset_r)),
             eval_box_filter(length(offset_g)),
             eval_box_filter(length(offset_b)));
     }
     else if(aa_filter < 2.5)
     {
-        return vec3(
+        return float3(
             eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y),
             eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y),
             eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y));
     }
     else if(aa_filter < 3.5)
     {
-        return vec3(eval_tent_filter(length(offset_r)),
+        return float3(eval_tent_filter(length(offset_r)),
             eval_tent_filter(length(offset_g)),
             eval_tent_filter(length(offset_b)));
     }
     else if(aa_filter < 4.5)
     {
-        return vec3(
+        return float3(
             eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y),
             eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y),
             eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y));
     }
     else if(aa_filter < 5.5)
     {
-        return vec3(eval_gaussian_filter(length(offset_r)),
+        return float3(eval_gaussian_filter(length(offset_r)),
             eval_gaussian_filter(length(offset_g)),
             eval_gaussian_filter(length(offset_b)));
     }
     else if(aa_filter < 6.5)
     {
-        return vec3(
+        return float3(
             eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y),
             eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y),
             eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y));
     }
     else if(aa_filter < 7.5)
     {
-        return vec3(eval_cubic_filter(length(offset_r)),
+        return float3(eval_cubic_filter(length(offset_r)),
             eval_cubic_filter(length(offset_g)),
             eval_cubic_filter(length(offset_b)));
     }
     else if(aa_filter < 8.5)
     {
-        return vec3(eval_separable_lanczos_sinc_filter(offset_r),
+        return float3(eval_separable_lanczos_sinc_filter(offset_r),
             eval_separable_lanczos_sinc_filter(offset_g),
             eval_separable_lanczos_sinc_filter(offset_b));
     }
     else if(aa_filter < 9.5)
     {
-        return vec3(eval_lanczos_jinc_filter(length(offset_r)),
+        return float3(eval_lanczos_jinc_filter(length(offset_r)),
             eval_lanczos_jinc_filter(length(offset_g)),
             eval_lanczos_jinc_filter(length(offset_b)));
     }
     else
     {
         //  Default to a box, because Lanczos Jinc is so bad. ;)
-        return vec3(eval_separable_box_filter(offset_r),
+        return float3(eval_separable_box_filter(offset_r),
             eval_separable_box_filter(offset_g),
             eval_separable_box_filter(offset_b));
     }
 }
 
+
 //////////////////////////////  HELPER FUNCTIONS  //////////////////////////////
 
-vec4 tex2Daa_tiled_linearize(const sampler2D samp, const vec2 s)
+inline float4 tex2Daa_tiled_linearize(const sampler2D samp, const float2 s)
 {
     //  If we're manually tiling a texture, anisotropic filtering can get
     //  confused.  This is one workaround:
     #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
         //  TODO: Use tex2Dlod_linearize with a calculated mip level.
-        return tex2Dlod_linearize(samp, vec4(s, 0.0, 0.0));
+        return tex2Dlod_linearize(samp, float4(s, 0.0, 0.0));
     #else
         return tex2D_linearize(samp, s);
     #endif
 }
 
-vec2 get_frame_sign(const float frame)
+inline float2 get_frame_sign(const float frame)
 {
-    if(aa_temporal == true)
+    if(aa_temporal)
     {
         //  Mirror the sampling pattern for odd frames in a direction that
         //  lets us keep the same subpixel sample weights:
-        float frame_odd = float(mod(frame, 2.0) > 0.5);
-        const vec2 aa_r_offset = get_aa_subpixel_r_offset();
-        vec2 mirror = vec2(FIX_ZERO(0.0));
-		if ( abs(aa_r_offset.x) < FIX_ZERO(0.0)) mirror.x = abs(aa_r_offset.x);
-		if ( abs(aa_r_offset.y) < FIX_ZERO(0.0)) mirror.y = abs(aa_r_offset.y);
-        return vec2(-1.0) * mirror;
+        const float frame_odd = float(fmod(frame, 2.0) > 0.5);
+        const float2 aa_r_offset = get_aa_subpixel_r_offset();
+        const float2 mirror = -float2(abs(aa_r_offset.x) < (FIX_ZERO(0.0)), abs(aa_r_offset.y) < (FIX_ZERO(0.0)));
+        return mirror;
     }
     else
     {
-        return vec2(1.0);
+        return float2(1.0, 1.0);
     }
 }
 
+
 /////////////////////////  ANTIALIASED TEXTURE LOOKUPS  ////////////////////////
 
-vec3 tex2Daa_subpixel_weights_only(const sampler2D tex,
-    const vec2 tex_uv, const mat2x2 pixel_to_tex_uv)
+float3 tex2Daa_subpixel_weights_only(const sampler2D tex,
+    const float2 tex_uv, const float2x2 pixel_to_tex_uv)
 {
     //  This function is unlike the others: Just perform a single independent
     //  lookup for each subpixel.  It may be very aliased.
-    const vec2 aa_r_offset = get_aa_subpixel_r_offset();
-    const vec2 aa_r_offset_uv_offset = (aa_r_offset * pixel_to_tex_uv);
+    const float2 aa_r_offset = get_aa_subpixel_r_offset();
+    const float2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset);
     const float color_g = tex2D_linearize(tex, tex_uv).g;
     const float color_r = tex2D_linearize(tex, tex_uv + aa_r_offset_uv_offset).r;
     const float color_b = tex2D_linearize(tex, tex_uv - aa_r_offset_uv_offset).b;
-    return vec3(color_r, color_g, color_b);
+    return float3(color_r, color_g, color_b);
 }
 
 //  The tex2Daa* functions compile very slowly due to all the macros and
 //  compile-time math, so only include the ones we'll actually use!
-vec3 tex2Daa4x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa4x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use an RGMS4 pattern (4-queens):
     //  . . Q .  : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4
@@ -543,45 +544,45 @@ vec3 tex2Daa4x(const sampler2D tex, const vec2 tex_uv,
     //  . . . Q  : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4
     //  . Q . .  : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4
     //  Static screenspace sample offsets (compute some implicitly):
-    const float grid_size = 4.0;
+    static const float grid_size = 4.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0,1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5,0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(0.0, 1.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(0.0, 1.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = w1.bgr;
-    const vec3 w3 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = w1.bgr;
+    const float3 w3 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0,1.0,1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (w0 * sample0 + w1 * sample1 +
         w2 * sample2 + w3 * sample3);
 }
 
-vec3 tex2Daa5x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa5x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 5-queens pattern:
     //  . Q . . .  : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5
@@ -590,46 +591,46 @@ vec3 tex2Daa5x(const sampler2D tex, const vec2 tex_uv,
     //  Q . . . .  : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5
     //  . . . Q .  : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5
     //  Static screenspace sample offsets (compute some implicitly):
-    const float grid_size = 5.0;
+    static const float grid_size = 5.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 2.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(2.0, 2.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = w1.bgr;
-    const vec3 w4 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = w1.bgr;
+    const float3 w4 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 w_sum_inv = vec3(1.0)/(w0 + w1 + w2 + w3 + w4);
+    const float3 w_sum_inv = float3(1.0)/(w0 + w1 + w2 + w3 + w4);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (w0 * sample0 + w1 * sample1 +
         w2 * sample2 + w3 * sample3 + w4 * sample4);
 }
 
-vec3 tex2Daa6x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa6x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 6-queens pattern with a stronger horizontal
     //  than vertical slant:
@@ -640,51 +641,51 @@ vec3 tex2Daa6x(const sampler2D tex, const vec2 tex_uv,
     //  . . . Q . .  : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6
     //  . Q . . . .  : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6
     //  Static screenspace sample offsets (compute some implicitly):
-    const float grid_size = 6.0;
+    static const float grid_size = 6.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(4.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(2.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(4.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(2.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = w2.bgr;
-    const vec3 w4 = w1.bgr;
-    const vec3 w5 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = w2.bgr;
+    const float3 w4 = w1.bgr;
+    const float3 w5 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1 + w2;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 +
         w3 * sample3 + w4 * sample4 + w5 * sample5);
 }
 
-vec3 tex2Daa7x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa7x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 7-queens pattern with a queen in the center:
     //  . Q . . . . .  : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7
@@ -694,55 +695,55 @@ vec3 tex2Daa7x(const sampler2D tex, const vec2 tex_uv,
     //  . . . . . . Q  : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7
     //  . . Q . . . .  : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7
     //  . . . . . Q .  : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7
-    const float grid_size = 7.0;
+    static const float grid_size = 7.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 3.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(0.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(3.0, 3.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = w2.bgr;
-    const vec3 w5 = w1.bgr;
-    const vec3 w6 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = w2.bgr;
+    const float3 w5 = w1.bgr;
+    const float3 w6 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2;
-    const vec3 w_sum = half_sum + half_sum.bgr + w3;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1 + w2;
+    const float3 w_sum = half_sum + half_sum.bgr + w3;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
         w4 * sample4 + w5 * sample5 + w6 * sample6);
 }
 
-vec3 tex2Daa8x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa8x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 8-queens pattern.
     //  . . Q . . . . .  : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8
@@ -753,57 +754,57 @@ vec3 tex2Daa8x(const sampler2D tex, const vec2 tex_uv,
     //  . . . . . . Q .  : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8
     //  . . . Q . . . .  : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8
     //  . . . . . Q . .  : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8
-    const float grid_size = 8.0;
+    static const float grid_size = 8.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(1.0, 2.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(7.0, 3.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(4.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(1.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(7.0, 3.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = w3.bgr;
-    const vec3 w5 = w2.bgr;
-    const vec3 w6 = w1.bgr;
-    const vec3 w7 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = w3.bgr;
+    const float3 w5 = w2.bgr;
+    const float3 w6 = w1.bgr;
+    const float3 w7 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2 + w3;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1 + w2 + w3;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, and mirror on odd frames if directed:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
         w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7);
 }
 
-vec3 tex2Daa12x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa12x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 12-superqueens pattern where no 3 points are
     //  exactly collinear.
@@ -819,62 +820,62 @@ vec3 tex2Daa12x(const sampler2D tex, const vec2 tex_uv,
     //  . . . . . Q . . . . . .  : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12
     //  . . Q . . . . . . . . .  : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12
     //  . . . . . . . . Q . . .  : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12
-    const float grid_size = 12.0;
+    static const float grid_size = 12.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(3.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(6.0, 2.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step;
-    const vec2 xy_offset4 = xy_start_offset + vec2(11.0, 4.0) * xy_step;
-    const vec2 xy_offset5 = xy_start_offset + vec2(4.0, 5.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(3.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(6.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(11.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(4.0, 5.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
-    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
-    const vec3 w6 = w5.bgr;
-    const vec3 w7 = w4.bgr;
-    const vec3 w8 = w3.bgr;
-    const vec3 w9 = w2.bgr;
-    const vec3 w10 = w1.bgr;
-    const vec3 w11 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = w5.bgr;
+    const float3 w7 = w4.bgr;
+    const float3 w8 = w3.bgr;
+    const float3 w9 = w2.bgr;
+    const float3 w10 = w1.bgr;
+    const float3 w11 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/w_sum;
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/w_sum;
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
-    const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
-    const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
-    const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
@@ -882,8 +883,8 @@ vec3 tex2Daa12x(const sampler2D tex, const vec2 tex_uv,
         w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11);
 }
 
-vec3 tex2Daa16x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa16x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 16-superqueens pattern where no 3 points are
     //  exactly collinear.
@@ -903,74 +904,74 @@ vec3 tex2Daa16x(const sampler2D tex, const vec2 tex_uv,
     //  . . . Q . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16
     //  . . . . . . Q . . . . . . . . .  : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16
     //  . . . . . . . . . . . . . Q . .  : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16
-    const float grid_size = 16.0;
+    static const float grid_size = 16.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(12.0, 2.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(4.0, 3.0) * xy_step;
-    const vec2 xy_offset4 = xy_start_offset + vec2(8.0, 4.0) * xy_step;
-    const vec2 xy_offset5 = xy_start_offset + vec2(14.0, 5.0) * xy_step;
-    const vec2 xy_offset6 = xy_start_offset + vec2(0.0, 6.0) * xy_step;
-    const vec2 xy_offset7 = xy_start_offset + vec2(10.0, 7.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(9.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(12.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(4.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(8.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(14.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(0.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(10.0, 7.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
-    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
-    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
-    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
-    const vec3 w8 = w7.bgr;
-    const vec3 w9 = w6.bgr;
-    const vec3 w10 = w5.bgr;
-    const vec3 w11 = w4.bgr;
-    const vec3 w12 = w3.bgr;
-    const vec3 w13 = w2.bgr;
-    const vec3 w14 = w1.bgr;
-    const vec3 w15 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = w7.bgr;
+    const float3 w9 = w6.bgr;
+    const float3 w10 = w5.bgr;
+    const float3 w11 = w4.bgr;
+    const float3 w12 = w3.bgr;
+    const float3 w13 = w2.bgr;
+    const float3 w14 = w1.bgr;
+    const float3 w15 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset6 = (xy_offset6 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset7 = (xy_offset7 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
-    const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
-    const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
-    const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
-    const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
-    const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
-    const vec3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
-    const vec3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
@@ -979,8 +980,8 @@ vec3 tex2Daa16x(const sampler2D tex, const vec2 tex_uv,
         w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
 }
 
-vec3 tex2Daa20x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa20x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 20-superqueens pattern where no 3 points are
     //  exactly collinear and superqueens have a squared attack radius of 13.
@@ -1004,86 +1005,86 @@ vec3 tex2Daa20x(const sampler2D tex, const vec2 tex_uv,
     //  . . . . . . . . Q . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20
     //  . . . Q . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20
     //  . . . . . . . . . . . . Q . . . . . . .  : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20
-    const float grid_size = 20.0;
+    static const float grid_size = 20.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(7.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(11.0, 2.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step;
-    const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step;
-    const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step;
-    const vec2 xy_offset6 = xy_start_offset + vec2(10.0, 6.0) * xy_step;
-    const vec2 xy_offset7 = xy_start_offset + vec2(19.0, 7.0) * xy_step;
-    const vec2 xy_offset8 = xy_start_offset + vec2(2.0, 8.0) * xy_step;
-    const vec2 xy_offset9 = xy_start_offset + vec2(6.0, 9.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(7.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(11.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(1.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(10.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(19.0, 7.0) * xy_step;
+    const float2 xy_offset8 = xy_start_offset + float2(2.0, 8.0) * xy_step;
+    const float2 xy_offset9 = xy_start_offset + float2(6.0, 9.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
-    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
-    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
-    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
-    const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
-    const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
-    const vec3 w10 = w9.bgr;
-    const vec3 w11 = w8.bgr;
-    const vec3 w12 = w7.bgr;
-    const vec3 w13 = w6.bgr;
-    const vec3 w14 = w5.bgr;
-    const vec3 w15 = w4.bgr;
-    const vec3 w16 = w3.bgr;
-    const vec3 w17 = w2.bgr;
-    const vec3 w18 = w1.bgr;
-    const vec3 w19 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const float3 w10 = w9.bgr;
+    const float3 w11 = w8.bgr;
+    const float3 w12 = w7.bgr;
+    const float3 w13 = w6.bgr;
+    const float3 w14 = w5.bgr;
+    const float3 w15 = w4.bgr;
+    const float3 w16 = w3.bgr;
+    const float3 w17 = w2.bgr;
+    const float3 w18 = w1.bgr;
+    const float3 w19 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset6 = (xy_offset6 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset7 = (xy_offset7 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset8 = (xy_offset8 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset9 = (xy_offset9 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
-    const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
-    const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
-    const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
-    const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
-    const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
-    const vec3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
-    const vec3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
-    const vec3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
-    const vec3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
-    const vec3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
-    const vec3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
@@ -1093,8 +1094,8 @@ vec3 tex2Daa20x(const sampler2D tex, const vec2 tex_uv,
         w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19);
 }
 
-vec3 tex2Daa24x(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa24x(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Use a diagonally symmetric 24-superqueens pattern where no 3 points are
     //  exactly collinear and superqueens have a squared attack radius of 13.
@@ -1122,99 +1123,99 @@ vec3 tex2Daa24x(const sampler2D tex, const vec2 tex_uv,
     //  . . . . . . . . . . . . . Q . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24
     //  . . . . . . . Q . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24
     //  . . . . . . . . . . . . . . . . . Q . . . . . .  : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24
-    const float grid_size = 24.0;
+    static const float grid_size = 24.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample.  Exploit diagonal symmetry:
-    const vec2 xy_offset0 = xy_start_offset + vec2(6.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(10.0, 2.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(21.0, 3.0) * xy_step;
-    const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step;
-    const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step;
-    const vec2 xy_offset6 = xy_start_offset + vec2(1.0, 6.0) * xy_step;
-    const vec2 xy_offset7 = xy_start_offset + vec2(11.0, 7.0) * xy_step;
-    const vec2 xy_offset8 = xy_start_offset + vec2(19.0, 8.0) * xy_step;
-    const vec2 xy_offset9 = xy_start_offset + vec2(23.0, 9.0) * xy_step;
-    const vec2 xy_offset10 = xy_start_offset + vec2(3.0, 10.0) * xy_step;
-    const vec2 xy_offset11 = xy_start_offset + vec2(14.0, 11.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(6.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(16.0, 1.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(10.0, 2.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(21.0, 3.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(5.0, 4.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(15.0, 5.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(1.0, 6.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(11.0, 7.0) * xy_step;
+    const float2 xy_offset8 = xy_start_offset + float2(19.0, 8.0) * xy_step;
+    const float2 xy_offset9 = xy_start_offset + float2(23.0, 9.0) * xy_step;
+    const float2 xy_offset10 = xy_start_offset + float2(3.0, 10.0) * xy_step;
+    const float2 xy_offset11 = xy_start_offset + float2(14.0, 11.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
-    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
-    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
-    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
-    const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
-    const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
-    const vec3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance);
-    const vec3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance);
-    const vec3 w12 = w11.bgr;
-    const vec3 w13 = w10.bgr;
-    const vec3 w14 = w9.bgr;
-    const vec3 w15 = w8.bgr;
-    const vec3 w16 = w7.bgr;
-    const vec3 w17 = w6.bgr;
-    const vec3 w18 = w5.bgr;
-    const vec3 w19 = w4.bgr;
-    const vec3 w20 = w3.bgr;
-    const vec3 w21 = w2.bgr;
-    const vec3 w22 = w1.bgr;
-    const vec3 w23 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const float3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const float3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance);
+    const float3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance);
+    const float3 w12 = w11.bgr;
+    const float3 w13 = w10.bgr;
+    const float3 w14 = w9.bgr;
+    const float3 w15 = w8.bgr;
+    const float3 w16 = w7.bgr;
+    const float3 w17 = w6.bgr;
+    const float3 w18 = w5.bgr;
+    const float3 w19 = w4.bgr;
+    const float3 w20 = w3.bgr;
+    const float3 w21 = w2.bgr;
+    const float3 w22 = w1.bgr;
+    const float3 w23 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 +
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 +
         w5 + w6 + w7 + w8 + w9 + w10 + w11;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, mirror on odd frames if directed, and exploit
     //  diagonal symmetry:
-    const vec2 frame_sign = get_frame_sign(frame);
-    const vec2 uv_offset0 = (xy_offset0 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset1 = (xy_offset1 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset2 = (xy_offset2 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset3 = (xy_offset3 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset4 = (xy_offset4 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset5 = (xy_offset5 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset6 = (xy_offset6 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset7 = (xy_offset7 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset8 = (xy_offset8 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset9 = (xy_offset9 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset10 = (xy_offset10 * frame_sign * true_pixel_to_tex_uv);
-    const vec2 uv_offset11 = (xy_offset11 * frame_sign * true_pixel_to_tex_uv);
+    const float2 frame_sign = get_frame_sign(frame);
+    const float2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const float2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const float2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const float2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const float2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const float2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const float2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const float2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const float2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const float2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    const float2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign);
+    const float2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign);
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
-    const vec3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
-    const vec3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
-    const vec3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
-    const vec3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb;
-    const vec3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb;
-    const vec3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb;
-    const vec3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb;
-    const vec3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
-    const vec3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
-    const vec3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
-    const vec3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
-    const vec3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
-    const vec3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
-    const vec3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
-    const vec3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
-    const vec3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
-    const vec3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset0).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset1).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset2).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset3).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset4).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset5).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset6).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset7).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset8).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset9).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset10).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, tex_uv + uv_offset11).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset11).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset10).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset9).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset8).rgb;
+    const float3 sample16 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset7).rgb;
+    const float3 sample17 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset6).rgb;
+    const float3 sample18 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset5).rgb;
+    const float3 sample19 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset4).rgb;
+    const float3 sample20 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset3).rgb;
+    const float3 sample21 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset2).rgb;
+    const float3 sample22 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset1).rgb;
+    const float3 sample23 = tex2Daa_tiled_linearize(tex, tex_uv - uv_offset0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
@@ -1225,78 +1226,78 @@ vec3 tex2Daa24x(const sampler2D tex, const vec2 tex_uv,
         w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23);
 }
 
-vec3 tex2Daa_debug_16x_regular(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa_debug_16x_regular(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Sample on a regular 4x4 grid.  This is mainly for testing.
-    const float grid_size = 4.0;
+    static const float grid_size = 4.0;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
-    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
-    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
+    const float2 xy_step = float2(1.0)/grid_size * subpixel_support_diameter;
+    const float2 xy_start_offset = float2(0.5 - grid_size*0.5) * xy_step;
     //  Get the xy offset of each sample:
-    const vec2 xy_offset0 = xy_start_offset + vec2(0.0, 0.0) * xy_step;
-    const vec2 xy_offset1 = xy_start_offset + vec2(1.0, 0.0) * xy_step;
-    const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
-    const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 0.0) * xy_step;
-    const vec2 xy_offset4 = xy_start_offset + vec2(0.0, 1.0) * xy_step;
-    const vec2 xy_offset5 = xy_start_offset + vec2(1.0, 1.0) * xy_step;
-    const vec2 xy_offset6 = xy_start_offset + vec2(2.0, 1.0) * xy_step;
-    const vec2 xy_offset7 = xy_start_offset + vec2(3.0, 1.0) * xy_step;
+    const float2 xy_offset0 = xy_start_offset + float2(0.0, 0.0) * xy_step;
+    const float2 xy_offset1 = xy_start_offset + float2(1.0, 0.0) * xy_step;
+    const float2 xy_offset2 = xy_start_offset + float2(2.0, 0.0) * xy_step;
+    const float2 xy_offset3 = xy_start_offset + float2(3.0, 0.0) * xy_step;
+    const float2 xy_offset4 = xy_start_offset + float2(0.0, 1.0) * xy_step;
+    const float2 xy_offset5 = xy_start_offset + float2(1.0, 1.0) * xy_step;
+    const float2 xy_offset6 = xy_start_offset + float2(2.0, 1.0) * xy_step;
+    const float2 xy_offset7 = xy_start_offset + float2(3.0, 1.0) * xy_step;
     //  Compute subpixel weights, and exploit diagonal symmetry for speed.
     //  (We can't exploit vertical or horizontal symmetry due to uncertain
     //  subpixel offsets.  We could fix that by rotating xy offsets with the
     //  subpixel structure, but...no.)
-    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
-    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
-    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
-    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
-    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
-    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
-    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
-    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
-    const vec3 w8 = w7.bgr;
-    const vec3 w9 = w6.bgr;
-    const vec3 w10 = w5.bgr;
-    const vec3 w11 = w4.bgr;
-    const vec3 w12 = w3.bgr;
-    const vec3 w13 = w2.bgr;
-    const vec3 w14 = w1.bgr;
-    const vec3 w15 = w0.bgr;
+    const float3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const float3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const float3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const float3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const float3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const float3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const float3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const float3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const float3 w8 = w7.bgr;
+    const float3 w9 = w6.bgr;
+    const float3 w10 = w5.bgr;
+    const float3 w11 = w4.bgr;
+    const float3 w12 = w3.bgr;
+    const float3 w13 = w2.bgr;
+    const float3 w14 = w1.bgr;
+    const float3 w15 = w0.bgr;
     //  Get the weight sum to normalize the total to 1.0 later:
-    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
-    const vec3 w_sum = half_sum + half_sum.bgr;
-    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    const float3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const float3 w_sum = half_sum + half_sum.bgr;
+    const float3 w_sum_inv = float3(1.0)/(w_sum);
     //  Scale the pixel-space to texture offset matrix by the pixel diameter.
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
     //  Get uv sample offsets, taking advantage of row alignment:
-    const vec2 uv_step_x = (vec2(xy_step.x, 0.0) * true_pixel_to_tex_uv);
-    const vec2 uv_step_y = (vec2(0.0, xy_step.y) * true_pixel_to_tex_uv);
-    const vec2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y);
-    const vec2 sample0_uv = tex_uv + uv_offset0;
-    const vec2 sample4_uv = sample0_uv + uv_step_y;
-    const vec2 sample8_uv = sample0_uv + uv_step_y * 2.0;
-    const vec2 sample12_uv = sample0_uv + uv_step_y * 3.0;
+    const float2 uv_step_x = mul(true_pixel_to_tex_uv, float2(xy_step.x, 0.0));
+    const float2 uv_step_y = mul(true_pixel_to_tex_uv, float2(0.0, xy_step.y));
+    const float2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y);
+    const float2 sample0_uv = tex_uv + uv_offset0;
+    const float2 sample4_uv = sample0_uv + uv_step_y;
+    const float2 sample8_uv = sample0_uv + uv_step_y * 2.0;
+    const float2 sample12_uv = sample0_uv + uv_step_y * 3.0;
     //  Load samples, linearizing if necessary, etc.:
-    const vec3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb;
-    const vec3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb;
-    const vec3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb;
-    const vec3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb;
-    const vec3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb;
-    const vec3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb;
-    const vec3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb;
-    const vec3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb;
-    const vec3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb;
-    const vec3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb;
-    const vec3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb;
-    const vec3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb;
-    const vec3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb;
-    const vec3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb;
-    const vec3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb;
-    const vec3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb;
+    const float3 sample0 = tex2Daa_tiled_linearize(tex, sample0_uv).rgb;
+    const float3 sample1 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x).rgb;
+    const float3 sample2 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 2.0).rgb;
+    const float3 sample3 = tex2Daa_tiled_linearize(tex, sample0_uv + uv_step_x * 3.0).rgb;
+    const float3 sample4 = tex2Daa_tiled_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x).rgb;
+    const float3 sample6 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 2.0).rgb;
+    const float3 sample7 = tex2Daa_tiled_linearize(tex, sample4_uv + uv_step_x * 3.0).rgb;
+    const float3 sample8 = tex2Daa_tiled_linearize(tex, sample8_uv).rgb;
+    const float3 sample9 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x).rgb;
+    const float3 sample10 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 2.0).rgb;
+    const float3 sample11 = tex2Daa_tiled_linearize(tex, sample8_uv + uv_step_x * 3.0).rgb;
+    const float3 sample12 = tex2Daa_tiled_linearize(tex, sample12_uv).rgb;
+    const float3 sample13 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x).rgb;
+    const float3 sample14 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 2.0).rgb;
+    const float3 sample15 = tex2Daa_tiled_linearize(tex, sample12_uv + uv_step_x * 3.0).rgb;
     //  Sum weighted samples (weight sum must equal 1.0 for each channel):
     return w_sum_inv * (
         w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
@@ -1305,54 +1306,56 @@ vec3 tex2Daa_debug_16x_regular(const sampler2D tex, const vec2 tex_uv,
         w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
 }
 
-vec3 tex2Daa_debug_dynamic(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+float3 tex2Daa_debug_dynamic(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  This function is for testing only: Use an NxN grid with dynamic weights.
-    const int grid_size = 8;
+    static const int grid_size = 8;
     assign_aa_cubic_constants();
-    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
-    const vec2 subpixel_support_diameter = ssd_fai.xy;
-    const vec2 final_axis_importance = ssd_fai.zw;
+    const float4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const float2 subpixel_support_diameter = ssd_fai.xy;
+    const float2 final_axis_importance = ssd_fai.zw;
     const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0;
-    const vec2 filter_space_offset_step =
-        subpixel_support_diameter/vec2(grid_size);
-    const vec2 sample0_filter_space_offset =
+    const float2 filter_space_offset_step =
+        subpixel_support_diameter/float2(grid_size);
+    const float2 sample0_filter_space_offset =
         -grid_radius_in_samples * filter_space_offset_step;
     //  Compute xy sample offsets and subpixel weights:
-    vec3 weights[grid_size * grid_size];
-    vec3 weight_sum = vec3(0.0);
+    float3 weights[grid_size * grid_size];
+    float3 weight_sum = float3(0.0, 0.0, 0.0);
     for(int i = 0; i < grid_size; ++i)
     {
         for(int j = 0; j < grid_size; ++j)
         {
             //  Weights based on xy distances:
-            const vec2 offset = sample0_filter_space_offset +
-                vec2(j, i) * filter_space_offset_step;
-            const vec3 weight = eval_unorm_rgb_weights(offset, final_axis_importance);
+            const float2 offset = sample0_filter_space_offset +
+                float2(j, i) * filter_space_offset_step;
+            const float3 weight = eval_unorm_rgb_weights(offset, final_axis_importance);
             weights[i*grid_size + j] = weight;
             weight_sum += weight;
         }
     }
     //  Get uv offset vectors along x and y directions:
-    const mat2x2 true_pixel_to_tex_uv =
-        mat2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
-    const vec2 uv_offset_step_x = (vec2(filter_space_offset_step.x, 0.0) * true_pixel_to_tex_uv);
-    const vec2 uv_offset_step_y = (vec2(0.0, filter_space_offset_step.y) * true_pixel_to_tex_uv);
+    const float2x2 true_pixel_to_tex_uv =
+        float2x2(float4(pixel_to_tex_uv * aa_pixel_diameter));
+    const float2 uv_offset_step_x = mul(true_pixel_to_tex_uv,
+        float2(filter_space_offset_step.x, 0.0));
+    const float2 uv_offset_step_y = mul(true_pixel_to_tex_uv,
+        float2(0.0, filter_space_offset_step.y));
     //  Get a starting sample location:
-    const vec2 sample0_uv_offset = -grid_radius_in_samples *
+    const float2 sample0_uv_offset = -grid_radius_in_samples *
         (uv_offset_step_x + uv_offset_step_y);
-    const vec2 sample0_uv = tex_uv + sample0_uv_offset;
+    const float2 sample0_uv = tex_uv + sample0_uv_offset;
     //  Load, weight, and sum [linearized] samples:
-    vec3 sum = vec3(0.0);
-    const vec3 weight_sum_inv = vec3(1.0)/vec3(weight_sum);
+    float3 sum = float3(0.0, 0.0, 0.0);
+    const float3 weight_sum_inv = float3(1.0)/weight_sum;
     for(int i = 0; i < grid_size; ++i)
     {
-        const vec2 row_i_first_sample_uv =
+        const float2 row_i_first_sample_uv =
             sample0_uv + i * uv_offset_step_y;
         for(int j = 0; j < grid_size; ++j)
         {
-            const vec2 sample_uv =
+            const float2 sample_uv =
                 row_i_first_sample_uv + j * uv_offset_step_x;
             sum += weights[i*grid_size + j] *
                 tex2Daa_tiled_linearize(tex, sample_uv).rgb;
@@ -1361,26 +1364,30 @@ vec3 tex2Daa_debug_dynamic(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
+
 ///////////////////////  ANTIALIASING CODEPATH SELECTION  //////////////////////
 
-vec3 tex2Daa(const sampler2D tex, const vec2 tex_uv,
-    const mat2x2 pixel_to_tex_uv, const float frame)
+inline float3 tex2Daa(const sampler2D tex, const float2 tex_uv,
+    const float2x2 pixel_to_tex_uv, const float frame)
 {
     //  Statically switch between antialiasing modes/levels:
-	if (aa_level < 0.5) return tex2D_linearize(tex, tex_uv).rgb;
-	else if (aa_level < 3.5) return tex2Daa_subpixel_weights_only(
-            tex, tex_uv, pixel_to_tex_uv);
-	else if (aa_level < 4.5)   return tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 5.5)   return tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 6.5)   return tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 7.5)   return tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 11.5)  return tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 15.5)  return tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 19.5)  return tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 23.5)  return tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 253.5) return tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame);
-	else if (aa_level < 254.5) return tex2Daa_debug_16x_regular(tex, tex_uv, pixel_to_tex_uv, frame);
-		else return tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame);
+    return (aa_level < 0.5) ? tex2D_linearize(tex, tex_uv).rgb :
+        (aa_level < 3.5) ? tex2Daa_subpixel_weights_only(
+            tex, tex_uv, pixel_to_tex_uv) :
+        (aa_level < 4.5) ? tex2Daa4x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 5.5) ? tex2Daa5x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 6.5) ? tex2Daa6x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 7.5) ? tex2Daa7x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 11.5) ? tex2Daa8x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 15.5) ? tex2Daa12x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 19.5) ? tex2Daa16x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 23.5) ? tex2Daa20x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 253.5) ? tex2Daa24x(tex, tex_uv, pixel_to_tex_uv, frame) :
+        (aa_level < 254.5) ? tex2Daa_debug_16x_regular(
+            tex, tex_uv, pixel_to_tex_uv, frame) :
+        tex2Daa_debug_dynamic(tex, tex_uv, pixel_to_tex_uv, frame);
 }
 
-#endif  //  TEX2DANTIALIAS_H
\ No newline at end of file
+
+#endif  //  TEX2DANTIALIAS_H
+
diff --git a/crt/shaders/crt-royale/src/user-preset-constants.h b/crt/shaders/crt-royale/src/user-cgp-constants.h
similarity index 74%
rename from crt/shaders/crt-royale/src/user-preset-constants.h
rename to crt/shaders/crt-royale/src/user-cgp-constants.h
index 93a77d5..25578cb 100644
--- a/crt/shaders/crt-royale/src/user-preset-constants.h
+++ b/crt/shaders/crt-royale/src/user-cgp-constants.h
@@ -11,14 +11,13 @@
 //  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
 //  this shader: One does a viewport-scale bloom, and the other skips it.  The
 //  latter benefits from a higher bloom_approx_scale_x, so save both separately:
-const float bloom_approx_size_x = 320.0;
-const float bloom_approx_scale_x = 320.0; //dunno why this is necessary
-const float bloom_approx_size_x_for_fake = 400.0;
+static const float bloom_approx_size_x = 320.0;
+static const float bloom_approx_size_x_for_fake = 400.0;
 //  Copy the viewport-relative scales of the phosphor mask resize passes
 //  (MASK_RESIZE and the pass immediately preceding it):
-const vec2 mask_resize_viewport_scale = vec2(0.0625, 0.0625);
+static const float2 mask_resize_viewport_scale = float2(0.0625, 0.0625);
 //  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
-const float geom_max_aspect_ratio = 4.0/3.0;
+static const float geom_max_aspect_ratio = 4.0/3.0;
 
 //  PHOSPHOR MASK TEXTURE CONSTANTS:
 //  Set the following constants to reflect the properties of the phosphor mask
@@ -26,32 +25,32 @@ const float geom_max_aspect_ratio = 4.0/3.0;
 //  based on user settings, then repeats a single tile until filling the screen.
 //  The shader must know the input texture size (default 64x64), and to manually
 //  resize, it must also know the horizontal triads per tile (default 8).
-const vec2 mask_texture_small_size = vec2(64.0);
-const vec2 mask_texture_large_size = vec2(512.0);
-const float mask_triads_per_tile = 8.0;
+static const float2 mask_texture_small_size = float2(64.0, 64.0);
+static const float2 mask_texture_large_size = float2(512.0, 512.0);
+static const float mask_triads_per_tile = 8.0;
 //  We need the average brightness of the phosphor mask to compensate for the
 //  dimming it causes.  The following four values are roughly correct for the
 //  masks included with the shader.  Update the value for any LUT texture you
 //  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
 //  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
 //#define PHOSPHOR_MASK_GRILLE14
-const float mask_grille14_avg_color = 50.6666666/255.0;
+static const float mask_grille14_avg_color = 50.6666666/255.0;
     //  TileableLinearApertureGrille14Wide7d33Spacing*.png
     //  TileableLinearApertureGrille14Wide10And6Spacing*.png
-const float mask_grille15_avg_color = 53.0/255.0;
+static const float mask_grille15_avg_color = 53.0/255.0;
     //  TileableLinearApertureGrille15Wide6d33Spacing*.png
     //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
-const float mask_slot_avg_color = 46.0/255.0;
+static const float mask_slot_avg_color = 46.0/255.0;
     //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
     //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
-const float mask_shadow_avg_color = 41.0/255.0;
+static const float mask_shadow_avg_color = 41.0/255.0;
     //  TileableLinearShadowMask*.png
     //  TileableLinearShadowMaskEDP*.png
 
 #ifdef PHOSPHOR_MASK_GRILLE14
-    const float mask_grille_avg_color = mask_grille14_avg_color;
+    static const float mask_grille_avg_color = mask_grille14_avg_color;
 #else
-    const float mask_grille_avg_color = mask_grille15_avg_color;
+    static const float mask_grille_avg_color = mask_grille15_avg_color;
 #endif
 
 
diff --git a/crt/shaders/crt-royale/user-settings.h b/crt/shaders/crt-royale/user-settings.h
index cc375df..211d624 100644
--- a/crt/shaders/crt-royale/user-settings.h
+++ b/crt/shaders/crt-royale/user-settings.h
@@ -15,8 +15,8 @@
 //  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
 //  Among other things, derivatives help us fix anisotropic filtering artifacts
 //  with curved manually tiled phosphor mask coords.  Related errors:
-//  error C3004: function "vec2 ddx(vec2);" not supported in this profile
-//  error C3004: function "vec2 ddy(vec2);" not supported in this profile
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
     //#define DRIVERS_ALLOW_DERIVATIVES
 
 //  Fine derivatives: Unsupported on older ATI cards.
@@ -43,13 +43,13 @@
 
 //  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
 //  anisotropic filtering, thereby fixing related artifacts.  Related errors:
-//  error C3004: function "vec4 tex2Dlod(sampler2D, vec4);" not supported in
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
 //  this profile
     //#define DRIVERS_ALLOW_TEX2DLOD
 
 //  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
 //  artifacts from anisotropic filtering and mipmapping.  Related errors:
-//  error C3004: function "vec4 tex2Dbias(sampler2D, vec4);" not supported
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
 //  in this profile
     //#define DRIVERS_ALLOW_TEX2DBIAS
 
@@ -124,30 +124,30 @@
 //  options that were cleaner or more convert to code as static constants.
 
 //  GAMMA:
-    const float crt_gamma_static = 2.5;                  //  range [1, 5]
-    const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
 
 //  LEVELS MANAGEMENT:
     //  Control the final multiplicative image contrast:
-    const float levels_contrast_static = 1.0;            //  range [0, 4)
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
     //  We auto-dim to avoid clipping between passes and restore brightness
     //  later.  Control the dim factor here: Lower values clip less but crush
     //  blacks more (static only for now).
-    const float levels_autodim_temp = 0.5;               //  range (0, 1]
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
 
 //  HALATION/DIFFUSION/BLOOM:
     //  Halation weight: How much energy should be lost to electrons bounding
     //  around under the CRT glass and exciting random phosphors?
-    const float halation_weight_static = 0.0;            //  range [0, 1]
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
     //  Refractive diffusion weight: How much light should spread/diffuse from
     //  refracting through the CRT glass?
-    const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
     //  Underestimate brightness: Bright areas bloom more, but we can base the
     //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
     //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
-    const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
     //  Blur all colors more than necessary for a softer phosphor bloom?
-    const float bloom_excess_static = 0.0;               //  range [0, 1]
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
     //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
     //  blurred resize of the input (convergence offsets are applied as well).
     //  There are three filter options (static option only for now):
@@ -159,7 +159,11 @@
     //      mask_num_triads_desired.
     //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
     //  These options are more pronounced for the fast, unbloomed shader version.
-    const float bloom_approx_filter_static = 2.0;
+#ifndef RADEON_FIX
+    static const float bloom_approx_filter_static = 2.0;
+#else
+    static const float bloom_approx_filter_static = 1.0;
+#endif
 
 //  ELECTRON BEAM SCANLINE DISTRIBUTION:
     //  How many scanlines should contribute light to each pixel?  Using more
@@ -172,68 +176,68 @@
     //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
     //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
     //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
-    const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
     //  A generalized Gaussian beam varies shape with color too, now just width.
     //  It's slower but more flexible (static option only for now).
-    bool beam_generalized_gaussian = true;
+    static const bool beam_generalized_gaussian = true;
     //  What kind of scanline antialiasing do you want?
     //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
     //  Integrals are slow (especially for generalized Gaussians) and rarely any
     //  better than 3x antialiasing (static option only for now).
-    const float beam_antialias_level = 1.0;              //  range [0, 2]
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
     //  Min/max standard deviations for scanline beams: Higher values widen and
     //  soften scanlines.  Depending on other options, low min sigmas can alias.
-    const float beam_min_sigma_static = 0.02;            //  range (0, 1]
-    const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
     //  Beam width varies as a function of color: A power function (0) is more
     //  configurable, but a spherical function (1) gives the widest beam
     //  variability without aliasing (static option only for now).
-    const float beam_spot_shape_function = 0.0;
+    static const float beam_spot_shape_function = 0.0;
     //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
     //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
-    const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
     //  Generalized Gaussian max shape parameters: Higher values give flatter
     //  scanline plateaus and steeper dropoffs, simultaneously widening and
     //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
     //  values > ~40.0 cause artifacts with integrals.
-    const float beam_min_shape_static = 2.0;         //  range [2, 32]
-    const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
     //  Generalized Gaussian shape power: Affects how quickly the distribution
     //  changes shape from Gaussian to steep/plateaued as color increases from 0
     //  to 1.0.  Higher powers appear softer for most colors, and lower powers
     //  appear sharper for most colors.
-    const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
     //  What filter should be used to sample scanlines horizontally?
     //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
-    const float beam_horiz_filter_static = 0.0;
+    static const float beam_horiz_filter_static = 0.0;
     //  Standard deviation for horizontal Gaussian resampling:
-    const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
     //  Do horizontal scanline sampling in linear RGB (correct light mixing),
     //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
     //  limiting circuitry in some CRT's), or a weighted avg.?
-    const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
     //  Simulate scanline misconvergence?  This needs 3x horizontal texture
     //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
     //  later passes (static option only for now).
-    bool beam_misconvergence = true;
+    static const bool beam_misconvergence = true;
     //  Convergence offsets in x/y directions for R/G/B scanline beams in units
     //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
-    const vec2 convergence_offsets_r_static = vec2(0.0, 0.0);
-    const vec2 convergence_offsets_g_static = vec2(0.0, 0.0);
-    const vec2 convergence_offsets_b_static = vec2(0.0, 0.0);
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
     //  Detect interlacing (static option only for now)?
-    bool interlace_detect = true;
+    static const bool interlace_detect = true;
     //  Assume 1080-line sources are interlaced?
-    const bool interlace_1080i_static = false;
+    static const bool interlace_1080i_static = false;
     //  For interlaced sources, assume TFF (top-field first) or BFF order?
     //  (Whether this matters depends on the nature of the interlaced input.)
-    const bool interlace_bff_static = false;
+    static const bool interlace_bff_static = false;
 
 //  ANTIALIASING:
     //  What AA level do you want for curvature/overscan/subpixels?  Options:
     //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
     //  (Static option only for now)
-    const float aa_level = 12.0;                     //  range [0, 24]
+    static const float aa_level = 12.0;                     //  range [0, 24]
     //  What antialiasing filter do you want (static option only)?  Options:
     //  0: Box (separable), 1: Box (cylindrical),
     //  2: Tent (separable), 3: Tent (cylindrical),
@@ -241,24 +245,24 @@
     //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
     //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
     //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
-    const float aa_filter = 6.0;                     //  range [0, 9]
+    static const float aa_filter = 6.0;                     //  range [0, 9]
     //  Flip the sample grid on odd/even frames (static option only for now)?
-    const bool aa_temporal = false;
+    static const bool aa_temporal = false;
     //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
     //  the blue offset is the negative r offset; range [0, 0.5]
-    const vec2 aa_subpixel_r_offset_static = vec2(-1.0/3.0, 0.0);//vec2(0.0);
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
     //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
     //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
     //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
     //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
     //  4.) C = 0.0 is a soft spline filter.
-    const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
     //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
-    const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
 
 //  PHOSPHOR MASK:
     //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
-    const float mask_type_static = 1.0;                  //  range [0, 2]
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
     //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
     //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
     //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
@@ -268,11 +272,11 @@
     //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
     //      This mode reuses the same masks, so triads will be enormous unless
     //      you change the mask LUT filenames in your .cgp file.
-    const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
     //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
     //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
     //  will always be used to calculate the full bloom sigma statically.
-    const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
     //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
     //  triads) will be rounded to the nearest integer tile size and clamped to
     //  obey minimum size constraints (imposed to reduce downsize taps) and
@@ -280,14 +284,14 @@
     //  To increase the size limit, double the viewport-relative scales for the
     //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
     //      range [1, mask_texture_small_size/mask_triads_per_tile]
-//    const float mask_triad_size_desired_static = 24.0 / 8.0;
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
     //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
     //  final size will be rounded and constrained as above); default 480.0
-    const float mask_num_triads_desired_static = 480.0;
+    static const float mask_num_triads_desired_static = 480.0;
     //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
     //  more samples and avoid moire a bit better, but some is unavoidable
     //  depending on the destination size (static option for now).
-    const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
     //  The mask is resized using a variable number of taps in each dimension,
     //  but some Cg profiles always fetch a constant number of taps no matter
     //  what (no dynamic branching).  We can limit the maximum number of taps if
@@ -295,27 +299,27 @@
     //  faster, but the limit IS enforced (static option only, forever);
     //      range [1, mask_texture_small_size/mask_triads_per_tile]
     //  TODO: Make this 1.0 and compensate with smarter sampling!
-    const float mask_min_allowed_triad_size = 2.0;
+    static const float mask_min_allowed_triad_size = 2.0;
 
 //  GEOMETRY:
     //  Geometry mode:
     //  0: Off (default), 1: Spherical mapping (like cgwg's),
     //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
-    const float geom_mode_static = 0.0;      //  range [0, 3]
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
     //  Radius of curvature: Measured in units of your viewport's diagonal size.
-    const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
     //  View dist is the distance from the player to their physical screen, in
     //  units of the viewport's diagonal size.  It controls the field of view.
-    const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
     //  Tilt angle in radians (clockwise around up and right vectors):
-    const vec2 geom_tilt_angle_static = vec2(0.0, 0.0);  //  range [-pi, pi]
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
     //  Aspect ratio: When the true viewport size is unknown, this value is used
     //  to help convert between the phosphor triad size and count, along with
     //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
     //  this equal to Retroarch's display aspect ratio (DAR) for best results;
     //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
     //  default (256/224)*(54/47) = 1.313069909 (see below)
-    const float geom_aspect_ratio_static = 1.313069909;
+    static const float geom_aspect_ratio_static = 1.313069909;
     //  Before getting into overscan, here's some general aspect ratio info:
     //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
     //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
@@ -338,21 +342,21 @@
     //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
     //  or adjust x/y independently to e.g. readd horizontal padding, as noted
     //  above: Values < 1.0 zoom out; range (0, inf)
-    const vec2 geom_overscan_static = vec2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
     //  Compute a proper pixel-space to texture-space matrix even without ddx()/
     //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
     //  with strong curvature (static option only for now).
-    const bool geom_force_correct_tangent_matrix = true;
+    static const bool geom_force_correct_tangent_matrix = true;
 
 //  BORDERS:
     //  Rounded border size in texture uv coords:
-    const float border_size_static = 0.015;           //  range [0, 0.5]
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
     //  Border darkness: Moderate values darken the border smoothly, and high
     //  values make the image very dark just inside the border:
-    const float border_darkness_static = 2.0;        //  range [0, inf)
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
     //  Border compression: High numbers compress border transitions, narrowing
     //  the dark border area.
-    const float border_compress_static = 2.5;        //  range [1, inf)
+    static const float border_compress_static = 2.5;        //  range [1, inf)
 
 
 #endif  //  USER_SETTINGS_H
diff --git a/include/blur-functions.h b/include/blur-functions.h
index bfef056..517a8cc 100644
--- a/include/blur-functions.h
+++ b/include/blur-functions.h
@@ -41,7 +41,7 @@
 //                      dxdy = (IN.video_size/IN.output_size)/IN.texture_size
 //              6.) For separable blurs (tex2DblurNresize and tex2DblurNfast),
 //                  zero out the dxdy component in the unblurred dimension:
-//                      dxdy = vec2(dxdy.x, 0.0) or vec2(0.0, dxdy.y)
+//                      dxdy = float2(dxdy.x, 0.0) or float2(0.0, dxdy.y)
 //              Many blurs share these requirements:
 //              1.) One-pass blurs require scale_xN == scale_yN or scales > 1.0,
 //                  or they will blur more in the lower-scaled dimension.
@@ -145,6 +145,7 @@
 //                                                  tex2Dblur43fast
 //                                                  tex2Dblur3x3resize
 
+
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 
 //  Set static standard deviations, but allow users to override them with their
@@ -157,70 +158,69 @@
         //  This distribution works such that blurring multiple times should
         //  have the same result as a single larger blur.  These values are
         //  larger than default for blurs up to 6x and smaller thereafter.
-        const float blur3_std_dev = 0.84931640625;
-        const float blur4_std_dev = 0.84931640625;
-        const float blur5_std_dev = 1.0595703125;
-        const float blur6_std_dev = 1.06591796875;
-        const float blur7_std_dev = 1.17041015625;
-        const float blur8_std_dev = 1.1720703125;
-        const float blur9_std_dev = 1.2259765625;
-        const float blur10_std_dev = 1.21982421875;
-        const float blur11_std_dev = 1.25361328125;
-        const float blur12_std_dev = 1.2423828125;
-        const float blur17_std_dev = 1.27783203125;
-        const float blur25_std_dev = 1.2810546875;
-        const float blur31_std_dev = 1.28125;
-        const float blur43_std_dev = 1.28125;
+        static const float blur3_std_dev = 0.84931640625;
+        static const float blur4_std_dev = 0.84931640625;
+        static const float blur5_std_dev = 1.0595703125;
+        static const float blur6_std_dev = 1.06591796875;
+        static const float blur7_std_dev = 1.17041015625;
+        static const float blur8_std_dev = 1.1720703125;
+        static const float blur9_std_dev = 1.2259765625;
+        static const float blur10_std_dev = 1.21982421875;
+        static const float blur11_std_dev = 1.25361328125;
+        static const float blur12_std_dev = 1.2423828125;
+        static const float blur17_std_dev = 1.27783203125;
+        static const float blur25_std_dev = 1.2810546875;
+        static const float blur31_std_dev = 1.28125;
+        static const float blur43_std_dev = 1.28125;
     #else
         //  The defaults are the largest values that keep the largest unused
         //  blur term on each side <= 1.0/256.0.  (We could get away with more
         //  or be more conservative, but this compromise is pretty reasonable.)
-        const float blur3_std_dev = 0.62666015625;
-        const float blur4_std_dev = 0.66171875;
-        const float blur5_std_dev = 0.9845703125;
-        const float blur6_std_dev = 1.02626953125;
-        const float blur7_std_dev = 1.36103515625;
-        const float blur8_std_dev = 1.4080078125;
-        const float blur9_std_dev = 1.7533203125;
-        const float blur10_std_dev = 1.80478515625;
-        const float blur11_std_dev = 2.15986328125;
-        const float blur12_std_dev = 2.215234375;
-        const float blur17_std_dev = 3.45535583496;
-        const float blur25_std_dev = 5.3409576416;
-        const float blur31_std_dev = 6.86488037109;
-        const float blur43_std_dev = 10.1852050781;
+        static const float blur3_std_dev = 0.62666015625;
+        static const float blur4_std_dev = 0.66171875;
+        static const float blur5_std_dev = 0.9845703125;
+        static const float blur6_std_dev = 1.02626953125;
+        static const float blur7_std_dev = 1.36103515625;
+        static const float blur8_std_dev = 1.4080078125;
+        static const float blur9_std_dev = 1.7533203125;
+        static const float blur10_std_dev = 1.80478515625;
+        static const float blur11_std_dev = 2.15986328125;
+        static const float blur12_std_dev = 2.215234375;
+        static const float blur17_std_dev = 3.45535583496;
+        static const float blur25_std_dev = 5.3409576416;
+        static const float blur31_std_dev = 6.86488037109;
+        static const float blur43_std_dev = 10.1852050781;
     #endif  //  USE_BINOMIAL_BLUR_STD_DEVS
 #endif  //  OVERRIDE_BLUR_STD_DEVS
 
 #ifndef OVERRIDE_ERROR_BLURRING
     //  error_blurring should be in [0.0, 1.0].  Higher values reduce ringing
     //  in shared-sample blurs but increase blurring and feature shifting.
-    const float error_blurring = 0.5;
+    static const float error_blurring = 0.5;
 #endif
 
-//  Make a length squared helper macro (for usage with static constants):
-#define LENGTH_SQ(vec) (dot(vec, vec))
 
 //////////////////////////////////  INCLUDES  //////////////////////////////////
 
 //  gamma-management.h relies on pass-specific settings to guide its behavior:
 //  FIRST_PASS, LAST_PASS, GAMMA_ENCODE_EVERY_FBO, etc.  See it for details.
 #include "gamma-management.h"
-//#include "quad-pixel-communication.h"
+#include "quad-pixel-communication.h"
 #include "special-functions.h"
 
+
 ///////////////////////////////////  HELPERS  //////////////////////////////////
 
-vec4 uv2_to_uv4(vec2 tex_uv)
+inline float4 uv2_to_uv4(float2 tex_uv)
 {
-    //  Make a vec2 uv offset safe for adding to vec4 tex2Dlod coords:
-    return vec4(tex_uv, 0.0, 0.0);
+    //  Make a float2 uv offset safe for adding to float4 tex2Dlod coords:
+    return float4(tex_uv, 0.0, 0.0);
 }
 
 //  Make a length squared helper macro (for usage with static constants):
 #define LENGTH_SQ(vec) (dot(vec, vec))
 
-float get_fast_gaussian_weight_sum_inv(const float sigma)
+inline float get_fast_gaussian_weight_sum_inv(const float sigma)
 {
     //  We can use the Gaussian integral to calculate the asymptotic weight for
     //  the center pixel.  Since the unnormalized center pixel weight is 1.0,
@@ -241,10 +241,11 @@ float get_fast_gaussian_weight_sum_inv(const float sigma)
         (sigma - 0.0860587260734721))), 0.399334576340352/sigma);
 }
 
+
 ////////////////////  ARBITRARILY RESIZABLE SEPARABLE BLURS  ///////////////////
 
-vec3 tex2Dblur11resize(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Global requirements must be met (see file description).
     //  Returns:    A 1D 11x Gaussian blurred texture lookup using a 11-tap blur.
@@ -262,7 +263,7 @@ vec3 tex2Dblur11resize(const sampler2D tex, const vec2 tex_uv,
         (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5));
     //  Statically normalize weights, sum weighted samples, and return.  Blurs are
     //  currently optimized for dynamic weights.
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w5 * tex2D_linearize(tex, tex_uv - 5.0 * dxdy).rgb;
     sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
     sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
@@ -277,8 +278,8 @@ vec3 tex2Dblur11resize(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur9resize(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Global requirements must be met (see file description).
     //  Returns:    A 1D 9x Gaussian blurred texture lookup using a 9-tap blur.
@@ -292,7 +293,7 @@ vec3 tex2Dblur9resize(const sampler2D tex, const vec2 tex_uv,
     const float w4 = exp(-16.0 * denom_inv);
     const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w4 * tex2D_linearize(tex, tex_uv - 4.0 * dxdy).rgb;
     sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
     sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
@@ -305,8 +306,8 @@ vec3 tex2Dblur9resize(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur7resize(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Global requirements must be met (see file description).
     //  Returns:    A 1D 7x Gaussian blurred texture lookup using a 7-tap blur.
@@ -319,7 +320,7 @@ vec3 tex2Dblur7resize(const sampler2D tex, const vec2 tex_uv,
     const float w3 = exp(-9.0 * denom_inv);
     const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w3 * tex2D_linearize(tex, tex_uv - 3.0 * dxdy).rgb;
     sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
     sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
@@ -330,8 +331,8 @@ vec3 tex2Dblur7resize(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur5resize(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Global requirements must be met (see file description).
     //  Returns:    A 1D 5x Gaussian blurred texture lookup using a 5-tap blur.
@@ -343,7 +344,7 @@ vec3 tex2Dblur5resize(const sampler2D tex, const vec2 tex_uv,
     const float w2 = exp(-4.0 * denom_inv);
     const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w2 * tex2D_linearize(tex, tex_uv - 2.0 * dxdy).rgb;
     sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
     sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
@@ -352,8 +353,8 @@ vec3 tex2Dblur5resize(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur3resize(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Global requirements must be met (see file description).
     //  Returns:    A 1D 3x Gaussian blurred texture lookup using a 3-tap blur.
@@ -364,17 +365,18 @@ vec3 tex2Dblur3resize(const sampler2D tex, const vec2 tex_uv,
     const float w1 = exp(-1.0 * denom_inv);
     const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w1 * tex2D_linearize(tex, tex_uv - 1.0 * dxdy).rgb;
     sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
     sum += w1 * tex2D_linearize(tex, tex_uv + 1.0 * dxdy).rgb;
     return sum * weight_sum_inv;
 }
 
+
 ///////////////////////////  FAST SEPARABLE BLURS  ///////////////////////////
 
-vec3 tex2Dblur11fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   1.) Global requirements must be met (see file description).
     //              2.) filter_linearN must = "true" in your .cgp file.
@@ -401,7 +403,7 @@ vec3 tex2Dblur11fast(const sampler2D tex, const vec2 tex_uv,
     const float w23_ratio = w3/w23;
     const float w45_ratio = w5/w45;
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w45 * tex2D_linearize(tex, tex_uv - (4.0 + w45_ratio) * dxdy).rgb;
     sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
     sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
@@ -411,12 +413,12 @@ vec3 tex2Dblur11fast(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur17fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
-    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 4 linear taps.  It may be mipmapped depending
     //              on settings and dxdy.
     //  First get the texel weights and normalization factor as above.
     const float denom_inv = 0.5/(sigma*sigma);
@@ -425,97 +427,27 @@ vec3 tex2Dblur17fast(const sampler2D tex, const vec2 tex_uv,
     const float w2 = exp(-4.0 * denom_inv);
     const float w3 = exp(-9.0 * denom_inv);
     const float w4 = exp(-16.0 * denom_inv);
-    const float w5 = exp(-25.0 * denom_inv);
-    const float w6 = exp(-36.0 * denom_inv);
-    const float w7 = exp(-49.0 * denom_inv);
-    const float w8 = exp(-64.0 * denom_inv);
-    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
-    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
-    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
     //  Calculate combined weights and linear sample ratios between texel pairs.
-    const float w1_2 = w1 + w2;
-    const float w3_4 = w3 + w4;
-    const float w5_6 = w5 + w6;
-    const float w7_8 = w7 + w8;
-    const float w1_2_ratio = w2/w1_2;
-    const float w3_4_ratio = w4/w3_4;
-    const float w5_6_ratio = w6/w5_6;
-    const float w7_8_ratio = w8/w7_8;
+    const float w12 = w1 + w2;
+    const float w34 = w3 + w4;
+    const float w12_ratio = w2/w12;
+    const float w34_ratio = w4/w34;
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
-    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
-    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
-    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
-    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
     sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
-    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
-    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
-    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
-    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur25fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
-    //              neighbor and 12 linear taps.  It may be mipmapped depending
-    //              on settings and dxdy.
-    //  First get the texel weights and normalization factor as above.
-    const float denom_inv = 0.5/(sigma*sigma);
-    const float w0 = 1.0;
-    const float w1 = exp(-1.0 * denom_inv);
-    const float w2 = exp(-4.0 * denom_inv);
-    const float w3 = exp(-9.0 * denom_inv);
-    const float w4 = exp(-16.0 * denom_inv);
-    const float w5 = exp(-25.0 * denom_inv);
-    const float w6 = exp(-36.0 * denom_inv);
-    const float w7 = exp(-49.0 * denom_inv);
-    const float w8 = exp(-64.0 * denom_inv);
-    const float w9 = exp(-81.0 * denom_inv);
-    const float w10 = exp(-100.0 * denom_inv);
-    const float w11 = exp(-121.0 * denom_inv);
-    const float w12 = exp(-144.0 * denom_inv);
-    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
-    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
-    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
-    //  Calculate combined weights and linear sample ratios between texel pairs.
-    const float w1_2 = w1 + w2;
-    const float w3_4 = w3 + w4;
-    const float w5_6 = w5 + w6;
-    const float w7_8 = w7 + w8;
-    const float w9_10 = w9 + w10;
-    const float w11_12 = w11 + w12;
-    const float w1_2_ratio = w2/w1_2;
-    const float w3_4_ratio = w4/w3_4;
-    const float w5_6_ratio = w6/w5_6;
-    const float w7_8_ratio = w8/w7_8;
-    const float w9_10_ratio = w10/w9_10;
-    const float w11_12_ratio = w12/w11_12;
-    //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
-    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
-    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
-    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
-    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
-    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
-    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
-    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
-    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
-    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
-    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
-    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
-    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
-    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
-    return sum * weight_sum_inv;
-}
-
-vec3 tex2Dblur31fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
-{
-    //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
+    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
     //              taps.  It may be mipmapped depending on settings and dxdy.
     //  First get the texel weights and normalization factor as above.
     const float denom_inv = 0.5/(sigma*sigma);
@@ -523,63 +455,73 @@ vec3 tex2Dblur31fast(const sampler2D tex, const vec2 tex_uv,
     const float w1 = exp(-1.0 * denom_inv);
     const float w2 = exp(-4.0 * denom_inv);
     const float w3 = exp(-9.0 * denom_inv);
-    const float w4 = exp(-16.0 * denom_inv);
-    const float w5 = exp(-25.0 * denom_inv);
-    const float w6 = exp(-36.0 * denom_inv);
-    const float w7 = exp(-49.0 * denom_inv);
-    const float w8 = exp(-64.0 * denom_inv);
-    const float w9 = exp(-81.0 * denom_inv);
-    const float w10 = exp(-100.0 * denom_inv);
-    const float w11 = exp(-121.0 * denom_inv);
-    const float w12 = exp(-144.0 * denom_inv);
-    const float w13 = exp(-169.0 * denom_inv);
-    const float w14 = exp(-196.0 * denom_inv);
-    const float w15 = exp(-225.0 * denom_inv);
-    //const float weight_sum_inv = 1.0 /
-    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
-    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
-    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
     //  Calculate combined weights and linear sample ratios between texel pairs.
     //  The center texel (with weight w0) is used twice, so halve its weight.
-    const float w0_1 = w0 * 0.5 + w1;
-    const float w2_3 = w2 + w3;
-    const float w4_5 = w4 + w5;
-    const float w6_7 = w6 + w7;
-    const float w8_9 = w8 + w9;
-    const float w10_11 = w10 + w11;
-    const float w12_13 = w12 + w13;
-    const float w14_15 = w14 + w15;
-    const float w0_1_ratio = w1/w0_1;
-    const float w2_3_ratio = w3/w2_3;
-    const float w4_5_ratio = w5/w4_5;
-    const float w6_7_ratio = w7/w6_7;
-    const float w8_9_ratio = w9/w8_9;
-    const float w10_11_ratio = w11/w10_11;
-    const float w12_13_ratio = w13/w12_13;
-    const float w14_15_ratio = w15/w14_15;
+    const float w01 = w0 * 0.5 + w1;
+    const float w23 = w2 + w3;
+    const float w01_ratio = w1/w01;
+    const float w23_ratio = w3/w23;
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
-    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
-    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
-    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
-    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
-    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
-    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
-    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
-    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
-    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
-    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
-    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
-    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
-    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
-    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
-    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
-    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
+    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
+    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur43fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 2 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w12 = w1 + w2;
+    const float w12_ratio = w2/w12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
+    //              taps.  It may be mipmapped depending on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    //  The center texel (with weight w0) is used twice, so halve its weight.
+    const float w01 = w0 * 0.5 + w1;
+    const float w01_ratio = w1/w01;
+    //  Weights for all samples are the same, so just average them:
+    return 0.5 * (
+        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
+        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
+}
+
+
+////////////////////////////  HUGE SEPARABLE BLURS  ////////////////////////////
+
+//  Huge separable blurs come only in "fast" versions.
+float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Same as tex2Dblur11()
     //  Returns:    A 1D 43x Gaussian blurred texture lookup using 22 linear
@@ -637,7 +579,7 @@ vec3 tex2Dblur43fast(const sampler2D tex, const vec2 tex_uv,
     const float w18_19_ratio = w19/w18_19;
     const float w20_21_ratio = w21/w20_21;
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w20_21 * tex2D_linearize(tex, tex_uv - (20.0 + w20_21_ratio) * dxdy).rgb;
     sum += w18_19 * tex2D_linearize(tex, tex_uv - (18.0 + w18_19_ratio) * dxdy).rgb;
     sum += w16_17 * tex2D_linearize(tex, tex_uv - (16.0 + w16_17_ratio) * dxdy).rgb;
@@ -663,56 +605,11 @@ vec3 tex2Dblur43fast(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur3fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 3x Gaussian blurred texture lookup using 2 linear
-    //              taps.  It may be mipmapped depending on settings and dxdy.
-    //  First get the texel weights and normalization factor as above.
-    const float denom_inv = 0.5/(sigma*sigma);
-    const float w0 = 1.0;
-    const float w1 = exp(-1.0 * denom_inv);
-    const float weight_sum_inv = 1.0 / (w0 + 2.0 * w1);
-    //  Calculate combined weights and linear sample ratios between texel pairs.
-    //  The center texel (with weight w0) is used twice, so halve its weight.
-    const float w01 = w0 * 0.5 + w1;
-    const float w01_ratio = w1/w01;
-    //  Weights for all samples are the same, so just average them:
-    return 0.5 * (
-        tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb +
-        tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb);
-}
-
-vec3 tex2Dblur5fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
-{
-    //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 5x Gaussian blurred texture lookup using 1 nearest
-    //              neighbor and 2 linear taps.  It may be mipmapped depending
-    //              on settings and dxdy.
-    //  First get the texel weights and normalization factor as above.
-    const float denom_inv = 0.5/(sigma*sigma);
-    const float w0 = 1.0;
-    const float w1 = exp(-1.0 * denom_inv);
-    const float w2 = exp(-4.0 * denom_inv);
-    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2));
-    //  Calculate combined weights and linear sample ratios between texel pairs.
-    const float w12 = w1 + w2;
-    const float w12_ratio = w2/w12;
-    //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
-    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
-    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
-    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
-    return sum * weight_sum_inv;
-}
-
-vec3 tex2Dblur7fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
-{
-    //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 7x Gaussian blurred texture lookup using 4 linear
+    //  Returns:    A 1D 31x Gaussian blurred texture lookup using 16 linear
     //              taps.  It may be mipmapped depending on settings and dxdy.
     //  First get the texel weights and normalization factor as above.
     const float denom_inv = 0.5/(sigma*sigma);
@@ -720,26 +617,166 @@ vec3 tex2Dblur7fast(const sampler2D tex, const vec2 tex_uv,
     const float w1 = exp(-1.0 * denom_inv);
     const float w2 = exp(-4.0 * denom_inv);
     const float w3 = exp(-9.0 * denom_inv);
-    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3));
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    const float w13 = exp(-169.0 * denom_inv);
+    const float w14 = exp(-196.0 * denom_inv);
+    const float w15 = exp(-225.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 /
+    //    (w0 + 2.0 * (w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 +
+    //        w9 + w10 + w11 + w12 + w13 + w14 + w15));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
     //  Calculate combined weights and linear sample ratios between texel pairs.
     //  The center texel (with weight w0) is used twice, so halve its weight.
-    const float w01 = w0 * 0.5 + w1;
-    const float w23 = w2 + w3;
-    const float w01_ratio = w1/w01;
-    const float w23_ratio = w3/w23;
+    const float w0_1 = w0 * 0.5 + w1;
+    const float w2_3 = w2 + w3;
+    const float w4_5 = w4 + w5;
+    const float w6_7 = w6 + w7;
+    const float w8_9 = w8 + w9;
+    const float w10_11 = w10 + w11;
+    const float w12_13 = w12 + w13;
+    const float w14_15 = w14 + w15;
+    const float w0_1_ratio = w1/w0_1;
+    const float w2_3_ratio = w3/w2_3;
+    const float w4_5_ratio = w5/w4_5;
+    const float w6_7_ratio = w7/w6_7;
+    const float w8_9_ratio = w9/w8_9;
+    const float w10_11_ratio = w11/w10_11;
+    const float w12_13_ratio = w13/w12_13;
+    const float w14_15_ratio = w15/w14_15;
     //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
-    sum += w23 * tex2D_linearize(tex, tex_uv - (2.0 + w23_ratio) * dxdy).rgb;
-    sum += w01 * tex2D_linearize(tex, tex_uv - w01_ratio * dxdy).rgb;
-    sum += w01 * tex2D_linearize(tex, tex_uv + w01_ratio * dxdy).rgb;
-    sum += w23 * tex2D_linearize(tex, tex_uv + (2.0 + w23_ratio) * dxdy).rgb;
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w14_15 * tex2D_linearize(tex, tex_uv - (14.0 + w14_15_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv - (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv - (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv - (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv - (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv - (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv - (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv - w0_1_ratio * dxdy).rgb;
+    sum += w0_1 * tex2D_linearize(tex, tex_uv + w0_1_ratio * dxdy).rgb;
+    sum += w2_3 * tex2D_linearize(tex, tex_uv + (2.0 + w2_3_ratio) * dxdy).rgb;
+    sum += w4_5 * tex2D_linearize(tex, tex_uv + (4.0 + w4_5_ratio) * dxdy).rgb;
+    sum += w6_7 * tex2D_linearize(tex, tex_uv + (6.0 + w6_7_ratio) * dxdy).rgb;
+    sum += w8_9 * tex2D_linearize(tex, tex_uv + (8.0 + w8_9_ratio) * dxdy).rgb;
+    sum += w10_11 * tex2D_linearize(tex, tex_uv + (10.0 + w10_11_ratio) * dxdy).rgb;
+    sum += w12_13 * tex2D_linearize(tex, tex_uv + (12.0 + w12_13_ratio) * dxdy).rgb;
+    sum += w14_15 * tex2D_linearize(tex, tex_uv + (14.0 + w14_15_ratio) * dxdy).rgb;
     return sum * weight_sum_inv;
 }
 
+float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 25x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 12 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    const float w9 = exp(-81.0 * denom_inv);
+    const float w10 = exp(-100.0 * denom_inv);
+    const float w11 = exp(-121.0 * denom_inv);
+    const float w12 = exp(-144.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w9_10 = w9 + w10;
+    const float w11_12 = w11 + w12;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    const float w9_10_ratio = w10/w9_10;
+    const float w11_12_ratio = w12/w11_12;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w11_12 * tex2D_linearize(tex, tex_uv - (11.0 + w11_12_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv - (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w9_10 * tex2D_linearize(tex, tex_uv + (9.0 + w9_10_ratio) * dxdy).rgb;
+    sum += w11_12 * tex2D_linearize(tex, tex_uv + (11.0 + w11_12_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
+{
+    //  Requires:   Same as tex2Dblur11()
+    //  Returns:    A 1D 17x Gaussian blurred texture lookup using 1 nearest
+    //              neighbor and 8 linear taps.  It may be mipmapped depending
+    //              on settings and dxdy.
+    //  First get the texel weights and normalization factor as above.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0 = 1.0;
+    const float w1 = exp(-1.0 * denom_inv);
+    const float w2 = exp(-4.0 * denom_inv);
+    const float w3 = exp(-9.0 * denom_inv);
+    const float w4 = exp(-16.0 * denom_inv);
+    const float w5 = exp(-25.0 * denom_inv);
+    const float w6 = exp(-36.0 * denom_inv);
+    const float w7 = exp(-49.0 * denom_inv);
+    const float w8 = exp(-64.0 * denom_inv);
+    //const float weight_sum_inv = 1.0 / (w0 + 2.0 * (
+    //    w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8));
+    const float weight_sum_inv = get_fast_gaussian_weight_sum_inv(sigma);
+    //  Calculate combined weights and linear sample ratios between texel pairs.
+    const float w1_2 = w1 + w2;
+    const float w3_4 = w3 + w4;
+    const float w5_6 = w5 + w6;
+    const float w7_8 = w7 + w8;
+    const float w1_2_ratio = w2/w1_2;
+    const float w3_4_ratio = w4/w3_4;
+    const float w5_6_ratio = w6/w5_6;
+    const float w7_8_ratio = w8/w7_8;
+    //  Statically normalize weights, sum weighted samples, and return:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += w7_8 * tex2D_linearize(tex, tex_uv - (7.0 + w7_8_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv - (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv - (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv - (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
+    sum += w1_2 * tex2D_linearize(tex, tex_uv + (1.0 + w1_2_ratio) * dxdy).rgb;
+    sum += w3_4 * tex2D_linearize(tex, tex_uv + (3.0 + w3_4_ratio) * dxdy).rgb;
+    sum += w5_6 * tex2D_linearize(tex, tex_uv + (5.0 + w5_6_ratio) * dxdy).rgb;
+    sum += w7_8 * tex2D_linearize(tex, tex_uv + (7.0 + w7_8_ratio) * dxdy).rgb;
+    return sum * weight_sum_inv;
+}
+
+
 ////////////////////  ARBITRARILY RESIZABLE ONE-PASS BLURS  ////////////////////
 
-vec3 tex2Dblur3x3resize(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Requires:   Global requirements must be met (see file description).
     //  Returns:    A 3x3 Gaussian blurred mipmapped texture lookup of the
@@ -751,71 +788,37 @@ vec3 tex2Dblur3x3resize(const sampler2D tex, const vec2 tex_uv,
     //  Load each sample.  We need all 3x3 samples.  Quad-pixel communication
     //  won't help either: This should perform like tex2Dblur5x5, but sharing a
     //  4x4 sample field would perform more like tex2Dblur8x8shared (worse).
-    const vec2 sample4_uv = tex_uv;
-    const vec2 dx = vec2(dxdy.x, 0.0);
-    const vec2 dy = vec2(0.0, dxdy.y);
-    const vec2 sample1_uv = sample4_uv - dy;
-    const vec2 sample7_uv = sample4_uv + dy;
-    const vec3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
-    const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
-    const vec3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
-    const vec3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
-    const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
-    const vec3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
-    const vec3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
-    const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
-    const vec3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
+    const float2 sample4_uv = tex_uv;
+    const float2 dx = float2(dxdy.x, 0.0);
+    const float2 dy = float2(0.0, dxdy.y);
+    const float2 sample1_uv = sample4_uv - dy;
+    const float2 sample7_uv = sample4_uv + dy;
+    const float3 sample0 = tex2D_linearize(tex, sample1_uv - dx).rgb;
+    const float3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const float3 sample2 = tex2D_linearize(tex, sample1_uv + dx).rgb;
+    const float3 sample3 = tex2D_linearize(tex, sample4_uv - dx).rgb;
+    const float3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const float3 sample5 = tex2D_linearize(tex, sample4_uv + dx).rgb;
+    const float3 sample6 = tex2D_linearize(tex, sample7_uv - dx).rgb;
+    const float3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const float3 sample8 = tex2D_linearize(tex, sample7_uv + dx).rgb;
     //  Statically compute Gaussian sample weights:
     const float w4 = 1.0;
-    const float w1_3_5_7 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
-    const float w0_2_6_8 = exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
+    const float w1_3_5_7 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w0_2_6_8 = exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
     const float weight_sum_inv = 1.0/(w4 + 4.0 * (w1_3_5_7 + w0_2_6_8));
     //  Weight and sum the samples:
-    const vec3 sum = w4 * sample4 +
+    const float3 sum = w4 * sample4 +
         w1_3_5_7 * (sample1 + sample3 + sample5 + sample7) +
         w0_2_6_8 * (sample0 + sample2 + sample6 + sample8);
     return sum * weight_sum_inv;
 }
 
-//  Resizable one-pass blurs:
-vec3 tex2Dblur3x3resize(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur3x3resize(texture, tex_uv, dxdy, blur3_std_dev);
-}
 
-vec3 tex2Dblur9fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
-{
-    //  Requires:   Same as tex2Dblur11()
-    //  Returns:    A 1D 9x Gaussian blurred texture lookup using 1 nearest
-    //              neighbor and 4 linear taps.  It may be mipmapped depending
-    //              on settings and dxdy.
-    //  First get the texel weights and normalization factor as above.
-    const float denom_inv = 0.5/(sigma*sigma);
-    const float w0 = 1.0;
-    const float w1 = exp(-1.0 * denom_inv);
-    const float w2 = exp(-4.0 * denom_inv);
-    const float w3 = exp(-9.0 * denom_inv);
-    const float w4 = exp(-16.0 * denom_inv);
-    const float weight_sum_inv = 1.0 / (w0 + 2.0 * (w1 + w2 + w3 + w4));
-    //  Calculate combined weights and linear sample ratios between texel pairs.
-    const float w12 = w1 + w2;
-    const float w34 = w3 + w4;
-    const float w12_ratio = w2/w12;
-    const float w34_ratio = w4/w34;
-    //  Statically normalize weights, sum weighted samples, and return:
-    vec3 sum = vec3(0.0);
-    sum += w34 * tex2D_linearize(tex, tex_uv - (3.0 + w34_ratio) * dxdy).rgb;
-    sum += w12 * tex2D_linearize(tex, tex_uv - (1.0 + w12_ratio) * dxdy).rgb;
-    sum += w0 * tex2D_linearize(tex, tex_uv).rgb;
-    sum += w12 * tex2D_linearize(tex, tex_uv + (1.0 + w12_ratio) * dxdy).rgb;
-    sum += w34 * tex2D_linearize(tex, tex_uv + (3.0 + w34_ratio) * dxdy).rgb;
-    return sum * weight_sum_inv;
-}
+////////////////////////////  FASTER ONE-PASS BLURS  ///////////////////////////
 
-vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Perform a 1-pass 9x9 blur with 5x5 bilinear samples.
     //  Requires:   Same as tex2Dblur9()
@@ -867,12 +870,12 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv,
     const float texel3to4ratio = w4off/(w3off + w4off);
     //  Statically compute texel offsets from the fragment center to each
     //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
-    const vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
-    const vec2 sample2R_texel_offset = vec2(3.0, 0.0) + vec2(texel3to4ratio, 0.0);
-    const vec2 sample3d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
-    const vec2 sample4d_texel_offset = vec2(3.0, 1.0) + vec2(texel3to4ratio, texel1to2ratio);
-    const vec2 sample5d_texel_offset = vec2(1.0, 3.0) + vec2(texel1to2ratio, texel3to4ratio);
-    const vec2 sample6d_texel_offset = vec2(3.0, 3.0) + vec2(texel3to4ratio, texel3to4ratio);
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2R_texel_offset = float2(3.0, 0.0) + float2(texel3to4ratio, 0.0);
+    const float2 sample3d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
+    const float2 sample4d_texel_offset = float2(3.0, 1.0) + float2(texel3to4ratio, texel1to2ratio);
+    const float2 sample5d_texel_offset = float2(1.0, 3.0) + float2(texel1to2ratio, texel3to4ratio);
+    const float2 sample6d_texel_offset = float2(3.0, 3.0) + float2(texel3to4ratio, texel3to4ratio);
 
     //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
     //  Statically compute Gaussian texel weights for the bottom-right quadrant.
@@ -881,16 +884,16 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv,
     const float w1R2 = w2off;
     const float w2R1 = w3off;
     const float w2R2 = w4off;
-    const float w3d1 =     exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
-    const float w3d2_3d3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
-    const float w3d4 =     exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
-    const float w4d1_5d1 = exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
-    const float w4d2_5d3 = exp(-LENGTH_SQ(vec2(4.0, 1.0)) * denom_inv);
-    const float w4d3_5d2 = exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
-    const float w4d4_5d4 = exp(-LENGTH_SQ(vec2(4.0, 2.0)) * denom_inv);
-    const float w6d1 =     exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
-    const float w6d2_6d3 = exp(-LENGTH_SQ(vec2(4.0, 3.0)) * denom_inv);
-    const float w6d4 =     exp(-LENGTH_SQ(vec2(4.0, 4.0)) * denom_inv);
+    const float w3d1 =     exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w3d2_3d3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w3d4 =     exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d1_5d1 = exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d2_5d3 = exp(-LENGTH_SQ(float2(4.0, 1.0)) * denom_inv);
+    const float w4d3_5d2 = exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4_5d4 = exp(-LENGTH_SQ(float2(4.0, 2.0)) * denom_inv);
+    const float w6d1 =     exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
+    const float w6d2_6d3 = exp(-LENGTH_SQ(float2(4.0, 3.0)) * denom_inv);
+    const float w6d4 =     exp(-LENGTH_SQ(float2(4.0, 4.0)) * denom_inv);
     //  Statically add texel weights in each sample to get sample weights:
     const float w0 = 1.0;
     const float w1 = w1R1 + w1R2;
@@ -905,42 +908,42 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv,
 
     //  LOAD TEXTURE SAMPLES:
     //  Load all 25 samples (1 nearest, 8 linear, 16 bilinear) using symmetry:
-    const vec2 mirror_x = vec2(-1.0, 1.0);
-    const vec2 mirror_y = vec2(1.0, -1.0);
-    const vec2 mirror_xy = vec2(-1.0, -1.0);
-    const vec2 dxdy_mirror_x = dxdy * mirror_x;
-    const vec2 dxdy_mirror_y = dxdy * mirror_y;
-    const vec2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
     //  Sampling order doesn't seem to affect performance, so just be clear:
-    const vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
-    const vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
-    const vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
-    const vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
-    const vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
-    const vec3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
-    const vec3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
-    const vec3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
-    const vec3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
-    const vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
-    const vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
-    const vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
-    const vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
-    const vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
-    const vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
-    const vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
-    const vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
-    const vec3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
-    const vec3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
-    const vec3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
-    const vec3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
-    const vec3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
-    const vec3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
-    const vec3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
-    const vec3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2R = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2D = tex2D_linearize(tex, tex_uv + dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample2L = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset).rgb;
+    const float3 sample2U = tex2D_linearize(tex, tex_uv - dxdy * sample2R_texel_offset.yx).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample5d = tex2D_linearize(tex, tex_uv + dxdy * sample5d_texel_offset).rgb;
+    const float3 sample5c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample5d_texel_offset).rgb;
+    const float3 sample5b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample5d_texel_offset).rgb;
+    const float3 sample5a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample5d_texel_offset).rgb;
+    const float3 sample6d = tex2D_linearize(tex, tex_uv + dxdy * sample6d_texel_offset).rgb;
+    const float3 sample6c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample6d_texel_offset).rgb;
+    const float3 sample6b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample6d_texel_offset).rgb;
+    const float3 sample6a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample6d_texel_offset).rgb;
 
     //  SUM WEIGHTED SAMPLES:
     //  Statically normalize weights (so total = 1.0), and sum weighted samples.
-    vec3 sum = w0 * sample0C;
+    float3 sum = w0 * sample0C;
     sum += w1 * (sample1R + sample1D + sample1L + sample1U);
     sum += w2 * (sample2R + sample2D + sample2L + sample2U);
     sum += w3 * (sample3d + sample3c + sample3b + sample3a);
@@ -950,8 +953,8 @@ vec3 tex2Dblur9x9(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Perform a 1-pass 7x7 blur with 5x5 bilinear samples.
     //  Requires:   Same as tex2Dblur9()
@@ -987,24 +990,24 @@ vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv,
     const float texel2to3ratio = w3off/(w2off + w3off);
     //  Statically compute texel offsets from the fragment center to each
     //  bilinear sample in the bottom-right quadrant, including axis-aligned:
-    const vec2 sample1d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
-    const vec2 sample2d_texel_offset = vec2(2.0, 0.0) + vec2(texel2to3ratio, texel0to1ratio);
-    const vec2 sample3d_texel_offset = vec2(0.0, 2.0) + vec2(texel0to1ratio, texel2to3ratio);
-    const vec2 sample4d_texel_offset = vec2(2.0, 2.0) + vec2(texel2to3ratio, texel2to3ratio);
+    const float2 sample1d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
+    const float2 sample2d_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample3d_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4d_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
 
     //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
     //  Statically compute Gaussian texel weights for the bottom-right quadrant.
     //  Read underscores as "and."
     const float w1abcd = 1.0;
-    const float w1bd2_1cd3 = exp(-LENGTH_SQ(vec2(1.0, 0.0)) * denom_inv);
-    const float w2bd1_3cd1 = exp(-LENGTH_SQ(vec2(2.0, 0.0)) * denom_inv);
-    const float w2bd2_3cd2 = exp(-LENGTH_SQ(vec2(3.0, 0.0)) * denom_inv);
-    const float w1d4 =       exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
-    const float w2d3_3d2 =   exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
-    const float w2d4_3d4 =   exp(-LENGTH_SQ(vec2(3.0, 1.0)) * denom_inv);
-    const float w4d1 =       exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
-    const float w4d2_4d3 =   exp(-LENGTH_SQ(vec2(3.0, 2.0)) * denom_inv);
-    const float w4d4 =       exp(-LENGTH_SQ(vec2(3.0, 3.0)) * denom_inv);
+    const float w1bd2_1cd3 = exp(-LENGTH_SQ(float2(1.0, 0.0)) * denom_inv);
+    const float w2bd1_3cd1 = exp(-LENGTH_SQ(float2(2.0, 0.0)) * denom_inv);
+    const float w2bd2_3cd2 = exp(-LENGTH_SQ(float2(3.0, 0.0)) * denom_inv);
+    const float w1d4 =       exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d3_3d2 =   exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4_3d4 =   exp(-LENGTH_SQ(float2(3.0, 1.0)) * denom_inv);
+    const float w4d1 =       exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
+    const float w4d2_4d3 =   exp(-LENGTH_SQ(float2(3.0, 2.0)) * denom_inv);
+    const float w4d4 =       exp(-LENGTH_SQ(float2(3.0, 3.0)) * denom_inv);
     //  Statically add texel weights in each sample to get sample weights.
     //  Split weights for shared texels between samples sharing them:
     const float w1 = w1abcd * 0.25 + w1bd2_1cd3 + w1d4;
@@ -1016,32 +1019,32 @@ vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv,
 
     //  LOAD TEXTURE SAMPLES:
     //  Load all 16 samples using symmetry:
-    const vec2 mirror_x = vec2(-1.0, 1.0);
-    const vec2 mirror_y = vec2(1.0, -1.0);
-    const vec2 mirror_xy = vec2(-1.0, -1.0);
-    const vec2 dxdy_mirror_x = dxdy * mirror_x;
-    const vec2 dxdy_mirror_y = dxdy * mirror_y;
-    const vec2 dxdy_mirror_xy = dxdy * mirror_xy;
-    const vec3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
-    const vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
-    const vec3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
-    const vec3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
-    const vec3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
-    const vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
-    const vec3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
-    const vec3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
-    const vec3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
-    const vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
-    const vec3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
-    const vec3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
-    const vec3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
-    const vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
-    const vec3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
-    const vec3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample1a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample1d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float3 sample3a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample3d_texel_offset).rgb;
+    const float3 sample4a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample4d_texel_offset).rgb;
+    const float3 sample1b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample1d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample3b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample3d_texel_offset).rgb;
+    const float3 sample4b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample4d_texel_offset).rgb;
+    const float3 sample1c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample1d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample3c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample3d_texel_offset).rgb;
+    const float3 sample4c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample4d_texel_offset).rgb;
+    const float3 sample1d = tex2D_linearize(tex, tex_uv + dxdy * sample1d_texel_offset).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample3d = tex2D_linearize(tex, tex_uv + dxdy * sample3d_texel_offset).rgb;
+    const float3 sample4d = tex2D_linearize(tex, tex_uv + dxdy * sample4d_texel_offset).rgb;
 
     //  SUM WEIGHTED SAMPLES:
     //  Statically normalize weights (so total = 1.0), and sum weighted samples.
-    vec3 sum = vec3(0.0);
+    float3 sum = float3(0.0,0.0,0.0);
     sum += w1 * (sample1a + sample1b + sample1c + sample1d);
     sum += w2_3 * (sample2a + sample2b + sample2c + sample2d);
     sum += w2_3 * (sample3a + sample3b + sample3c + sample3d);
@@ -1049,8 +1052,8 @@ vec3 tex2Dblur7x7(const sampler2D tex, const vec2 tex_uv,
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur5x5(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Perform a 1-pass 5x5 blur with 3x3 bilinear samples.
     //  Requires:   Same as tex2Dblur9()
@@ -1077,17 +1080,17 @@ vec3 tex2Dblur5x5(const sampler2D tex, const vec2 tex_uv,
     const float texel1to2ratio = w2off/(w1off + w2off);
     //  Statically compute texel offsets from the fragment center to each
     //  bilinear sample in the bottom-right quadrant, including x-axis-aligned:
-    const vec2 sample1R_texel_offset = vec2(1.0, 0.0) + vec2(texel1to2ratio, 0.0);
-    const vec2 sample2d_texel_offset = vec2(1.0, 1.0) + vec2(texel1to2ratio, texel1to2ratio);
+    const float2 sample1R_texel_offset = float2(1.0, 0.0) + float2(texel1to2ratio, 0.0);
+    const float2 sample2d_texel_offset = float2(1.0, 1.0) + float2(texel1to2ratio, texel1to2ratio);
 
     //  CALCULATE KERNEL WEIGHTS FOR ALL SAMPLES:
     //  Statically compute Gaussian texel weights for the bottom-right quadrant.
     //  Read underscores as "and."
     const float w1R1 = w1off;
     const float w1R2 = w2off;
-    const float w2d1 =   exp(-LENGTH_SQ(vec2(1.0, 1.0)) * denom_inv);
-    const float w2d2_3 = exp(-LENGTH_SQ(vec2(2.0, 1.0)) * denom_inv);
-    const float w2d4 =   exp(-LENGTH_SQ(vec2(2.0, 2.0)) * denom_inv);
+    const float w2d1 =   exp(-LENGTH_SQ(float2(1.0, 1.0)) * denom_inv);
+    const float w2d2_3 = exp(-LENGTH_SQ(float2(2.0, 1.0)) * denom_inv);
+    const float w2d4 =   exp(-LENGTH_SQ(float2(2.0, 2.0)) * denom_inv);
     //  Statically add texel weights in each sample to get sample weights:
     const float w0 = 1.0;
     const float w1 = w1R1 + w1R2;
@@ -1097,32 +1100,32 @@ vec3 tex2Dblur5x5(const sampler2D tex, const vec2 tex_uv,
 
     //  LOAD TEXTURE SAMPLES:
     //  Load all 9 samples (1 nearest, 4 linear, 4 bilinear) using symmetry:
-    const vec2 mirror_x = vec2(-1.0, 1.0);
-    const vec2 mirror_y = vec2(1.0, -1.0);
-    const vec2 mirror_xy = vec2(-1.0, -1.0);
-    const vec2 dxdy_mirror_x = dxdy * mirror_x;
-    const vec2 dxdy_mirror_y = dxdy * mirror_y;
-    const vec2 dxdy_mirror_xy = dxdy * mirror_xy;
-    const vec3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
-    const vec3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
-    const vec3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
-    const vec3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
-    const vec3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
-    const vec3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
-    const vec3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
-    const vec3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
-    const vec3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0C = tex2D_linearize(tex, tex_uv).rgb;
+    const float3 sample1R = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1D = tex2D_linearize(tex, tex_uv + dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample1L = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset).rgb;
+    const float3 sample1U = tex2D_linearize(tex, tex_uv - dxdy * sample1R_texel_offset.yx).rgb;
+    const float3 sample2d = tex2D_linearize(tex, tex_uv + dxdy * sample2d_texel_offset).rgb;
+    const float3 sample2c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample2d_texel_offset).rgb;
+    const float3 sample2b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample2d_texel_offset).rgb;
+    const float3 sample2a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample2d_texel_offset).rgb;
 
     //  SUM WEIGHTED SAMPLES:
     //  Statically normalize weights (so total = 1.0), and sum weighted samples.
-    vec3 sum = w0 * sample0C;
+    float3 sum = w0 * sample0C;
     sum += w1 * (sample1R + sample1D + sample1L + sample1U);
     sum += w2 * (sample2a + sample2b + sample2c + sample2d);
     return sum * weight_sum_inv;
 }
 
-vec3 tex2Dblur3x3(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy, const float sigma)
+float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy, const float sigma)
 {
     //  Perform a 1-pass 3x3 blur with 5x5 bilinear samples.
     //  Requires:   Same as tex2Dblur9()
@@ -1148,130 +1151,766 @@ vec3 tex2Dblur3x3(const sampler2D tex, const vec2 tex_uv,
     const float texel0to1ratio = w1off/(w0off * 0.5 + w1off);
     //  Statically compute texel offsets from the fragment center to each
     //  bilinear sample in the bottom-right quadrant, including axis-aligned:
-    const vec2 sample0d_texel_offset = vec2(texel0to1ratio, texel0to1ratio);
+    const float2 sample0d_texel_offset = float2(texel0to1ratio, texel0to1ratio);
 
     //  LOAD TEXTURE SAMPLES:
     //  Load all 4 samples using symmetry:
-    const vec2 mirror_x = vec2(-1.0, 1.0);
-    const vec2 mirror_y = vec2(1.0, -1.0);
-    const vec2 mirror_xy = vec2(-1.0, -1.0);
-    const vec2 dxdy_mirror_x = dxdy * mirror_x;
-    const vec2 dxdy_mirror_y = dxdy * mirror_y;
-    const vec2 dxdy_mirror_xy = dxdy * mirror_xy;
-    const vec3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
-    const vec3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
-    const vec3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
-    const vec3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
+    const float2 mirror_x = float2(-1.0, 1.0);
+    const float2 mirror_y = float2(1.0, -1.0);
+    const float2 mirror_xy = float2(-1.0, -1.0);
+    const float2 dxdy_mirror_x = dxdy * mirror_x;
+    const float2 dxdy_mirror_y = dxdy * mirror_y;
+    const float2 dxdy_mirror_xy = dxdy * mirror_xy;
+    const float3 sample0a = tex2D_linearize(tex, tex_uv + dxdy_mirror_xy * sample0d_texel_offset).rgb;
+    const float3 sample0b = tex2D_linearize(tex, tex_uv + dxdy_mirror_y * sample0d_texel_offset).rgb;
+    const float3 sample0c = tex2D_linearize(tex, tex_uv + dxdy_mirror_x * sample0d_texel_offset).rgb;
+    const float3 sample0d = tex2D_linearize(tex, tex_uv + dxdy * sample0d_texel_offset).rgb;
 
     //  SUM WEIGHTED SAMPLES:
     //  Weights for all samples are the same, so just average them:
     return 0.25 * (sample0a + sample0b + sample0c + sample0d);
 }
 
-vec3 tex2Dblur9fast(const sampler2D tex, const vec2 tex_uv,
-    const vec2 dxdy)
+
+//////////////////  LINEAR ONE-PASS BLURS WITH SHARED SAMPLES  /////////////////
+
+float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   1.) Same as tex2Dblur9()
+    //              2.) ddx() and ddy() are present in the current Cg profile.
+    //              3.) The GPU driver is using fine/high-quality derivatives.
+    //              4.) quad_vector *correctly* describes the current fragment's
+    //                  location in its pixel quad, by the conventions noted in
+    //                  get_quad_vector[_naive].
+    //              5.) tex_uv.w = log2(IN.video_size/IN.output_size).y
+    //              6.) tex2Dlod() is present in the current Cg profile.
+    //  Optional:   Tune artifacts vs. excessive blurriness with the global
+    //              float error_blurring.
+    //  Returns:    A blurred texture lookup using a "virtual" 12x12 Gaussian
+    //              blur (a 6x6 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  Perform a 1-pass blur with shared texture lookups across a pixel quad.
+    //  We'll get neighboring samples with high-quality ddx/ddy derivatives, as
+    //  in GPU Pro 2, Chapter VI.2, "Shader Amortization using Pixel Quad
+    //  Message Passing" by Eric Penner.
+    //
+    //  Our "virtual" 12x12 blur will be comprised of ((6 - 1)^2)/4 + 3 = 12
+    //  bilinear samples, where bilinear sampling positions are computed from
+    //  the relative Gaussian weights of the 4 surrounding texels.  The catch is
+    //  that the appropriate texel weights and sample coords differ for each
+    //  fragment, but we're reusing most of the same samples across a quad of
+    //  destination fragments.  (We do use unique coords for the four nearest
+    //  samples at each fragment.)  Mixing bilinear filtering and sample-sharing
+    //  therefore introduces some error into the weights, and this can get nasty
+    //  when the source image is small or high-frequency.  Computing bilinear
+    //  ratios based on weights at the sample field center results in sharpening
+    //  and ringing artifacts, but we can move samples closer to halfway between
+    //  texels to try blurring away the error (which can move features around by
+    //  a texel or so).  Tune this with the global float "error_blurring".
+    //
+    //  The pixel quad's sample field covers 12x12 texels, accessed through 6x6
+    //  bilinear (2x2 texel) taps.  Each fragment depends on a window of 10x10
+    //  texels (5x5 bilinear taps), and each fragment is responsible for loading
+    //  a 6x6 texel quadrant as a 3x3 block of bilinear taps, plus 3 more taps
+    //  to use unique bilinear coords for sample0* for each fragment.  This
+    //  diagram illustrates the relative locations of bilinear samples 1-9 for
+    //  each quadrant a, b, c, d (note samples will not be equally spaced):
+    //      8a 7a 6a 6b 7b 8b
+    //      5a 4a 3a 3b 4b 5b
+    //      2a 1a 0a 0b 1b 2b
+    //      2c 1c 0c 0d 1d 2d
+    //      5c 4c 3c 3d 4d 5d
+    //      8c 7c 6c 6d 7d 8d
+    //  The following diagram illustrates the underlying equally spaced texels,
+    //  named after the sample that accesses them and subnamed by their location
+    //  within their 2x2 texel block:
+    //      8a3 8a2 7a3 7a2 6a3 6a2 6b2 6b3 7b2 7b3 8b2 8b3
+    //      8a1 8a0 7a1 7a0 6a1 6a0 6b0 6b1 7b0 7b1 8b0 8b1
+    //      5a3 5a2 4a3 4a2 3a3 3a2 3b2 3b3 4b2 4b3 5b2 5b3
+    //      5a1 5a0 4a1 4a0 3a1 3a0 3b0 3b1 4b0 4b1 5b0 5b1
+    //      2a3 2a2 1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3 2b2 2b3
+    //      2a1 2a0 1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1 2b0 2b1
+    //      2c1 2c0 1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1 2d0 2d1
+    //      2c3 2c2 1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3 2d2 2d3
+    //      5c1 5c0 4c1 4c0 3c1 3c0 3d0 3d1 4d0 4d1 5d0 5d1
+    //      5c3 5c2 4c3 4c2 3c3 3c2 3d2 3d3 4d2 4d3 5d2 5d3
+    //      8c1 8c0 7c1 7c0 6c1 6c0 6d0 6d1 7d0 7d1 8d0 8d1
+    //      8c3 8c2 7c3 7c2 6c3 6c2 6d2 6d3 7d2 7d3 8d2 8d3
+    //  With this symmetric arrangement, we don't have to know which absolute
+    //  quadrant a sample lies in to assign kernel weights; it's enough to know
+    //  the sample number and the relative quadrant of the sample (relative to
+    //  the current quadrant):
+    //      {current, adjacent x, adjacent y, diagonal}
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute sampling offsets within each 2x2 texel block, based
+    //  on appropriate 1D Gaussian sampling ratio between texels [0, 1], [2, 3],
+    //  and [4, 5] away from the fragment, and reuse them independently for both
+    //  dimensions.  Use the sample field center as the estimated destination,
+    //  but nudge the result closer to halfway between texels to blur error.
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  based on the sum of their 4 underlying texel weights.  Assume a same-
+    //  resolution blur, so each symmetrically named sample weight will compute
+    //  the same at every fragment in the pixel quad: We can therefore compute
+    //  texel weights based only on the bottom-right quadrant (fragment at 0d0).
+    //  Too avoid too much boilerplate code, use a macro to get all 4 texel
+    //  weights for a bilinear sample based on the offset of its top-left texel:
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w8diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -6.0);
+    const float w7diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -6.0);
+    const float w6diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -6.0);
+    const float w6adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -6.0);
+    const float w7adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -6.0);
+    const float w8adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -6.0);
+    const float w5diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -4.0);
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-6.0, -2.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 0.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w5adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 2.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w8adjx = GET_TEXEL_QUAD_WEIGHTS(-6.0, 4.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w5 = float4(w5curr, w5adjx, w5adjy, w5diag);
+    const float4 w6 = float4(w6curr, w6adjx, w6adjy, w6diag);
+    const float4 w7 = float4(w7curr, w7adjx, w7adjy, w7diag);
+    const float4 w8 = float4(w8curr, w8adjx, w8adjy, w8diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    float3 sample8adjx, sample8adjy, sample8diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    quad_gather(quad_vector, sample8curr, sample8adjx, sample8adjy, sample8diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    sum += mul(w5, float4x3(sample5curr, sample5adjx, sample5adjy, sample5diag));
+    sum += mul(w6, float4x3(sample6curr, sample6adjx, sample6adjy, sample6diag));
+    sum += mul(w7, float4x3(sample7curr, sample7adjx, sample7adjy, sample7diag));
+    sum += mul(w8, float4x3(sample8curr, sample8adjx, sample8adjy, sample8diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 10x10 Gaussian
+    //              blur (a 5x5 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 25 of the 36 samples taken across the pixel quad (to cover a
+    //  5x5 sample area, or 10x10 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 11 omitted samples
+    //  are always the "same:"
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float w4_5off = exp(-(4.5*4.5) * denom_inv);
+    const float w5_5off = exp(-(5.5*5.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    const float texel4to5ratio = lerp(w5_5off/(w4_5off + w5_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(4.0, 0.0) + float2(texel4to5ratio, texel0to1ratio);
+    const float2 sample3_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample4_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+    const float2 sample5_texel_offset = float2(4.0, 2.0) + float2(texel4to5ratio, texel2to3ratio);
+    const float2 sample6_texel_offset = float2(0.0, 4.0) + float2(texel0to1ratio, texel4to5ratio);
+    const float2 sample7_texel_offset = float2(2.0, 4.0) + float2(texel2to3ratio, texel4to5ratio);
+    const float2 sample8_texel_offset = float2(4.0, 4.0) + float2(texel4to5ratio, texel4to5ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 25 of the 36 sample weights.  Skip the following weights:
+    //      8adjx, 2adjx, 5adjx,
+    //      6adjy, 7adjy, 8adjy,
+    //      2diag, 5diag, 6diag, 7diag, 8diag
+    const float w4diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w4adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w5adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(4.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 0.0);
+    const float w4adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w4curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    const float w5curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 2.0);
+    const float w7adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 4.0);
+    const float w6adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 4.0);
+    const float w6curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 4.0);
+    const float w7curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 4.0);
+    const float w8curr = GET_TEXEL_QUAD_WEIGHTS(4.0, 4.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w4curr + w5curr + w6curr + w7curr + w8curr +
+        w0adjx + w1adjx + w3adjx + w4adjx + w6adjx + w7adjx +
+        w0adjy + w1adjy + w2adjy + w3adjy + w4adjy + w5adjy +
+        w0diag + w1diag + w3diag + w4diag);
+    //  Statically pack most weights for runtime.  Note the mixed packing:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    const float4 w4 = float4(w4curr, w4adjx, w4adjy, w4diag);
+    const float4 w2and5 = float4(w2curr, w2adjy, w5curr, w5adjy);
+    const float4 w6and7 = float4(w6curr, w6adjx, w7curr, w7adjx);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+    const float3 sample4curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample4_texel_offset)).rgb;
+    const float3 sample5curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample5_texel_offset)).rgb;
+    const float3 sample6curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample6_texel_offset)).rgb;
+    const float3 sample7curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample7_texel_offset)).rgb;
+    const float3 sample8curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample8_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad in order of need:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    float3 sample4adjx, sample4adjy, sample4diag;
+    float3 sample5adjx, sample5adjy, sample5diag;
+    float3 sample6adjx, sample6adjy, sample6diag;
+    float3 sample7adjx, sample7adjy, sample7diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    quad_gather(quad_vector, sample4curr, sample4adjx, sample4adjy, sample4diag);
+    quad_gather(quad_vector, sample5curr, sample5adjx, sample5adjy, sample5diag);
+    quad_gather(quad_vector, sample6curr, sample6adjx, sample6adjy, sample6diag);
+    quad_gather(quad_vector, sample7curr, sample7adjx, sample7adjy, sample7diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result.  First do the simple ones:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    sum += mul(w4, float4x3(sample4curr, sample4adjx, sample4adjy, sample4diag));
+    //  Now do the mixed-sample ones:
+    sum += mul(w2and5, float4x3(sample2curr, sample2adjy, sample5curr, sample5adjy));
+    sum += mul(w6and7, float4x3(sample6curr, sample6adjx, sample7curr, sample7adjx));
+    sum += w8curr * sample8curr;
+    //  Normalize the sum (so the weights add to 1.0) and return:
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 8x8 Gaussian
+    //              blur (a 4x4 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur12x12shared().  This function
+    //  shares the same concept and a similar sample placement, except each
+    //  quadrant contains 4x4 texels and 2x2 samples instead of 6x6 and 3x3
+    //  respectively.  There could be a total of 16 samples, 4 of which each
+    //  fragment is responsible for, but each fragment loads 0a/0b/0c/0d with
+    //  its own offset to reduce shared sample artifacts, bringing the sample
+    //  count for each fragment to 7.  Sample placement:
+    //      3a 2a 2b 3b
+    //      1a 0a 0b 1b
+    //      1c 0c 0d 1d
+    //      3c 2c 2d 3d
+    //  Texel placement:
+    //      3a3 3a2 2a3 2a2 2b2 2b3 3b2 3b3
+    //      3a1 3a0 2a1 2a0 2b0 2b1 3b0 3b1
+    //      1a3 1a2 0a3 0a2 0b2 0b3 1b2 1b3
+    //      1a1 1a0 0a1 0a0 0b0 0b1 1b0 1b1
+    //      1c1 1c0 0c1 0c0 0d0 0d1 1d0 1d1
+    //      1c3 1c2 0c3 0c2 0d2 0d3 1d2 1d3
+    //      3c1 3c0 2c1 2c0 2d0 2d1 3d0 4d1
+    //      3c3 3c2 2c3 2c2 2d2 2d3 3d2 4d3
+    
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    const float w3diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -4.0);
+    const float w2diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -4.0);
+    const float w2adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -4.0);
+    const float w3adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -4.0);
+    const float w1diag = GET_TEXEL_QUAD_WEIGHTS(-4.0, -2.0);
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w1adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 0.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w3adjx = GET_TEXEL_QUAD_WEIGHTS(-4.0, 2.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Statically pack weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+    const float4 w1 = float4(w1curr, w1adjx, w1adjy, w1diag);
+    const float4 w2 = float4(w2curr, w2adjx, w2adjy, w2diag);
+    const float4 w3 = float4(w3curr, w3adjx, w3adjy, w3diag);
+    //  Get the weight sum inverse (normalization factor):
+    const float4 weight_sum4 = w0 + w1 + w2 + w3;
+    const float2 weight_sum2 = weight_sum4.xy + weight_sum4.zw;
+    const float weight_sum = weight_sum2.x + weight_sum2.y;
+    const float weight_sum_inv = 1.0/(weight_sum);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    float3 sample3adjx, sample3adjy, sample3diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    quad_gather(quad_vector, sample3curr, sample3adjx, sample3adjy, sample3diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += mul(w1, float4x3(sample1curr, sample1adjx, sample1adjy, sample1diag));
+    sum += mul(w2, float4x3(sample2curr, sample2adjx, sample2adjy, sample2diag));
+    sum += mul(w3, float4x3(sample3curr, sample3adjx, sample3adjy, sample3diag));
+    return sum * weight_sum_inv;
+}
+
+float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector,
+    const float sigma)
+{
+    //  Perform a 1-pass mipmapped blur with shared samples across a pixel quad.
+    //  Requires:   Same as tex2Dblur12x12shared()
+    //  Returns:    A blurred texture lookup using a "virtual" 6x6 Gaussian
+    //              blur (a 3x3 blur of carefully selected bilinear samples)
+    //              of the given mip level.  There will be some inaccuracies,subtle inaccuracies,
+    //              especially for small or high-frequency detailed sources.
+    //  Description:
+    //  First see the description for tex2Dblur8x8shared().  This
+    //  function shares the same concept and sample placement, but each fragment
+    //  only uses 9 of the 16 samples taken across the pixel quad (to cover a
+    //  3x3 sample area, or 6x6 texel area), and it uses a lower standard
+    //  deviation to compensate.  Thanks to symmetry, the 7 omitted samples
+    //  are always the "same:"
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+
+    //  COMPUTE COORDS FOR TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Statically compute bilinear sampling offsets (details in tex2Dblur12x12shared).
+    const float denom_inv = 0.5/(sigma*sigma);
+    const float w0off   = 1.0;
+    const float w0_5off = exp(-(0.5*0.5) * denom_inv);
+    const float w1off   = exp(-(1.0*1.0) * denom_inv);
+    const float w1_5off = exp(-(1.5*1.5) * denom_inv);
+    const float w2off   = exp(-(2.0*2.0) * denom_inv);
+    const float w2_5off = exp(-(2.5*2.5) * denom_inv);
+    const float w3_5off = exp(-(3.5*3.5) * denom_inv);
+    const float texel0to1ratio = lerp(w1_5off/(w0_5off + w1_5off), 0.5, error_blurring);
+    const float texel2to3ratio = lerp(w3_5off/(w2_5off + w3_5off), 0.5, error_blurring);
+    //  We don't share sample0*, so use the nearest destination fragment:
+    const float texel0to1ratio_nearest = w1off/(w0off + w1off);
+    const float texel1to2ratio_nearest = w2off/(w1off + w2off);
+    //  Statically compute texel offsets from the bottom-right fragment to each
+    //  bilinear sample in the bottom-right quadrant:
+    const float2 sample0curr_texel_offset = float2(0.0, 0.0) + float2(texel0to1ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjx_texel_offset = float2(-1.0, 0.0) + float2(-texel1to2ratio_nearest, texel0to1ratio_nearest);
+    const float2 sample0adjy_texel_offset = float2(0.0, -1.0) + float2(texel0to1ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample0diag_texel_offset = float2(-1.0, -1.0) + float2(-texel1to2ratio_nearest, -texel1to2ratio_nearest);
+    const float2 sample1_texel_offset = float2(2.0, 0.0) + float2(texel2to3ratio, texel0to1ratio);
+    const float2 sample2_texel_offset = float2(0.0, 2.0) + float2(texel0to1ratio, texel2to3ratio);
+    const float2 sample3_texel_offset = float2(2.0, 2.0) + float2(texel2to3ratio, texel2to3ratio);
+
+    //  CALCULATE KERNEL WEIGHTS:
+    //  Statically compute bilinear sample weights at each destination fragment
+    //  from the sum of their 4 texel weights (details in tex2Dblur12x12shared).
+    #define GET_TEXEL_QUAD_WEIGHTS(xoff, yoff) \
+        (exp(-LENGTH_SQ(float2(xoff, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff, yoff + 1.0)) * denom_inv) + \
+        exp(-LENGTH_SQ(float2(xoff + 1.0, yoff + 1.0)) * denom_inv))
+    //  We only need 9 of the 16 sample weights.  Skip the following weights:
+    //      1adjx, 3adjx
+    //      2adjy, 3adjy
+    //      1diag, 2diag, 3diag
+    const float w0diag = GET_TEXEL_QUAD_WEIGHTS(-2.0, -2.0);
+    const float w0adjy = GET_TEXEL_QUAD_WEIGHTS(0.0, -2.0);
+    const float w1adjy = GET_TEXEL_QUAD_WEIGHTS(2.0, -2.0);
+    const float w0adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 0.0);
+    const float w0curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 0.0);
+    const float w1curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 0.0);
+    const float w2adjx = GET_TEXEL_QUAD_WEIGHTS(-2.0, 2.0);
+    const float w2curr = GET_TEXEL_QUAD_WEIGHTS(0.0, 2.0);
+    const float w3curr = GET_TEXEL_QUAD_WEIGHTS(2.0, 2.0);
+    #undef GET_TEXEL_QUAD_WEIGHTS
+    //  Get the weight sum inverse (normalization factor):
+    const float weight_sum_inv = 1.0/(w0curr + w1curr + w2curr + w3curr +
+        w0adjx + w2adjx + w0adjy + w1adjy + w0diag);
+    //  Statically pack some weights for runtime:
+    const float4 w0 = float4(w0curr, w0adjx, w0adjy, w0diag);
+
+    //  LOAD TEXTURE SAMPLES THIS FRAGMENT IS RESPONSIBLE FOR:
+    //  Get a uv vector from texel 0q0 of this quadrant to texel 0q3:
+    const float2 dxdy_curr = dxdy * quad_vector.xy;
+    //  Load bilinear samples for the current quadrant (for this fragment):
+    const float3 sample0curr = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0curr_texel_offset).rgb;
+    const float3 sample0adjx = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjx_texel_offset).rgb;
+    const float3 sample0adjy = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0adjy_texel_offset).rgb;
+    const float3 sample0diag = tex2D_linearize(tex, tex_uv.xy + dxdy_curr * sample0diag_texel_offset).rgb;
+    const float3 sample1curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample1_texel_offset)).rgb;
+    const float3 sample2curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample2_texel_offset)).rgb;
+    const float3 sample3curr = tex2Dlod_linearize(tex, tex_uv + uv2_to_uv4(dxdy_curr * sample3_texel_offset)).rgb;
+
+    //  GATHER NEIGHBORING SAMPLES AND SUM WEIGHTED SAMPLES:
+    //  Fetch the samples from other fragments in the 2x2 quad:
+    float3 sample1adjx, sample1adjy, sample1diag;
+    float3 sample2adjx, sample2adjy, sample2diag;
+    quad_gather(quad_vector, sample1curr, sample1adjx, sample1adjy, sample1diag);
+    quad_gather(quad_vector, sample2curr, sample2adjx, sample2adjy, sample2diag);
+    //  Statically normalize weights (so total = 1.0), and sum weighted samples.
+    //  Fill each row of a matrix with an rgb sample and pre-multiply by the
+    //  weights to obtain a weighted result for sample1*, and handle the rest
+    //  of the weights more directly/verbosely:
+    float3 sum = float3(0.0,0.0,0.0);
+    sum += mul(w0, float4x3(sample0curr, sample0adjx, sample0adjy, sample0diag));
+    sum += w1curr * sample1curr + w1adjy * sample1adjy + w2curr * sample2curr +
+            w2adjx * sample2adjx + w3curr * sample3curr;
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  MAX OPTIMAL SIGMA BLUR WRAPPERS  //////////////////////
+
+//  The following blurs are static wrappers around the dynamic blurs above.
+//  HOPEFULLY, the compiler will be smart enough to do constant-folding.
+
+//  Resizable separable blurs:
+inline float3 tex2Dblur11resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11resize(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9resize(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7resize(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5resize(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Fast separable blurs:
+inline float3 tex2Dblur11fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur11fast(tex, tex_uv, dxdy, blur11_std_dev);
+}
+inline float3 tex2Dblur9fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
 {
     return tex2Dblur9fast(tex, tex_uv, dxdy, blur9_std_dev);
 }
-
-vec3 tex2Dblur17fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
+inline float3 tex2Dblur7fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
 {
-    return tex2Dblur17fast(texture, tex_uv, dxdy, blur17_std_dev);
+    return tex2Dblur7fast(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5fast(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3fast(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  Huge, "fast" separable blurs:
+inline float3 tex2Dblur43fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur43fast(tex, tex_uv, dxdy, blur43_std_dev);
+}
+inline float3 tex2Dblur31fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur31fast(tex, tex_uv, dxdy, blur31_std_dev);
+}
+inline float3 tex2Dblur25fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur25fast(tex, tex_uv, dxdy, blur25_std_dev);
+}
+inline float3 tex2Dblur17fast(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur17fast(tex, tex_uv, dxdy, blur17_std_dev);
+}
+//  Resizable one-pass blurs:
+inline float3 tex2Dblur3x3resize(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3resize(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" one-pass blurs:
+inline float3 tex2Dblur9x9(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur9x9(tex, tex_uv, dxdy, blur9_std_dev);
+}
+inline float3 tex2Dblur7x7(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur7x7(tex, tex_uv, dxdy, blur7_std_dev);
+}
+inline float3 tex2Dblur5x5(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur5x5(tex, tex_uv, dxdy, blur5_std_dev);
+}
+inline float3 tex2Dblur3x3(const sampler2D tex, const float2 tex_uv,
+    const float2 dxdy)
+{
+    return tex2Dblur3x3(tex, tex_uv, dxdy, blur3_std_dev);
+}
+//  "Fast" shared-sample one-pass blurs:
+inline float3 tex2Dblur12x12shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur12x12shared(tex, tex_uv, dxdy, quad_vector, blur12_std_dev);
+}
+inline float3 tex2Dblur10x10shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur10x10shared(tex, tex_uv, dxdy, quad_vector, blur10_std_dev);
+}
+inline float3 tex2Dblur8x8shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur8x8shared(tex, tex_uv, dxdy, quad_vector, blur8_std_dev);
+}
+inline float3 tex2Dblur6x6shared(const sampler2D tex,
+    const float4 tex_uv, const float2 dxdy, const float4 quad_vector)
+{
+    return tex2Dblur6x6shared(tex, tex_uv, dxdy, quad_vector, blur6_std_dev);
 }
 
-vec3 tex2Dblur25fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur25fast(texture, tex_uv, dxdy, blur25_std_dev);
-}
 
-vec3 tex2Dblur43fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur43fast(texture, tex_uv, dxdy, blur43_std_dev);
-}
-vec3 tex2Dblur31fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur31fast(texture, tex_uv, dxdy, blur31_std_dev);
-}
+#endif  //  BLUR_FUNCTIONS_H
 
-vec3 tex2Dblur3fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur3fast(texture, tex_uv, dxdy, blur3_std_dev);
-}
-
-vec3 tex2Dblur3x3(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur3x3(texture, tex_uv, dxdy, blur3_std_dev);
-}
-
-vec3 tex2Dblur5fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur5fast(texture, tex_uv, dxdy, blur5_std_dev);
-}
-
-vec3 tex2Dblur5resize(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur5resize(texture, tex_uv, dxdy, blur5_std_dev);
-}
-vec3 tex2Dblur3resize(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur3resize(texture, tex_uv, dxdy, blur3_std_dev);
-}
-
-vec3 tex2Dblur5x5(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur5x5(texture, tex_uv, dxdy, blur5_std_dev);
-}
-
-vec3 tex2Dblur7resize(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur7resize(texture, tex_uv, dxdy, blur7_std_dev);
-}
-
-vec3 tex2Dblur7fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur7fast(texture, tex_uv, dxdy, blur7_std_dev);
-}
-
-vec3 tex2Dblur7x7(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur7x7(texture, tex_uv, dxdy, blur7_std_dev);
-}
-
-vec3 tex2Dblur9resize(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur9resize(texture, tex_uv, dxdy, blur9_std_dev);
-}
-
-vec3 tex2Dblur9x9(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur9x9(texture, tex_uv, dxdy, blur9_std_dev);
-}
-
-vec3 tex2Dblur11resize(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur11resize(texture, tex_uv, dxdy, blur11_std_dev);
-}
-
-vec3 tex2Dblur11fast(const sampler2D texture, const vec2 tex_uv,
-    const vec2 dxdy)
-{
-    return tex2Dblur11fast(texture, tex_uv, dxdy, blur11_std_dev);
-}
-
-#endif  //  BLUR_FUNCTIONS_H
\ No newline at end of file
diff --git a/include/compat_macros.inc b/include/compat_macros.inc
new file mode 100644
index 0000000..fd9dba7
--- /dev/null
+++ b/include/compat_macros.inc
@@ -0,0 +1,28 @@
+// compatibility macros for transparently converting HLSLisms into GLSLisms
+#define mul(a,b) (b*a)
+#define lerp(a,b,c) mix(a,b,c)
+#define saturate(c) clamp(c, 0.0, 1.0)
+#define frac(x) (fract(x))
+#define float2 vec2
+#define float3 vec3
+#define float4 vec4
+#define bool2 bvec2
+#define bool3 bvec3
+#define bool4 bvec4
+#define float2x2 mat2x2
+#define float3x3 mat3x3
+#define float4x4 mat4x4
+#define float4x3 mat4x3
+#define float2x4 mat2x4
+#define IN params
+#define texture_size SourceSize.xy
+#define video_size SourceSize.xy
+#define output_size OutputSize.xy
+#define frame_count FrameCount
+#define static  
+#define inline  
+#define fmod(x,y) mod(x,y)
+#define ddx(c) dFdx(c)
+#define ddy(c) dFdy(c)
+#define atan2(x,y) atan(y,x)
+#define rsqrt(c) inversesqrt(c)
\ No newline at end of file
diff --git a/include/gamma-management.h b/include/gamma-management.h
index a89bc2a..424290a 100644
--- a/include/gamma-management.h
+++ b/include/gamma-management.h
@@ -1,13 +1,138 @@
 #ifndef GAMMA_MANAGEMENT_H
 #define GAMMA_MANAGEMENT_H
 
+/////////////////////////////////  MIT LICENSE  ////////////////////////////////
+
+//  Copyright (C) 2014 TroggleMonkey
+//
+//  Permission is hereby granted, free of charge, to any person obtaining a copy
+//  of this software and associated documentation files (the "Software"), to
+//  deal in the Software without restriction, including without limitation the
+//  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+//  sell copies of the Software, and to permit persons to whom the Software is
+//  furnished to do so, subject to the following conditions:
+//  
+//  The above copyright notice and this permission notice shall be included in
+//  all copies or substantial portions of the Software.
+//
+//  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+//  IN THE SOFTWARE.
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides gamma-aware tex*D*() and encode_output() functions.
+//  Requires:   Before #include-ing this file, the including file must #define
+//              the following macros when applicable and follow their rules:
+//              1.) #define FIRST_PASS if this is the first pass.
+//              2.) #define LAST_PASS if this is the last pass.
+//              3.) If sRGB is available, set srgb_framebufferN = "true" for
+//                  every pass except the last in your .cgp preset.
+//              4.) If sRGB isn't available but you want gamma-correctness with
+//                  no banding, #define GAMMA_ENCODE_EVERY_FBO each pass.
+//              5.) #define SIMULATE_CRT_ON_LCD if desired (precedence over 5-7)
+//              6.) #define SIMULATE_GBA_ON_LCD if desired (precedence over 6-7)
+//              7.) #define SIMULATE_LCD_ON_CRT if desired (precedence over 7)
+//              8.) #define SIMULATE_GBA_ON_CRT if desired (precedence over -)
+//              If an option in [5, 8] is #defined in the first or last pass, it
+//              should be #defined for both.  It shouldn't make a difference
+//              whether it's #defined for intermediate passes or not.
+//  Optional:   The including file (or an earlier included file) may optionally
+//              #define a number of macros indicating it will override certain
+//              macros and associated constants are as follows:
+//              static constants with either static or uniform constants.  The
+//              1.) OVERRIDE_STANDARD_GAMMA: The user must first define:
+//                  static const float ntsc_gamma
+//                  static const float pal_gamma
+//                  static const float crt_reference_gamma_high
+//                  static const float crt_reference_gamma_low
+//                  static const float lcd_reference_gamma
+//                  static const float crt_office_gamma
+//                  static const float lcd_office_gamma
+//              2.) OVERRIDE_DEVICE_GAMMA: The user must first define:
+//                  static const float crt_gamma
+//                  static const float gba_gamma
+//                  static const float lcd_gamma
+//              3.) OVERRIDE_FINAL_GAMMA: The user must first define:
+//                  static const float input_gamma
+//                  static const float intermediate_gamma
+//                  static const float output_gamma
+//                  (intermediate_gamma is for GAMMA_ENCODE_EVERY_FBO.)
+//              4.) OVERRIDE_ALPHA_ASSUMPTIONS: The user must first define:
+//                  static const bool assume_opaque_alpha
+//              The gamma constant overrides must be used in every pass or none,
+//              and OVERRIDE_FINAL_GAMMA bypasses all of the SIMULATE* macros.
+//              OVERRIDE_ALPHA_ASSUMPTIONS may be set on a per-pass basis.
+//  Usage:      After setting macros appropriately, ignore gamma correction and
+//              replace all tex*D*() calls with equivalent gamma-aware
+//              tex*D*_linearize calls, except:
+//              1.) When you read an LUT, use regular tex*D or a gamma-specified
+//                  function, depending on its gamma encoding:
+//                      tex*D*_linearize_gamma (takes a runtime gamma parameter)
+//              2.) If you must read pass0's original input in a later pass, use
+//                  tex2D_linearize_ntsc_gamma.  If you want to read pass0's
+//                  input with gamma-corrected bilinear filtering, consider
+//                  creating a first linearizing pass and reading from the input
+//                  of pass1 later.
+//              Then, return encode_output(color) from every fragment shader.
+//              Finally, use the global gamma_aware_bilinear boolean if you want
+//              to statically branch based on whether bilinear filtering is
+//              gamma-correct or not (e.g. for placing Gaussian blur samples).
+//
+//  Detailed Policy:
+//  tex*D*_linearize() functions enforce a consistent gamma-management policy
+//  based on the FIRST_PASS and GAMMA_ENCODE_EVERY_FBO settings.  They assume
+//  their input texture has the same encoding characteristics as the input for
+//  the current pass (which doesn't apply to the exceptions listed above).
+//  Similarly, encode_output() enforces a policy based on the LAST_PASS and
+//  GAMMA_ENCODE_EVERY_FBO settings.  Together, they result in one of the
+//  following two pipelines.
+//  Typical pipeline with intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = linear_color;     //  Automatic sRGB encoding
+//      linear_color = intermediate_output;     //  Automatic sRGB decoding
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Typical pipeline without intermediate sRGB framebuffers:
+//      linear_color = pow(pass0_encoded_color, input_gamma);
+//      intermediate_output = pow(linear_color, 1.0/intermediate_gamma);
+//      linear_color = pow(intermediate_output, intermediate_gamma);
+//      final_output = pow(intermediate_output, 1.0/output_gamma);
+//  Using GAMMA_ENCODE_EVERY_FBO is much slower, but it's provided as a way to
+//  easily get gamma-correctness without banding on devices where sRGB isn't
+//  supported.
+//
+//  Use This Header to Maximize Code Reuse:
+//  The purpose of this header is to provide a consistent interface for texture
+//  reads and output gamma-encoding that localizes and abstracts away all the
+//  annoying details.  This greatly reduces the amount of code in each shader
+//  pass that depends on the pass number in the .cgp preset or whether sRGB
+//  FBO's are being used: You can trivially change the gamma behavior of your
+//  whole pass by commenting or uncommenting 1-3 #defines.  To reuse the same
+//  code in your first, Nth, and last passes, you can even put it all in another
+//  header file and #include it from skeleton .cg files that #define the
+//  appropriate pass-specific settings.
+//
+//  Rationale for Using Three Macros:
+//  This file uses GAMMA_ENCODE_EVERY_FBO instead of an opposite macro like
+//  SRGB_PIPELINE to ensure sRGB is assumed by default, which hopefully imposes
+//  a lower maintenance burden on each pass.  At first glance it seems we could
+//  accomplish everything with two macros: GAMMA_CORRECT_IN / GAMMA_CORRECT_OUT.
+//  This works for simple use cases where input_gamma == output_gamma, but it
+//  breaks down for more complex scenarios like CRT simulation, where the pass
+//  number determines the gamma encoding of the input and output.
+
+
 ///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
 
 //  Set standard gamma constants, but allow users to override them:
 #ifndef OVERRIDE_STANDARD_GAMMA
     //  Standard encoding gammas:
-    const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
-    const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    static const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    static const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
     //  Typical device decoding gammas (only use for emulating devices):
     //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
     //  gammas: The standards purposely undercorrected for an analog CRT's
@@ -19,17 +144,17 @@
     //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
     //  displays designed to view sRGB in bright environments.  (Standards are
     //  also in flux again with BT.1886, but it's underspecified for displays.)
-    const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
-    const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
-    const float lcd_reference_gamma = 2.5;       //  To match CRT
-    const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
-    const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+    static const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    static const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    static const float lcd_reference_gamma = 2.5;       //  To match CRT
+    static const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    static const float lcd_office_gamma = 2.2;  //  Approximates sRGB
 #endif  //  OVERRIDE_STANDARD_GAMMA
 
 //  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
 //  but only if they're aware of it.
 #ifndef OVERRIDE_ALPHA_ASSUMPTIONS
-    bool assume_opaque_alpha = false;
+    static const bool assume_opaque_alpha = false;
 #endif
 
 
@@ -43,90 +168,99 @@
 //  Set device gamma constants, but allow users to override them:
 #ifdef OVERRIDE_DEVICE_GAMMA
     //  The user promises to globally define the appropriate constants:
-    float get_crt_gamma()    {   return crt_gamma;   }
-    float get_gba_gamma()    {   return gba_gamma;   }
-    float get_lcd_gamma()    {   return lcd_gamma;   }
+    inline float get_crt_gamma()    {   return crt_gamma;   }
+    inline float get_gba_gamma()    {   return gba_gamma;   }
+    inline float get_lcd_gamma()    {   return lcd_gamma;   }
 #else
-    float get_crt_gamma()    {   return crt_reference_gamma_high;    }
-    float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
-    float get_lcd_gamma()    {   return lcd_office_gamma;            }
+    inline float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    inline float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    inline float get_lcd_gamma()    {   return lcd_office_gamma;            }
 #endif  //  OVERRIDE_DEVICE_GAMMA
 
 //  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
 #ifdef OVERRIDE_FINAL_GAMMA
     //  The user promises to globally define the appropriate constants:
-    float get_intermediate_gamma()   {   return intermediate_gamma;  }
-    float get_input_gamma()          {   return input_gamma;         }
-    float get_output_gamma()         {   return output_gamma;        }
+    inline float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    inline float get_input_gamma()          {   return input_gamma;         }
+    inline float get_output_gamma()         {   return output_gamma;        }
 #else
     //  If we gamma-correct every pass, always use ntsc_gamma between passes to
     //  ensure middle passes don't need to care if anything is being simulated:
-    float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    inline float get_intermediate_gamma()   {   return ntsc_gamma;          }
     #ifdef SIMULATE_CRT_ON_LCD
-        float get_input_gamma()      {   return get_crt_gamma();     }
-        float get_output_gamma()     {   return get_lcd_gamma();     }
+        inline float get_input_gamma()      {   return get_crt_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
     #else
     #ifdef SIMULATE_GBA_ON_LCD
-        float get_input_gamma()      {   return get_gba_gamma();     }
-        float get_output_gamma()     {   return get_lcd_gamma();     }
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_lcd_gamma();     }
     #else
     #ifdef SIMULATE_LCD_ON_CRT
-        float get_input_gamma()      {   return get_lcd_gamma();     }
-        float get_output_gamma()     {   return get_crt_gamma();     }
+        inline float get_input_gamma()      {   return get_lcd_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
     #else
     #ifdef SIMULATE_GBA_ON_CRT
-        float get_input_gamma()      {   return get_gba_gamma();     }
-        float get_output_gamma()     {   return get_crt_gamma();     }
+        inline float get_input_gamma()      {   return get_gba_gamma();     }
+        inline float get_output_gamma()     {   return get_crt_gamma();     }
     #else   //  Don't simulate anything:
-        float get_input_gamma()      {   return ntsc_gamma;          }
-        float get_output_gamma()     {   return ntsc_gamma;          }
+        inline float get_input_gamma()      {   return ntsc_gamma;          }
+        inline float get_output_gamma()     {   return ntsc_gamma;          }
     #endif  //  SIMULATE_GBA_ON_CRT
     #endif  //  SIMULATE_LCD_ON_CRT
     #endif  //  SIMULATE_GBA_ON_LCD
     #endif  //  SIMULATE_CRT_ON_LCD
 #endif  //  OVERRIDE_FINAL_GAMMA
 
+//  Set decoding/encoding gammas for the current pass.  Use static constants for
+//  linearize_input and gamma_encode_output, because they aren't derived, and
+//  they let the compiler do dead-code elimination.
 #ifndef GAMMA_ENCODE_EVERY_FBO
     #ifdef FIRST_PASS
-        bool linearize_input = true;
-        float get_pass_input_gamma()     {   return get_input_gamma();   }
+        static const bool linearize_input = true;
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
     #else
-        bool linearize_input = false;
-        float get_pass_input_gamma()     {   return 1.0;                 }
+        static const bool linearize_input = false;
+        inline float get_pass_input_gamma()     {   return 1.0;                 }
     #endif
     #ifdef LAST_PASS
-        bool gamma_encode_output = true;
-        float get_pass_output_gamma()    {   return get_output_gamma();  }
+        static const bool gamma_encode_output = true;
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
     #else
-        bool gamma_encode_output = false;
-        float get_pass_output_gamma()    {   return 1.0;                 }
+        static const bool gamma_encode_output = false;
+        inline float get_pass_output_gamma()    {   return 1.0;                 }
     #endif
 #else
-    bool linearize_input = true;
-    bool gamma_encode_output = true;
+    static const bool linearize_input = true;
+    static const bool gamma_encode_output = true;
     #ifdef FIRST_PASS
-        float get_pass_input_gamma()     {   return get_input_gamma();   }
+        inline float get_pass_input_gamma()     {   return get_input_gamma();   }
     #else
-        float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+        inline float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
     #endif
     #ifdef LAST_PASS
-        float get_pass_output_gamma()    {   return get_output_gamma();  }
+        inline float get_pass_output_gamma()    {   return get_output_gamma();  }
     #else
-        float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+        inline float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
     #endif
 #endif
 
-vec4 decode_input(const vec4 color)
+//  Users might want to know if bilinear filtering will be gamma-correct:
+static const bool gamma_aware_bilinear = !linearize_input;
+
+
+//////////////////////  COLOR ENCODING/DECODING FUNCTIONS  /////////////////////
+
+inline float4 encode_output(const float4 color)
 {
-    if(linearize_input = true)
+    if(gamma_encode_output)
     {
-        if(assume_opaque_alpha = true)
+        if(assume_opaque_alpha)
         {
-            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), 1.0);
         }
         else
         {
-            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
+            return float4(pow(color.rgb, float3(1.0/get_pass_output_gamma())), color.a);
         }
     }
     else
@@ -135,17 +269,17 @@ vec4 decode_input(const vec4 color)
     }
 }
 
-vec4 encode_output(const vec4 color)
+inline float4 decode_input(const float4 color)
 {
-    if(gamma_encode_output = true)
+    if(linearize_input)
     {
-        if(assume_opaque_alpha = true)
+        if(assume_opaque_alpha)
         {
-            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), 1.0);
         }
         else
         {
-            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
+            return float4(pow(color.rgb, float3(get_pass_input_gamma())), color.a);
         }
     }
     else
@@ -154,12 +288,259 @@ vec4 encode_output(const vec4 color)
     }
 }
 
+inline float4 decode_gamma_input(const float4 color, const float3 gamma)
+{
+    if(assume_opaque_alpha)
+    {
+        return float4(pow(color.rgb, gamma), 1.0);
+    }
+    else
+    {
+        return float4(pow(color.rgb, gamma), color.a);
+    }
+}
+
+//TODO/FIXME: I have no idea why replacing the lookup wrappers with this macro fixes the blurs being offset ¯\_(ツ)_/¯
 #define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
-//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords)
-//{   return decode_input(vec4(texture(tex, tex_coords)));   }
 
-//#define tex2D_linearize(C, D, E) decode_input(vec4(texture(C, D, E)))
-//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords, const int texel_off)
-//{   return decode_input(vec4(texture(tex, tex_coords, texel_off)));    }
+///////////////////////////  TEXTURE LOOKUP WRAPPERS  //////////////////////////
+
+//  "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a wide array of linearizing texture lookup wrapper functions.  The
+//  Cg shader spec Retroarch uses only allows for 2D textures, but 1D and 3D
+//  lookups are provided for completeness in case that changes someday.  Nobody
+//  is likely to use the *fetch and *proj functions, but they're included just
+//  in case.  The only tex*D texture sampling functions omitted are:
+//      - tex*Dcmpbias
+//      - tex*Dcmplod
+//      - tex*DARRAY*
+//      - tex*DMS*
+//      - Variants returning integers
+//  Standard line length restrictions are ignored below for vertical brevity.
+/*
+//  tex1D:
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1D(tex, tex_coords));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+inline float4 tex1D_linearize(const sampler1D tex, const float2 tex_coords, const float dx, const float dy, const int texel_off)
+{   return decode_input(tex1D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex1Dbias:
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dbias(tex, tex_coords));   }
+
+inline float4 tex1Dbias_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dbias(tex, tex_coords, texel_off));    }
+
+//  tex1Dfetch:
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords)
+{   return decode_input(tex1Dfetch(tex, tex_coords));  }
+
+inline float4 tex1Dfetch_linearize(const sampler1D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex1Dlod:
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords)
+{   return decode_input(tex1Dlod(tex, tex_coords));    }
+
+inline float4 tex1Dlod_linearize(const sampler1D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex1Dlod(tex, tex_coords, texel_off));     }
+
+//  tex1Dproj:
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords)
+{   return decode_input(tex1Dproj(tex, tex_coords));   }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex1Dproj_linearize(const sampler1D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex1Dproj(tex, tex_coords, texel_off));    }
+/////////*
+//  tex2D:
+inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords)
+{   return decode_input(texture(tex, tex_coords));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(texture(tex, tex_coords.xy));   }
+
+inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+{   return decode_input(textureLod(tex, tex_coords, texel_off));    }
+
+inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(texture(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy)
+//{   return decode_input(texture(tex, tex_coords, dx, dy));   }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(texture(tex, tex_coords, dx, dy, texel_off));    }
+
+//inline float4 tex2D_linearize(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off)
+//{   return decode_input(texture(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex2Dbias:
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords)
+//{   return decode_input(tex2Dbias(tex, tex_coords));   }
+
+//inline float4 tex2Dbias_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dbias(tex, tex_coords, texel_off));    }
+
+//  tex2Dfetch:
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords)
+//{   return decode_input(tex2Dfetch(tex, tex_coords));  }
+
+//inline float4 tex2Dfetch_linearize(const sampler2D tex, const int4 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dfetch(tex, tex_coords, texel_off));   }
+*/
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(textureLod(tex, tex_coords.xy, 0.0));    }
+
+inline float4 tex2Dlod_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(textureLod(tex, tex_coords.xy, texel_off));     }
+/*
+//  tex2Dproj:
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords)
+{   return decode_input(tex2Dproj(tex, tex_coords));   }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+
+inline float4 tex2Dproj_linearize(const sampler2D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex2Dproj(tex, tex_coords, texel_off));    }
+*/
+/*
+//  tex3D:
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords)
+{   return decode_input(tex3D(tex, tex_coords));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, texel_off));    }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy));   }
+
+inline float4 tex3D_linearize(const sampler3D tex, const float3 tex_coords, const float3 dx, const float3 dy, const int texel_off)
+{   return decode_input(tex3D(tex, tex_coords, dx, dy, texel_off));    }
+
+//  tex3Dbias:
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dbias(tex, tex_coords));   }
+
+inline float4 tex3Dbias_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dbias(tex, tex_coords, texel_off));    }
+
+//  tex3Dfetch:
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords)
+{   return decode_input(tex3Dfetch(tex, tex_coords));  }
+
+inline float4 tex3Dfetch_linearize(const sampler3D tex, const int4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dfetch(tex, tex_coords, texel_off));   }
+
+//  tex3Dlod:
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dlod(tex, tex_coords));    }
+
+inline float4 tex3Dlod_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dlod(tex, tex_coords, texel_off));     }
+
+//  tex3Dproj:
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords)
+{   return decode_input(tex3Dproj(tex, tex_coords));   }
+
+inline float4 tex3Dproj_linearize(const sampler3D tex, const float4 tex_coords, const int texel_off)
+{   return decode_input(tex3Dproj(tex, tex_coords, texel_off));    }
+/////////*
+
+//  NONSTANDARD "SMART" LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  This narrow selection of nonstandard tex2D* functions can be useful:
+
+//  tex2Dlod0: Automatically fill in the tex2D LOD parameter for mip level 0.
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0)));   }
+
+//inline float4 tex2Dlod0_linearize(const sampler2D tex, const float2 tex_coords, const int texel_off)
+//{   return decode_input(tex2Dlod(tex, float4(tex_coords, 0.0, 0.0), texel_off));    }
+
+
+//  MANUALLY LINEARIZING TEXTURE LOOKUP FUNCTIONS:
+//  Provide a narrower selection of tex2D* wrapper functions that decode an
+//  input sample with a specified gamma value.  These are useful for reading
+//  LUT's and for reading the input of pass0 in a later pass.
+
+//  tex2D:
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float3 gamma)
+{   return decode_gamma_input(texture(tex, tex_coords), gamma);   }
+
+inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float3 gamma)
+{   return decode_gamma_input(texture(tex, tex_coords.xy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(texture(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(texture(tex, tex_coords, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(texture(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const float3 gamma)
+//{   return decode_gamma_input(texture(tex, tex_coords, dx, dy), gamma);   }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float2 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(texture(tex, tex_coords, dx, dy, texel_off), gamma);    }
+
+//inline float4 tex2D_linearize_gamma(const sampler2D tex, const float3 tex_coords, const float2 dx, const float2 dy, const int texel_off, const float3 gamma)
+//{   return decode_gamma_input(texture(tex, tex_coords, dx, dy, texel_off), gamma);    }
+/*
+//  tex2Dbias:
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords), gamma);   }
+
+inline float4 tex2Dbias_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dbias(tex, tex_coords, texel_off), gamma);    }
+
+//  tex2Dfetch:
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords), gamma);  }
+
+inline float4 tex2Dfetch_linearize_gamma(const sampler2D tex, const int4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(tex2Dfetch(tex, tex_coords, texel_off), gamma);   }
+/////////*
+//  tex2Dlod:
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, const float4 tex_coords, const float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, 0.0), gamma);    }
+
+inline float4 tex2Dlod_linearize_gamma(const sampler2D tex, const float4 tex_coords, const int texel_off, const float3 gamma)
+{   return decode_gamma_input(textureLod(tex, tex_coords.xy, texel_off), gamma);     }
+*/
+
+#endif  //  GAMMA_MANAGEMENT_H
 
-#endif  //  GAMMA_MANAGEMENT_H
\ No newline at end of file
diff --git a/include/quad-pixel-communication.h b/include/quad-pixel-communication.h
index 4c3f1cb..c8ffca4 100644
--- a/include/quad-pixel-communication.h
+++ b/include/quad-pixel-communication.h
@@ -47,7 +47,7 @@
 
 /////////////////////  QUAD-PIXEL COMMUNICATION PRIMITIVES  ////////////////////
 
-vec4 get_quad_vector_naive(const vec4 output_pixel_num_wrt_uvxy)
+float4 get_quad_vector_naive(const float4 output_pixel_num_wrt_uvxy)
 {
     //  Requires:   Two measures of the current fragment's output pixel number
     //              in the range ([0, IN.output_size.x), [0, IN.output_size.y)):
@@ -62,33 +62,33 @@ vec4 get_quad_vector_naive(const vec4 output_pixel_num_wrt_uvxy)
     //              2.) The .zw components are its 2x2 placement with respect to
     //                  screen xy direction (IN.position); the origin varies.
     //                  quad_gather needs this measure to work correctly.
-    //              Note: quad_vector.zw = quad_vector.xy * vec2(
+    //              Note: quad_vector.zw = quad_vector.xy * float2(
     //                      ddx(output_pixel_num_wrt_uvxy.x),
     //                      ddy(output_pixel_num_wrt_uvxy.y));
     //  Caveats:    This function assumes the GPU driver always starts 2x2 pixel
     //              quads at even pixel numbers.  This assumption can be wrong
     //              for odd output resolutions (nondeterministically so).
-    const vec4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
-    const vec4 quad_vector = pixel_odd * 2.0 - vec4(1.0);
+    const float4 pixel_odd = frac(output_pixel_num_wrt_uvxy * 0.5) * 2.0;
+    const float4 quad_vector = pixel_odd * 2.0 - float4(1.0);
     return quad_vector;
 }
 
-vec4 get_quad_vector(const vec4 output_pixel_num_wrt_uvxy)
+float4 get_quad_vector(const float4 output_pixel_num_wrt_uvxy)
 {
     //  Requires:   Same as get_quad_vector_naive() (see that first).
     //  Returns:    Same as get_quad_vector_naive() (see that first), but it's
     //              correct even if the 2x2 pixel quad starts at an odd pixel,
     //              which can occur at odd resolutions.
-    const vec4 quad_vector_guess =
+    const float4 quad_vector_guess =
         get_quad_vector_naive(output_pixel_num_wrt_uvxy);
     //  If quad_vector_guess.zw doesn't increase with screen xy, we know
     //  the 2x2 pixel quad starts at an odd pixel:
-    const vec2 odd_start_mirror = 0.5 * vec2(ddx(quad_vector_guess.z),
+    const float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_guess.z),
                                                 ddy(quad_vector_guess.w));
     return quad_vector_guess * odd_start_mirror.xyxy;
 }
 
-vec4 get_quad_vector(const vec2 output_pixel_num_wrt_uv)
+float4 get_quad_vector(const float2 output_pixel_num_wrt_uv)
 {
     //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
     //              2.) output_pixel_num_wrt_uv must increase with uv coords and
@@ -98,25 +98,25 @@ vec4 get_quad_vector(const vec2 output_pixel_num_wrt_uv)
     //              correct even if the 2x2 pixel quad starts at an odd pixel,
     //              which can occur at odd resolutions.
     //  Caveats:    This function requires less information than the version
-    //              taking a vec4, but it's potentially slower.
+    //              taking a float4, but it's potentially slower.
     //  Do screen coords increase with or against uv?  Get the direction
     //  with respect to (uv.x, uv.y) for (screen.x, screen.y) in {-1, 1}.
-    const vec2 screen_uv_mirror = vec2(ddx(output_pixel_num_wrt_uv.x),
+    const float2 screen_uv_mirror = float2(ddx(output_pixel_num_wrt_uv.x),
                                         ddy(output_pixel_num_wrt_uv.y));
-    const vec2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
-    const vec2 quad_vector_uv_guess = (pixel_odd_wrt_uv - vec2(0.5)) * 2.0;
-    const vec2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
+    const float2 pixel_odd_wrt_uv = frac(output_pixel_num_wrt_uv * 0.5) * 2.0;
+    const float2 quad_vector_uv_guess = (pixel_odd_wrt_uv - float2(0.5)) * 2.0;
+    const float2 quad_vector_screen_guess = quad_vector_uv_guess * screen_uv_mirror;
     //  If quad_vector_screen_guess doesn't increase with screen xy, we know
     //  the 2x2 pixel quad starts at an odd pixel:
-    const vec2 odd_start_mirror = 0.5 * vec2(ddx(quad_vector_screen_guess.x),
+    const float2 odd_start_mirror = 0.5 * float2(ddx(quad_vector_screen_guess.x),
                                                 ddy(quad_vector_screen_guess.y));
-    const vec4 quad_vector_guess = vec4(
+    const float4 quad_vector_guess = float4(
         quad_vector_uv_guess, quad_vector_screen_guess);
     return quad_vector_guess * odd_start_mirror.xyxy;
 }
 
-void quad_gather(const vec4 quad_vector, const vec4 curr,
-    out vec4 adjx, out vec4 adjy, out vec4 diag)
+void quad_gather(const float4 quad_vector, const float4 curr,
+    out float4 adjx, out float4 adjy, out float4 diag)
 {
     //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
     //              2.) The GPU driver is using fine/high-quality derivatives.
@@ -130,70 +130,70 @@ void quad_gather(const vec4 quad_vector, const vec4 curr,
     diag = adjx - ddy(adjx) * quad_vector.w;
 }
 
-void quad_gather(const vec4 quad_vector, const vec3 curr,
-    out vec3 adjx, out vec3 adjy, out vec3 diag)
+void quad_gather(const float4 quad_vector, const float3 curr,
+    out float3 adjx, out float3 adjy, out float3 diag)
 {
-    //  vec3 version
+    //  Float3 version
     adjx = curr - ddx(curr) * quad_vector.z;
     adjy = curr - ddy(curr) * quad_vector.w;
     diag = adjx - ddy(adjx) * quad_vector.w;
 }
 
-void quad_gather(const vec4 quad_vector, const vec2 curr,
-    out vec2 adjx, out vec2 adjy, out vec2 diag)
+void quad_gather(const float4 quad_vector, const float2 curr,
+    out float2 adjx, out float2 adjy, out float2 diag)
 {
-    //  vec2 version
+    //  Float2 version
     adjx = curr - ddx(curr) * quad_vector.z;
     adjy = curr - ddy(curr) * quad_vector.w;
     diag = adjx - ddy(adjx) * quad_vector.w;
 }
 
-vec4 quad_gather(const vec4 quad_vector, const float curr)
+float4 quad_gather(const float4 quad_vector, const float curr)
 {
     //  Float version:
     //  Returns:    return.x == current
     //              return.y == adjacent x
     //              return.z == adjacent y
     //              return.w == diagonal
-    vec4 all = vec4(curr);
+    float4 all = float4(curr);
     all.y = all.x - ddx(all.x) * quad_vector.z;
     all.zw = all.xy - ddy(all.xy) * quad_vector.w;
     return all;
 }
 
-vec4 quad_gather_sum(const vec4 quad_vector, const vec4 curr)
+float4 quad_gather_sum(const float4 quad_vector, const float4 curr)
 {
     //  Requires:   Same as quad_gather()
     //  Returns:    Sum of an input vector (curr) at all fragments in a quad.
-    vec4 adjx, adjy, diag;
+    float4 adjx, adjy, diag;
     quad_gather(quad_vector, curr, adjx, adjy, diag);
     return (curr + adjx + adjy + diag);
 }
 
-vec3 quad_gather_sum(const vec4 quad_vector, const vec3 curr)
+float3 quad_gather_sum(const float4 quad_vector, const float3 curr)
 {
-    //  vec3 version:
-    vec3 adjx, adjy, diag;
+    //  Float3 version:
+    float3 adjx, adjy, diag;
     quad_gather(quad_vector, curr, adjx, adjy, diag);
     return (curr + adjx + adjy + diag);
 }
 
-vec2 quad_gather_sum(const vec4 quad_vector, const vec2 curr)
+float2 quad_gather_sum(const float4 quad_vector, const float2 curr)
 {
-    //  vec2 version:
-    vec2 adjx, adjy, diag;
+    //  Float2 version:
+    float2 adjx, adjy, diag;
     quad_gather(quad_vector, curr, adjx, adjy, diag);
     return (curr + adjx + adjy + diag);
 }
 
-float quad_gather_sum(const vec4 quad_vector, const float curr)
+float quad_gather_sum(const float4 quad_vector, const float curr)
 {
     //  Float version:
-    const vec4 all_values = quad_gather(quad_vector, curr);
+    const float4 all_values = quad_gather(quad_vector, curr);
     return (all_values.x + all_values.y + all_values.z + all_values.w);
 }
 
-bool fine_derivatives_working(const vec4 quad_vector, vec4 curr)
+bool fine_derivatives_working(const float4 quad_vector, float4 curr)
 {
     //  Requires:   1.) ddx() and ddy() are present in the current Cg profile.
     //              2.) quad_vector describes the current fragment's location in
@@ -206,19 +206,19 @@ bool fine_derivatives_working(const vec4 quad_vector, vec4 curr)
     //  Method:     We can confirm fine derivatives are used if the following
     //              holds (ever, for any value at any fragment):
     //                  (ddy(curr) != ddy(adjx)) or (ddx(curr) != ddx(adjy))
-    //              The more values we test (e.g. test a vec4 two ways), the
+    //              The more values we test (e.g. test a float4 two ways), the
     //              easier it is to demonstrate fine derivatives are working.
     //  TODO: Check for floating point exact comparison issues!
-    vec4 ddx_curr = ddx(curr);
-    vec4 ddy_curr = ddy(curr);
-    vec4 adjx = curr - ddx_curr * quad_vector.z;
-    vec4 adjy = curr - ddy_curr * quad_vector.w;
-    bool ddy_different = any(ddy_curr != ddy(adjx));
-    bool ddx_different = any(ddx_curr != ddx(adjy));
+    float4 ddx_curr = ddx(curr);
+    float4 ddy_curr = ddy(curr);
+    float4 adjx = curr - ddx_curr * quad_vector.z;
+    float4 adjy = curr - ddy_curr * quad_vector.w;
+    bool ddy_different = any(bool4(ddy_curr.x != ddy(adjx).x, ddy_curr.y != ddy(adjx).y, ddy_curr.z != ddy(adjx).z, ddy_curr.w != ddy(adjx).w));
+    bool ddx_different = any(bool4(ddx_curr.x != ddx(adjy).x, ddx_curr.y != ddx(adjy).y, ddx_curr.z != ddx(adjy).z, ddx_curr.w != ddx(adjy).w));
     return any(bool2(ddy_different, ddx_different));
 }
 
-bool fine_derivatives_working_fast(const vec4 quad_vector, float curr)
+bool fine_derivatives_working_fast(const float4 quad_vector, float curr)
 {
     //  Requires:   Same as fine_derivatives_working()
     //  Returns:    Same as fine_derivatives_working()
diff --git a/include/special-functions.h b/include/special-functions.h
index 2a06390..6fb3809 100644
--- a/include/special-functions.h
+++ b/include/special-functions.h
@@ -1,7 +1,6 @@
 #ifndef SPECIAL_FUNCTIONS_H
 #define SPECIAL_FUNCTIONS_H
 
-
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 
 //  Copyright (C) 2014 TroggleMonkey
@@ -38,7 +37,7 @@
 //
 //  Design Rationale:
 //  Pretty much every line of code in this file is duplicated four times for
-//  different input types (vec4/vec3/vec2/float).  This is unfortunate,
+//  different input types (float4/float3/float2/float).  This is unfortunate,
 //  but Cg doesn't allow function templates.  Macros would be far less verbose,
 //  but they would make the code harder to document and read.  I don't expect
 //  these functions will require a whole lot of maintenance changes unless
@@ -48,7 +47,7 @@
 
 ///////////////////////////  GAUSSIAN ERROR FUNCTION  //////////////////////////
 
-vec4 erf6(vec4 x)
+float4 erf6(float4 x)
 {
     //  Requires:   x is the standard parameter to erf().
     //  Returns:    Return an Abramowitz/Stegun approximation of erf(), where:
@@ -56,32 +55,32 @@ vec4 erf6(vec4 x)
     //              This approximation has a max absolute error of 2.5*10**-5
     //              with solid numerical robustness and efficiency.  See:
 	//                  https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions
-	const vec4 one = vec4(1.0);
-	const vec4 sign_x = sign(x);
-	const vec4 t = one/(one + 0.47047*abs(x));
-	const vec4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+	static const float4 one = float4(1.0);
+	const float4 sign_x = sign(x);
+	const float4 t = one/(one + 0.47047*abs(x));
+	const float4 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
 		exp(-(x*x));
 	return result * sign_x;
 }
 
-vec3 erf6(const vec3 x)
+float3 erf6(const float3 x)
 {
-    //  vec3 version:
-	const vec3 one = vec3(1.0);
-	const vec3 sign_x = sign(x);
-	const vec3 t = one/(one + 0.47047*abs(x));
-	const vec3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+    //  Float3 version:
+	static const float3 one = float3(1.0);
+	const float3 sign_x = sign(x);
+	const float3 t = one/(one + 0.47047*abs(x));
+	const float3 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
 		exp(-(x*x));
 	return result * sign_x;
 }
 
-vec2 erf6(const vec2 x)
+float2 erf6(const float2 x)
 {
-    //  vec2 version:
-	const vec2 one = vec2(1.0);
-	const vec2 sign_x = sign(x);
-	const vec2 t = one/(one + 0.47047*abs(x));
-	const vec2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
+    //  Float2 version:
+	static const float2 one = float2(1.0);
+	const float2 sign_x = sign(x);
+	const float2 t = one/(one + 0.47047*abs(x));
+	const float2 result = one - t*(0.3480242 + t*(-0.0958798 + t*0.7478556))*
 		exp(-(x*x));
 	return result * sign_x;
 }
@@ -96,7 +95,7 @@ float erf6(const float x)
 	return result * sign_x;
 }
 
-vec4 erft(const vec4 x)
+float4 erft(const float4 x)
 {
     //  Requires:   x is the standard parameter to erf().
     //  Returns:    Approximate erf() with the hyperbolic tangent.  The error is
@@ -108,15 +107,15 @@ vec4 erft(const vec4 x)
 	return tanh(1.202760580 * x);
 }
 
-vec3 erft(const vec3 x)
+float3 erft(const float3 x)
 {
-    //  vec3 version:
+    //  Float3 version:
 	return tanh(1.202760580 * x);
 }
 
-vec2 erft(const vec2 x)
+float2 erft(const float2 x)
 {
-    //  vec2 version:
+    //  Float2 version:
 	return tanh(1.202760580 * x);
 }
 
@@ -126,7 +125,7 @@ float erft(const float x)
 	return tanh(1.202760580 * x);
 }
 
-vec4 erf(const vec4 x)
+inline float4 erf(const float4 x)
 {
     //  Requires:   x is the standard parameter to erf().
     //  Returns:    Some approximation of erf(x), depending on user settings.
@@ -137,9 +136,9 @@ vec4 erf(const vec4 x)
 	#endif
 }
 
-vec3 erf(const vec3 x)
+inline float3 erf(const float3 x)
 {
-    //  vec3 version:
+    //  Float3 version:
 	#ifdef ERF_FAST_APPROXIMATION
 		return erft(x);
 	#else
@@ -147,9 +146,9 @@ vec3 erf(const vec3 x)
 	#endif
 }
 
-vec2 erf(const vec2 x)
+inline float2 erf(const float2 x)
 {
-    //  vec2 version:
+    //  Float2 version:
 	#ifdef ERF_FAST_APPROXIMATION
 		return erft(x);
 	#else
@@ -157,7 +156,7 @@ vec2 erf(const vec2 x)
 	#endif
 }
 
-float erf(const float x)
+inline float erf(const float x)
 {
     //  Float version:
 	#ifdef ERF_FAST_APPROXIMATION
@@ -167,9 +166,10 @@ float erf(const float x)
 	#endif
 }
 
+
 ///////////////////////////  COMPLETE GAMMA FUNCTION  //////////////////////////
 
-vec4 gamma_impl(const vec4 s, const vec4 s_inv)
+float4 gamma_impl(const float4 s, const float4 s_inv)
 {
     //  Requires:   1.) s is the standard parameter to the gamma function, and
     //                  it should lie in the [0, 36] range.
@@ -185,76 +185,76 @@ vec4 gamma_impl(const vec4 s, const vec4 s_inv)
     //              evals.  We could use three coeffs (0.0000346 error) without
     //              hurting latency, but this allows more parallelism with
     //              outside instructions.
-	const vec4 g = vec4(1.12906830989);
-	const vec4 c0 = vec4(0.8109119309638332633713423362694399653724431);
-	const vec4 c1 = vec4(0.4808354605142681877121661197951496120000040);
-	const vec4 e = vec4(2.71828182845904523536028747135266249775724709);
-	const vec4 sph = s + vec4(0.5);
-	const vec4 lanczos_sum = c0 + c1/(s + vec4(1.0));
-	const vec4 base = (sph + g)/e;  //  or (s + g + vec4(0.5))/e
+	static const float4 g = float4(1.12906830989);
+	static const float4 c0 = float4(0.8109119309638332633713423362694399653724431);
+	static const float4 c1 = float4(0.4808354605142681877121661197951496120000040);
+	static const float4 e = float4(2.71828182845904523536028747135266249775724709);
+	const float4 sph = s + float4(0.5);
+	const float4 lanczos_sum = c0 + c1/(s + float4(1.0));
+	const float4 base = (sph + g)/e;  //  or (s + g + float4(0.5))/e
 	//  gamma(s + 1) = base**sph * lanczos_sum; divide by s for gamma(s).
 	//  This has less error for small s's than (s -= 1.0) at the beginning.
 	return (pow(base, sph) * lanczos_sum) * s_inv;
 }
 
-vec3 gamma_impl(const vec3 s, const vec3 s_inv)
+float3 gamma_impl(const float3 s, const float3 s_inv)
 {
-    //  vec3 version:
-	const vec3 g = vec3(1.12906830989);
-	const vec3 c0 = vec3(0.8109119309638332633713423362694399653724431);
-	const vec3 c1 = vec3(0.4808354605142681877121661197951496120000040);
-	const vec3 e = vec3(2.71828182845904523536028747135266249775724709);
-	const vec3 sph = s + vec3(0.5);
-	const vec3 lanczos_sum = c0 + c1/(s + vec3(1.0));
-	const vec3 base = (sph + g)/e;
+    //  Float3 version:
+	static const float3 g = float3(1.12906830989);
+	static const float3 c0 = float3(0.8109119309638332633713423362694399653724431);
+	static const float3 c1 = float3(0.4808354605142681877121661197951496120000040);
+	static const float3 e = float3(2.71828182845904523536028747135266249775724709);
+	const float3 sph = s + float3(0.5);
+	const float3 lanczos_sum = c0 + c1/(s + float3(1.0));
+	const float3 base = (sph + g)/e;
 	return (pow(base, sph) * lanczos_sum) * s_inv;
 }
 
-vec2 gamma_impl(const vec2 s, const vec2 s_inv)
+float2 gamma_impl(const float2 s, const float2 s_inv)
 {
-    //  vec2 version:
-	const vec2 g = vec2(1.12906830989);
-	const vec2 c0 = vec2(0.8109119309638332633713423362694399653724431);
-	const vec2 c1 = vec2(0.4808354605142681877121661197951496120000040);
-	const vec2 e = vec2(2.71828182845904523536028747135266249775724709);
-	const vec2 sph = s + vec2(0.5);
-	const vec2 lanczos_sum = c0 + c1/(s + vec2(1.0));
-	const vec2 base = (sph + g)/e;
+    //  Float2 version:
+	static const float2 g = float2(1.12906830989);
+	static const float2 c0 = float2(0.8109119309638332633713423362694399653724431);
+	static const float2 c1 = float2(0.4808354605142681877121661197951496120000040);
+	static const float2 e = float2(2.71828182845904523536028747135266249775724709);
+	const float2 sph = s + float2(0.5);
+	const float2 lanczos_sum = c0 + c1/(s + float2(1.0));
+	const float2 base = (sph + g)/e;
 	return (pow(base, sph) * lanczos_sum) * s_inv;
 }
 
 float gamma_impl(const float s, const float s_inv)
 {
     //  Float version:
-	const float g = 1.12906830989;
-	const float c0 = 0.8109119309638332633713423362694399653724431;
-	const float c1 = 0.4808354605142681877121661197951496120000040;
-	const float e = 2.71828182845904523536028747135266249775724709;
+	static const float g = 1.12906830989;
+	static const float c0 = 0.8109119309638332633713423362694399653724431;
+	static const float c1 = 0.4808354605142681877121661197951496120000040;
+	static const float e = 2.71828182845904523536028747135266249775724709;
 	const float sph = s + 0.5;
 	const float lanczos_sum = c0 + c1/(s + 1.0);
 	const float base = (sph + g)/e;
 	return (pow(base, sph) * lanczos_sum) * s_inv;
 }
 
-vec4 gamma(const vec4 s)
+float4 gamma(const float4 s)
 {
     //  Requires:   s is the standard parameter to the gamma function, and it
     //              should lie in the [0, 36] range.
     //  Returns:    Return approximate gamma function output with a maximum
     //              relative error of 0.000463.  See gamma_impl for details.
-	return gamma_impl(s, vec4(1.0)/s);
+	return gamma_impl(s, float4(1.0)/s);
 }
 
-vec3 gamma(const vec3 s)
+float3 gamma(const float3 s)
 {
-    //  vec3 version:
-	return gamma_impl(s, vec3(1.0)/s);
+    //  Float3 version:
+	return gamma_impl(s, float3(1.0)/s);
 }
 
-vec2 gamma(const vec2 s)
+float2 gamma(const float2 s)
 {
-    //  vec2 version:
-	return gamma_impl(s, vec2(1.0)/s);
+    //  Float2 version:
+	return gamma_impl(s, float2(1.0)/s);
 }
 
 float gamma(const float s)
@@ -263,10 +263,11 @@ float gamma(const float s)
 	return gamma_impl(s, 1.0/s);
 }
 
+
 ////////////////  INCOMPLETE GAMMA FUNCTIONS (RESTRICTED INPUT)  ///////////////
 
 //  Lower incomplete gamma function for small s and z (implementation):
-vec4 ligamma_small_z_impl(const vec4 s, const vec4 z, const vec4 s_inv)
+float4 ligamma_small_z_impl(const float4 s, const float4 z, const float4 s_inv)
 {
     //  Requires:   1.) s < ~0.5
     //              2.) z <= ~0.775075
@@ -282,14 +283,14 @@ vec4 ligamma_small_z_impl(const vec4 s, const vec4 z, const vec4 s_inv)
 	//          sum += last_sign * last_pow / ((s + k) * last_factorial);
 	//      }
 	//  Unrolled, constant-unfolded and arranged for madds and parallelism:
-	const vec4 scale = pow(z, s);
-	vec4 sum = s_inv;  //  Summation iteration 0 result
+	const float4 scale = pow(z, s);
+	float4 sum = s_inv;  //  Summation iteration 0 result
 	//  Summation iterations 1, 2, and 3:
-	const vec4 z_sq = z*z;
-	const vec4 denom1 = s + vec4(1.0);
-	const vec4 denom2 = 2.0*s + vec4(4.0);
-	const vec4 denom3 = 6.0*s + vec4(18.0);
-	//vec4 denom4 = 24.0*s + vec4(96.0);
+	const float4 z_sq = z*z;
+	const float4 denom1 = s + float4(1.0);
+	const float4 denom2 = 2.0*s + float4(4.0);
+	const float4 denom3 = 6.0*s + float4(18.0);
+	//float4 denom4 = 24.0*s + float4(96.0);
 	sum -= z/denom1;
 	sum += z_sq/denom2;
 	sum -= z * z_sq/denom3;
@@ -298,30 +299,30 @@ vec4 ligamma_small_z_impl(const vec4 s, const vec4 z, const vec4 s_inv)
 	return scale * sum;
 }
 
-vec3 ligamma_small_z_impl(const vec3 s, const vec3 z, const vec3 s_inv)
+float3 ligamma_small_z_impl(const float3 s, const float3 z, const float3 s_inv)
 {
-    //  vec3 version:
-	const vec3 scale = pow(z, s);
-	vec3 sum = s_inv;
-	const vec3 z_sq = z*z;
-	const vec3 denom1 = s + vec3(1.0);
-	const vec3 denom2 = 2.0*s + vec3(4.0);
-	const vec3 denom3 = 6.0*s + vec3(18.0);
+    //  Float3 version:
+	const float3 scale = pow(z, s);
+	float3 sum = s_inv;
+	const float3 z_sq = z*z;
+	const float3 denom1 = s + float3(1.0);
+	const float3 denom2 = 2.0*s + float3(4.0);
+	const float3 denom3 = 6.0*s + float3(18.0);
 	sum -= z/denom1;
 	sum += z_sq/denom2;
 	sum -= z * z_sq/denom3;
 	return scale * sum;
 }
 
-vec2 ligamma_small_z_impl(const vec2 s, const vec2 z, const vec2 s_inv)
+float2 ligamma_small_z_impl(const float2 s, const float2 z, const float2 s_inv)
 {
-    //  vec2 version:
-	const vec2 scale = pow(z, s);
-	vec2 sum = s_inv;
-	const vec2 z_sq = z*z;
-	const vec2 denom1 = s + vec2(1.0);
-	const vec2 denom2 = 2.0*s + vec2(4.0);
-	const vec2 denom3 = 6.0*s + vec2(18.0);
+    //  Float2 version:
+	const float2 scale = pow(z, s);
+	float2 sum = s_inv;
+	const float2 z_sq = z*z;
+	const float2 denom1 = s + float2(1.0);
+	const float2 denom2 = 2.0*s + float2(4.0);
+	const float2 denom3 = 6.0*s + float2(18.0);
 	sum -= z/denom1;
 	sum += z_sq/denom2;
 	sum -= z * z_sq/denom3;
@@ -344,7 +345,7 @@ float ligamma_small_z_impl(const float s, const float z, const float s_inv)
 }
 
 //  Upper incomplete gamma function for small s and large z (implementation):
-vec4 uigamma_large_z_impl(const vec4 s, const vec4 z)
+float4 uigamma_large_z_impl(const float4 s, const float4 z)
 {
     //  Requires:   1.) s < ~0.5
     //              2.) z > ~0.775075
@@ -352,40 +353,40 @@ vec4 uigamma_large_z_impl(const vec4 s, const vec4 z)
     //              incomplete gamma function (4 terms).
 	//  The "rolled up" continued fraction looks like this.  The denominator
     //  is truncated, and it's calculated "from the bottom up:"
-	//      denom = vec4('inf');
-	//      vec4 one = vec4(1.0);
+	//      denom = float4('inf');
+	//      float4 one = float4(1.0);
 	//      for(int i = 4; i > 0; --i)
 	//      {
 	//          denom = ((i * 2.0) - one) + z - s + (i * (s - i))/denom;
 	//      }
 	//  Unrolled and constant-unfolded for madds and parallelism:
-	const vec4 numerator = pow(z, s) * exp(-z);
-	vec4 denom = vec4(7.0) + z - s;
-	denom = vec4(5.0) + z - s + (3.0*s - vec4(9.0))/denom;
-	denom = vec4(3.0) + z - s + (2.0*s - vec4(4.0))/denom;
-	denom = vec4(1.0) + z - s + (s - vec4(1.0))/denom;
+	const float4 numerator = pow(z, s) * exp(-z);
+	float4 denom = float4(7.0) + z - s;
+	denom = float4(5.0) + z - s + (3.0*s - float4(9.0))/denom;
+	denom = float4(3.0) + z - s + (2.0*s - float4(4.0))/denom;
+	denom = float4(1.0) + z - s + (s - float4(1.0))/denom;
 	return numerator / denom;
 }
 
-vec3 uigamma_large_z_impl(const vec3 s, const vec3 z)
+float3 uigamma_large_z_impl(const float3 s, const float3 z)
 {
-    //  vec3 version:
-	const vec3 numerator = pow(z, s) * exp(-z);
-	vec3 denom = vec3(7.0) + z - s;
-	denom = vec3(5.0) + z - s + (3.0*s - vec3(9.0))/denom;
-	denom = vec3(3.0) + z - s + (2.0*s - vec3(4.0))/denom;
-	denom = vec3(1.0) + z - s + (s - vec3(1.0))/denom;
+    //  Float3 version:
+	const float3 numerator = pow(z, s) * exp(-z);
+	float3 denom = float3(7.0) + z - s;
+	denom = float3(5.0) + z - s + (3.0*s - float3(9.0))/denom;
+	denom = float3(3.0) + z - s + (2.0*s - float3(4.0))/denom;
+	denom = float3(1.0) + z - s + (s - float3(1.0))/denom;
 	return numerator / denom;
 }
 
-vec2 uigamma_large_z_impl(const vec2 s, const vec2 z)
+float2 uigamma_large_z_impl(const float2 s, const float2 z)
 {
-    //  vec2 version:
-	const vec2 numerator = pow(z, s) * exp(-z);
-	vec2 denom = vec2(7.0) + z - s;
-	denom = vec2(5.0) + z - s + (3.0*s - vec2(9.0))/denom;
-	denom = vec2(3.0) + z - s + (2.0*s - vec2(4.0))/denom;
-	denom = vec2(1.0) + z - s + (s - vec2(1.0))/denom;
+    //  Float2 version:
+	const float2 numerator = pow(z, s) * exp(-z);
+	float2 denom = float2(7.0) + z - s;
+	denom = float2(5.0) + z - s + (3.0*s - float2(9.0))/denom;
+	denom = float2(3.0) + z - s + (2.0*s - float2(4.0))/denom;
+	denom = float2(1.0) + z - s + (s - float2(1.0))/denom;
 	return numerator / denom;
 }
 
@@ -401,8 +402,8 @@ float uigamma_large_z_impl(const float s, const float z)
 }
 
 //  Normalized lower incomplete gamma function for small s (implementation):
-vec4 normalized_ligamma_impl(const vec4 s, const vec4 z,
-    const vec4 s_inv, const vec4 gamma_s_inv)
+float4 normalized_ligamma_impl(const float4 s, const float4 z,
+    const float4 s_inv, const float4 gamma_s_inv)
 {
     //  Requires:   1.) s < ~0.5
     //              2.) s_inv = 1/s (precomputed for outside reuse)
@@ -415,75 +416,83 @@ vec4 normalized_ligamma_impl(const vec4 s, const vec4 z,
     //              from Gil/Segura/Temme's paper here:
     //                  http://oai.cwi.nl/oai/asset/20433/20433B.pdf
 	//  Evaluate both branches: Real branches test slower even when available.
-	const vec4 thresh = vec4(0.775075);
-	bvec4 z_is_large = greaterThan(z , thresh);
-	vec4 z_size_check = vec4(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0, z_is_large.w ? 1.0 : 0.0);
-	const vec4 large_z = vec4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
-	const vec4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	static const float4 thresh = float4(0.775075);
+	bool4 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	z_is_large.w = z.w > thresh.w;
+	const float4 large_z = float4(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float4 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
 	//  Combine the results from both branches:
-	return large_z * vec4(z_size_check) + small_z * vec4(z_size_check);
+	bool4 inverse_z_is_large = not(z_is_large);
+	return large_z * float4(z_is_large) + small_z * float4(inverse_z_is_large);
 }
 
-vec3 normalized_ligamma_impl(const vec3 s, const vec3 z,
-    const vec3 s_inv, const vec3 gamma_s_inv)
+float3 normalized_ligamma_impl(const float3 s, const float3 z,
+    const float3 s_inv, const float3 gamma_s_inv)
 {
-    //  vec3 version:
-	const vec3 thresh = vec3(0.775075);
-	bvec3 z_is_large = greaterThan(z , thresh);
-	vec3 z_size_check = vec3(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0, z_is_large.z ? 1.0 : 0.0);
-	const vec3 large_z = vec3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
-	const vec3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
-	return large_z * vec3(z_size_check) + small_z * vec3(z_size_check);
+    //  Float3 version:
+	static const float3 thresh = float3(0.775075);
+	bool3 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	z_is_large.z = z.z > thresh.z;
+	const float3 large_z = float3(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float3 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool3 inverse_z_is_large = not(z_is_large);
+	return large_z * float3(z_is_large) + small_z * float3(inverse_z_is_large);
 }
 
-vec2 normalized_ligamma_impl(const vec2 s, const vec2 z,
-    const vec2 s_inv, const vec2 gamma_s_inv)
+float2 normalized_ligamma_impl(const float2 s, const float2 z,
+    const float2 s_inv, const float2 gamma_s_inv)
 {
-    //  vec2 version:
-	const vec2 thresh = vec2(0.775075);
-	bvec2 z_is_large = greaterThan(z , thresh);
-	vec2 z_size_check = vec2(z_is_large.x ? 1.0 : 0.0, z_is_large.y ? 1.0 : 0.0);
-	const vec2 large_z = vec2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
-	const vec2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
-	return large_z * vec2(z_size_check) + small_z * vec2(z_size_check);
+    //  Float2 version:
+	static const float2 thresh = float2(0.775075);
+	bool2 z_is_large;
+	z_is_large.x = z.x > thresh.x;
+	z_is_large.y = z.y > thresh.y;
+	const float2 large_z = float2(1.0) - uigamma_large_z_impl(s, z) * gamma_s_inv;
+	const float2 small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
+	bool2 inverse_z_is_large = not(z_is_large);
+	return large_z * float2(z_is_large) + small_z * float2(inverse_z_is_large);
 }
 
 float normalized_ligamma_impl(const float s, const float z,
     const float s_inv, const float gamma_s_inv)
 {
     //  Float version:
-	const float thresh = 0.775075;
-	float z_size_check = 0.0;
-	if (z > thresh) z_size_check = 1.0;
+	static const float thresh = 0.775075;
+	const bool z_is_large = z > thresh;
 	const float large_z = 1.0 - uigamma_large_z_impl(s, z) * gamma_s_inv;
 	const float small_z = ligamma_small_z_impl(s, z, s_inv) * gamma_s_inv;
-	return large_z * float(z_size_check) + small_z * float(z_size_check);
+	return large_z * float(z_is_large) + small_z * float(!z_is_large);
 }
 
 //  Normalized lower incomplete gamma function for small s:
-vec4 normalized_ligamma(const vec4 s, const vec4 z)
+float4 normalized_ligamma(const float4 s, const float4 z)
 {
     //  Requires:   s < ~0.5
     //  Returns:    Approximate the normalized lower incomplete gamma function
     //              for s < 0.5.  See normalized_ligamma_impl() for details.
-	const vec4 s_inv = vec4(1.0)/s;
-	const vec4 gamma_s_inv = vec4(1.0)/gamma_impl(s, s_inv);
+	const float4 s_inv = float4(1.0)/s;
+	const float4 gamma_s_inv = float4(1.0)/gamma_impl(s, s_inv);
 	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
 }
 
-vec3 normalized_ligamma(const vec3 s, const vec3 z)
+float3 normalized_ligamma(const float3 s, const float3 z)
 {
-    //  vec3 version:
-	const vec3 s_inv = vec3(1.0)/s;
-	const vec3 gamma_s_inv = vec3(1.0)/gamma_impl(s, s_inv);
+    //  Float3 version:
+	const float3 s_inv = float3(1.0)/s;
+	const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, s_inv);
 	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
 }
 
-vec2 normalized_ligamma(const vec2 s, const vec2 z)
+float2 normalized_ligamma(const float2 s, const float2 z)
 {
-    //  vec2 version:
-	const vec2 s_inv = vec2(1.0)/s;
-	const vec2 gamma_s_inv = vec2(1.0)/gamma_impl(s, s_inv);
+    //  Float2 version:
+	const float2 s_inv = float2(1.0)/s;
+	const float2 gamma_s_inv = float2(1.0)/gamma_impl(s, s_inv);
 	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
 }
 
@@ -495,4 +504,7 @@ float normalized_ligamma(const float s, const float z)
 	return normalized_ligamma_impl(s, z, s_inv, gamma_s_inv);
 }
 
-#endif  //  SPECIAL_FUNCTIONS_H
\ No newline at end of file
+
+#endif  //  SPECIAL_FUNCTIONS_H
+
+