diff --git a/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png
new file mode 100644
index 0000000..2995ae5
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png
new file mode 100644
index 0000000..2c3f21e
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearShadowMask.png b/crt/shaders/crt-royale/TileableLinearShadowMask.png
new file mode 100644
index 0000000..ca40956
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMask.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearShadowMaskEDP.png b/crt/shaders/crt-royale/TileableLinearShadowMaskEDP.png
new file mode 100644
index 0000000..a3844dc
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMaskEDP.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png b/crt/shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png
new file mode 100644
index 0000000..b61d92a
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearShadowMaskResizeTo64.png b/crt/shaders/crt-royale/TileableLinearShadowMaskResizeTo64.png
new file mode 100644
index 0000000..9b66ffb
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMaskResizeTo64.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png
new file mode 100644
index 0000000..eb20b23
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png differ
diff --git a/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png
new file mode 100644
index 0000000..df518db
Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png differ
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/README.TXT b/crt/shaders/crt-royale/crt-royale-settings-files/README.TXT
new file mode 100644
index 0000000..38270a7
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/README.TXT
@@ -0,0 +1,12 @@
+These files aren't nearly as good as canned .cgp presets with all of your
+favorite settings, and there aren't nearly enough, but they're a start.
+
+The nVidia settings files will only work on nVidia cards.
+The ATI settings files will work on both AMD/ATI and nVidia cards.
+The Intel settings files should additionally work on Intel HD 4000 Graphics, but
+they disable manual phosphor mask resizing, so the phosphor mask will be softer.
+
+For compatibility with Intel integrated graphics, you can either use the Intel-
+specific .cgp files or use the Intel settings files.  These are the same as the
+ATI settings, except the following line is also uncommented:
+    #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati-clean.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati-clean.h
new file mode 100644
index 0000000..058899d
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati-clean.h
@@ -0,0 +1,92 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+    //#define DRIVERS_ALLOW_DERIVATIVES
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+    //#define DRIVERS_ALLOW_TEX2DLOD
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+#define RUNTIME_SHADER_PARAMS_ENABLE
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+#define RUNTIME_GEOMETRY_TILT
+#define RUNTIME_GEOMETRY_MODE
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    static const float bloom_approx_filter_static = 2.0;
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    static const bool beam_generalized_gaussian = true;
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    static const float beam_spot_shape_function = 0.0;
+    static const float beam_spot_power_static = 1.0/3.0;        //  range (0, 16]
+    static const float beam_min_shape_static = 2.0;             //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;             //  range [2, 32]
+    static const float beam_shape_power_static = 1.0/4.0;       //  range (0, 16]
+    static const float beam_horiz_filter_static = 0.0;
+    static const float beam_horiz_sigma_static = 0.35;          //  range (0, 2/3]
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    static const bool beam_misconvergence = true;
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    static const bool interlace_detect = true;
+    static const bool interlace_1080i_static = false;
+    static const bool interlace_bff_static = false;
+    static const float aa_level = 12.0;                         //  range [0, 24]
+    static const float aa_filter = 6.0;                         //  range [0, 9]
+    static const bool aa_temporal = false;
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    static const float aa_cubic_c_static = 0.5;                 //  range [0, 4]
+    static const float aa_gauss_sigma_static = 0.5;             //  range [0.0625, 1.0]
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    static const float mask_num_triads_desired_static = 480.0;
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    static const float mask_min_allowed_triad_size = 2.0;
+    static const float geom_mode_static = 0.0;                  //  range [0, 3]
+    static const float geom_radius_static = 2.0;                //  range [1/(2*pi), 1024]
+    static const float geom_view_dist_static = 2.0;             //  range [0.5, 1024]
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    static const float geom_aspect_ratio_static = 1.313069909;
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    static const bool geom_force_correct_tangent_matrix = true;
+    static const float border_size_static = 0.015;               //  range [0, 0.5]
+    static const float border_darkness_static = 2.0;            //  range [0, inf)
+    static const float border_compress_static = 2.5;            //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati.h
new file mode 100644
index 0000000..fc35aee
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel-clean.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel-clean.h
new file mode 100644
index 0000000..6929369
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel-clean.h
@@ -0,0 +1,92 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+    //#define DRIVERS_ALLOW_DERIVATIVES
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+    //#define DRIVERS_ALLOW_TEX2DLOD
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+    #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+#define RUNTIME_SHADER_PARAMS_ENABLE
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+#define RUNTIME_GEOMETRY_TILT
+#define RUNTIME_GEOMETRY_MODE
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    static const float bloom_approx_filter_static = 0.0;
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    static const bool beam_generalized_gaussian = true;
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    static const float beam_spot_shape_function = 0.0;
+    static const float beam_spot_power_static = 1.0/3.0;        //  range (0, 16]
+    static const float beam_min_shape_static = 2.0;             //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;             //  range [2, 32]
+    static const float beam_shape_power_static = 1.0/4.0;       //  range (0, 16]
+    static const float beam_horiz_filter_static = 0.0;
+    static const float beam_horiz_sigma_static = 0.35;          //  range (0, 2/3]
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    static const bool beam_misconvergence = true;
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    static const bool interlace_detect = true;
+    static const bool interlace_1080i_static = false;
+    static const bool interlace_bff_static = false;
+    static const float aa_level = 12.0;                         //  range [0, 24]
+    static const float aa_filter = 6.0;                         //  range [0, 9]
+    static const bool aa_temporal = false;
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    static const float aa_cubic_c_static = 0.5;                 //  range [0, 4]
+    static const float aa_gauss_sigma_static = 0.5;             //  range [0.0625, 1.0]
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    static const float mask_num_triads_desired_static = 480.0;
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    static const float mask_min_allowed_triad_size = 2.0;
+    static const float geom_mode_static = 0.0;                  //  range [0, 3]
+    static const float geom_radius_static = 2.0;                //  range [1/(2*pi), 1024]
+    static const float geom_view_dist_static = 2.0;             //  range [0.5, 1024]
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    static const float geom_aspect_ratio_static = 1.313069909;
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    static const bool geom_force_correct_tangent_matrix = true;
+    static const float border_size_static = 0.015;               //  range [0, 0.5]
+    static const float border_darkness_static = 2.0;            //  range [0, inf)
+    static const float border_compress_static = 2.5;            //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel.h
new file mode 100644
index 0000000..9ce0b3f
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 0.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia-clean.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia-clean.h
new file mode 100644
index 0000000..ce837fd
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia-clean.h
@@ -0,0 +1,92 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+    #define DRIVERS_ALLOW_DERIVATIVES
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+    #define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+    #define DRIVERS_ALLOW_TEX2DLOD
+    #define DRIVERS_ALLOW_TEX2DBIAS
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+#define RUNTIME_SHADER_PARAMS_ENABLE
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+#define RUNTIME_GEOMETRY_TILT
+#define RUNTIME_GEOMETRY_MODE
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    static const float bloom_approx_filter_static = 2.0;
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    static const bool beam_generalized_gaussian = true;
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    static const float beam_spot_shape_function = 0.0;
+    static const float beam_spot_power_static = 1.0/3.0;        //  range (0, 16]
+    static const float beam_min_shape_static = 2.0;             //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;             //  range [2, 32]
+    static const float beam_shape_power_static = 1.0/4.0;       //  range (0, 16]
+    static const float beam_horiz_filter_static = 0.0;
+    static const float beam_horiz_sigma_static = 0.35;          //  range (0, 2/3]
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    static const bool beam_misconvergence = true;
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    static const bool interlace_detect = true;
+    static const bool interlace_1080i_static = false;
+    static const bool interlace_bff_static = false;
+    static const float aa_level = 12.0;                         //  range [0, 24]
+    static const float aa_filter = 6.0;                         //  range [0, 9]
+    static const bool aa_temporal = false;
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    static const float aa_cubic_c_static = 0.5;                 //  range [0, 4]
+    static const float aa_gauss_sigma_static = 0.5;             //  range [0.0625, 1.0]
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    static const float mask_num_triads_desired_static = 480.0;
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    static const float mask_min_allowed_triad_size = 2.0;
+    static const float geom_mode_static = 0.0;                  //  range [0, 3]
+    static const float geom_radius_static = 2.0;                //  range [1/(2*pi), 1024]
+    static const float geom_view_dist_static = 2.0;             //  range [0.5, 1024]
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    static const float geom_aspect_ratio_static = 1.313069909;
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    static const bool geom_force_correct_tangent_matrix = true;
+    static const float border_size_static = 0.015;               //  range [0, 0.5]
+    static const float border_darkness_static = 2.0;            //  range [0, inf)
+    static const float border_compress_static = 2.5;            //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia.h
new file mode 100644
index 0000000..60c2c9a
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    #define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    #define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    #define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    #define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-ati.h
new file mode 100644
index 0000000..f01d68f
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-ati.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+//#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+//#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = false;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 8.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = false;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-intel.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-intel.h
new file mode 100644
index 0000000..fd76d99
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-intel.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+//#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+//#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 0.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = false;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 8.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = false;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-nvidia.h
new file mode 100644
index 0000000..1737335
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-nvidia.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    #define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    #define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    #define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    #define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+//#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+//#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = false;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 8.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = false;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-ati.h
new file mode 100644
index 0000000..0439ec0
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-ati.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+//#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+//#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+//#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+//#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 0.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 2.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = false;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = false;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 8.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = mask_specify_num_triads_static;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = false;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-intel.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-intel.h
new file mode 100644
index 0000000..8e2f683
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-intel.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+//#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+//#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+//#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+//#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 0.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 2.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = false;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = false;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 8.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = mask_specify_num_triads_static;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = false;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-nvidia.h
new file mode 100644
index 0000000..fef7c4f
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-nvidia.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    #define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    #define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    #define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    #define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+//#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+//#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+//#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+//#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 0.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 2.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = false;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = false;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 8.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = mask_specify_num_triads_static;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = false;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-ati.h
new file mode 100644
index 0000000..964ca12
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-ati.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 4.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-nvidia.h
new file mode 100644
index 0000000..c6ad27e
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-nvidia.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    #define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    #define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    #define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    #define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 4.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings.h
new file mode 100644
index 0000000..fc35aee
--- /dev/null
+++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "float2 ddx(float2);" not supported in this profile
+//  error C3004: function "float2 ddy(float2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    static const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    static const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    static const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    static const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    static const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    static const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    static const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    static const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    static const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    static const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    static const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    static const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    static const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    static const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    static const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    static const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    static const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    static const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    static const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    static const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    static const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    static const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    static const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
+    static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
+    static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    static const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    static const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    static const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    static const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    static const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    static const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    static const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    static const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    static const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    static const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    static const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    static const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    static const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    static const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    static const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    static const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    static const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    static const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    static const float2 geom_tilt_angle_static = float2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    static const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    static const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    static const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    static const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    static const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+
diff --git a/crt/shaders/crt-royale/src/bind-shader-params.h b/crt/shaders/crt-royale/src/bind-shader-params.h
new file mode 100644
index 0000000..80adc7d
--- /dev/null
+++ b/crt/shaders/crt-royale/src/bind-shader-params.h
@@ -0,0 +1,247 @@
+#ifndef BIND_SHADER_PARAMS_H
+#define BIND_SHADER_PARAMS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+
+//  Override some parameters for gamma-management.h and tex2Dantialias.h:
+#define OVERRIDE_DEVICE_GAMMA
+const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
+#define ANTIALIAS_OVERRIDE_BASICS
+#define ANTIALIAS_OVERRIDE_PARAMETERS
+
+//  Disable runtime shader params if the user doesn't explicitly want them.
+//  Static constants will be defined in place of uniforms of the same name.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #undef PARAMETER_UNIFORM
+#endif
+
+//  Bind option names to shader parameter uniforms or static constants.
+#ifdef PARAMETER_UNIFORM
+    uniform float crt_gamma;
+    uniform float lcd_gamma;
+    uniform float levels_contrast;
+    uniform float halation_weight;
+    uniform float diffusion_weight;
+    uniform float bloom_underestimate_levels;
+    uniform float bloom_excess;
+    uniform float beam_min_sigma;
+    uniform float beam_max_sigma;
+    uniform float beam_spot_power;
+    uniform float beam_min_shape;
+    uniform float beam_max_shape;
+    uniform float beam_shape_power;
+    uniform float beam_horiz_sigma;
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        uniform float beam_horiz_filter;
+        uniform float beam_horiz_linear_rgb_weight;
+    #else
+        const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
+        const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
+    #endif
+    uniform float convergence_offset_x_r;
+    uniform float convergence_offset_x_g;
+    uniform float convergence_offset_x_b;
+    uniform float convergence_offset_y_r;
+    uniform float convergence_offset_y_g;
+    uniform float convergence_offset_y_b;
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        uniform float mask_type;
+    #else
+        const float mask_type = clamp(mask_type_static, 0.0, 2.0);
+    #endif
+    uniform float mask_sample_mode_desired;
+    uniform float mask_specify_num_triads;
+    uniform float mask_triad_size_desired;
+    uniform float mask_num_triads_desired;
+    uniform float aa_subpixel_r_offset_x_runtime;
+    uniform float aa_subpixel_r_offset_y_runtime;
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        uniform float aa_cubic_c;
+        uniform float aa_gauss_sigma;
+    #else
+        const float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
+        const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
+    #endif
+    uniform float geom_mode_runtime;
+    uniform float geom_radius;
+    uniform float geom_view_dist;
+    uniform float geom_tilt_angle_x;
+    uniform float geom_tilt_angle_y;
+    uniform float geom_aspect_ratio_x;
+    uniform float geom_aspect_ratio_y;
+    uniform float geom_overscan_x;
+    uniform float geom_overscan_y;
+    uniform float border_size;
+    uniform float border_darkness;
+    uniform float border_compress;
+    uniform float interlace_bff;
+    uniform float interlace_1080i;
+#else
+    //  Use constants from user-settings.h, and limit ranges appropriately:
+    const float crt_gamma = max(0.0, crt_gamma_static);
+    const float lcd_gamma = max(0.0, lcd_gamma_static);
+    const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0);
+    const float halation_weight = clamp(halation_weight_static, 0.0, 1.0);
+    const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0);
+    const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static);
+    const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0);
+    const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static);
+    const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static);
+    const float beam_spot_power = max(beam_spot_power_static, 0.0);
+    const float beam_min_shape = max(2.0, beam_min_shape_static);
+    const float beam_max_shape = max(beam_min_shape, beam_max_shape_static);
+    const float beam_shape_power = max(0.0, beam_shape_power_static);
+    const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
+    const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static);
+    const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
+    //  Unpack vector elements to match scalar uniforms:
+    const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0);
+    const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0);
+    const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0);
+    const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0);
+    const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0);
+    const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0);
+    const float mask_type = clamp(mask_type_static, 0.0, 2.0);
+    const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0);
+    const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0);
+    const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0);
+    const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0);
+    const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5);
+    const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5);
+    const float aa_cubic_c = aa_cubic_c_static;                              //  Clamp to [0, 4]?
+    const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static);  //  Clamp to [FIXZERO(0), 1]?
+    const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0);
+    const float geom_radius = max(1.0/(2.0*pi), geom_radius_static);         //  Clamp to [1/(2*pi), 1024]?
+    const float geom_view_dist = max(0.5, geom_view_dist_static);            //  Clamp to [0.5, 1024]?
+    const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi);
+    const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi);
+    const float geom_aspect_ratio_x = geom_aspect_ratio_static;              //  Force >= 1?
+    const float geom_aspect_ratio_y = 1.0;
+    const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x);
+    const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y);
+    const float border_size = clamp(border_size_static, 0.0, 0.5);           //  0.5 reaches to image center
+    const float border_darkness = max(0.0, border_darkness_static);
+    const float border_compress = max(1.0, border_compress_static);          //  < 1.0 darkens whole image
+    const float interlace_bff = float(interlace_bff_static);
+    const float interlace_1080i = float(interlace_1080i_static);
+#endif
+
+
+//  Provide accessors for vector constants that pack scalar uniforms:
+vec2 get_aspect_vector(const float geom_aspect_ratio)
+{
+    //  Get an aspect ratio vector.  Enforce geom_max_aspect_ratio, and prevent
+    //  the absolute scale from affecting the uv-mapping for curvature:
+    const float geom_clamped_aspect_ratio =
+        min(geom_aspect_ratio, geom_max_aspect_ratio);
+    const vec2 geom_aspect =
+        normalize(vec2(geom_clamped_aspect_ratio, 1.0));
+    return geom_aspect;
+}
+
+vec2 get_geom_overscan_vector()
+{
+    return vec2(geom_overscan_x, geom_overscan_y);
+}
+
+vec2 get_geom_tilt_angle_vector()
+{
+    return vec2(geom_tilt_angle_x, geom_tilt_angle_y);
+}
+
+vec3 get_convergence_offsets_x_vector()
+{
+    return vec3(convergence_offset_x_r, convergence_offset_x_g,
+        convergence_offset_x_b);
+}
+
+vec3 get_convergence_offsets_y_vector()
+{
+    return vec3(convergence_offset_y_r, convergence_offset_y_g,
+        convergence_offset_y_b);
+}
+
+vec2 get_convergence_offsets_r_vector()
+{
+    return vec2(convergence_offset_x_r, convergence_offset_y_r);
+}
+
+vec2 get_convergence_offsets_g_vector()
+{
+    return vec2(convergence_offset_x_g, convergence_offset_y_g);
+}
+
+vec2 get_convergence_offsets_b_vector()
+{
+    return vec2(convergence_offset_x_b, convergence_offset_y_b);
+}
+
+vec2 get_aa_subpixel_r_offset()
+{
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+            //  WARNING: THIS IS EXTREMELY EXPENSIVE.
+            return vec2(aa_subpixel_r_offset_x_runtime,
+                aa_subpixel_r_offset_y_runtime);
+        #else
+            return aa_subpixel_r_offset_static;
+        #endif
+    #else
+        return aa_subpixel_r_offset_static;
+    #endif
+}
+
+//  Provide accessors settings which still need "cooking:"
+float get_mask_amplify()
+{
+    const float mask_grille_amplify = 1.0/mask_grille_avg_color;
+    const float mask_slot_amplify = 1.0/mask_slot_avg_color;
+    const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
+    return mask_type < 0.5 ? mask_grille_amplify :
+        mask_type < 1.5 ? mask_slot_amplify :
+        mask_shadow_amplify;
+}
+
+float get_mask_sample_mode()
+{
+    #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_desired;
+        #else
+            return clamp(mask_sample_mode_desired, 1.0, 2.0);
+        #endif
+    #else
+        #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+            return mask_sample_mode_static;
+        #else
+            return clamp(mask_sample_mode_static, 1.0, 2.0);
+        #endif
+    #endif
+}
+
+
+#endif  //  BIND_SHADER_PARAMS_H
+
+
diff --git a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
new file mode 100644
index 0000000..43e8215
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
@@ -0,0 +1,403 @@
+#version 450
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	uint FrameCount;
+} registers;
+
+layout(std140, set = 0, binding = 0) uniform UBO
+{
+	mat4 MVP;
+	float crt_gamma;
+    float lcd_gamma;
+    float levels_contrast;
+    float halation_weight;
+    float diffusion_weight;
+    float bloom_underestimate_levels;
+    float bloom_excess;
+    float beam_min_sigma;
+    float beam_max_sigma;
+    float beam_spot_power;
+    float beam_min_shape;
+    float beam_max_shape;
+    float beam_shape_power;
+    float beam_horiz_filter;
+    float beam_horiz_sigma;
+    float beam_horiz_linear_rgb_weight;
+    float convergence_offset_x_r;
+    float convergence_offset_x_g;
+    float convergence_offset_x_b;
+    float convergence_offset_y_r;
+    float convergence_offset_y_g;
+    float convergence_offset_y_b;
+    float mask_type;
+    float mask_sample_mode_desired;
+    float mask_specify_num_triads;
+    float mask_triad_size_desired;
+    float mask_num_triads_desired;
+    float aa_subpixel_r_offset_x_runtime;
+    float aa_subpixel_r_offset_y_runtime;
+    float aa_cubic_c;
+    float aa_gauss_sigma;
+    float geom_mode_runtime;
+    float geom_radius;
+    float geom_view_dist;
+    float geom_tilt_angle_x;
+    float geom_tilt_angle_y;
+    float geom_aspect_ratio_x;
+    float geom_aspect_ratio_y;
+    float geom_overscan_x;
+    float geom_overscan_y;
+    float border_size;
+    float border_darkness;
+    float border_compress;
+    float interlace_bff;
+    float interlace_1080i;
+} params;
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+//  PASS SETTINGS:
+//  gamma-management.h needs to know what kind of pipeline we're using and
+//  what pass this is in that pipeline.  This will become obsolete if/when we
+//  can #define things like this in the preset file.
+#define FIRST_PASS
+#define SIMULATE_CRT_ON_LCD
+
+//  Set shader params for all passes here:
+#pragma parameter crt_gamma "crt_gamma" 2.5 1.0 5.0 0.025
+#pragma parameter lcd_gamma "lcd_gamma" 2.2 1.0 5.0 0.025
+#pragma parameter levels_contrast "levels_contrast" 1.0 0.0 4.0 0.015625
+#pragma parameter halation_weight "halation_weight" 0.0 0.0 1.0 0.005
+#pragma parameter diffusion_weight "diffusion_weight" 0.075 0.0 1.0 0.005
+#pragma parameter bloom_underestimate_levels "bloom_underestimate_levels" 0.8 0.0 5.0 0.01
+#pragma parameter bloom_excess "bloom_excess" 0.0 0.0 1.0 0.005
+#pragma parameter beam_min_sigma "beam_min_sigma" 0.02 0.005 1.0 0.005
+#pragma parameter beam_max_sigma "beam_max_sigma" 0.3 0.005 1.0 0.005
+#pragma parameter beam_spot_power "beam_spot_power" 0.33 0.01 16.0 0.01
+#pragma parameter beam_min_shape "beam_min_shape" 2.0 2.0 32.0 0.1
+#pragma parameter beam_max_shape "beam_max_shape" 4.0 2.0 32.0 0.1
+#pragma parameter beam_shape_power "beam_shape_power" 0.25 0.01 16.0 0.01
+#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0
+#pragma parameter beam_horiz_sigma "beam_horiz_sigma" 0.35 0.0 0.67 0.005
+#pragma parameter beam_horiz_linear_rgb_weight "beam_horiz_linear_rgb_weight" 1.0 0.0 1.0 0.01
+#pragma parameter convergence_offset_x_r "convergence_offset_x_r" 0.0 -4.0 4.0 0.05
+#pragma parameter convergence_offset_x_g "convergence_offset_x_g" 0.0 -4.0 4.0 0.05
+#pragma parameter convergence_offset_x_b "convergence_offset_x_b" 0.0 -4.0 4.0 0.05
+#pragma parameter convergence_offset_y_r "convergence_offset_y_r" 0.0 -2.0 2.0 0.05
+#pragma parameter convergence_offset_y_g "convergence_offset_y_g" 0.0 -2.0 2.0 0.05
+#pragma parameter convergence_offset_y_b "convergence_offset_y_b" 0.0 -2.0 2.0 0.05
+#pragma parameter mask_type "mask_type" 1.0 0.0 2.0 1.0
+#pragma parameter mask_sample_mode_desired "mask_sample_mode" 0.0 0.0 2.0 1.0   //  Consider blocking mode 2.
+#pragma parameter mask_specify_num_triads "mask_specify_num_triads" 0.0 0.0 1.0 1.0
+#pragma parameter mask_triad_size_desired "mask_triad_size_desired" 3.0 1.0 18.0 0.125
+#pragma parameter mask_num_triads_desired "mask_num_triads_desired" 480.0 342.0 1920.0 1.0
+#pragma parameter aa_subpixel_r_offset_x_runtime "aa_subpixel_r_offset_x" -0.333333333 -0.333333333 0.333333333 0.333333333
+#pragma parameter aa_subpixel_r_offset_y_runtime "aa_subpixel_r_offset_y" 0.0 -0.333333333 0.333333333 0.333333333
+#pragma parameter aa_cubic_c "antialias_cubic_sharpness" 0.5 0.0 4.0 0.015625
+#pragma parameter aa_gauss_sigma "antialias_gauss_sigma" 0.5 0.0625 1.0 0.015625
+#pragma parameter geom_mode_runtime "geom_mode" 0.0 0.0 3.0 1.0
+#pragma parameter geom_radius "geom_radius" 2.0 0.16 1024.0 0.1
+#pragma parameter geom_view_dist "geom_view_dist" 2.0 0.5 1024.0 0.25
+#pragma parameter geom_tilt_angle_x "geom_tilt_angle_x" 0.0 -3.14159265 3.14159265 0.017453292519943295
+#pragma parameter geom_tilt_angle_y "geom_tilt_angle_y" 0.0 -3.14159265 3.14159265 0.017453292519943295
+#pragma parameter geom_aspect_ratio_x "geom_aspect_ratio_x" 432.0 1.0 512.0 1.0
+#pragma parameter geom_aspect_ratio_y "geom_aspect_ratio_y" 329.0 1.0 512.0 1.0
+#pragma parameter geom_overscan_x "geom_overscan_x" 1.0 0.00390625 4.0 0.00390625
+#pragma parameter geom_overscan_y "geom_overscan_y" 1.0 0.00390625 4.0 0.00390625
+#pragma parameter border_size "border_size" 0.015 0.0000001 0.5 0.005
+#pragma parameter border_darkness "border_darkness" 2.0 0.0 16.0 0.0625
+#pragma parameter border_compress "border_compress" 2.5 1.0 64.0 0.0625
+#pragma parameter interlace_bff "interlace_bff" 0.0 0.0 1.0 1.0
+#pragma parameter interlace_1080i "interlace_1080i" 0.0 0.0 1.0 1.0
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../user-settings.h"
+#include "bind-shader-params.h"
+//#include "../../../../include/gamma-management.h"
+//#include "scanline-functions.h"
+
+// from scanline-functions.h //
+bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        bool sd_interlace;
+		if (num_lines > 288.5 && num_lines < 576.5)
+			{sd_interlace = true;}
+		else
+			{sd_interlace = false;}
+        bool hd_interlace;
+        if (num_lines > 1079.5 && num_lines < 1080.5)
+			{hd_interlace = true;}
+		else
+			{hd_interlace = false;}
+		return (sd_interlace || hd_interlace);
+    }
+    else
+    {
+        return false;
+    }
+}
+// end scanline-functions.h //
+
+// from gamma-management.h //
+///////////////////////////////  BASE CONSTANTS  ///////////////////////////////
+
+//  Set standard gamma constants, but allow users to override them:
+#ifndef OVERRIDE_STANDARD_GAMMA
+    //  Standard encoding gammas:
+    const float ntsc_gamma = 2.2;    //  Best to use NTSC for PAL too?
+    const float pal_gamma = 2.8;     //  Never actually 2.8 in practice
+    //  Typical device decoding gammas (only use for emulating devices):
+    //  CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
+    //  gammas: The standards purposely undercorrected for an analog CRT's
+    //  assumed 2.5 reference display gamma to maintain contrast in assumed
+    //  [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
+    //  These unstated assumptions about display gamma and perceptual rendering
+    //  intent caused a lot of confusion, and more modern CRT's seemed to target
+    //  NTSC 2.2 gamma with circuitry.  LCD displays seem to have followed suit
+    //  (they struggle near black with 2.5 gamma anyway), especially PC/laptop
+    //  displays designed to view sRGB in bright environments.  (Standards are
+    //  also in flux again with BT.1886, but it's underspecified for displays.)
+    const float crt_reference_gamma_high = 2.5;  //  In (2.35, 2.55)
+    const float crt_reference_gamma_low = 2.35;  //  In (2.35, 2.55)
+    const float lcd_reference_gamma = 2.5;       //  To match CRT
+    const float crt_office_gamma = 2.2;  //  Circuitry-adjusted for NTSC
+    const float lcd_office_gamma = 2.2;  //  Approximates sRGB
+#endif  //  OVERRIDE_STANDARD_GAMMA
+
+//  Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
+//  but only if they're aware of it.
+#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
+    const bool assume_opaque_alpha = false;
+#endif
+
+
+///////////////////////  DERIVED CONSTANTS AS FUNCTIONS  ///////////////////////
+
+//  gamma-management.h should be compatible with overriding gamma values with
+//  runtime user parameters, but we can only define other global constants in
+//  terms of static constants, not uniform user parameters.  To get around this
+//  limitation, we need to define derived constants using functions.
+
+//  Set device gamma constants, but allow users to override them:
+#ifdef OVERRIDE_DEVICE_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_crt_gamma()    {   return crt_gamma;   }
+    float get_gba_gamma()    {   return gba_gamma;   }
+    float get_lcd_gamma()    {   return lcd_gamma;   }
+#else
+    float get_crt_gamma()    {   return crt_reference_gamma_high;    }
+    float get_gba_gamma()    {   return 3.5; }   //  Game Boy Advance; in (3.0, 4.0)
+    float get_lcd_gamma()    {   return lcd_office_gamma;            }
+#endif  //  OVERRIDE_DEVICE_GAMMA
+
+//  Set decoding/encoding gammas for the first/lass passes, but allow overrides:
+#ifdef OVERRIDE_FINAL_GAMMA
+    //  The user promises to globally define the appropriate constants:
+    float get_intermediate_gamma()   {   return intermediate_gamma;  }
+    float get_input_gamma()          {   return input_gamma;         }
+    float get_output_gamma()         {   return output_gamma;        }
+#else
+    //  If we gamma-correct every pass, always use ntsc_gamma between passes to
+    //  ensure middle passes don't need to care if anything is being simulated:
+    float get_intermediate_gamma()   {   return ntsc_gamma;          }
+    #ifdef SIMULATE_CRT_ON_LCD
+        float get_input_gamma()      {   return get_crt_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_LCD
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_lcd_gamma();     }
+    #else
+    #ifdef SIMULATE_LCD_ON_CRT
+        float get_input_gamma()      {   return get_lcd_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else
+    #ifdef SIMULATE_GBA_ON_CRT
+        float get_input_gamma()      {   return get_gba_gamma();     }
+        float get_output_gamma()     {   return get_crt_gamma();     }
+    #else   //  Don't simulate anything:
+        float get_input_gamma()      {   return ntsc_gamma;          }
+        float get_output_gamma()     {   return ntsc_gamma;          }
+    #endif  //  SIMULATE_GBA_ON_CRT
+    #endif  //  SIMULATE_LCD_ON_CRT
+    #endif  //  SIMULATE_GBA_ON_LCD
+    #endif  //  SIMULATE_CRT_ON_LCD
+#endif  //  OVERRIDE_FINAL_GAMMA
+
+#ifndef GAMMA_ENCODE_EVERY_FBO
+    #ifdef FIRST_PASS
+        const bool linearize_input = true;
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        const bool linearize_input = false;
+        float get_pass_input_gamma()     {   return 1.0;                 }
+    #endif
+    #ifdef LAST_PASS
+        const bool gamma_encode_output = true;
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        const bool gamma_encode_output = false;
+        float get_pass_output_gamma()    {   return 1.0;                 }
+    #endif
+#else
+    const bool linearize_input = true;
+    const bool gamma_encode_output = true;
+    #ifdef FIRST_PASS
+        float get_pass_input_gamma()     {   return get_input_gamma();   }
+    #else
+        float get_pass_input_gamma()     {   return get_intermediate_gamma();    }
+    #endif
+    #ifdef LAST_PASS
+        float get_pass_output_gamma()    {   return get_output_gamma();  }
+    #else
+        float get_pass_output_gamma()    {   return get_intermediate_gamma();    }
+    #endif
+#endif
+
+vec4 decode_input(const vec4 color)
+{
+    if(linearize_input)
+    {
+        if(assume_opaque_alpha)
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+vec4 encode_output(const vec4 color)
+{
+    if(gamma_encode_output)
+    {
+        if(assume_opaque_alpha)
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
+        }
+        else
+        {
+            return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
+        }
+    }
+    else
+    {
+        return color;
+    }
+}
+
+#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
+//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords)
+//{   return decode_input(vec4(texture(tex, tex_coords)));   }
+
+//#define tex2D_linearize(C, D, E) decode_input(vec4(texture(C, D, E)))
+//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords, const int texel_off)
+//{   return decode_input(vec4(texture(tex, tex_coords, texel_off)));    }
+
+// end gamma-management.h //
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out vec2 uv_step;
+
+void main()
+{
+	gl_Position = params.MVP * Position;
+	tex_uv = TexCoord;
+   
+	//  Save the uv distance between texels:
+	uv_step = vec2(1.0) * registers.SourceSize.zw;
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in vec2 uv_step;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+
+void main()
+{
+	//  Detect interlacing: 1.0 = true, 0.0 = false.
+	const vec2 video_size = registers.SourceSize.xy;
+	bool interlaced = is_interlaced(video_size.y);
+	
+//  Linearize the input based on CRT gamma and bob interlaced fields.
+//  Bobbing ensures we can immediately blur without getting artifacts.
+//  Note: TFF/BFF won't matter for sources that double-weave or similar.
+if(interlace_detect)
+    {
+        //  Sample the current line and an average of the previous/next line;
+        //  tex2D_linearize will decode CRT gamma.  Don't bother branching:
+//        const vec2 tex_uv = tex_uv;
+        const vec2 v_step = vec2(0.0, uv_step.y);
+        const vec3 curr_line = tex2D_linearize(
+            Source, tex_uv).rgb;
+        const vec3 last_line = tex2D_linearize(
+            Source, tex_uv - v_step).rgb;
+        const vec3 next_line = tex2D_linearize(
+            Source, tex_uv + v_step).rgb;
+        const vec3 interpolated_line = 0.5 * (last_line + next_line);
+        //  If we're interlacing, determine which field curr_line is in:
+		float interlace_check = 0.0;
+		if (interlaced = true) interlace_check = 1.0;
+        const float modulus = interlace_check + 1.0;
+        const float field_offset =
+            mod(registers.FrameCount + float(params.interlace_bff), modulus);
+        const float curr_line_texel = tex_uv.y * registers.SourceSize.y;
+        //  Use under_half to fix a rounding bug around exact texel locations.
+        const float line_num_last = floor(curr_line_texel - under_half);
+        const float wrong_field = mod(line_num_last + field_offset, modulus);
+        //  Select the correct color, and output the result:
+        const vec3 color = mix(curr_line, interpolated_line, wrong_field);
+        FragColor = encode_output(vec4(color, 1.0));
+    }
+    else
+    {
+        FragColor = encode_output(tex2D_linearize(Source, tex_uv));
+    }
+}
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/derived-settings-and-constants.h b/crt/shaders/crt-royale/src/derived-settings-and-constants.h
new file mode 100644
index 0000000..356eea3
--- /dev/null
+++ b/crt/shaders/crt-royale/src/derived-settings-and-constants.h
@@ -0,0 +1,315 @@
+#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
+#define DERIVED_SETTINGS_AND_CONSTANTS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  These macros and constants can be used across the whole codebase.
+//  Unlike the values in user-settings.cgh, end users shouldn't modify these.
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../user-settings.h"
+#include "user-preset-constants.h"
+
+
+///////////////////////////////  FIXED SETTINGS  ///////////////////////////////
+
+//  Avoid dividing by zero; using a macro overloads for float, vec2, etc.:
+#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625))   //  2^-16
+
+//  Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
+#ifndef SIMULATE_CRT_ON_LCD
+    #define SIMULATE_CRT_ON_LCD
+#endif
+
+//  Manually tiling a manually resized texture creates texture coord derivative
+//  discontinuities and confuses anisotropic filtering, causing discolored tile
+//  seams in the phosphor mask.  Workarounds:
+//  a.) Using tex2Dlod disables anisotropic filtering for tiled masks.  It's
+//      downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
+//      disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
+//  b.) "Tile flat twice" requires drawing two full tiles without border padding
+//      to the resized mask FBO, and it's incompatible with same-pass curvature.
+//      (Same-pass curvature isn't used but could be in the future...maybe.)
+//  c.) "Fix discontinuities" requires derivatives and drawing one tile with
+//      border padding to the resized mask FBO, but it works with same-pass
+//      curvature.  It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
+//  Precedence: a, then, b, then c (if multiple strategies are #defined).
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD              //  129.7 FPS, 4x, flat; 101.8 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE       //  128.1 FPS, 4x, flat; 101.5 at fullscreen
+    #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES   //  124.4 FPS, 4x, flat; 97.4 at fullscreen
+//  Also, manually resampling the phosphor mask is slightly blurrier with
+//  anisotropic filtering.  (Resampling with mipmapping is even worse: It
+//  creates artifacts, but only with the fully bloomed shader.)  The difference
+//  is subtle with small triads, but you can fix it for a small cost.
+    //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+
+
+//////////////////////////////  DERIVED SETTINGS  //////////////////////////////
+
+//  Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
+//  geometry mode at runtime, or a 4x4 true Gaussian resize.  Disable
+//  incompatible settings ASAP.  (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
+//  #defined by either user-settings.h or a wrapper .cg that #includes the
+//  current .cg pass.)
+#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+    #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
+        #undef PHOSPHOR_MASK_MANUALLY_RESIZE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    //  Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
+    //  inferior in most cases, so replace 2.0 with 0.0:
+     const float bloom_approx_filter =
+        bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
+#else
+     const float bloom_approx_filter = bloom_approx_filter_static;
+#endif
+
+//  Disable slow runtime paths if static parameters are used.  Most of these
+//  won't be a problem anyway once the params are disabled, but some will.
+#ifndef RUNTIME_SHADER_PARAMS_ENABLE
+    #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+        #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        #undef RUNTIME_ANTIALIAS_WEIGHTS
+    #endif
+    #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+        #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+    #endif
+    #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+    #endif
+    #ifdef RUNTIME_GEOMETRY_TILT
+        #undef RUNTIME_GEOMETRY_TILT
+    #endif
+    #ifdef RUNTIME_GEOMETRY_MODE
+        #undef RUNTIME_GEOMETRY_MODE
+    #endif
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  Make tex2Dbias a backup for tex2Dlod for wider compatibility.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+#endif
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+#endif
+//  Rule out unavailable anisotropic compatibility strategies:
+#ifndef DRIVERS_ALLOW_DERIVATIVES
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #endif
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        #undef ANTIALIAS_DISABLE_ANISOTROPIC
+    #endif
+#endif
+#ifndef DRIVERS_ALLOW_TEX2DBIAS
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+//  Prioritize anisotropic tiling compatibility strategies by performance and
+//  disable unused strategies.  This concentrates all the nesting in one place.
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+    #endif
+    #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+    #endif
+#else
+    #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+        #endif
+    #else
+        //  ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
+        //  flat texture coords in the same pass, but that's all we use.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+                #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            #endif
+        #endif
+    #endif
+#endif
+//  The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
+//  reduce some #ifdef nesting in the next section by essentially OR'ing them:
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
+    #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+#endif
+//  Prioritize anisotropic resampling compatibility strategies the same way:
+#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+        #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+    #endif
+#endif
+
+
+///////////////////////  DERIVED PHOSPHOR MASK CONSTANTS  //////////////////////
+
+//  If we can use the large mipmapped LUT without mipmapping artifacts, we
+//  should: It gives us more options for using fewer samples.
+#ifdef DRIVERS_ALLOW_TEX2DLOD
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        //  TODO: Take advantage of this!
+        #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
+         const vec2 mask_resize_src_lut_size = mask_texture_large_size;
+    #else
+         const vec2 mask_resize_src_lut_size = mask_texture_small_size;
+    #endif
+#else
+     const vec2 mask_resize_src_lut_size = mask_texture_small_size;
+#endif
+
+
+//  tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
+//  main_fragment, or a static alias of one of the above.  This makes it hard
+//  to select the phosphor mask at runtime: We can't even assign to a uniform
+//  global in the vertex shader or select a sampler2D in the vertex shader and
+//  pass it to the fragment shader (even with explicit TEXUNIT# bindings),
+//  because it just gives us the input texture or a black screen.  However, we
+//  can get around these limitations by calling tex2D three times with different
+//  uniform samplers (or resizing the phosphor mask three times altogether).
+//  With dynamic branches, we can process only one of these branches on top of
+//  quickly discarding fragments we don't need (cgc seems able to overcome
+//  limigations around dependent texture fetches inside of branches).  Without
+//  dynamic branches, we have to process every branch for every fragment...which
+//  is slower.  Runtime sampling mode selection is slower without dynamic
+//  branches as well.  Let the user's static #defines decide if it's worth it.
+#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+#else
+    #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+        #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+    #endif
+#endif
+
+//  We need to render some minimum number of tiles in the resize passes.
+//  We need at least 1.0 just to repeat a single tile, and we need extra
+//  padding beyond that for anisotropic filtering, discontinuitity fixing,
+//  antialiasing, same-pass curvature (not currently used), etc.  First
+//  determine how many border texels and tiles we need, based on how the result
+//  will be sampled:
+#ifdef GEOMETRY_EARLY
+         const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
+        //  Most antialiasing filters have a base radius of 4.0 pixels:
+         const float max_aa_base_pixel_border = 4.0 +
+            max_subpixel_offset;
+#else
+     const float max_aa_base_pixel_border = 0.0;
+#endif
+//  Anisotropic filtering adds about 0.5 to the pixel border:
+#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
+     const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
+#else
+     const float max_aniso_pixel_border = max_aa_base_pixel_border;
+#endif
+//  Fixing discontinuities adds 1.0 more to the pixel border:
+#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+     const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
+#else
+     const float max_tiled_pixel_border = max_aniso_pixel_border;
+#endif
+//  Convert the pixel border to an integer texel border.  Assume same-pass
+//  curvature about triples the texel frequency:
+#ifdef GEOMETRY_EARLY
+     const float max_mask_texel_border =
+        ceil(max_tiled_pixel_border * 3.0);
+#else
+     const float max_mask_texel_border = ceil(max_tiled_pixel_border);
+#endif
+//  Convert the texel border to a tile border using worst-case assumptions:
+ const float max_mask_tile_border = max_mask_texel_border/
+    (mask_min_allowed_triad_size * mask_triads_per_tile);
+
+//  Finally, set the number of resized tiles to render to MASK_RESIZE, and set
+//  the starting texel (inside borders) for sampling it.
+#ifndef GEOMETRY_EARLY
+    #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+        //  Special case: Render two tiles without borders.  Anisotropic
+        //  filtering doesn't seem to be a problem here.
+         const float mask_resize_num_tiles = 1.0 + 1.0;
+         const float mask_start_texels = 0.0;
+    #else
+         const float mask_resize_num_tiles = 1.0 +
+            2.0 * max_mask_tile_border;
+         const float mask_start_texels = max_mask_texel_border;
+    #endif
+#else
+     const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
+     const float mask_start_texels = max_mask_texel_border;
+#endif
+
+//  We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
+//  mask_resize_viewport_scale.  This limits the maximum final triad size.
+//  Estimate the minimum number of triads we can split the screen into in each
+//  dimension (we'll be as correct as mask_resize_viewport_scale is):
+ const float mask_resize_num_triads =
+    mask_resize_num_tiles * mask_triads_per_tile;
+ const vec2 min_allowed_viewport_triads =
+    vec2(mask_resize_num_triads) / mask_resize_viewport_scale;
+
+
+////////////////////////  COMMON MATHEMATICAL CONSTANTS  ///////////////////////
+
+ const float pi = 3.141592653589;
+//  We often want to find the location of the previous texel, e.g.:
+//      const vec2 curr_texel = uv * texture_size;
+//      const vec2 prev_texel = floor(curr_texel - vec2(0.5)) + vec2(0.5);
+//      const vec2 prev_texel_uv = prev_texel / texture_size;
+//  However, many GPU drivers round incorrectly around exact texel locations.
+//  We need to subtract a little less than 0.5 before flooring, and some GPU's
+//  require this value to be farther from 0.5 than others; define it here.
+//      const vec2 prev_texel =
+//          floor(curr_texel - vec2(under_half)) + vec2(0.5);
+ const float under_half = 0.4995;
+
+
+#endif  //  DERIVED_SETTINGS_AND_CONSTANTS_H
+
diff --git a/crt/shaders/crt-royale/src/geometry-functions.h b/crt/shaders/crt-royale/src/geometry-functions.h
new file mode 100644
index 0000000..fff281c
--- /dev/null
+++ b/crt/shaders/crt-royale/src/geometry-functions.h
@@ -0,0 +1,693 @@
+#ifndef GEOMETRY_FUNCTIONS_H
+#define GEOMETRY_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+#include "bind-shader-params.h"
+
+
+////////////////////////////  MACROS AND CONSTANTS  ////////////////////////////
+
+//  Curvature-related constants:
+#define MAX_POINT_CLOUD_SIZE 9
+
+
+/////////////////////////////  CURVATURE FUNCTIONS /////////////////////////////
+
+vec2 quadratic_solve(const float a, const float b_over_2, const float c)
+{
+    //  Requires:   1.) a, b, and c are quadratic formula coefficients
+    //              2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out)
+    //              3.) b_over_2 must be guaranteed < 0.0 (avoids a branch)
+    //  Returns:    Returns vec2(first_solution, discriminant), so the caller
+    //              can choose how to handle the "no intersection" case.  The
+    //              Kahan or Citardauq formula is used for numerical robustness.
+    const float discriminant = b_over_2*b_over_2 - a*c;
+    const float solution0 = c/(-b_over_2 + sqrt(discriminant));
+    return vec2(solution0, discriminant);
+}
+
+vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec)
+{
+    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
+    //                  local coordinate frame (eye_pos_vec is a position, i.e.
+    //                  a vector from the origin to the eye/camera)
+    //              2.) geom_radius is a global containing the sphere's radius
+    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
+    //              sphere of radius geom_radius, and return the distance to
+    //              the first intersection in units of length(view_vec).
+    //              http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection
+    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
+    const float a = dot(view_vec, view_vec);
+    const float b_over_2 = dot(view_vec, eye_pos_vec);  //  * 2.0 factored out
+    const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius;
+    return quadratic_solve(a, b_over_2, c);
+}
+
+vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec)
+{
+    //  Requires:   1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
+    //                  local coordinate frame (eye_pos_vec is a position, i.e.
+    //                  a vector from the origin to the eye/camera)
+    //              2.) geom_radius is a global containing the cylinder's radius
+    //  Returns:    Cast a ray of direction view_vec from eye_pos_vec at a
+    //              cylinder of radius geom_radius, and return the distance to
+    //              the first intersection in units of length(view_vec).  The
+    //              derivation of the coefficients is in Christer Ericson's
+    //              Real-Time Collision Detection, p. 195-196, and this version
+    //              uses LaGrange's identity to reduce operations.
+    //  Arbitrary "cylinder top" reference point for an infinite cylinder:
+    const vec3 cylinder_top_vec = vec3(0.0, geom_radius, 0.0);
+    const vec3 cylinder_axis_vec = vec3(0.0, 1.0, 0.0);//vec3(0.0, 2.0*geom_radius, 0.0);
+    const vec3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
+    const vec3 axis_x_view = cross(cylinder_axis_vec, view_vec);
+    const vec3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
+    //  Quadratic formula coefficients (b_over_2 is guaranteed negative):
+    const float a = dot(axis_x_view, axis_x_view);
+    const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view);
+    const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) -
+        geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
+    return quadratic_solve(a, b_over_2, c);
+}
+
+vec2 cylinder_xyz_to_uv(const vec3 intersection_pos_local,
+    const vec2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a cylinder.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
+    //              and define square_uv.y = -intersection_pos_local.y (+v = -y).
+    //  Start with a numerically robust arc length calculation.
+    const float angle_from_image_center = atan2(intersection_pos_local.x,
+        intersection_pos_local.z);
+    const float signed_arc_len = angle_from_image_center * geom_radius;
+    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
+    //  by the aspect ratio to stretch the mapping appropriately:
+    const vec2 square_uv = vec2(signed_arc_len, -intersection_pos_local.y);
+    const vec2 video_uv = square_uv / geom_aspect;
+    return video_uv;
+}
+
+vec3 cylinder_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a cylinder.  This is the
+    //              inverse of cylinder_xyz_to_uv().
+    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
+    //  then calculate an xyz position for the cylindrical mapping above.
+    const vec2 square_uv = video_uv * geom_aspect;
+    const float arc_len = square_uv.x;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float x_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    //  Or: z = sqrt(geom_radius**2 - x**2)
+    //  Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle)
+    const vec3 intersection_pos_local = vec3(x_pos, -square_uv.y, z_pos);
+    return intersection_pos_local;
+}
+
+vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local,
+    const vec2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a sphere.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    First define square_uv.x/square_uv.y ==
+    //              intersection_pos_local.x/intersection_pos_local.y.  Then,
+    //              length(square_uv) is the arc length from the image center
+    //              at (0.0, 0.0, geom_radius) along the tangent great circle.
+    //              Credit for this mapping goes to cgwg: I never managed to
+    //              understand his code, but he told me his mapping was based on
+    //              great circle distances when I asked him about it, which
+    //              informed this very similar (almost identical) mapping.
+    //  Start with a numerically robust arc length calculation between the ray-
+    //  sphere intersection point and the image center using a method posted by
+    //  Roger Stafford on comp.soft-sys.matlab:
+    //  https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ
+    const vec3 image_center_pos_local = vec3(0.0, 0.0, geom_radius);
+    const float cp_len =
+        length(cross(intersection_pos_local, image_center_pos_local));
+    const float dp = dot(intersection_pos_local, image_center_pos_local);
+    const float angle_from_image_center = atan2(cp_len, dp);
+    const float arc_len = angle_from_image_center * geom_radius;
+    //  Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
+    //  by the aspect ratio to stretch the mapping appropriately:
+    const vec2 square_uv_unit = normalize(vec2(intersection_pos_local.x,
+        -intersection_pos_local.y));
+    const vec2 square_uv = arc_len * square_uv_unit;
+    const vec2 video_uv = square_uv / geom_aspect;
+    return video_uv;
+}
+
+vec3 sphere_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a sphere.  This is the
+    //              inverse of sphere_xyz_to_uv().
+    //  Expand video_uv by the aspect ratio to get proportionate x/y lengths,
+    //  then calculate an xyz position for the spherical mapping above.
+    const vec2 square_uv = video_uv * geom_aspect;
+    //  Using length or sqrt here butchers the framerate on my 8800GTS if
+    //  this function is called too many times, and so does taking the max
+    //  component of square_uv/square_uv_unit (program length threshold?).
+    //float arc_len = length(square_uv);
+    const vec2 square_uv_unit = normalize(square_uv);
+    const float arc_len = square_uv.y/square_uv_unit.y;
+    const float angle_from_image_center = arc_len / geom_radius;
+    const float xy_dist_from_sphere_center =
+        sin(angle_from_image_center) * geom_radius;
+    //vec2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
+    const vec2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
+    const float z_pos = cos(angle_from_image_center) * geom_radius;
+    const vec3 intersection_pos_local = vec3(xy_pos.x, -xy_pos.y, z_pos);
+    return intersection_pos_local;
+}
+
+vec2 sphere_alt_xyz_to_uv(const vec3 intersection_pos_local,
+    const vec2 geom_aspect)
+{
+    //  Requires:   An xyz intersection position on a cylinder.
+    //  Returns:    video_uv coords mapped to range [-0.5, 0.5]
+    //  Mapping:    Define square_uv.x to be the signed arc length in xz-space,
+    //              and define square_uv.y == signed arc length in yz-space.
+    //  See cylinder_xyz_to_uv() for implementation details (very similar).
+    const vec2 angle_from_image_center = atan2(
+        vec2(intersection_pos_local.x, -intersection_pos_local.y),
+        intersection_pos_local.zz);
+    const vec2 signed_arc_len = angle_from_image_center * geom_radius;
+    const vec2 video_uv = signed_arc_len / geom_aspect;
+    return video_uv;
+}
+
+vec3 sphere_alt_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
+{
+    //  Requires:   video_uv coords mapped to range [-0.5, 0.5]
+    //  Returns:    An xyz intersection position on a sphere.  This is the
+    //              inverse of sphere_alt_xyz_to_uv().
+    //  See cylinder_uv_to_xyz() for implementation details (very similar).
+    const vec2 square_uv = video_uv * geom_aspect;
+    const vec2 arc_len = square_uv;
+    const vec2 angle_from_image_center = arc_len / geom_radius;
+    const vec2 xy_pos = sin(angle_from_image_center) * geom_radius;
+    const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos));
+    return vec3(xy_pos.x, -xy_pos.y, z_pos);
+}
+
+inline vec2 intersect(const vec3 view_vec_local, const vec3 eye_pos_local,
+    const float geom_mode)
+{
+    return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) :
+        intersect_cylinder(view_vec_local, eye_pos_local);
+}
+
+inline vec2 xyz_to_uv(const vec3 intersection_pos_local,
+    const vec2 geom_aspect, const float geom_mode)
+{
+    return geom_mode < 1.5 ?
+            sphere_xyz_to_uv(intersection_pos_local, geom_aspect) :
+        geom_mode < 2.5 ?
+            sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) :
+            cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
+}
+
+inline vec3 uv_to_xyz(const vec2 uv, const vec2 geom_aspect,
+    const float geom_mode)
+{
+    return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) :
+        geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) :
+        cylinder_uv_to_xyz(uv, geom_aspect);
+}
+
+vec2 view_vec_to_uv(const vec3 view_vec_local, const vec3 eye_pos_local,
+    const vec2 geom_aspect, const float geom_mode, out vec3 intersection_pos)
+{
+    //  Get the intersection point on the primitive, given an eye position
+    //  and view vector already in its local coordinate frame:
+    const vec2 intersect_dist_and_discriminant = intersect(view_vec_local,
+        eye_pos_local, geom_mode);
+    const vec3 intersection_pos_local = eye_pos_local +
+        view_vec_local * intersect_dist_and_discriminant.x;
+    //  Save the intersection position to an output parameter:
+    intersection_pos = intersection_pos_local;
+    //  Transform into uv coords, but give out-of-range coords if the
+    //  view ray doesn't intersect the primitive in the first place:
+    return intersect_dist_and_discriminant.y > 0.005 ?
+        xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : vec2(1.0);
+}
+
+vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
+    const vec2 geom_aspect, const vec3 global_coords[MAX_POINT_CLOUD_SIZE],
+    const int num_points)
+{
+    //  Requires:   Parameters:
+    //              1.) Starting eye_pos is a global 3D position at which the
+    //                  camera contains all points in global_coords[] in its FOV
+    //              2.) geom_aspect = get_aspect_vector(
+    //                      IN.output_size.x / IN.output_size.y);
+    //              3.) global_coords is a point cloud containing global xyz
+    //                  coords of extreme points on the simulated CRT screen.
+    //              Globals:
+    //              1.) geom_view_dist must be > 0.0.  It controls the "near
+    //                  plane" used to interpret flat_video_uv as a view
+    //                  vector, which controls the field of view (FOV).
+    //              Eyespace coordinate frame: +x = right, +y = up, +z = back
+    //  Returns:    Return an eye position at which the point cloud spans as
+    //              much of the screen as possible (given the FOV controlled by
+    //              geom_view_dist) without being cropped or sheared.
+    //  Algorithm:
+    //  1.) Move the eye laterally to a point which attempts to maximize the
+    //      the amount we can move forward without clipping the CRT screen.
+    //  2.) Move forward by as much as possible without clipping the CRT.
+    //  Get the allowed movement range by solving for the eye_pos offsets
+    //  that result in each point being projected to a screen edge/corner in
+    //  pseudo-normalized device coords (where xy ranges from [-0.5, 0.5]
+    //  and z = eyespace z):
+    //      pndc_coord = vec3(vec2(eyespace_xyz.x, -eyespace_xyz.y)*
+    //      geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z);
+    //  Notes:
+    //  The field of view is controlled by geom_view_dist's magnitude relative to
+    //  the view vector's x and y components:
+    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect
+    //      view_vec.z = -geom_view_dist
+    //  But for the purposes of perspective divide, it should be considered:
+    //      view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist
+    //      view_vec.z = -1.0
+    const int max_centering_iters = 1;  //  Keep for easy testing.
+    for(int iter = 0; iter < max_centering_iters; iter++)
+    {
+        //  0.) Get the eyespace coordinates of our point cloud:
+        vec3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
+        for(int i = 0; i < num_points; i++)
+        {
+            eyespace_coords[i] = global_coords[i] - eye_pos;
+        }
+        //  1a.)For each point, find out how far we can move eye_pos in each
+        //      lateral direction without the point clipping the frustum.
+        //      Eyespace +y = up, screenspace +y = down, so flip y after
+        //      applying the eyespace offset (on the way to "clip space").
+        //  Solve for two offsets per point based on:
+        //      (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(-0.5)
+        //      (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) *
+        //      geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(0.5)
+        //  offset_ul and offset_dr represent the farthest we can move the
+        //  eye_pos up-left and down-right.  Save the min of all offset_dr's
+        //  and the max of all offset_ul's (since it's negative).
+        float abs_radius = abs(geom_radius);  //  In case anyone gets ideas. ;)
+        vec2 offset_dr_min = vec2(10.0 * abs_radius, 10.0 * abs_radius);
+        vec2 offset_ul_max = vec2(-10.0 * abs_radius, -10.0 * abs_radius);
+        for(int i = 0; i < num_points; i++)
+        {
+            const vec2 flipy = vec2(1.0, -1.0);
+            vec3 eyespace_xyz = eyespace_coords[i];
+            vec2 offset_dr = eyespace_xyz.xy - vec2(-0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            vec2 offset_ul = eyespace_xyz.xy - vec2(0.5) *
+                (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
+            offset_dr_min = min(offset_dr_min, offset_dr);
+            offset_ul_max = max(offset_ul_max, offset_ul);
+        }
+        //  1b.)Update eye_pos: Adding the average of offset_ul_max and
+        //      offset_dr_min gives it equal leeway on the top vs. bottom
+        //      and left vs. right.  Recalculate eyespace_coords accordingly.
+        vec2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
+        eye_pos.xy += center_offset;
+        for(int i = 0; i < num_points; i++)
+        {
+            eyespace_coords[i] = global_coords[i] - eye_pos;
+        }
+        //  2a.)For each point, find out how far we can move eye_pos forward
+        //      without the point clipping the frustum.  Flip the y
+        //      direction in advance (matters for a later step, not here).
+        //      Solve for four offsets per point based on:
+        //      eyespace_xyz_flipy.x * geom_view_dist /
+        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5
+        //      eyespace_xyz_flipy.y * geom_view_dist /
+        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5
+        //      eyespace_xyz_flipy.x * geom_view_dist /
+        //          (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5
+        //      eyespace_xyz_flipy.y * geom_view_dist /
+        //          (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5
+        //      We'll vectorize the actual computation.  Take the maximum of
+        //      these four for a single offset, and continue taking the max
+        //      for every point (use max because offset.z is negative).
+        float offset_z_max = -10.0 * geom_radius * geom_view_dist;
+        for(int i = 0; i < num_points; i++)
+        {
+            vec3 eyespace_xyz_flipy = eyespace_coords[i] *
+                vec3(1.0, -1.0, 1.0);
+            vec4 offset_zzzz = eyespace_xyz_flipy.zzzz +
+                (eyespace_xyz_flipy.xyxy * geom_view_dist) /
+                (vec4(-0.5, -0.5, 0.5, 0.5) * vec4(geom_aspect, geom_aspect));
+            //  Ignore offsets that push positive x/y values to opposite
+            //  boundaries, and vice versa, and don't let the camera move
+            //  past a point in the dead center of the screen:
+            offset_z_max = (eyespace_xyz_flipy.x < 0.0) ?
+                max(offset_z_max, offset_zzzz.x) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.y < 0.0) ?
+                max(offset_z_max, offset_zzzz.y) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.x > 0.0) ?
+                max(offset_z_max, offset_zzzz.z) : offset_z_max;
+            offset_z_max = (eyespace_xyz_flipy.y > 0.0) ?
+                max(offset_z_max, offset_zzzz.w) : offset_z_max;
+            offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z);
+        }
+        //  2b.)Update eye_pos: Add the maximum (smallest negative) z offset.
+        eye_pos.z += offset_z_max;
+    }
+    return eye_pos;
+}
+
+vec3 get_ideal_global_eye_pos(const vec3x3 local_to_global,
+    const vec2 geom_aspect, const float geom_mode)
+{
+    //  Start with an initial eye_pos that includes the entire primitive
+    //  (sphere or cylinder) in its field-of-view:
+    const vec3 high_view = vec3(0.0, geom_aspect.y, -geom_view_dist);
+    const vec3 low_view = high_view * vec3(1.0, -1.0, 1.0);
+    const float len_sq = dot(high_view, high_view);
+    const float fov = abs(acos(dot(high_view, low_view)/len_sq));
+    //  Trigonometry/similar triangles say distance = geom_radius/sin(fov/2):
+    const float eye_z_spherical = geom_radius/sin(fov*0.5);
+    const vec3 eye_pos = geom_mode < 2.5 ?
+        vec3(0.0, 0.0, eye_z_spherical) :
+        vec3(0.0, 0.0, max(geom_view_dist, eye_z_spherical));
+
+    //  Get global xyz coords of extreme sample points on the simulated CRT
+    //  screen.  Start with the center, edge centers, and corners of the
+    //  video image.  We can't ignore backfacing points: They're occluded
+    //  by closer points on the primitive, but they may NOT be occluded by
+    //  the convex hull of the remaining samples (i.e. the remaining convex
+    //  hull might not envelope points that do occlude a back-facing point.)
+    const int num_points = MAX_POINT_CLOUD_SIZE;
+    vec3 global_coords[MAX_POINT_CLOUD_SIZE];
+    global_coords[0] = mul(local_to_global, uv_to_xyz(vec2(0.0, 0.0), geom_aspect, geom_mode));
+    global_coords[1] = mul(local_to_global, uv_to_xyz(vec2(0.0, -0.5), geom_aspect, geom_mode));
+    global_coords[2] = mul(local_to_global, uv_to_xyz(vec2(0.0, 0.5), geom_aspect, geom_mode));
+    global_coords[3] = mul(local_to_global, uv_to_xyz(vec2(-0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[4] = mul(local_to_global, uv_to_xyz(vec2(0.5, 0.0), geom_aspect, geom_mode));
+    global_coords[5] = mul(local_to_global, uv_to_xyz(vec2(-0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[6] = mul(local_to_global, uv_to_xyz(vec2(0.5, -0.5), geom_aspect, geom_mode));
+    global_coords[7] = mul(local_to_global, uv_to_xyz(vec2(-0.5, 0.5), geom_aspect, geom_mode));
+    global_coords[8] = mul(local_to_global, uv_to_xyz(vec2(0.5, 0.5), geom_aspect, geom_mode));
+    //  Adding more inner image points could help in extreme cases, but too many
+    //  points will kille the framerate.  For safety, default to the initial
+    //  eye_pos if any z coords are negative:
+    float num_negative_z_coords = 0.0;
+    for(int i = 0; i < num_points; i++)
+    {
+        num_negative_z_coords += float(global_coords[0].z < 0.0);
+    }
+    //  Outsource the optimized eye_pos calculation:
+    return num_negative_z_coords > 0.5 ? eye_pos :
+        get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect,
+            global_coords, num_points);
+}
+
+vec3x3 get_pixel_to_object_matrix(const vec3x3 global_to_local,
+    const vec3 eye_pos_local, const vec3 view_vec_global,
+    const vec3 intersection_pos_local, const vec3 normal,
+    const vec2 output_size_inv)
+{
+    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
+    //              descriptions of each parameter.
+    //  Returns:    Return a transformation matrix from 2D pixel-space vectors
+    //              (where (+1.0, +1.0) is a vector to one pixel down-right,
+    //              i.e. same directionality as uv texels) to 3D object-space
+    //              vectors in the CRT's local coordinate frame (right-handed)
+    //              ***which are tangent to the CRT surface at the intersection
+    //              position.***  (Basically, we want to convert pixel-space
+    //              vectors to 3D vectors along the CRT's surface, for later
+    //              conversion to uv vectors.)
+    //  Shorthand inputs:
+    const vec3 pos = intersection_pos_local;
+    const vec3 eye_pos = eye_pos_local;
+    //  Get a piecewise-linear matrix transforming from "pixelspace" offset
+    //  vectors (1.0 = one pixel) to object space vectors in the tangent
+    //  plane (faster than finding 3 view-object intersections).
+    //  1.) Get the local view vecs for the pixels to the right and down:
+    const vec3 view_vec_right_global = view_vec_global +
+        vec3(output_size_inv.x, 0.0, 0.0);
+    const vec3 view_vec_down_global = view_vec_global +
+        vec3(0.0, -output_size_inv.y, 0.0);
+    const vec3 view_vec_right_local =
+        mul(global_to_local, view_vec_right_global);
+    const vec3 view_vec_down_local =
+        mul(global_to_local, view_vec_down_global);
+    //  2.) Using the true intersection point, intersect the neighboring
+    //      view vectors with the tangent plane:
+    const vec3 intersection_vec_dot_normal = dot(pos - eye_pos, normal);
+    const vec3 right_pos = eye_pos + (intersection_vec_dot_normal /
+        dot(view_vec_right_local, normal))*view_vec_right_local;
+    const vec3 down_pos = eye_pos + (intersection_vec_dot_normal /
+        dot(view_vec_down_local, normal))*view_vec_down_local;
+    //  3.) Subtract the original intersection pos from its neighbors; the
+    //      resulting vectors are object-space vectors tangent to the plane.
+    //      These vectors are the object-space transformations of (1.0, 0.0)
+    //      and (0.0, 1.0) pixel offsets, so they form the first two basis
+    //      vectors of a pixelspace to object space transformation.  This
+    //      transformation is 2D to 3D, so use (0, 0, 0) for the third vector.
+    const vec3 object_right_vec = right_pos - pos;
+    const vec3 object_down_vec = down_pos - pos;
+    const vec3x3 pixel_to_object = vec3x3(
+        object_right_vec.x, object_down_vec.x, 0.0,
+        object_right_vec.y, object_down_vec.y, 0.0,
+        object_right_vec.z, object_down_vec.z, 0.0);
+    return pixel_to_object;
+}
+
+vec3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
+    const vec3 normal, const vec2 geom_aspect, const float geom_mode)
+{
+    //  Requires:   See get_curved_video_uv_coords_and_tangent_matrix for
+    //              descriptions of each parameter.
+    //  Returns:    Return a transformation matrix from 3D object-space vectors
+    //              in the CRT's local coordinate frame (right-handed, +y = up)
+    //              to 2D video_uv vectors (+v = down).
+    //  Description:
+    //  The TBN matrix formed by the [tangent, bitangent, normal] basis
+    //  vectors transforms ordinary vectors from tangent->object space.
+    //  The cotangent matrix formed by the [cotangent, cobitangent, normal]
+    //  basis vectors transforms normal vectors (covectors) from
+    //  tangent->object space.  It's the inverse-transpose of the TBN matrix.
+    //  We want the inverse of the TBN matrix (transpose of the cotangent
+    //  matrix), which transforms ordinary vectors from object->tangent space.
+    //  Start by calculating the relevant basis vectors in accordance with
+    //  Christian Schüler's blog post "Followup: Normal Mapping Without
+    //  Precomputed Tangents":  http://www.thetenthplanet.de/archives/1180
+    //  With our particular uv mapping, the scale of the u and v directions
+    //  is determined entirely by the aspect ratio for cylindrical and ordinary
+    //  spherical mappings, and so tangent and bitangent lengths are also
+    //  determined by it (the alternate mapping is more complex).  Therefore, we
+    //  must ensure appropriate cotangent and cobitangent lengths as well.
+    //  Base these off the uv<=>xyz mappings for each primitive.
+    const vec3 pos = intersection_pos_local;
+    const vec3 x_vec = vec3(1.0, 0.0, 0.0);
+    const vec3 y_vec = vec3(0.0, 1.0, 0.0);
+    //  The tangent and bitangent vectors correspond with increasing u and v,
+    //  respectively.  Mathematically we'd base the cotangent/cobitangent on
+    //  those, but we'll compute the cotangent/cobitangent directly when we can.
+    vec3 cotangent_unscaled, cobitangent_unscaled;
+    //  geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE.
+    if(geom_mode < 1.5)
+    {
+        //  Sphere:
+        //  tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x
+        //  bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y
+        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
+        //  cotangent = cross(normal, bitangent) * inv_determinant
+        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
+        //  cobitangent = cross(tangent, normal) * inv_determinant
+        //            == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant
+        //  Simplified (scale by inv_determinant below):
+        cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y;
+        cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x;
+    }
+    else if(geom_mode < 2.5)
+    {
+        //  Sphere, alternate mapping:
+        //  This mapping works a bit like the cylindrical mapping in two
+        //  directions, which makes the lengths and directions more complex.
+        //  Unfortunately, I can't find much of a shortcut:
+        const vec3 tangent = normalize(
+            cross(y_vec, vec3(pos.x, 0.0, pos.z))) * geom_aspect.x;
+        const vec3 bitangent = normalize(
+            cross(x_vec, vec3(0.0, pos.yz))) * geom_aspect.y;
+        cotangent_unscaled = cross(normal, bitangent);
+        cobitangent_unscaled = cross(tangent, normal);
+    }
+    else
+    {
+        //  Cylinder:
+        //  tangent = normalize(cross(y_vec, normal)) * geom_aspect.x;
+        //  bitangent = vec3(0.0, -geom_aspect.y, 0.0);
+        //  inv_determinant = 1.0/length(cross(bitangent, tangent))
+        //  cotangent = cross(normal, bitangent) * inv_determinant
+        //            == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
+        //  cobitangent = cross(tangent, normal) * inv_determinant
+        //            == vec3(0.0, -geom_aspect.x, 0.0) * inv_determinant
+        cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y;
+        cobitangent_unscaled = vec3(0.0, -geom_aspect.x, 0.0);
+    }
+    const vec3 computed_normal =
+        cross(cobitangent_unscaled, cotangent_unscaled);
+    const float inv_determinant = rsqrt(dot(computed_normal, computed_normal));
+    const vec3 cotangent = cotangent_unscaled * inv_determinant;
+    const vec3 cobitangent = cobitangent_unscaled * inv_determinant;
+    //  The [cotangent, cobitangent, normal] column vecs form the cotangent
+    //  frame, i.e. the inverse-transpose TBN matrix.  Get its transpose:
+    const vec3x3 object_to_tangent = vec3x3(cotangent, cobitangent, normal);
+    return object_to_tangent;
+}
+
+vec2 get_curved_video_uv_coords_and_tangent_matrix(
+    const vec2 flat_video_uv, const vec3 eye_pos_local,
+    const vec2 output_size_inv, const vec2 geom_aspect,
+    const float geom_mode, const vec3x3 global_to_local,
+    out vec2x2 pixel_to_tangent_video_uv)
+{
+    //  Requires:   Parameters:
+    //              1.) flat_video_uv coords are in range [0.0, 1.0], where
+    //                  (0.0, 0.0) is the top-left corner of the screen and
+    //                  (1.0, 1.0) is the bottom-right corner.
+    //              2.) eye_pos_local is the 3D camera position in the simulated
+    //                  CRT's local coordinate frame.  For best results, it must
+    //                  be computed based on the same geom_view_dist used here.
+    //              3.) output_size_inv = vec2(1.0)/IN.output_size
+    //              4.) geom_aspect = get_aspect_vector(
+    //                      IN.output_size.x / IN.output_size.y);
+    //              5.) geom_mode is a static or runtime mode setting:
+    //                  0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder
+    //              6.) global_to_local is a 3x3 matrix transforming (ordinary)
+    //                  worldspace vectors to the CRT's local coordinate frame
+    //              Globals:
+    //              1.) geom_view_dist must be > 0.0.  It controls the "near
+    //                  plane" used to interpret flat_video_uv as a view
+    //                  vector, which controls the field of view (FOV).
+    //  Returns:    Return final uv coords in [0.0, 1.0], and return a pixel-
+    //              space to video_uv tangent-space matrix in the out parameter.
+    //              (This matrix assumes pixel-space +y = down, like +v = down.)
+    //              We'll transform flat_video_uv into a view vector, project
+    //              the view vector from the camera/eye, intersect with a sphere
+    //              or cylinder representing the simulated CRT, and convert the
+    //              intersection position into final uv coords and a local
+    //              transformation matrix.
+    //  First get the 3D view vector (geom_aspect and geom_view_dist are globals):
+    //  1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5)
+    //      correspond to the top-left/bottom-right output screen corners.
+    //  2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen-
+    //      space 2D aspect correction.  We'll reapply it in uv-space.
+    //  3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y
+    //      is up in 3D worldspace (enforce a right-handed system).
+    //  4.) The view vector z controls the "near plane" distance and FOV.
+    //      For the effect of "looking through a window" at a CRT, it should be
+    //      set equal to the user's distance from their physical screen, in
+    //      units of the viewport's physical diagonal size.
+    const vec2 view_uv = (flat_video_uv - vec2(0.5)) * geom_aspect;
+    const vec3 view_vec_global =
+        vec3(view_uv.x, -view_uv.y, -geom_view_dist);
+    //  Transform the view vector into the CRT's local coordinate frame, convert
+    //  to video_uv coords, and get the local 3D intersection position:
+    const vec3 view_vec_local = mul(global_to_local, view_vec_global);
+    vec3 pos;
+    const vec2 centered_uv = view_vec_to_uv(
+        view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos);
+    const vec2 video_uv = centered_uv + vec2(0.5);
+    //  Get a pixel-to-tangent-video-uv matrix.  The caller could deal with
+    //  all but one of these cases, but that would be more complicated.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        //  Derivatives obtain a matrix very fast, but the direction of pixel-
+        //  space +y seems to depend on the pass.  Enforce the correct direction
+        //  on a best-effort basis (but it shouldn't matter for antialiasing).
+        const vec2 duv_dx = ddx(video_uv);
+        const vec2 duv_dy = ddy(video_uv);
+        #ifdef LAST_PASS
+            pixel_to_tangent_video_uv = vec2x2(
+                duv_dx.x, duv_dy.x,
+                -duv_dx.y, -duv_dy.y);
+        #else
+            pixel_to_tangent_video_uv = vec2x2(
+                duv_dx.x, duv_dy.x,
+                duv_dx.y, duv_dy.y);
+        #endif
+    #else
+        //  Manually define a transformation matrix.  We'll assume pixel-space
+        //  +y = down, just like +v = down.
+        if(geom_force_correct_tangent_matrix)
+        {
+            //  Get the surface normal based on the local intersection position:
+            const vec3 normal_base = geom_mode < 2.5 ? pos :
+                vec3(pos.x, 0.0, pos.z);
+            const vec3 normal = normalize(normal_base);
+            //  Get pixel-to-object and object-to-tangent matrices and combine
+            //  them into a 2x2 pixel-to-tangent matrix for video_uv offsets:
+            const vec3x3 pixel_to_object = get_pixel_to_object_matrix(
+                global_to_local, eye_pos_local, view_vec_global, pos, normal,
+                output_size_inv);
+            const vec3x3 object_to_tangent = get_object_to_tangent_matrix(
+                pos, normal, geom_aspect, geom_mode);
+            const vec3x3 pixel_to_tangent3x3 =
+                mul(object_to_tangent, pixel_to_object);
+            pixel_to_tangent_video_uv = vec2x2(
+                pixel_to_tangent3x3._m00_m01_m10_m11);
+        }
+        else
+        {
+            //  Ignore curvature, and just consider flat scaling.  The
+            //  difference is only apparent with strong curvature:
+            pixel_to_tangent_video_uv = vec2x2(
+                output_size_inv.x, 0.0, 0.0, output_size_inv.y);
+        }
+    #endif
+    return video_uv;
+}
+
+float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect)
+{
+    //  COPYRIGHT NOTE FOR THIS FUNCTION:
+    //  Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey
+    //  This function uses an algorithm first coded in several of cgwg's GPL-
+    //  licensed lines in crt-geom-curved.cg and its ancestors.  The line
+    //  between algorithm and code is nearly indistinguishable here, so it's
+    //  unclear whether I could even release this project under a non-GPL
+    //  license with this function included.
+
+    //  Calculate border_dim_factor from the proximity to uv-space image
+    //  borders; geom_aspect/border_size/border/darkness/border_compress are globals:
+    const vec2 edge_dists = min(video_uv, vec2(1.0) - video_uv) *
+        geom_aspect;
+    const vec2 border_penetration =
+        max(vec2(border_size) - edge_dists, vec2(0.0));
+    const float penetration_ratio = length(border_penetration)/border_size;
+    const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0);
+    const float border_dim_factor =
+        pow(border_escape_ratio, border_darkness) * max(1.0, border_compress);
+    return min(border_dim_factor, 1.0);
+}
+
+
+
+#endif  //  GEOMETRY_FUNCTIONS_H
+
+
+
diff --git a/crt/shaders/crt-royale/src/phosphor-mask-resizing.h b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h
new file mode 100644
index 0000000..be26624
--- /dev/null
+++ b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h
@@ -0,0 +1,677 @@
+#ifndef PHOSPHOR_MASK_RESIZING_H
+#define PHOSPHOR_MASK_RESIZING_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+
+/////////////////////////////  CODEPATH SELECTION  /////////////////////////////
+
+//  Choose a looping strategy based on what's allowed:
+//  Dynamic loops not allowed: Use a flat static loop.
+//  Dynamic loops accomodated: Coarsely branch around static loops.
+//  Dynamic loops assumed allowed: Use a flat dynamic loop.
+#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+    #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+        #define BREAK_LOOPS_INTO_PIECES
+    #else
+        #define USE_SINGLE_STATIC_LOOP
+    #endif
+#endif  //  No else needed: Dynamic loops assumed.
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+//  The larger the resized tile, the fewer samples we'll need for downsizing.
+//  See if we can get a static min tile size > mask_min_allowed_tile_size:
+const float mask_min_allowed_tile_size = ceil(
+    mask_min_allowed_triad_size * mask_triads_per_tile);
+const float mask_min_expected_tile_size = 
+        mask_min_allowed_tile_size;
+//  Limit the number of sinc resize taps by the maximum minification factor:
+const float pi_over_lobes = pi/mask_sinc_lobes;
+const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
+    mask_resize_src_lut_size.x/mask_min_expected_tile_size;
+//  Vectorized loops sample in multiples of 4.  Round up to be safe:
+const float max_sinc_resize_samples_m4 = ceil(
+    max_sinc_resize_samples_float * 0.25) * 4.0;
+
+
+/////////////////////////  RESAMPLING FUNCTION HELPERS  ////////////////////////
+
+inline float get_dynamic_loop_size(const float magnification_scale)
+{
+    //  Requires:   The following global constants must be defined:
+    //              1.) mask_sinc_lobes
+    //              2.) max_sinc_resize_samples_m4
+    //  Returns:    The minimum number of texture samples for a correct downsize
+    //              at magnification_scale.
+    //  We're downsizing, so the filter is sized across 2*lobes output pixels
+    //  (not 2*lobes input texels).  This impacts distance measurements and the
+    //  minimum number of input samples needed.
+    const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
+    const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
+    #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+        const float max_samples_m4 = max_sinc_resize_samples_m4;
+    #else   // ifdef BREAK_LOOPS_INTO_PIECES
+        //  Simulating loops with branches imposes a 128-sample limit.
+        const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
+    #endif
+    return min(min_samples_m4, max_samples_m4);
+}
+
+vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv, 
+    const vec2 texture_size, const float dr, 
+    const float input_tiles_per_texture_r, const float samples,
+    const bool vertical)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) input_tiles_per_texture_r is the number of input tiles
+    //                  that can fit in the input texture in the direction we're
+    //                  resampling this pass.
+    //              3.) vertical indicates whether we're resampling vertically
+    //                  this pass (or horizontally).
+    //  Returns:    Pack and return the first sample's tile_uv coord in [0, 1]
+    //              and its texel distance from the destination pixel, in the
+    //              resized dimension only.
+    //  We'll start with the topmost or leftmost sample and work down or right,
+    //  so get the first sample location and distance.  Modify both dimensions
+    //  as if we're doing a one-pass 2D resize; we'll throw away the unneeded
+    //  (and incorrect) dimension at the end.
+    const vec2 curr_texel = tex_uv * texture_size;
+    const vec2 prev_texel =
+        floor(curr_texel - vec2(under_half)) + vec2(0.5);
+    const vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0);
+    const vec2 first_texel_uv_wrap_2D = first_texel * dr;
+    const vec2 first_texel_dist_2D = curr_texel - first_texel;
+    //  Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
+    const vec2 first_texel_tile_uv_wrap_2D =
+        first_texel_uv_wrap_2D * input_tiles_per_texture_r;
+    //  Project wrapped coordinates to the [0, 1] range.  We'll do this with all
+    //  samples,but the first texel is special, since it might be negative.
+    const vec2 coord_negative =
+        vec2(first_texel_tile_uv_wrap_2D < vec2(0.0));
+    const vec2 first_texel_tile_uv_2D =
+        frac(first_texel_tile_uv_wrap_2D) + coord_negative;
+    //  Pack the first texel's tile_uv coord and texel distance in 1D:
+    const vec2 tile_u_and_dist =
+        vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
+    const vec2 tile_v_and_dist =
+        vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
+    return vertical ? tile_v_and_dist : tile_u_and_dist;
+    //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
+}
+
+inline vec4 tex2Dlod0try(const sampler2D tex, const vec2 tex_uv)
+{
+    //  Mipmapping and anisotropic filtering get confused by sinc-resampling.
+    //  One [slow] workaround is to select the lowest mip level:
+    #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
+        return tex2Dlod(tex, vec4(tex_uv, 0.0, 0.0));
+    #else
+        #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
+            return tex2Dbias(tex, vec4(tex_uv, 0.0, -16.0));
+        #else
+            return texture(tex, tex_uv);
+        #endif
+    #endif
+}
+
+
+//////////////////////////////  LOOP BODY MACROS  //////////////////////////////
+
+//  Using inline functions can exceed the temporary register limit, so we're
+//  stuck with #define macros (I'm TRULY sorry).  They're declared here instead
+//  of above to be closer to the actual invocation sites.  Steps:
+//  1.) Get the exact texel location.
+//  2.) Sample the phosphor mask (already assumed encoded in linear RGB).
+//  3.) Get the distance from the current pixel and sinc weight:
+//          sinc(dist) = sin(pi * dist)/(pi * dist)
+//      We can also use the slower/smoother Lanczos instead:
+//          L(x) = sinc(dist) * sinc(dist / lobes)
+//  4.) Accumulate the weight sum in weights, and accumulate the weighted texels
+//      in pixel_color (we'll normalize outside the loop at the end).
+//  We vectorize the loop to help reduce the Lanczos window's cost.
+
+    //  The r coord is the coord in the dimension we're resizing along (u or v),
+    //  and first_texel_tile_uv_rrrr is a vec4 of the first texel's u or v
+    //  tile_uv coord in [0, 1].  tex_uv_r will contain the tile_uv u or v coord
+    //  for four new texel samples.
+    #define CALCULATE_R_COORD_FOR_4_SAMPLES                                    \
+        const vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \
+        const vec4 tile_uv_r = frac(                                         \
+            first_texel_tile_uv_rrrr + true_i * tile_dr);                      \
+        const vec4 tex_uv_r = tile_uv_r * tile_size_uv_r;
+
+    #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const vec4 pi_dist_over_lobes = pi_over_lobes * dist;            \
+            const vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
+                (pi_dist*pi_dist_over_lobes), vec4(1.0));
+    #else
+        #define CALCULATE_SINC_RESAMPLE_WEIGHTS                                \
+            const vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0));
+    #endif
+
+    #define UPDATE_COLOR_AND_WEIGHT_SUMS                                       \
+        const vec4 dist = magnification_scale *                              \
+            abs(first_dist_unscaled - true_i);                                 \
+        const vec4 pi_dist = pi * dist;                                      \
+        CALCULATE_SINC_RESAMPLE_WEIGHTS;                                       \
+        pixel_color += new_sample0 * weights.xxx;                              \
+        pixel_color += new_sample1 * weights.yyy;                              \
+        pixel_color += new_sample2 * weights.zzz;                              \
+        pixel_color += new_sample3 * weights.www;                              \
+        weight_sum += weights;
+
+    #define VERTICAL_SINC_RESAMPLE_LOOP_BODY                                   \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const vec3 new_sample0 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv.x, tex_uv_r.x)).rgb;                                 \
+        const vec3 new_sample1 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv.x, tex_uv_r.y)).rgb;                                 \
+        const vec3 new_sample2 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv.x, tex_uv_r.z)).rgb;                                 \
+        const vec3 new_sample3 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv.x, tex_uv_r.w)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+    #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY                                 \
+        CALCULATE_R_COORD_FOR_4_SAMPLES;                                       \
+        const vec3 new_sample0 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv_r.x, tex_uv.y)).rgb;                                 \
+        const vec3 new_sample1 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv_r.y, tex_uv.y)).rgb;                                 \
+        const vec3 new_sample2 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv_r.z, tex_uv.y)).rgb;                                 \
+        const vec3 new_sample3 = tex2Dlod0try(texture,                       \
+            vec2(tex_uv_r.w, tex_uv.y)).rgb;                                 \
+        UPDATE_COLOR_AND_WEIGHT_SUMS;
+
+
+////////////////////////////  RESAMPLING FUNCTIONS  ////////////////////////////
+
+vec3 downsample_vertical_sinc_tiled(const sampler2D texture,
+    const vec2 tex_uv, const vec2 texture_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Requires:   1.) dr == du == 1.0/texture_size.x or
+    //                  dr == dv == 1.0/texture_size.y
+    //                  (whichever direction we're resampling in).
+    //                  It's a scalar to save register space.
+    //              2.) tile_size_uv_r is the number of texels an input tile
+    //                  takes up in the input texture, in the direction we're
+    //                  resampling this pass.
+    //              3.) magnification_scale must be <= 1.0.
+    //  Returns:    Return a [Lanczos] sinc-resampled pixel of a vertically
+    //              downsized input tile embedded in an input texture.  (The
+    //              vertical version is special-cased though: It assumes the
+    //              tile size equals the [static] texture size, since it's used
+    //              on an LUT texture input containing one tile.  For more
+    //              generic use, eliminate the "static" in the parameters.)
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dy" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  A static loop can be faster, but it might blur too much from using
+        //  more samples than it should.
+        const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along the resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  true = vertical resize:
+    const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, true);
+    const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    vec4 weight_sum = vec4(0.0);
+    vec3 pixel_color = vec3(0.0);
+    const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            VERTICAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + 
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
+    const vec2 tex_uv, const vec2 texture_size, const float dr,
+    const float magnification_scale, const float tile_size_uv_r)
+{
+    //  Differences from downsample_horizontal_sinc_tiled:
+    //  1.) The dr and tile_size_uv_r parameters are not static consts.
+    //  2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
+    //      set to false instead of true.
+    //  3.) The horizontal version of the loop body is used.
+    //  TODO: If we can get guaranteed compile-time dead code elimination,
+    //  we can combine the vertical/horizontal downsampling functions by:
+    //  1.) Add an extra static const bool parameter called "vertical."
+    //  2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
+    //  3.) Use a conditional assignment in the loop body macro.  This is the
+    //      tricky part: We DO NOT want to incur the extra conditional
+    //      assignment in the inner loop at runtime!
+    //  The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
+    //  we're resizing along, e.g. "dx" in this case.
+    #ifdef USE_SINGLE_STATIC_LOOP
+        //  If we have to load all samples, we might as well use them.
+        const int samples = int(max_sinc_resize_samples_m4);
+    #else
+        const int samples = int(get_dynamic_loop_size(magnification_scale));
+    #endif
+
+    //  Get the first sample location (scalar tile uv coord along resized
+    //  dimension) and distance from the output location (in texels):
+    const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
+    //  false = horizontal resize:
+    const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
+        tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, false);
+    const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
+    const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
+    //  Get the tile sample offset:
+    const float tile_dr = dr * input_tiles_per_texture_r;
+
+    //  Sum up each weight and weighted sample color, varying the looping
+    //  strategy based on our expected dynamic loop capabilities.  See the
+    //  loop body macros above.
+    int i_base = 0;
+    vec4 weight_sum = vec4(0.0);
+    vec3 pixel_color = vec3(0.0);
+    const int i_step = 4;
+    #ifdef BREAK_LOOPS_INTO_PIECES
+        if(samples - i_base >= 64)
+        {
+            for(int i = 0; i < 64; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 64;
+        }
+        if(samples - i_base >= 32)
+        {
+            for(int i = 0; i < 32; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 32;
+        }
+        if(samples - i_base >= 16)
+        {
+            for(int i = 0; i < 16; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 16;
+        }
+        if(samples - i_base >= 8)
+        {
+            for(int i = 0; i < 8; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 8;
+        }
+        if(samples - i_base >= 4)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+            i_base += 4;
+        }
+        //  Do another 4-sample block for a total of 128 max samples.
+        if(samples - i_base > 0)
+        {
+            for(int i = 0; i < 4; i += i_step)
+            {
+                HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+            }
+        }
+    #else
+        for(int i = 0; i < samples; i += i_step)
+        {
+            HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
+        }
+    #endif
+    //  Normalize so the weight_sum == 1.0, and return:
+    const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
+    const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x +
+        weight_sum_reduce.y);
+    return (pixel_color/scalar_weight_sum);
+}
+
+
+////////////////////////////  TILE SIZE CALCULATION  ///////////////////////////
+
+vec2 get_resized_mask_tile_size(const vec2 estimated_viewport_size,
+    const vec2 estimated_mask_resize_output_size,
+    const bool solemnly_swear_same_inputs_for_every_pass)
+{
+    //  Requires:   The following global constants must be defined according to
+    //              certain constraints:
+    //              1.) mask_resize_num_triads: Must be high enough that our
+    //                  mask sampling method won't have artifacts later
+    //                  (long story; see derived-settings-and-constants.h)
+    //              2.) mask_resize_src_lut_size: Texel size of our mask LUT
+    //              3.) mask_triads_per_tile: Num horizontal triads in our LUT
+    //              4.) mask_min_allowed_triad_size: User setting (the more
+    //                  restrictive it is, the faster the resize will go)
+    //              5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
+    //              6.) mask_triad_size_desired_{runtime, static}
+    //              7.) mask_num_triads_desired_{runtime, static}
+    //              8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
+    //              The function parameters must be defined as follows:
+    //              1.) estimated_viewport_size == (final viewport size);
+    //                  If mask_specify_num_triads is 1.0/true and the viewport
+    //                  estimate is wrong, the number of triads will differ from
+    //                  the user's preference by about the same factor.
+    //              2.) estimated_mask_resize_output_size: Must equal the
+    //                  output size of the MASK_RESIZE pass.
+    //                  Exception: The x component may be estimated garbage if
+    //                  and only if the caller throws away the x result.
+    //              3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
+    //                  unless you can guarantee that every call across every
+    //                  pass will use the same sizes for the other parameters.
+    //              When calling this across multiple passes, always use the
+    //              same y viewport size/scale, and always use the same x
+    //              viewport size/scale when using the x result.
+    //  Returns:    Return the final size of a manually resized mask tile, after
+    //              constraining the desired size to avoid artifacts.  Under
+    //              unusual circumstances, tiles may become stretched vertically
+    //              (see wall of text below).
+    //  Stated tile properties must be correct:
+    const float tile_aspect_ratio_inv =
+        mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
+    const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
+    const vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
+    //  If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
+    //  wrong, the user preference will be misinterpreted:
+    const float desired_tile_size_x = mask_triads_per_tile * lerp(
+        mask_triad_size_desired,
+        estimated_viewport_size.x / mask_num_triads_desired,
+        mask_specify_num_triads);
+    if(get_mask_sample_mode() > 0.5)
+    {
+        //  We don't need constraints unless we're sampling MASK_RESIZE.
+        return desired_tile_size_x * tile_aspect;
+    }
+    //  Make sure we're not upsizing:
+    const float temp_tile_size_x =
+        min(desired_tile_size_x, mask_resize_src_lut_size.x);
+    //  Enforce min_tile_size and max_tile_size in both dimensions:
+    const vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
+    const vec2 min_tile_size =
+        mask_min_allowed_tile_size * tile_aspect;
+    const vec2 max_tile_size =
+        estimated_mask_resize_output_size / mask_resize_num_tiles;
+    const vec2 clamped_tile_size =
+        clamp(temp_tile_size, min_tile_size, max_tile_size);
+    //  Try to maintain tile_aspect_ratio.  This is the tricky part:
+    //  If we're currently resizing in the y dimension, the x components
+    //  could be MEANINGLESS.  (If estimated_mask_resize_output_size.x is
+    //  bogus, then so is max_tile_size.x and clamped_tile_size.x.)
+    //  We can't adjust the y size based on clamped_tile_size.x.  If it
+    //  clamps when it shouldn't, it won't clamp again when later passes
+    //  call this function with the correct sizes, and the discrepancy will
+    //  break the sampling coords in MASKED_SCANLINES.  Instead, we'll limit
+    //  the x size based on the y size, but not vice versa, unless the
+    //  caller swears the parameters were the same (correct) in every pass.
+    //  As a result, triads could appear vertically stretched if:
+    //  a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
+    //      LUT's might clamp x more than y (all provided LUT's are square)
+    //  b.) true_viewport_size.x < true_viewport_size.y: The user is playing
+    //      with a vertically oriented screen (not accounted for anyway)
+    //  c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
+    //      Viewport scales are equal by default.
+    //  If any of these are the case, you can fix the stretching by setting:
+    //      mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
+    //          (1.0 / min_expected_aspect_ratio) *
+    //          (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
+    const float x_tile_size_from_y =
+        clamped_tile_size.y * tile_aspect_ratio;
+    const float y_tile_size_from_x = lerp(clamped_tile_size.y,
+        clamped_tile_size.x * tile_aspect_ratio_inv,
+        float(solemnly_swear_same_inputs_for_every_pass));
+    const vec2 reclamped_tile_size = vec2(
+        min(clamped_tile_size.x, x_tile_size_from_y),
+        min(clamped_tile_size.y, y_tile_size_from_x));
+    //  We need integer tile sizes in both directions for tiled sampling to
+    //  work correctly.  Use floor (to make sure we don't round up), but be
+    //  careful to avoid a rounding bug where floor decreases whole numbers:
+    const vec2 final_resized_tile_size =
+        floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
+    return final_resized_tile_size;
+}
+
+
+/////////////////////////  FINAL MASK SAMPLING HELPERS  ////////////////////////
+
+vec4 get_mask_sampling_parameters(const vec2 mask_resize_texture_size,
+    const vec2 mask_resize_video_size, const vec2 true_viewport_size,
+    out vec2 mask_tiles_per_screen)
+{
+    //  Requires:   1.) Requirements of get_resized_mask_tile_size() must be
+    //                  met, particularly regarding global constants.
+    //              The function parameters must be defined as follows:
+    //              1.) mask_resize_texture_size == MASK_RESIZE.texture_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              2.) mask_resize_video_size == MASK_RESIZE.video_size
+    //                  if get_mask_sample_mode() is 0 (otherwise anything)
+    //              3.) true_viewport_size == IN.output_size for a pass set to
+    //                  1.0 viewport scale (i.e. it must be correct)
+    //  Returns:    Return a vec4 containing:
+    //                  xy: tex_uv coords for the start of the mask tile
+    //                  zw: tex_uv size of the mask tile from start to end
+    //              mask_tiles_per_screen is an out parameter containing the
+    //              number of mask tiles that will fit on the screen.
+    //  First get the final resized tile size.  The viewport size and mask
+    //  resize viewport scale must be correct, but don't solemnly swear they
+    //  were correct in both mask resize passes unless you know it's true.
+    //  (We can better ensure a correct tile aspect ratio if the parameters are
+    //  guaranteed correct in all passes...but if we lie, we'll get inconsistent
+    //  sizes across passes, resulting in broken texture coordinates.)
+    const float mask_sample_mode = get_mask_sample_mode();
+    const vec2 mask_resize_tile_size = get_resized_mask_tile_size(
+        true_viewport_size, mask_resize_video_size, false);
+    if(mask_sample_mode < 0.5)
+    {
+        //  Sample MASK_RESIZE: The resized tile is a fraction of the texture
+        //  size and starts at a nonzero offset to allow for border texels:
+        const vec2 mask_tile_uv_size = mask_resize_tile_size /
+            mask_resize_texture_size;
+        const vec2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
+        const vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
+        //  mask_tiles_per_screen must be based on the *true* viewport size:
+        mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        return vec4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+    else
+    {
+        //  If we're tiling at the original size (1:1 pixel:texel), redefine a
+        //  "tile" to be the full texture containing many triads.  Otherwise,
+        //  we're hardware-resampling an LUT, and the texture truly contains a
+        //  single unresized phosphor mask tile anyway.
+        const vec2 mask_tile_uv_size = vec2(1.0);
+        const vec2 mask_tile_start_uv = vec2(0.0);
+        if(mask_sample_mode > 1.5)
+        {
+            //  Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
+            mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
+        }
+        else
+        {
+            //  Hardware-resize the original LUT:
+            mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
+        }
+        return vec4(mask_tile_start_uv, mask_tile_uv_size);
+    }
+}
+
+vec2 fix_tiling_discontinuities_normalized(const vec2 tile_uv,
+    vec2 duv_dx, vec2 duv_dy)
+{
+    //  Requires:   1.) duv_dx == ddx(tile_uv)
+    //              2.) duv_dy == ddy(tile_uv)
+    //              3.) tile_uv contains tile-relative uv coords in [0, 1],
+    //                  such that (0.5, 0.5) is the center of a tile, etc.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //  Returns:    Return new tile_uv coords that contain no discontinuities
+    //              across a 2x2 pixel quad.
+    //  Description:
+    //  When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
+    //  derivatives, which we assume happened if the absolute difference between
+    //  any fragment in a 2x2 block is > ~half a tile.  If the current block has
+    //  a u or v discontinuity and the current fragment is in the first half of
+    //  the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
+    //  to that coord to make the 2x2 block continuous.  (It will now have a
+    //  coord > 1.0 in the padding area beyond the tile.)  This function takes
+    //  derivatives as parameters so the caller can reuse them.
+    //  In case we're using high-quality (nVidia-style) derivatives, ensure
+    //  diagonically opposite fragments see each other for correctness:
+    duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
+    duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
+    const vec2 pixel_in_first_half_tile = vec2(tile_uv < vec2(0.5));
+    const vec2 jump_exists = vec2(duv_dx + duv_dy > vec2(0.5));
+    return tile_uv + jump_exists * pixel_in_first_half_tile;
+}
+
+vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap,
+    const vec4 mask_tile_start_uv_and_size)
+{
+    //  Requires:   1.) tile_uv_wrap contains tile-relative uv coords, where the
+    //                  tile spans from [0, 1], such that (0.5, 0.5) is at the
+    //                  tile center.  The input coords can range from [0, inf],
+    //                  and their fractional parts map to a repeated tile.
+    //                  ("Tile" can mean texture, the video embedded in the
+    //                  texture, or some other "tile" embedded in a texture.)
+    //              2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
+    //                  for the start of the embedded tile in the full texture.
+    //              3.) mask_tile_start_uv_and_size.zw contains the [fractional]
+    //                  tex_uv size of the embedded tile in the full texture.
+    //  Returns:    Return tex_uv coords (used for texture sampling)
+    //              corresponding to tile_uv_wrap.
+    if(get_mask_sample_mode() < 0.5)
+    {
+        //  Manually repeat the resized mask tile to fill the screen:
+        //  First get fractional tile_uv coords.  Using frac/fmod on coords
+        //  confuses anisotropic filtering; fix it as user options dictate.
+        //  derived-settings-and-constants.h disables incompatible options.
+        #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
+            vec2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
+        #else
+            vec2 tile_uv = frac(tile_uv_wrap);
+        #endif
+        #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
+            const vec2 tile_uv_dx = ddx(tile_uv);
+            const vec2 tile_uv_dy = ddy(tile_uv);
+            tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
+                tile_uv_dx, tile_uv_dy);
+        #endif
+        //  The tile is embedded in a padded FBO, and it may start at a
+        //  nonzero offset if border texels are used to avoid artifacts:
+        const vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
+            tile_uv * mask_tile_start_uv_and_size.zw;
+        return mask_tex_uv;
+    }
+    else
+    {
+        //  Sample from the input phosphor mask texture with hardware tiling.
+        //  If we're tiling at the original size (mode 2), the "tile" is the
+        //  whole texture, and it contains a large number of triads mapped with
+        //  a 1:1 pixel:texel ratio.  OTHERWISE, the texture contains a single
+        //  unresized tile.  tile_uv_wrap already has correct coords for both!
+        return tile_uv_wrap;
+    }
+}
+
+
+#endif  //  PHOSPHOR_MASK_RESIZING_H
+
diff --git a/crt/shaders/crt-royale/src/scanline-functions.h b/crt/shaders/crt-royale/src/scanline-functions.h
new file mode 100644
index 0000000..d71a500
--- /dev/null
+++ b/crt/shaders/crt-royale/src/scanline-functions.h
@@ -0,0 +1,572 @@
+#ifndef SCANLINE_FUNCTIONS_H
+#define SCANLINE_FUNCTIONS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+#include "../../../../include/special-functions.h"
+#include "../../../../include/gamma-management.h"
+
+
+/////////////////////////////  SCANLINE FUNCTIONS  /////////////////////////////
+/*
+inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_sigma and beam_max_sigma are global floats
+    //                  containing the desired minimum and maximum beam standard
+    //                  deviations, for dim and bright colors respectively.
+    //              2.) beam_max_sigma must be > 0.0
+    //              3.) beam_min_sigma must be in (0.0, beam_max_sigma]
+    //              4.) beam_spot_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
+    //                  sigma_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_sigma are runtime shader parameters
+    //  Optional:   Users may set beam_spot_shape_function to 1 to define the
+    //              inner f(color) subfunction (see below) as:
+    //                  f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
+    //              Otherwise (technically, if beam_spot_shape_function < 0.5):
+    //                  f(color) = pow(color, beam_spot_power)
+    //  Returns:    The standard deviation of the Gaussian beam for "color:"
+    //                  sigma = beam_min_sigma + sigma_range * f(color)
+    //  Details/Discussion:
+    //  The beam's spot shape vaguely resembles an aspect-corrected f() in the
+    //  range [0, 1] (not quite, but it's related).  f(color) = color makes
+    //  spots look like diamonds, and a spherical function or cube balances
+    //  between variable width and a soft/realistic shape.   A beam_spot_power
+    //  > 1.0 can produce an ugly spot shape and more initial clipping, but the
+    //  final shape also differs based on the horizontal resampling filter and
+    //  the phosphor bloom.  For instance, resampling horizontally in nonlinear
+    //  light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
+    //  shape, but a sixth root is still quite soft.  A power function (default
+    //  1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
+    //  has the highest variability without an awful spot shape.
+    //
+    //  beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
+    //  difference from beam_max_sigma affects beam width variability.  It only
+    //  affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
+    //  a conservative estimate for a more complex constraint).
+    //
+    //  beam_max_sigma affects clipping and increasing scanline width/softness
+    //  as color increases.  The wider this is, the more scanlines need to be
+    //  evaluated to avoid distortion.  For a pure Gaussian, the max_beam_sigma
+    //  at which the first unused scanline always has a weight < 1.0/255.0 is:
+    //      num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
+    //      num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
+    //      num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
+    //      num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
+    //      num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
+    //  Generalized Gaussians permit more leeway here as steepness increases.
+    if(beam_spot_shape_function < 0.5)
+    {
+        //  Use a power function:
+        return float3(beam_min_sigma) + sigma_range *
+            pow(color, beam_spot_power);
+    }
+    else
+    {
+        //  Use a spherical function:
+        const float3 color_minus_1 = color - float3(1.0);
+        return float3(beam_min_sigma) + sigma_range *
+            sqrt(float3(1.0) - color_minus_1*color_minus_1);
+    }
+}
+
+inline float3 get_generalized_gaussian_beta(const float3 color,
+    const float shape_range)
+{
+    //  Requires:   Globals:
+    //              1.) beam_min_shape and beam_max_shape are global floats
+    //                  containing the desired min/max generalized Gaussian
+    //                  beta parameters, for dim and bright colors respectively.
+    //              2.) beam_max_shape must be >= 2.0
+    //              3.) beam_min_shape must be in [2.0, beam_max_shape]
+    //              4.) beam_shape_power must be defined as a global float.
+    //              Parameters:
+    //              1.) color is the underlying source color along a scanline
+    //              2.) shape_range = beam_max_shape - beam_min_shape; we take
+    //                  shape_range as a parameter to avoid repeated computation
+    //                  when beam_{min, max}_shape are runtime shader parameters
+    //  Returns:    The type-I generalized Gaussian "shape" parameter beta for
+    //              the given color.
+    //  Details/Discussion:
+    //  Beta affects the scanline distribution as follows:
+    //  a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
+    //  b.) beta == 2.0 just degenerates to a Gaussian
+    //  c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
+    //      than a Gaussian.  Whereas high sigmas widen and soften peaks, high
+    //      beta widen and sharpen peaks at the risk of aliasing.
+    //  Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
+    //  transitions, whereas lower ones sharpen them (at the risk of aliasing).
+    return beam_min_shape + shape_range * pow(color, beam_shape_power);
+}
+
+float3 scanline_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range)
+{
+    //  Requires:   1.) dist is the distance of the [potentially separate R/G/B]
+    //                  point(s) from a scanline in units of scanlines, where
+    //                  1.0 means the sample point straddles the next scanline.
+    //              2.) color is the underlying source color along a scanline.
+    //              3.) pixel_height is the output pixel height in scanlines.
+    //              4.) Requirements of get_gaussian_sigma() must be met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  Details:
+    //  The CRT beam profile follows a roughly Gaussian distribution which is
+    //  wider for bright colors than dark ones.  The integral over the full
+    //  range of a Gaussian function is always 1.0, so we can vary the beam
+    //  with a standard deviation without affecting brightness.  'x' = distance:
+    //      gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    //      gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
+    //  Use a numerical approximation of the "error function" (the Gaussian
+    //  indefinite integral) to find the definite integral of the scanline's
+    //  average brightness over a given pixel area.  Even if curved coords were
+    //  used in this pass, a flat scalar pixel height works almost as well as a
+    //  pixel height computed from a full pixel-space to scanline-space matrix.
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
+    const float3 integral_high = erf((dist + ph_offset)*denom_inv);
+    const float3 integral_low = erf((dist - ph_offset)*denom_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_generalized_gaussian_integral_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range,
+    const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel.
+    //  A generalized Gaussian distribution allows the shape (beta) to vary
+    //  as well as the width (alpha).  "gamma" refers to the gamma function:
+    //      generalized sample =
+    //          beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    //  ligamma(s, z) is the lower incomplete gamma function, for which we only
+    //  implement two of four branches (because we keep 1/beta <= 0.5):
+    //      generalized integral = 0.5 + 0.5* sign(x) *
+    //          ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
+    //  See get_generalized_gaussian_beta() for a discussion of beta.
+    //  We base alpha on the intended Gaussian sigma, but it only strictly
+    //  models models standard deviation at beta == 2, because the standard
+    //  deviation depends on both alpha and beta (keeping alpha independent is
+    //  faster and preserves intuitive behavior and a full spectrum of results).
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 s = float3(1.0)/beta;
+    const float3 ph_offset = float3(pixel_height * 0.5);
+    //  Pass beta to gamma_impl to avoid repeated divides.  Similarly pass
+    //  beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
+    const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
+    const float3 dist1 = dist + ph_offset;
+    const float3 dist0 = dist - ph_offset;
+    const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
+        s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
+    const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
+        s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
+    return color * 0.5*(integral_high - integral_low)/pixel_height;
+}
+
+float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
+    const float pixel_height, const float sigma_range)
+{
+    //  See scanline_gaussian integral_contrib() for detailed comments!
+    //  gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
+    const float3 sigma = get_gaussian_sigma(color, sigma_range);
+    //  Avoid repeated divides:
+    const float3 sigma_inv = float3(1.0)/sigma;
+    const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
+    const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel away in each direction as well:
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three pure Gaussian samples:
+        const float3 scale = color/3.0 * outer_denom_inv;
+        const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
+        const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
+        const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
+        return scale * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
+    }
+}
+
+float3 scanline_generalized_gaussian_sampled_contrib(const float3 dist,
+    const float3 color, const float pixel_height, const float sigma_range,
+    const float shape_range)
+{
+    //  See scanline_generalized_gaussian_integral_contrib() for details!
+    //  generalized sample =
+    //      beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
+    const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
+    const float3 beta = get_generalized_gaussian_beta(color, shape_range);
+    //  Avoid repeated divides:
+    const float3 alpha_inv = float3(1.0)/alpha;
+    const float3 beta_inv = float3(1.0)/beta;
+    const float3 scale = color * beta * 0.5 * alpha_inv /
+        gamma_impl(beta_inv, beta);
+    if(beam_antialias_level > 0.5)
+    {
+        //  Sample 1/3 pixel closer to and farther from the scanline too.
+        const float3 sample_offset = float3(pixel_height/3.0);
+        const float3 dist2 = dist + sample_offset;
+        const float3 dist3 = abs(dist - sample_offset);
+        //  Average three generalized Gaussian samples:
+        const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
+        const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
+        const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
+        return scale/3.0 * (weight1 + weight2 + weight3);
+    }
+    else
+    {
+        return scale * exp(-pow(abs(dist*alpha_inv), beta));
+    }
+}
+
+inline float3 scanline_contrib(float3 dist, float3 color,
+    float pixel_height, const float sigma_range, const float shape_range)
+{
+    //  Requires:   1.) Requirements of scanline_gaussian_integral_contrib()
+    //                  must be met.
+    //              2.) Requirements of get_gaussian_sigma() must be met.
+    //              3.) Requirements of get_generalized_gaussian_beta() must be
+    //                  met.
+    //  Returns:    Return a scanline's light output over a given pixel, using
+    //              a generalized or pure Gaussian distribution and sampling or
+    //              integrals as desired by user codepath choices.
+    if(beam_generalized_gaussian)
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_generalized_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+        else
+        {
+            return scanline_generalized_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range, shape_range);
+        }
+    }
+    else
+    {
+        if(beam_antialias_level > 1.5)
+        {
+            return scanline_gaussian_integral_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+        else
+        {
+            return scanline_gaussian_sampled_contrib(
+                dist, color, pixel_height, sigma_range);
+        }
+    }
+}
+
+inline float3 get_raw_interpolated_color(const float3 color0,
+    const float3 color1, const float3 color2, const float3 color3,
+    const float4 weights)
+{
+    //  Use max to avoid bizarre artifacts from negative colors:
+    return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
+}
+
+float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
+    const float3 color2, const float3 color3, const float4 weights)
+{
+    //  Requires:   1.) Requirements of include/gamma-management.h must be met:
+    //                  intermediate_gamma must be globally defined, and input
+    //                  colors are interpreted as linear RGB unless you #define
+    //                  GAMMA_ENCODE_EVERY_FBO (in which case they are
+    //                  interpreted as gamma-encoded with intermediate_gamma).
+    //              2.) color0-3 are colors sampled from a texture with tex2D().
+    //                  They are interpreted as defined in requirement 1.
+    //              3.) weights contains weights for each color, summing to 1.0.
+    //              4.) beam_horiz_linear_rgb_weight must be defined as a global
+    //                  float in [0.0, 1.0] describing how much blending should
+    //                  be done in linear RGB (rest is gamma-corrected RGB).
+    //              5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
+    //                  if beam_horiz_linear_rgb_weight is anything other than a
+    //                  static constant, or we may try branching at runtime
+    //                  without dynamic branches allowed (slow).
+    //  Returns:    Return an interpolated color lookup between the four input
+    //              colors based on the weights in weights.  The final color will
+    //              be a linear RGB value, but the blending will be done as
+    //              indicated above.
+    const float intermediate_gamma = get_intermediate_gamma();
+    //  Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
+    //  profile allows dynamic branches (faster than computing extra pows):
+    #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+        #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+    #else
+        #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
+            #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        #endif
+    #endif
+    #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+        //  beam_horiz_linear_rgb_weight is static, so we can branch:
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            if(beam_horiz_linear_rgb_weight > 0.0)
+            {
+                const float3 linear_mixed_color = get_raw_interpolated_color(
+                    pow(color0, intermediate_gamma),
+                    pow(color1, intermediate_gamma),
+                    pow(color2, intermediate_gamma),
+                    pow(color3, intermediate_gamma),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return gamma_mixed_color;
+            }
+        #else
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            if(beam_horiz_linear_rgb_weight < 1.0)
+            {
+                const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, 1.0/intermediate_gamma),
+                    pow(color1, 1.0/intermediate_gamma),
+                    pow(color2, 1.0/intermediate_gamma),
+                    pow(color3, 1.0/intermediate_gamma),
+                    weights);
+                return lerp(gamma_mixed_color, linear_mixed_color,
+                    beam_horiz_linear_rgb_weight);
+            }
+            else
+            {
+                return linear_mixed_color;
+            }
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #else
+        #ifdef GAMMA_ENCODE_EVERY_FBO
+            //  Inputs: color0-3 are colors in gamma-encoded RGB.
+            const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
+                color0, color1, color2, color3, weights), intermediate_gamma);
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                pow(color0, intermediate_gamma),
+                pow(color1, intermediate_gamma),
+                pow(color2, intermediate_gamma),
+                pow(color3, intermediate_gamma),
+                weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #else
+            //  Inputs: color0-3 are colors in linear RGB.
+            const float3 linear_mixed_color = get_raw_interpolated_color(
+                color0, color1, color2, color3, weights);
+            const float3 gamma_mixed_color = get_raw_interpolated_color(
+                    pow(color0, 1.0/intermediate_gamma),
+                    pow(color1, 1.0/intermediate_gamma),
+                    pow(color2, 1.0/intermediate_gamma),
+                    pow(color3, 1.0/intermediate_gamma),
+                    weights);
+            return lerp(gamma_mixed_color, linear_mixed_color,
+                beam_horiz_linear_rgb_weight);
+        #endif  //  GAMMA_ENCODE_EVERY_FBO
+    #endif  //  SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
+}
+
+float3 get_scanline_color(const sampler2D texture, const float2 scanline_uv,
+    const float2 uv_step_x, const float4 weights)
+{
+    //  Requires:   1.) scanline_uv must be vertically snapped to the caller's
+    //                  desired line or scanline and horizontally snapped to the
+    //                  texel just left of the output pixel (color1)
+    //              2.) uv_step_x must contain the horizontal uv distance
+    //                  between texels.
+    //              3.) weights must contain interpolation filter weights for
+    //                  color0, color1, color2, and color3, where color1 is just
+    //                  left of the output pixel.
+    //  Returns:    Return a horizontally interpolated texture lookup using 2-4
+    //              nearby texels, according to weights and the conventions of
+    //              get_interpolated_linear_color().
+    //  We can ignore the outside texture lookups for Quilez resampling.
+    const float3 color1 = tex2D(texture, scanline_uv).rgb;
+    const float3 color2 = tex2D(texture, scanline_uv + uv_step_x).rgb;
+    float3 color0 = float3(0.0);
+    float3 color3 = float3(0.0);
+    if(beam_horiz_filter > 0.5)
+    {
+        color0 = tex2D(texture, scanline_uv - uv_step_x).rgb;
+        color3 = tex2D(texture, scanline_uv + 2.0 * uv_step_x).rgb;
+    }
+    //  Sample the texture as-is, whether it's linear or gamma-encoded:
+    //  get_interpolated_linear_color() will handle the difference.
+    return get_interpolated_linear_color(color0, color1, color2, color3, weights);
+}
+
+float3 sample_single_scanline_horizontal(const sampler2D texture,
+    const float2 tex_uv, const float2 texture_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Snap to the previous texel and get sample dists from 2/4 nearby texels:
+    const float2 curr_texel = tex_uv * texture_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel =
+        floor(curr_texel - float2(under_half)) + float2(0.5);
+    const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
+    const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
+    const float prev_dist = curr_texel.x - prev_texel_hor.x;
+    const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
+        1.0 - prev_dist, 2.0 - prev_dist);
+    //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
+    float4 weights;
+    if(beam_horiz_filter < 0.5)
+    {
+        //  Quilez:
+        const float x = sample_dists.y;
+        const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
+        weights = float4(0.0, 1.0 - w2, w2, 0.0);
+    }
+    else if(beam_horiz_filter < 1.5)
+    {
+        //  Gaussian:
+        float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
+        weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
+    }
+    else
+    {
+        //  Lanczos2:
+        const float4 pi_dists = FIX_ZERO(sample_dists * pi);
+        weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
+            (pi_dists * pi_dists);
+    }
+    //  Ensure the weight sum == 1.0:
+    const float4 final_weights = weights/dot(weights, float4(1.0));
+    //  Get the interpolated horizontal scanline color:
+    const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
+    return get_scanline_color(
+        texture, prev_texel_hor_uv, uv_step_x, final_weights);
+}
+
+float3 sample_rgb_scanline_horizontal(const sampler2D texture,
+    const float2 tex_uv, const float2 texture_size,
+    const float2 texture_size_inv)
+{
+    //  TODO: Add function requirements.
+    //  Rely on a helper to make convergence easier.
+    if(beam_misconvergence)
+    {
+        const float3 convergence_offsets_rgb =
+            get_convergence_offsets_x_vector();
+        const float3 offset_u_rgb =
+            convergence_offsets_rgb * texture_size_inv.xxx;
+        const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
+        const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
+        const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
+        const float3 sample_r = sample_single_scanline_horizontal(
+            texture, scanline_uv_r, texture_size, texture_size_inv);
+        const float3 sample_g = sample_single_scanline_horizontal(
+            texture, scanline_uv_g, texture_size, texture_size_inv);
+        const float3 sample_b = sample_single_scanline_horizontal(
+            texture, scanline_uv_b, texture_size, texture_size_inv);
+        return float3(sample_r.r, sample_g.g, sample_b.b);
+    }
+    else
+    {
+        return sample_single_scanline_horizontal(texture, tex_uv, texture_size,
+            texture_size_inv);
+    }
+}
+
+float2 get_last_scanline_uv(const float2 tex_uv, const float2 texture_size,
+    const float2 texture_size_inv, const float2 il_step_multiple,
+    const float frame_count, out float dist)
+{
+    //  Compute texture coords for the last/upper scanline, accounting for
+    //  interlacing: With interlacing, only consider even/odd scanlines every
+    //  other frame.  Top-field first (TFF) order puts even scanlines on even
+    //  frames, and BFF order puts them on odd frames.  Texels are centered at:
+    //      frac(tex_uv * texture_size) == x.5
+    //  Caution: If these coordinates ever seem incorrect, first make sure it's
+    //  not because anisotropic filtering is blurring across field boundaries.
+    //  Note: TFF/BFF won't matter for sources that double-weave or similar.
+    const float field_offset = floor(il_step_multiple.y * 0.75) *
+        fmod(frame_count + float(interlace_bff), 2.0);
+    const float2 curr_texel = tex_uv * texture_size;
+    //  Use under_half to fix a rounding bug right around exact texel locations.
+    const float2 prev_texel_num = floor(curr_texel - float2(under_half));
+    const float wrong_field = fmod(
+        prev_texel_num.y + field_offset, il_step_multiple.y);
+    const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
+    //  Snap to the center of the previous scanline in the current field:
+    const float2 scanline_texel = scanline_texel_num + float2(0.5);
+    const float2 scanline_uv = scanline_texel * texture_size_inv;
+    //  Save the sample's distance from the scanline, in units of scanlines:
+    dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
+    return scanline_uv;
+}
+*/
+bool is_interlaced(float num_lines)
+{
+    //  Detect interlacing based on the number of lines in the source.
+    if(interlace_detect)
+    {
+        //  NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
+        //  NTSC Emulators: Typically 224 or 240 lines
+        //  PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
+        //  PAL Emulators: ?
+        //  ATSC: 720p, 1080i, 1080p
+        //  Where do we place our cutoffs?  Assumptions:
+        //  1.) We only need to care about active lines.
+        //  2.) Anything > 288 and <= 576 lines is probably interlaced.
+        //  3.) Anything > 576 lines is probably not interlaced...
+        //  4.) ...except 1080 lines, which is a crapshoot (user decision).
+        //  5.) Just in case the main program uses calculated video sizes,
+        //      we should nudge the float thresholds a bit.
+        bool sd_interlace;
+		if (num_lines > 288.5 && num_lines < 576.5)
+			{sd_interlace = true;}
+		else
+			{sd_interlace = false;}
+        bool hd_interlace;
+        if (num_lines > 1079.5 && num_lines < 1080.5)
+			{hd_interlace = false;}
+		else
+			{hd_interlace = sd_interlace || hd_interlace;}
+    }
+    else
+    {
+        return false;
+    }
+}
+
+
+#endif  //  SCANLINE_FUNCTIONS_H
+
diff --git a/crt/shaders/crt-royale/src/tex2Dantialias.h b/crt/shaders/crt-royale/src/tex2Dantialias.h
new file mode 100644
index 0000000..0a5f834
--- /dev/null
+++ b/crt/shaders/crt-royale/src/tex2Dantialias.h
@@ -0,0 +1,1393 @@
+#ifndef TEX2DANTIALIAS_H
+#define TEX2DANTIALIAS_H
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+/////////////////////////////////  DESCRIPTION  ////////////////////////////////
+
+//  This file provides antialiased and subpixel-aware tex2D lookups.
+//  Requires:   All functions share these requirements:
+//              1.) All requirements of gamma-management.h must be satisfied!
+//              2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe-
+//                  space offsets to texture uv offsets.  You can get this with:
+//                      const vec2 duv_dx = ddx(tex_uv);
+//                      const vec2 duv_dy = ddy(tex_uv);
+//                      const vec2x2 pixel_to_tex_uv = vec2x2(
+//                          duv_dx.x, duv_dy.x,
+//                          duv_dx.y, duv_dy.y);
+//                  This is left to the user in case the current Cg profile
+//                  doesn't support ddx()/ddy().  Ideally, the user could find
+//                  calculate a distorted tangent-space mapping analytically.
+//                  If not, a simple flat mapping can be obtained with:
+//                      const vec2 xy_to_uv_scale = IN.output_size *
+//                          IN.video_size/IN.texture_size;
+//                      const vec2x2 pixel_to_tex_uv = vec2x2(
+//                          xy_to_uv_scale.x, 0.0,
+//                          0.0, xy_to_uv_scale.y);
+//  Optional:   To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and:
+//              1.) Set an antialiasing level:
+//                      static const float aa_level = {0 (none),
+//                          1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24}
+//              2.) Set a filter type:
+//                      static const float aa_filter = {
+//                          0 (Box, Separable), 1 (Box, Cylindrical),
+//                          2 (Tent, Separable), 3 (Tent, Cylindrical)
+//                          4 (Gaussian, Separable), 5 (Gaussian, Cylindrical)
+//                          6 (Cubic, Separable), 7 (Cubic, Cylindrical)
+//                          8 (Lanczos Sinc, Separable),
+//                          9 (Lanczos Jinc, Cylindrical)}
+//                  If the input is unknown, a separable box filter is used.
+//                  Note: Lanczos Jinc is terrible for sparse sampling, and
+//                  using aa_axis_importance (see below) defeats the purpose.
+//              3.) Mirror the sample pattern on odd frames?
+//                      static const bool aa_temporal = {true, false]
+//                  This helps rotational invariance but can look "fluttery."
+//              The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override
+//              (all of) the following default parameters with static or uniform
+//              constants (or an accessor function for subpixel offsets):
+//              1.) Cubic parameters:
+//                      static const float aa_cubic_c = 0.5;
+//                  See http://www.imagemagick.org/Usage/filter/#mitchell
+//              2.) Gaussian parameters:
+//                      static const float aa_gauss_sigma =
+//                          0.5/aa_pixel_diameter;
+//              3.) Set subpixel offsets.  This requires an accessor function
+//                  for compatibility with scalar runtime shader params.  Return
+//                  a vec2 pixel offset in [-0.5, 0.5] for the red subpixel:
+//                      vec2 get_aa_subpixel_r_offset()
+//              The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to
+//              override (all of) the following default static values.  However,
+//              the file's structure requires them to be declared static const:
+//              1.) static const float aa_lanczos_lobes = 3.0;
+//              2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter;
+//                  Note the default tent/Gaussian support radii may appear
+//                  arbitrary, but extensive testing found them nearly optimal
+//                  for tough cases like strong distortion at low AA levels.
+//                  (The Gaussian default is only best for practical gauss_sigma
+//                  values; much larger gauss_sigmas ironically prefer slightly
+//                  smaller support given sparse sampling, and vice versa.)
+//              3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter;
+//              4.) static const vec2 aa_xy_axis_importance:
+//                  The sparse N-queens sampling grid interacts poorly with
+//                  negative-lobed 2D filters.  However, if aliasing is much
+//                  stronger in one direction (e.g. horizontally with a phosphor
+//                  mask), it can be useful to downplay sample offsets along the
+//                  other axis.  The support radius in each direction scales with
+//                  aa_xy_axis_importance down to a minimum of 0.5 (box support),
+//                  after which point only the offsets used for calculating
+//                  weights continue to scale downward.  This works as follows:
+//                  If aa_xy_axis_importance = vec2(1.0, 1.0/support_radius),
+//                  the vertical support radius will drop to 1.0, and we'll just
+//                  filter vertical offsets with the first filter lobe, while
+//                  horizontal offsets go through the full multi-lobe filter.
+//                  If aa_xy_axis_importance = vec2(1.0, 0.0), the vertical
+//                  support radius will drop to box support, and the vertical
+//                  offsets will be ignored entirely (essentially giving us a
+//                  box filter vertically).  The former is potentially smoother
+//                  (but less predictable) and the default behavior of Lanczos
+//                  jinc, whereas the latter is sharper and the default behavior
+//                  of cubics and Lanczos sinc.
+//              5.) static const float aa_pixel_diameter: You can expand the
+//                  pixel diameter to e.g. sqrt(2.0), which may be a better
+//                  support range for cylindrical filters (they don't
+//                  currently discard out-of-circle samples though).
+//              Finally, there are two miscellaneous options:
+//              1.) If you want to antialias a manually tiled texture, you can
+//                  #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to
+//                  fix incompatibilities with anisotropic filtering.  This is
+//                  slower, and the Cg profile must support tex2Dlod().
+//              2.) If aa_cubic_c is a runtime uniform, you can #define
+//                  RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per
+//                  fragment instead of at the usage site (which is used by
+//                  default, because it enables static evaluation).
+//  Description:
+//  Each antialiased lookup follows these steps:
+//  1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5]
+//      pixels, spanning the diameter of a rectangular box filter.
+//  2.) Scale these offsets by the support diameter of the user's chosen filter.
+//  3.) Using these pixel offsets from the pixel center, compute the offsets to
+//      predefined subpixel locations.
+//  4.) Compute filter weights based on subpixel offsets.
+//  Much of that can often be done at compile-time.  At runtime:
+//  1.) Project pixel-space offsets into uv-space with a matrix multiplication
+//      to get the uv offsets for each sample.  Rectangular pixels have a
+//      diameter of 1.0.  Circular pixels are not currently supported, but they
+//      might be better with a diameter of sqrt(2.0) to ensure there are no gaps
+//      between them.
+//  2.) Load, weight, and sum samples.
+//  We use a sparse bilinear sampling grid, so there are two major implications:
+//  1.) We can directly project the pixel-space support box into uv-space even
+//      if we're upsizing.  This wouldn't be the case for nearest neighbor,
+//      where we'd have to expand the uv-space diameter to at least the support
+//      size to ensure sufficient filter support.  In our case, this allows us
+//      to treat upsizing the same as downsizing and use static weighting. :)
+//  2.) For decent results, negative-lobed filters must be computed based on
+//      separable weights, not radial distances, because the sparse sampling
+//      makes no guarantees about radial distributions.  Even then, it's much
+//      better to set aa_xy_axis_importance to e.g. vec2(1.0, 0.0) to use e.g.
+//      Lanczos2 horizontally and a box filter vertically.  This is mainly due
+//      to the sparse N-queens sampling and a statistically enormous positive or
+//      negative covariance between horizontal and vertical weights.
+//
+//  Design Decision Comments:
+//  "aa_temporal" mirrors the sample pattern on odd frames along the axis that
+//  keeps subpixel weights constant.  This helps with rotational invariance, but
+//  it can cause distracting fluctuations, and horizontal and vertical edges
+//  will look the same.  Using a different pattern on a shifted grid would
+//  exploit temporal AA better, but it would require a dynamic branch or a lot
+//  of conditional moves, so it's prohibitively slow for the minor benefit.
+
+
+/////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
+
+#ifndef ANTIALIAS_OVERRIDE_BASICS
+    //  The following settings must be static constants:
+    const float aa_level = 12.0;
+    const float aa_filter = 0.0;
+    const bool aa_temporal = false;
+#endif
+
+#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS
+    //  Users may override these parameters, but the file structure requires
+    //  them to be static constants; see the descriptions above.
+    const float aa_pixel_diameter = 1.0;
+    const float aa_lanczos_lobes = 3.0;
+    const float aa_gauss_support = 1.0 / aa_pixel_diameter;
+    const float aa_tent_support = 1.0 / aa_pixel_diameter;
+    
+    //  If we're using a negative-lobed filter, default to using it horizontally
+    //  only, and use only the first lobe vertically or a box filter, over a
+    //  correspondingly smaller range.  This compensates for the sparse sampling
+    //  grid's typically large positive/negative x/y covariance.
+    const vec2 aa_xy_axis_importance =
+        aa_filter < 5.5 ? vec2(1.0) :         //  Box, tent, Gaussian
+        aa_filter < 8.5 ? vec2(1.0, 0.0) :    //  Cubic and Lanczos sinc
+        aa_filter < 9.5 ? vec2(1.0, 1.0/aa_lanczos_lobes) :   //  Lanczos jinc
+        vec2(1.0);                            //  Default to box
+#endif
+
+#ifndef ANTIALIAS_OVERRIDE_PARAMETERS
+    //  Users may override these values with their own uniform or static consts.
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    const float aa_cubic_c = 0.5;
+    const float aa_gauss_sigma = 0.5 / aa_pixel_diameter;
+    //  Users may override the subpixel offset accessor function with their own.
+    //  A function is used for compatibility with scalar runtime shader params.
+    vec2 get_aa_subpixel_r_offset()
+    {
+        return vec2(0.0, 0.0);
+    }
+#endif
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../../../../include/gamma-management.h"
+
+
+//////////////////////////////////  CONSTANTS  /////////////////////////////////
+
+const float aa_box_support = 0.5;
+const float aa_cubic_support = 2.0;
+
+
+////////////////////////////  GLOBAL NON-CONSTANTS  ////////////////////////////
+
+//  We'll want to define these only once per fragment at most.
+#ifdef RUNTIME_ANTIALIAS_WEIGHTS
+    uniform float aa_cubic_b;
+    uniform float cubic_branch1_x3_coeff;
+    uniform float cubic_branch1_x2_coeff;
+    uniform float cubic_branch1_x0_coeff;
+    uniform float cubic_branch2_x3_coeff;
+    uniform float cubic_branch2_x2_coeff;
+    uniform float cubic_branch2_x1_coeff;
+    uniform float cubic_branch2_x0_coeff;
+#endif
+
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+void assign_aa_cubic_constants()
+{
+    //  Compute cubic coefficients on demand at runtime, and save them to global
+    //  uniforms.  The B parameter is computed from C, because "Keys cubics"
+    //  with B = 1 - 2C are considered the highest quality.
+    #ifdef RUNTIME_ANTIALIAS_WEIGHTS
+        if(aa_filter > 5.5 && aa_filter < 7.5)
+        {
+            aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+            cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+            cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
+            cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
+            cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+            cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+            cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+            cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
+        }
+    #endif
+}
+
+vec4 get_subpixel_support_diam_and_final_axis_importance()
+{
+    //  Statically select the base support radius:
+    const float base_support_radius =
+        aa_filter < 1.5 ? aa_box_support :
+        aa_filter < 3.5 ? aa_tent_support :
+        aa_filter < 5.5 ? aa_gauss_support :
+        aa_filter < 7.5 ? aa_cubic_support :
+        aa_filter < 9.5 ? aa_lanczos_lobes :
+        aa_box_support; //  Default to box
+    //  Expand the filter support for subpixel filtering.
+    const vec2 subpixel_support_radius_raw =
+        vec2(base_support_radius) + abs(get_aa_subpixel_r_offset());
+    if(aa_filter < 1.5)
+    {
+        //  Ignore aa_xy_axis_importance for box filtering.
+        const vec2 subpixel_support_diam =
+            2.0 * subpixel_support_radius_raw;
+        const vec2 final_axis_importance = vec2(1.0);
+        return vec4(subpixel_support_diam, final_axis_importance);
+    }
+    else
+    {
+        //  Scale the support window by aa_xy_axis_importance, but don't narrow
+        //  it further than box support.  This allows decent vertical AA without
+        //  messing up horizontal weights or using something silly like Lanczos4
+        //  horizontally with a huge vertical average over an 8-pixel radius.
+        const vec2 subpixel_support_radius = max(vec2(aa_box_support),
+            subpixel_support_radius_raw * aa_xy_axis_importance);
+        //  Adjust aa_xy_axis_importance to compensate for what's already done:
+        const vec2 final_axis_importance = aa_xy_axis_importance *
+            subpixel_support_radius_raw/subpixel_support_radius;
+        const vec2 subpixel_support_diam = 2.0 * subpixel_support_radius;
+        return vec4(subpixel_support_diam, final_axis_importance);
+    }
+}
+
+
+///////////////////////////  FILTER WEIGHT FUNCTIONS  //////////////////////////
+
+float eval_box_filter(const float dist)
+{
+    return float(abs(dist) <= aa_box_support);
+}
+
+float eval_separable_box_filter(const vec2 offset)
+{
+    return float(all(abs(offset) <= vec2(aa_box_support)));
+}
+
+float eval_tent_filter(const float dist)
+{
+    return clamp((aa_tent_support - dist)/
+        aa_tent_support, 0.0, 1.0);
+}
+
+float eval_gaussian_filter(const float dist)
+{
+    return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma));
+}
+
+float eval_cubic_filter(const float dist)
+{
+    //  Compute coefficients like assign_aa_cubic_constants(), but statically.
+    #ifndef RUNTIME_ANTIALIAS_WEIGHTS
+        //  When runtime weights are used, these values are instead written to
+        //  global uniforms at the beginning of each tex2Daa* call.
+        const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c;
+        const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c;
+        const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c;
+        const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b;
+        const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c;
+        const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c;
+        const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c;
+        const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c;
+    #endif
+    const float abs_dist = abs(dist);
+    //  Compute the cubic based on the Horner's method formula in:
+    //  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf
+    return (abs_dist < 1.0 ?
+        (cubic_branch1_x3_coeff*abs_dist +
+            cubic_branch1_x2_coeff)*abs_dist*abs_dist +
+            cubic_branch1_x0_coeff :
+        abs_dist < 2.0 ?
+            ((cubic_branch2_x3_coeff*abs_dist +
+                cubic_branch2_x2_coeff)*abs_dist +
+                cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff :
+            0.0)/6.0;
+}
+
+float eval_separable_cubic_filter(const vec2 offset)
+{
+    //  This is faster than using a specific vec2 version:
+    return eval_cubic_filter(offset.x) *
+        eval_cubic_filter(offset.y);
+}
+
+vec2 eval_sinc_filter(const vec2 offset)
+{
+    //  It's faster to let the caller handle the zero case, or at least it
+    //  was when I used macros and the shader preset took a full minute to load.
+    const vec2 pi_offset = pi * offset;
+    return sin(pi_offset)/pi_offset;
+}
+
+float eval_separable_lanczos_sinc_filter(const vec2 offset_unsafe)
+{
+    //  Note: For sparse sampling, you really need to pick an axis to use
+    //  Lanczos along (e.g. set aa_xy_axis_importance = vec2(1.0, 0.0)).
+    const vec2 offset = FIX_ZERO(offset_unsafe);
+    const vec2 xy_weights = eval_sinc_filter(offset) *
+        eval_sinc_filter(offset/aa_lanczos_lobes);
+    return xy_weights.x * xy_weights.y;
+}
+
+float eval_jinc_filter_unorm(const float x)
+{
+    //  This is a Jinc approximation for x in [0, 45).  We'll use x in range
+    //  [0, 4*pi) or so.  There are faster/closer approximations based on
+    //  piecewise cubics from [0, 45) and asymptotic approximations beyond that,
+    //  but this has a maximum absolute error < 1/512, and it's simpler/faster
+    //  for shaders...not that it's all that useful for sparse sampling anyway.
+    const float point3845_x = 0.38448566093564*x;
+    const float exp_term = exp(-(point3845_x*point3845_x));
+    const float point8154_plus_x = 0.815362332840791 + x;
+    const float cos_term = cos(point8154_plus_x);
+    return (
+        0.0264727330997042*min(x, 6.83134964622778) +
+        0.680823557250528*exp_term +
+        -0.0597255978950933*min(7.41043194481873, x)*cos_term /
+            (point8154_plus_x + 0.0646074538634482*(x*x) +
+            cos(x)*max(exp_term, cos(x) + cos_term)) -
+        0.180837503591406);
+}
+
+float eval_jinc_filter(const float dist)
+{
+    return eval_jinc_filter_unorm(pi * dist);
+}
+
+float eval_lanczos_jinc_filter(const float dist)
+{
+    return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes);
+}
+
+
+vec3 eval_unorm_rgb_weights(const vec2 offset,
+    const vec2 final_axis_importance)
+{
+    //  Requires:   1.) final_axis_impportance must be computed according to
+    //                  get_subpixel_support_diam_and_final_axis_importance().
+    //              2.) aa_filter must be a global constant.
+    //              3.) offset must be an xy pixel offset in the range:
+    //                      ([-subpixel_support_diameter.x/2,
+    //                      subpixel_support_diameter.x/2],
+    //                      [-subpixel_support_diameter.y/2,
+    //                      subpixel_support_diameter.y/2])
+    //  Returns:    Sample weights at R/G/B destination subpixels for the
+    //              given xy pixel offset.
+    const vec2 offset_g = offset * final_axis_importance;
+    const vec2 aa_r_offset = get_aa_subpixel_r_offset();
+    const vec2 offset_r = offset_g - aa_r_offset * final_axis_importance;
+    const vec2 offset_b = offset_g + aa_r_offset * final_axis_importance;
+    //  Statically select a filter:
+    if(aa_filter < 0.5)
+    {
+        return vec3(eval_separable_box_filter(offset_r),
+            eval_separable_box_filter(offset_g),
+            eval_separable_box_filter(offset_b));
+    }
+    else if(aa_filter < 1.5)
+    {
+        return vec3(eval_box_filter(length(offset_r)),
+            eval_box_filter(length(offset_g)),
+            eval_box_filter(length(offset_b)));
+    }
+    else if(aa_filter < 2.5)
+    {
+        return vec3(
+            eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y),
+            eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y),
+            eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y));
+    }
+    else if(aa_filter < 3.5)
+    {
+        return vec3(eval_tent_filter(length(offset_r)),
+            eval_tent_filter(length(offset_g)),
+            eval_tent_filter(length(offset_b)));
+    }
+    else if(aa_filter < 4.5)
+    {
+        return vec3(
+            eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y),
+            eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y),
+            eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y));
+    }
+    else if(aa_filter < 5.5)
+    {
+        return vec3(eval_gaussian_filter(length(offset_r)),
+            eval_gaussian_filter(length(offset_g)),
+            eval_gaussian_filter(length(offset_b)));
+    }
+    else if(aa_filter < 6.5)
+    {
+        return vec3(
+            eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y),
+            eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y),
+            eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y));
+    }
+    else if(aa_filter < 7.5)
+    {
+        return vec3(eval_cubic_filter(length(offset_r)),
+            eval_cubic_filter(length(offset_g)),
+            eval_cubic_filter(length(offset_b)));
+    }
+    else if(aa_filter < 8.5)
+    {
+        return vec3(eval_separable_lanczos_sinc_filter(offset_r),
+            eval_separable_lanczos_sinc_filter(offset_g),
+            eval_separable_lanczos_sinc_filter(offset_b));
+    }
+    else if(aa_filter < 9.5)
+    {
+        return vec3(eval_lanczos_jinc_filter(length(offset_r)),
+            eval_lanczos_jinc_filter(length(offset_g)),
+            eval_lanczos_jinc_filter(length(offset_b)));
+    }
+    else
+    {
+        //  Default to a box, because Lanczos Jinc is so bad. ;)
+        return vec3(eval_separable_box_filter(offset_r),
+            eval_separable_box_filter(offset_g),
+            eval_separable_box_filter(offset_b));
+    }
+}
+
+
+//////////////////////////////  HELPER FUNCTIONS  //////////////////////////////
+
+vec4 tex2Daa_tiled_linearize(const sampler2D samp, const vec2 s)
+{
+    //  If we're manually tiling a texture, anisotropic filtering can get
+    //  confused.  This is one workaround:
+    #ifdef ANTIALIAS_DISABLE_ANISOTROPIC
+        //  TODO: Use tex2Dlod_linearize with a calculated mip level.
+        return tex2Dlod_linearize(samp, vec4(s, 0.0, 0.0));
+    #else
+        return tex2D_linearize(samp, s);
+    #endif
+}
+
+vec2 get_frame_sign(const float frame)
+{
+    if(aa_temporal)
+    {
+        //  Mirror the sampling pattern for odd frames in a direction that
+        //  lets us keep the same subpixel sample weights:
+        const float frame_odd = float(fmod(frame, 2.0) > 0.5);
+        const vec2 aa_r_offset = get_aa_subpixel_r_offset();
+        const vec2 mirror = -vec2(abs(aa_r_offset) < vec2(FIX_ZERO(0.0)));
+        return mirror;
+    }
+    else
+    {
+        return vec2(1.0);
+    }
+}
+
+
+/////////////////////////  ANTIALIASED TEXTURE LOOKUPS  ////////////////////////
+
+vec3 tex2Daa_subpixel_weights_only(const sampler2D texture,
+    const vec2 tex_uv, const vec2x2 pixel_to_tex_uv)
+{
+    //  This function is unlike the others: Just perform a single independent
+    //  lookup for each subpixel.  It may be very aliased.
+    const vec2 aa_r_offset = get_aa_subpixel_r_offset();
+    const vec2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset);
+    const float color_g = tex2D_linearize(texture, tex_uv).g;
+    const float color_r = tex2D_linearize(texture, tex_uv + aa_r_offset_uv_offset).r;
+    const float color_b = tex2D_linearize(texture, tex_uv - aa_r_offset_uv_offset).b;
+    return vec3(color_r, color_g, color_b);
+}
+
+//  The tex2Daa* functions compile very slowly due to all the macros and
+//  compile-time math, so only include the ones we'll actually use!
+vec3 tex2Daa4x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use an RGMS4 pattern (4-queens):
+    //  . . Q .  : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4
+    //  Q . . .  : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4
+    //  . . . Q  : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4
+    //  . Q . .  : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4
+    //  Static screenspace sample offsets (compute some implicitly):
+    const float grid_size = 4.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(0.0, 1.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = w1.bgr;
+    const vec3 w3 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
+        w2 * sample2 + w3 * sample3);
+}
+
+vec3 tex2Daa5x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 5-queens pattern:
+    //  . Q . . .  : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5
+    //  . . . . Q  : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5
+    //  . . Q . .  : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5
+    //  Q . . . .  : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5
+    //  . . . Q .  : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5
+    //  Static screenspace sample offsets (compute some implicitly):
+    const float grid_size = 5.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 2.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = w1.bgr;
+    const vec3 w4 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 w_sum_inv = vec3(1.0)/(w0 + w1 + w2 + w3 + w4);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 +
+        w2 * sample2 + w3 * sample3 + w4 * sample4);
+}
+
+vec3 tex2Daa6x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 6-queens pattern with a stronger horizontal
+    //  than vertical slant:
+    //  . . . . Q .  : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6
+    //  . . Q . . .  : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6
+    //  Q . . . . .  : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6
+    //  . . . . . Q  : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6
+    //  . . . Q . .  : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6
+    //  . Q . . . .  : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6
+    //  Static screenspace sample offsets (compute some implicitly):
+    const float grid_size = 6.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(4.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(2.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = w2.bgr;
+    const vec3 w4 = w1.bgr;
+    const vec3 w5 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 +
+        w3 * sample3 + w4 * sample4 + w5 * sample5);
+}
+
+vec3 tex2Daa7x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 7-queens pattern with a queen in the center:
+    //  . Q . . . . .  : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7
+    //  . . . . Q . .  : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7
+    //  Q . . . . . .  : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7
+    //  . . . Q . . .  : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7
+    //  . . . . . . Q  : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7
+    //  . . Q . . . .  : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7
+    //  . . . . . Q .  : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7
+    const float grid_size = 7.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 3.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = w2.bgr;
+    const vec3 w5 = w1.bgr;
+    const vec3 w6 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2;
+    const vec3 w_sum = half_sum + half_sum.bgr + w3;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6);
+}
+
+vec3 tex2Daa8x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 8-queens pattern.
+    //  . . Q . . . . .  : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8
+    //  . . . . Q . . .  : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8
+    //  . Q . . . . . .  : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8
+    //  . . . . . . . Q  : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8
+    //  Q . . . . . . .  : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8
+    //  . . . . . . Q .  : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8
+    //  . . . Q . . . .  : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8
+    //  . . . . . Q . .  : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8
+    const float grid_size = 8.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(1.0, 2.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(7.0, 3.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = w3.bgr;
+    const vec3 w5 = w2.bgr;
+    const vec3 w6 = w1.bgr;
+    const vec3 w7 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2 + w3;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, and mirror on odd frames if directed:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7);
+}
+
+vec3 tex2Daa12x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 12-superqueens pattern where no 3 points are
+    //  exactly collinear.
+    //  . . . Q . . . . . . . .  : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12
+    //  . . . . . . . . . Q . .  : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12
+    //  . . . . . . Q . . . . .  : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12
+    //  . Q . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12
+    //  . . . . . . . . . . . Q  : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12
+    //  . . . . Q . . . . . . .  : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12
+    //  . . . . . . . Q . . . .  : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12
+    //  Q . . . . . . . . . . .  : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12
+    //  . . . . . . . . . . Q .  : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12
+    //  . . . . . Q . . . . . .  : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12
+    //  . . Q . . . . . . . . .  : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12
+    //  . . . . . . . . Q . . .  : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12
+    const float grid_size = 12.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(3.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(6.0, 2.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step;
+    const vec2 xy_offset4 = xy_start_offset + vec2(11.0, 4.0) * xy_step;
+    const vec2 xy_offset5 = xy_start_offset + vec2(4.0, 5.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const vec3 w6 = w5.bgr;
+    const vec3 w7 = w4.bgr;
+    const vec3 w8 = w3.bgr;
+    const vec3 w9 = w2.bgr;
+    const vec3 w10 = w1.bgr;
+    const vec3 w11 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/w_sum;
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb;
+    const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb;
+    const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb;
+    const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11);
+}
+
+vec3 tex2Daa16x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 16-superqueens pattern where no 3 points are
+    //  exactly collinear.
+    //  . . Q . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16
+    //  . . . . . . . . . Q . . . . . .  : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16
+    //  . . . . . . . . . . . . Q . . .  : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16
+    //  . . . . Q . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16
+    //  . . . . . . . . Q . . . . . . .  : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16
+    //  . . . . . . . . . . . . . . Q .  : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16
+    //  Q . . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16
+    //  . . . . . . . . . . Q . . . . .  : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16
+    //  . . . . . Q . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16
+    //  . . . . . . . . . . . . . . . Q  : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16
+    //  . Q . . . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16
+    //  . . . . . . . Q . . . . . . . .  : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16
+    //  . . . . . . . . . . . Q . . . .  : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16
+    //  . . . Q . . . . . . . . . . . .  : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16
+    //  . . . . . . Q . . . . . . . . .  : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16
+    //  . . . . . . . . . . . . . Q . .  : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16
+    const float grid_size = 16.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(12.0, 2.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(4.0, 3.0) * xy_step;
+    const vec2 xy_offset4 = xy_start_offset + vec2(8.0, 4.0) * xy_step;
+    const vec2 xy_offset5 = xy_start_offset + vec2(14.0, 5.0) * xy_step;
+    const vec2 xy_offset6 = xy_start_offset + vec2(0.0, 6.0) * xy_step;
+    const vec2 xy_offset7 = xy_start_offset + vec2(10.0, 7.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const vec3 w8 = w7.bgr;
+    const vec3 w9 = w6.bgr;
+    const vec3 w10 = w5.bgr;
+    const vec3 w11 = w4.bgr;
+    const vec3 w12 = w3.bgr;
+    const vec3 w13 = w2.bgr;
+    const vec3 w14 = w1.bgr;
+    const vec3 w15 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const vec2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const vec2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset6).rgb;
+    const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset7).rgb;
+    const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset7).rgb;
+    const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset6).rgb;
+    const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb;
+    const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb;
+    const vec3 sample12 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb;
+    const vec3 sample13 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample14 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample15 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
+}
+
+vec3 tex2Daa20x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 20-superqueens pattern where no 3 points are
+    //  exactly collinear and superqueens have a squared attack radius of 13.
+    //  . . . . . . . Q . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20
+    //  . . . . . . . . . . . . . . . . Q . . .  : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20
+    //  . . . . . . . . . . . Q . . . . . . . .  : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20
+    //  . Q . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20
+    //  . . . . . Q . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20
+    //  . . . . . . . . . . . . . . . Q . . . .  : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20
+    //  . . . . . . . . . . Q . . . . . . . . .  : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20
+    //  . . . . . . . . . . . . . . . . . . . Q  : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20
+    //  . . Q . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20
+    //  . . . . . . Q . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20
+    //  . . . . . . . . . . . . . Q . . . . . .  : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20
+    //  . . . . . . . . . . . . . . . . . Q . .  : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20
+    //  Q . . . . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20
+    //  . . . . . . . . . Q . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20
+    //  . . . . Q . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20
+    //  . . . . . . . . . . . . . . Q . . . . .  : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20
+    //  . . . . . . . . . . . . . . . . . . Q .  : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20
+    //  . . . . . . . . Q . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20
+    //  . . . Q . . . . . . . . . . . . . . . .  : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20
+    //  . . . . . . . . . . . . Q . . . . . . .  : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20
+    const float grid_size = 20.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(7.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(11.0, 2.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step;
+    const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step;
+    const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step;
+    const vec2 xy_offset6 = xy_start_offset + vec2(10.0, 6.0) * xy_step;
+    const vec2 xy_offset7 = xy_start_offset + vec2(19.0, 7.0) * xy_step;
+    const vec2 xy_offset8 = xy_start_offset + vec2(2.0, 8.0) * xy_step;
+    const vec2 xy_offset9 = xy_start_offset + vec2(6.0, 9.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const vec3 w10 = w9.bgr;
+    const vec3 w11 = w8.bgr;
+    const vec3 w12 = w7.bgr;
+    const vec3 w13 = w6.bgr;
+    const vec3 w14 = w5.bgr;
+    const vec3 w15 = w4.bgr;
+    const vec3 w16 = w3.bgr;
+    const vec3 w17 = w2.bgr;
+    const vec3 w18 = w1.bgr;
+    const vec3 w19 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const vec2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const vec2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const vec2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const vec2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset6).rgb;
+    const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset7).rgb;
+    const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset8).rgb;
+    const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset9).rgb;
+    const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset9).rgb;
+    const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset8).rgb;
+    const vec3 sample12 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset7).rgb;
+    const vec3 sample13 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset6).rgb;
+    const vec3 sample14 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb;
+    const vec3 sample15 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb;
+    const vec3 sample16 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb;
+    const vec3 sample17 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample18 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample19 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
+        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19);
+}
+
+vec3 tex2Daa24x(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Use a diagonally symmetric 24-superqueens pattern where no 3 points are
+    //  exactly collinear and superqueens have a squared attack radius of 13.
+    //  . . . . . . Q . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24
+    //  . . . . . . . . . . . . . . . . Q . . . . . . .  : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24
+    //  . . . . . . . . . . Q . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . Q . .  : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24
+    //  . . . . . Q . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24
+    //  . . . . . . . . . . . . . . . Q . . . . . . . .  : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24
+    //  . Q . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24
+    //  . . . . . . . . . . . Q . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24
+    //  . . . . . . . . . . . . . . . . . . . Q . . . .  : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . . . Q  : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24
+    //  . . . Q . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24
+    //  . . . . . . . . . . . . . . Q . . . . . . . . .  : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24
+    //  . . . . . . . . . Q . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . Q . . .  : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24
+    //  Q . . . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24
+    //  . . . . Q . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24
+    //  . . . . . . . . . . . . Q . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24
+    //  . . . . . . . . . . . . . . . . . . . . . . Q .  : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24
+    //  . . . . . . . . Q . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24
+    //  . . . . . . . . . . . . . . . . . . Q . . . . .  : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24
+    //  . . Q . . . . . . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24
+    //  . . . . . . . . . . . . . Q . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24
+    //  . . . . . . . Q . . . . . . . . . . . . . . . .  : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24
+    //  . . . . . . . . . . . . . . . . . Q . . . . . .  : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24
+    const float grid_size = 24.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample.  Exploit diagonal symmetry:
+    const vec2 xy_offset0 = xy_start_offset + vec2(6.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(10.0, 2.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(21.0, 3.0) * xy_step;
+    const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step;
+    const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step;
+    const vec2 xy_offset6 = xy_start_offset + vec2(1.0, 6.0) * xy_step;
+    const vec2 xy_offset7 = xy_start_offset + vec2(11.0, 7.0) * xy_step;
+    const vec2 xy_offset8 = xy_start_offset + vec2(19.0, 8.0) * xy_step;
+    const vec2 xy_offset9 = xy_start_offset + vec2(23.0, 9.0) * xy_step;
+    const vec2 xy_offset10 = xy_start_offset + vec2(3.0, 10.0) * xy_step;
+    const vec2 xy_offset11 = xy_start_offset + vec2(14.0, 11.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance);
+    const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance);
+    const vec3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance);
+    const vec3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance);
+    const vec3 w12 = w11.bgr;
+    const vec3 w13 = w10.bgr;
+    const vec3 w14 = w9.bgr;
+    const vec3 w15 = w8.bgr;
+    const vec3 w16 = w7.bgr;
+    const vec3 w17 = w6.bgr;
+    const vec3 w18 = w5.bgr;
+    const vec3 w19 = w4.bgr;
+    const vec3 w20 = w3.bgr;
+    const vec3 w21 = w2.bgr;
+    const vec3 w22 = w1.bgr;
+    const vec3 w23 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 +
+        w5 + w6 + w7 + w8 + w9 + w10 + w11;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, mirror on odd frames if directed, and exploit
+    //  diagonal symmetry:
+    const vec2 frame_sign = get_frame_sign(frame);
+    const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign);
+    const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign);
+    const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign);
+    const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign);
+    const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign);
+    const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign);
+    const vec2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign);
+    const vec2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign);
+    const vec2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign);
+    const vec2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign);
+    const vec2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign);
+    const vec2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign);
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset6).rgb;
+    const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset7).rgb;
+    const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset8).rgb;
+    const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset9).rgb;
+    const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset10).rgb;
+    const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset11).rgb;
+    const vec3 sample12 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset11).rgb;
+    const vec3 sample13 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset10).rgb;
+    const vec3 sample14 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset9).rgb;
+    const vec3 sample15 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset8).rgb;
+    const vec3 sample16 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset7).rgb;
+    const vec3 sample17 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset6).rgb;
+    const vec3 sample18 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb;
+    const vec3 sample19 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb;
+    const vec3 sample20 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb;
+    const vec3 sample21 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb;
+    const vec3 sample22 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb;
+    const vec3 sample23 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 +
+        w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 +
+        w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23);
+}
+
+vec3 tex2Daa_debug_16x_regular(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Sample on a regular 4x4 grid.  This is mainly for testing.
+    const float grid_size = 4.0;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter;
+    const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step;
+    //  Get the xy offset of each sample:
+    const vec2 xy_offset0 = xy_start_offset + vec2(0.0, 0.0) * xy_step;
+    const vec2 xy_offset1 = xy_start_offset + vec2(1.0, 0.0) * xy_step;
+    const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 0.0) * xy_step;
+    const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 0.0) * xy_step;
+    const vec2 xy_offset4 = xy_start_offset + vec2(0.0, 1.0) * xy_step;
+    const vec2 xy_offset5 = xy_start_offset + vec2(1.0, 1.0) * xy_step;
+    const vec2 xy_offset6 = xy_start_offset + vec2(2.0, 1.0) * xy_step;
+    const vec2 xy_offset7 = xy_start_offset + vec2(3.0, 1.0) * xy_step;
+    //  Compute subpixel weights, and exploit diagonal symmetry for speed.
+    //  (We can't exploit vertical or horizontal symmetry due to uncertain
+    //  subpixel offsets.  We could fix that by rotating xy offsets with the
+    //  subpixel structure, but...no.)
+    const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance);
+    const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance);
+    const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance);
+    const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance);
+    const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance);
+    const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance);
+    const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance);
+    const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance);
+    const vec3 w8 = w7.bgr;
+    const vec3 w9 = w6.bgr;
+    const vec3 w10 = w5.bgr;
+    const vec3 w11 = w4.bgr;
+    const vec3 w12 = w3.bgr;
+    const vec3 w13 = w2.bgr;
+    const vec3 w14 = w1.bgr;
+    const vec3 w15 = w0.bgr;
+    //  Get the weight sum to normalize the total to 1.0 later:
+    const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7;
+    const vec3 w_sum = half_sum + half_sum.bgr;
+    const vec3 w_sum_inv = vec3(1.0)/(w_sum);
+    //  Scale the pixel-space to texture offset matrix by the pixel diameter.
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    //  Get uv sample offsets, taking advantage of row alignment:
+    const vec2 uv_step_x = mul(true_pixel_to_tex_uv, vec2(xy_step.x, 0.0));
+    const vec2 uv_step_y = mul(true_pixel_to_tex_uv, vec2(0.0, xy_step.y));
+    const vec2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y);
+    const vec2 sample0_uv = tex_uv + uv_offset0;
+    const vec2 sample4_uv = sample0_uv + uv_step_y;
+    const vec2 sample8_uv = sample0_uv + uv_step_y * 2.0;
+    const vec2 sample12_uv = sample0_uv + uv_step_y * 3.0;
+    //  Load samples, linearizing if necessary, etc.:
+    const vec3 sample0 = tex2Daa_tiled_linearize(texture, sample0_uv).rgb;
+    const vec3 sample1 = tex2Daa_tiled_linearize(texture, sample0_uv + uv_step_x).rgb;
+    const vec3 sample2 = tex2Daa_tiled_linearize(texture, sample0_uv + uv_step_x * 2.0).rgb;
+    const vec3 sample3 = tex2Daa_tiled_linearize(texture, sample0_uv + uv_step_x * 3.0).rgb;
+    const vec3 sample4 = tex2Daa_tiled_linearize(texture, sample4_uv).rgb;
+    const vec3 sample5 = tex2Daa_tiled_linearize(texture, sample4_uv + uv_step_x).rgb;
+    const vec3 sample6 = tex2Daa_tiled_linearize(texture, sample4_uv + uv_step_x * 2.0).rgb;
+    const vec3 sample7 = tex2Daa_tiled_linearize(texture, sample4_uv + uv_step_x * 3.0).rgb;
+    const vec3 sample8 = tex2Daa_tiled_linearize(texture, sample8_uv).rgb;
+    const vec3 sample9 = tex2Daa_tiled_linearize(texture, sample8_uv + uv_step_x).rgb;
+    const vec3 sample10 = tex2Daa_tiled_linearize(texture, sample8_uv + uv_step_x * 2.0).rgb;
+    const vec3 sample11 = tex2Daa_tiled_linearize(texture, sample8_uv + uv_step_x * 3.0).rgb;
+    const vec3 sample12 = tex2Daa_tiled_linearize(texture, sample12_uv).rgb;
+    const vec3 sample13 = tex2Daa_tiled_linearize(texture, sample12_uv + uv_step_x).rgb;
+    const vec3 sample14 = tex2Daa_tiled_linearize(texture, sample12_uv + uv_step_x * 2.0).rgb;
+    const vec3 sample15 = tex2Daa_tiled_linearize(texture, sample12_uv + uv_step_x * 3.0).rgb;
+    //  Sum weighted samples (weight sum must equal 1.0 for each channel):
+    return w_sum_inv * (
+        w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15);
+}
+
+vec3 tex2Daa_debug_dynamic(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  This function is for testing only: Use an NxN grid with dynamic weights.
+    const int grid_size = 8;
+    assign_aa_cubic_constants();
+    const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance();
+    const vec2 subpixel_support_diameter = ssd_fai.xy;
+    const vec2 final_axis_importance = ssd_fai.zw;
+    const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0;
+    const vec2 filter_space_offset_step =
+        subpixel_support_diameter/vec2(grid_size);
+    const vec2 sample0_filter_space_offset =
+        -grid_radius_in_samples * filter_space_offset_step;
+    //  Compute xy sample offsets and subpixel weights:
+    vec3 weights[grid_size * grid_size];
+    vec3 weight_sum = 0.0;
+    for(int i = 0; i < grid_size; ++i)
+    {
+        for(int j = 0; j < grid_size; ++j)
+        {
+            //  Weights based on xy distances:
+            const vec2 offset = sample0_filter_space_offset +
+                vec2(j, i) * filter_space_offset_step;
+            const vec3 weight = eval_unorm_rgb_weights(offset, final_axis_importance);
+            weights[i*grid_size + j] = weight;
+            weight_sum += weight;
+        }
+    }
+    //  Get uv offset vectors along x and y directions:
+    const vec2x2 true_pixel_to_tex_uv =
+        vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter));
+    const vec2 uv_offset_step_x = mul(true_pixel_to_tex_uv,
+        vec2(filter_space_offset_step.x, 0.0));
+    const vec2 uv_offset_step_y = mul(true_pixel_to_tex_uv,
+        vec2(0.0, filter_space_offset_step.y));
+    //  Get a starting sample location:
+    const vec2 sample0_uv_offset = -grid_radius_in_samples *
+        (uv_offset_step_x + uv_offset_step_y);
+    const vec2 sample0_uv = tex_uv + sample0_uv_offset;
+    //  Load, weight, and sum [linearized] samples:
+    vec3 sum = 0.0;
+    const vec3 weight_sum_inv = vec3(1.0)/weight_sum;
+    for(int i = 0; i < grid_size; ++i)
+    {
+        const vec2 row_i_first_sample_uv =
+            sample0_uv + i * uv_offset_step_y;
+        for(int j = 0; j < grid_size; ++j)
+        {
+            const vec2 sample_uv =
+                row_i_first_sample_uv + j * uv_offset_step_x;
+            sum += weights[i*grid_size + j] *
+                tex2Daa_tiled_linearize(texture, sample_uv).rgb;
+        }
+    }
+    return sum * weight_sum_inv;
+}
+
+
+///////////////////////  ANTIALIASING CODEPATH SELECTION  //////////////////////
+
+vec3 tex2Daa(const sampler2D texture, const vec2 tex_uv,
+    const vec2x2 pixel_to_tex_uv, const float frame)
+{
+    //  Statically switch between antialiasing modes/levels:
+    return aa_level < 0.5 ? tex2D_linearize(texture, tex_uv).rgb :
+        aa_level < 3.5 ? tex2Daa_subpixel_weights_only(
+            texture, tex_uv, pixel_to_tex_uv) :
+        aa_level < 4.5 ? tex2Daa4x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 5.5 ? tex2Daa5x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 6.5 ? tex2Daa6x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 7.5 ? tex2Daa7x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 11.5 ? tex2Daa8x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 15.5 ? tex2Daa12x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 19.5 ? tex2Daa16x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 23.5 ? tex2Daa20x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 253.5 ? tex2Daa24x(texture, tex_uv, pixel_to_tex_uv, frame) :
+        aa_level < 254.5 ? tex2Daa_debug_16x_regular(
+            texture, tex_uv, pixel_to_tex_uv, frame) :
+        tex2Daa_debug_dynamic(texture, tex_uv, pixel_to_tex_uv, frame);
+}
+
+
+#endif  //  TEX2DANTIALIAS_H
+
diff --git a/crt/shaders/crt-royale/src/user-preset-constants.h b/crt/shaders/crt-royale/src/user-preset-constants.h
new file mode 100644
index 0000000..ad70a9a
--- /dev/null
+++ b/crt/shaders/crt-royale/src/user-preset-constants.h
@@ -0,0 +1,58 @@
+#ifndef USER_CGP_CONSTANTS_H
+#define USER_CGP_CONSTANTS_H
+
+//  IMPORTANT:
+//  These constants MUST be set appropriately for the settings in crt-royale.cgp
+//  (or whatever related .cgp file you're using).  If they aren't, you're likely
+//  to get artifacts, the wrong phosphor mask size, etc.  I wish these could be
+//  set directly in the .cgp file to make things easier, but...they can't.
+
+//  PASS SCALES AND RELATED CONSTANTS:
+//  Copy the absolute scale_x for BLOOM_APPROX.  There are two major versions of
+//  this shader: One does a viewport-scale bloom, and the other skips it.  The
+//  latter benefits from a higher bloom_approx_scale_x, so save both separately:
+const float bloom_approx_size_x = 320.0;
+const float bloom_approx_size_x_for_fake = 400.0;
+//  Copy the viewport-relative scales of the phosphor mask resize passes
+//  (MASK_RESIZE and the pass immediately preceding it):
+const vec2 mask_resize_viewport_scale = vec2(0.0625, 0.0625);
+//  Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
+const float geom_max_aspect_ratio = 4.0/3.0;
+
+//  PHOSPHOR MASK TEXTURE CONSTANTS:
+//  Set the following constants to reflect the properties of the phosphor mask
+//  texture named in crt-royale.cgp.  The shader optionally resizes a mask tile
+//  based on user settings, then repeats a single tile until filling the screen.
+//  The shader must know the input texture size (default 64x64), and to manually
+//  resize, it must also know the horizontal triads per tile (default 8).
+const vec2 mask_texture_small_size = vec2(64.0);
+const vec2 mask_texture_large_size = vec2(512.0);
+const float mask_triads_per_tile = 8.0;
+//  We need the average brightness of the phosphor mask to compensate for the
+//  dimming it causes.  The following four values are roughly correct for the
+//  masks included with the shader.  Update the value for any LUT texture you
+//  change.  [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
+//  the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
+//#define PHOSPHOR_MASK_GRILLE14
+const float mask_grille14_avg_color = 50.6666666/255.0;
+    //  TileableLinearApertureGrille14Wide7d33Spacing*.png
+    //  TileableLinearApertureGrille14Wide10And6Spacing*.png
+const float mask_grille15_avg_color = 53.0/255.0;
+    //  TileableLinearApertureGrille15Wide6d33Spacing*.png
+    //  TileableLinearApertureGrille15Wide8And5d5Spacing*.png
+const float mask_slot_avg_color = 46.0/255.0;
+    //  TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
+    //  TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
+const float mask_shadow_avg_color = 41.0/255.0;
+    //  TileableLinearShadowMask*.png
+    //  TileableLinearShadowMaskEDP*.png
+
+#ifdef PHOSPHOR_MASK_GRILLE14
+    const float mask_grille_avg_color = mask_grille14_avg_color;
+#else
+    const float mask_grille_avg_color = mask_grille15_avg_color;
+#endif
+
+
+#endif  //  USER_CGP_CONSTANTS_H
+
diff --git a/crt/shaders/crt-royale/user-settings.h b/crt/shaders/crt-royale/user-settings.h
new file mode 100644
index 0000000..b631311
--- /dev/null
+++ b/crt/shaders/crt-royale/user-settings.h
@@ -0,0 +1,359 @@
+#ifndef USER_SETTINGS_H
+#define USER_SETTINGS_H
+
+/////////////////////////////  DRIVER CAPABILITIES  ////////////////////////////
+
+//  The Cg compiler uses different "profiles" with different capabilities.
+//  This shader requires a Cg compilation profile >= arbfp1, but a few options
+//  require higher profiles like fp30 or fp40.  The shader can't detect profile
+//  or driver capabilities, so instead you must comment or uncomment the lines
+//  below with "//" before "#define."  Disable an option if you get compilation
+//  errors resembling those listed.  Generally speaking, all of these options
+//  will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
+//  likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
+
+//  Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
+//  Among other things, derivatives help us fix anisotropic filtering artifacts
+//  with curved manually tiled phosphor mask coords.  Related errors:
+//  error C3004: function "vec2 ddx(vec2);" not supported in this profile
+//  error C3004: function "vec2 ddy(vec2);" not supported in this profile
+    //#define DRIVERS_ALLOW_DERIVATIVES
+
+//  Fine derivatives: Unsupported on older ATI cards.
+//  Fine derivatives enable 2x2 fragment block communication, letting us perform
+//  fast single-pass blur operations.  If your card uses coarse derivatives and
+//  these are enabled, blurs could look broken.  Derivatives are a prerequisite.
+    #ifdef DRIVERS_ALLOW_DERIVATIVES
+        #define DRIVERS_ALLOW_FINE_DERIVATIVES
+    #endif
+
+//  Dynamic looping: Requires an fp30 or newer profile.
+//  This makes phosphor mask resampling faster in some cases.  Related errors:
+//  error C5013: profile does not support "for" statements and "for" could not
+//  be unrolled
+    //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
+
+//  Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
+//  Using one static loop avoids overhead if the user is right, but if the user
+//  is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
+//  binary search can potentially save some iterations.  However, it may fail:
+//  error C6001: Temporary register limit of 32 exceeded; 35 registers
+//  needed to compile program
+    //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
+
+//  tex2Dlod: Requires an fp40 or newer profile.  This can be used to disable
+//  anisotropic filtering, thereby fixing related artifacts.  Related errors:
+//  error C3004: function "vec4 tex2Dlod(sampler2D, vec4);" not supported in
+//  this profile
+    //#define DRIVERS_ALLOW_TEX2DLOD
+
+//  tex2Dbias: Requires an fp30 or newer profile.  This can be used to alleviate
+//  artifacts from anisotropic filtering and mipmapping.  Related errors:
+//  error C3004: function "vec4 tex2Dbias(sampler2D, vec4);" not supported
+//  in this profile
+    //#define DRIVERS_ALLOW_TEX2DBIAS
+
+//  Integrated graphics compatibility: Integrated graphics like Intel HD 4000
+//  impose stricter limitations on register counts and instructions.  Enable
+//  INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
+//  error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
+//  to compile program.
+//  Enabling integrated graphics compatibility mode will automatically disable:
+//  1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
+//      (This may be reenabled in a later release.)
+//  2.) RUNTIME_GEOMETRY_MODE
+//  3.) The high-quality 4x4 Gaussian resize for the bloom approximation
+    //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
+
+
+////////////////////////////  USER CODEPATH OPTIONS  ///////////////////////////
+
+//  To disable a #define option, turn its line into a comment with "//."
+
+//  RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
+//  Enable runtime shader parameters in the Retroarch (etc.) GUI?  They override
+//  many of the options in this file and allow real-time tuning, but many of
+//  them are slower.  Disabling them and using this text file will boost FPS.
+#define RUNTIME_SHADER_PARAMS_ENABLE
+//  Specify the phosphor bloom sigma at runtime?  This option is 10% slower, but
+//  it's the only way to do a wide-enough full bloom with a runtime dot pitch.
+#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
+//  Specify antialiasing weight parameters at runtime?  (Costs ~20% with cubics)
+#define RUNTIME_ANTIALIAS_WEIGHTS
+//  Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
+//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
+//  Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
+//  parameters?  This will require more math or dynamic branching.
+#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
+//  Specify the tilt at runtime?  This makes things about 3% slower.
+#define RUNTIME_GEOMETRY_TILT
+//  Specify the geometry mode at runtime?
+#define RUNTIME_GEOMETRY_MODE
+//  Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
+//  mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
+//  dynamic branches?  This is cheap if mask_resize_viewport_scale is small.
+#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
+
+//  PHOSPHOR MASK:
+//  Manually resize the phosphor mask for best results (slower)?  Disabling this
+//  removes the option to do so, but it may be faster without dynamic branches.
+    #define PHOSPHOR_MASK_MANUALLY_RESIZE
+//  If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
+    #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
+//  Larger blurs are expensive, but we need them to blur larger triads.  We can
+//  detect the right blur if the triad size is static or our profile allows
+//  dynamic branches, but otherwise we use the largest blur the user indicates
+//  they might need:
+    #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
+    //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
+    //  Here's a helpful chart:
+    //  MaxTriadSize    BlurSize    MinTriadCountsByResolution
+    //  3.0             9.0         480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  6.0             17.0        240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  9.0             25.0        160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  12.0            31.0        120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+    //  18.0            43.0        80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
+
+
+///////////////////////////////  USER PARAMETERS  //////////////////////////////
+
+//  Note: Many of these static parameters are overridden by runtime shader
+//  parameters when those are enabled.  However, many others are static codepath
+//  options that were cleaner or more convert to code as static constants.
+
+//  GAMMA:
+    const float crt_gamma_static = 2.5;                  //  range [1, 5]
+    const float lcd_gamma_static = 2.2;                  //  range [1, 5]
+
+//  LEVELS MANAGEMENT:
+    //  Control the final multiplicative image contrast:
+    const float levels_contrast_static = 1.0;            //  range [0, 4)
+    //  We auto-dim to avoid clipping between passes and restore brightness
+    //  later.  Control the dim factor here: Lower values clip less but crush
+    //  blacks more (static only for now).
+    const float levels_autodim_temp = 0.5;               //  range (0, 1]
+
+//  HALATION/DIFFUSION/BLOOM:
+    //  Halation weight: How much energy should be lost to electrons bounding
+    //  around under the CRT glass and exciting random phosphors?
+    const float halation_weight_static = 0.0;            //  range [0, 1]
+    //  Refractive diffusion weight: How much light should spread/diffuse from
+    //  refracting through the CRT glass?
+    const float diffusion_weight_static = 0.075;         //  range [0, 1]
+    //  Underestimate brightness: Bright areas bloom more, but we can base the
+    //  bloom brightpass on a lower brightness to sharpen phosphors, or a higher
+    //  brightness to soften them.  Low values clip, but >= 0.8 looks okay.
+    const float bloom_underestimate_levels_static = 0.8; //  range [0, 5]
+    //  Blur all colors more than necessary for a softer phosphor bloom?
+    const float bloom_excess_static = 0.0;               //  range [0, 1]
+    //  The BLOOM_APPROX pass approximates a phosphor blur early on with a small
+    //  blurred resize of the input (convergence offsets are applied as well).
+    //  There are three filter options (static option only for now):
+    //  0.) Bilinear resize: A fast, close approximation to a 4x4 resize
+    //      if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
+    //      and beam_max_sigma is low.
+    //  1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
+    //      always uses a static sigma regardless of beam_max_sigma or
+    //      mask_num_triads_desired.
+    //  2.) True 4x4 Gaussian resize: Slowest, technically correct.
+    //  These options are more pronounced for the fast, unbloomed shader version.
+    const float bloom_approx_filter_static = 2.0;
+
+//  ELECTRON BEAM SCANLINE DISTRIBUTION:
+    //  How many scanlines should contribute light to each pixel?  Using more
+    //  scanlines is slower (especially for a generalized Gaussian) but less
+    //  distorted with larger beam sigmas (especially for a pure Gaussian).  The
+    //  max_beam_sigma at which the closest unused weight is guaranteed <
+    //  1.0/255.0 (for a 3x antialiased pure Gaussian) is:
+    //      2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
+    //      3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
+    //      4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
+    //      5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
+    //      6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
+    const float beam_num_scanlines = 3.0;                //  range [2, 6]
+    //  A generalized Gaussian beam varies shape with color too, now just width.
+    //  It's slower but more flexible (static option only for now).
+    const bool beam_generalized_gaussian = true;
+    //  What kind of scanline antialiasing do you want?
+    //  0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
+    //  Integrals are slow (especially for generalized Gaussians) and rarely any
+    //  better than 3x antialiasing (static option only for now).
+    const float beam_antialias_level = 1.0;              //  range [0, 2]
+    //  Min/max standard deviations for scanline beams: Higher values widen and
+    //  soften scanlines.  Depending on other options, low min sigmas can alias.
+    const float beam_min_sigma_static = 0.02;            //  range (0, 1]
+    const float beam_max_sigma_static = 0.3;             //  range (0, 1]
+    //  Beam width varies as a function of color: A power function (0) is more
+    //  configurable, but a spherical function (1) gives the widest beam
+    //  variability without aliasing (static option only for now).
+    const float beam_spot_shape_function = 0.0;
+    //  Spot shape power: Powers <= 1 give smoother spot shapes but lower
+    //  sharpness.  Powers >= 1.0 are awful unless mix/max sigmas are close.
+    const float beam_spot_power_static = 1.0/3.0;    //  range (0, 16]
+    //  Generalized Gaussian max shape parameters: Higher values give flatter
+    //  scanline plateaus and steeper dropoffs, simultaneously widening and
+    //  sharpening scanlines at the cost of aliasing.  2.0 is pure Gaussian, and
+    //  values > ~40.0 cause artifacts with integrals.
+    const float beam_min_shape_static = 2.0;         //  range [2, 32]
+    const float beam_max_shape_static = 4.0;         //  range [2, 32]
+    //  Generalized Gaussian shape power: Affects how quickly the distribution
+    //  changes shape from Gaussian to steep/plateaued as color increases from 0
+    //  to 1.0.  Higher powers appear softer for most colors, and lower powers
+    //  appear sharper for most colors.
+    const float beam_shape_power_static = 1.0/4.0;   //  range (0, 16]
+    //  What filter should be used to sample scanlines horizontally?
+    //  0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
+    const float beam_horiz_filter_static = 0.0;
+    //  Standard deviation for horizontal Gaussian resampling:
+    const float beam_horiz_sigma_static = 0.35;      //  range (0, 2/3]
+    //  Do horizontal scanline sampling in linear RGB (correct light mixing),
+    //  gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
+    //  limiting circuitry in some CRT's), or a weighted avg.?
+    const float beam_horiz_linear_rgb_weight_static = 1.0;   //  range [0, 1]
+    //  Simulate scanline misconvergence?  This needs 3x horizontal texture
+    //  samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
+    //  later passes (static option only for now).
+    const bool beam_misconvergence = true;
+    //  Convergence offsets in x/y directions for R/G/B scanline beams in units
+    //  of scanlines.  Positive offsets go right/down; ranges [-2, 2]
+    const vec2 convergence_offsets_r_static = vec2(0.1, 0.2);
+    const vec2 convergence_offsets_g_static = vec2(0.3, 0.4);
+    const vec2 convergence_offsets_b_static = vec2(0.5, 0.6);
+    //  Detect interlacing (static option only for now)?
+    const bool interlace_detect = true;
+    //  Assume 1080-line sources are interlaced?
+    const bool interlace_1080i_static = false;
+    //  For interlaced sources, assume TFF (top-field first) or BFF order?
+    //  (Whether this matters depends on the nature of the interlaced input.)
+    const bool interlace_bff_static = false;
+
+//  ANTIALIASING:
+    //  What AA level do you want for curvature/overscan/subpixels?  Options:
+    //  0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
+    //  (Static option only for now)
+    const float aa_level = 12.0;                     //  range [0, 24]
+    //  What antialiasing filter do you want (static option only)?  Options:
+    //  0: Box (separable), 1: Box (cylindrical),
+    //  2: Tent (separable), 3: Tent (cylindrical),
+    //  4: Gaussian (separable), 5: Gaussian (cylindrical),
+    //  6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
+    //  8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
+    //      * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
+    const float aa_filter = 6.0;                     //  range [0, 9]
+    //  Flip the sample grid on odd/even frames (static option only for now)?
+    const bool aa_temporal = false;
+    //  Use RGB subpixel offsets for antialiasing?  The pixel is at green, and
+    //  the blue offset is the negative r offset; range [0, 0.5]
+    const vec2 aa_subpixel_r_offset_static = vec2(-1.0/3.0, 0.0);//vec2(0.0);
+    //  Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
+    //  1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
+    //  2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
+    //  3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
+    //  4.) C = 0.0 is a soft spline filter.
+    const float aa_cubic_c_static = 0.5;             //  range [0, 4]
+    //  Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
+    const float aa_gauss_sigma_static = 0.5;     //  range [0.0625, 1.0]
+
+//  PHOSPHOR MASK:
+    //  Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
+    const float mask_type_static = 1.0;                  //  range [0, 2]
+    //  We can sample the mask three ways.  Pick 2/3 from: Pretty/Fast/Flexible.
+    //  0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
+    //      This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
+    //  1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible).  This
+    //      is halfway decent with LUT mipmapping but atrocious without it.
+    //  2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
+    //      (pretty/fast/inflexible).  Each input LUT has a fixed dot pitch.
+    //      This mode reuses the same masks, so triads will be enormous unless
+    //      you change the mask LUT filenames in your .cgp file.
+    const float mask_sample_mode_static = 0.0;           //  range [0, 2]
+    //  Prefer setting the triad size (0.0) or number on the screen (1.0)?
+    //  If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
+    //  will always be used to calculate the full bloom sigma statically.
+    const float mask_specify_num_triads_static = 0.0;    //  range [0, 1]
+    //  Specify the phosphor triad size, in pixels.  Each tile (usually with 8
+    //  triads) will be rounded to the nearest integer tile size and clamped to
+    //  obey minimum size constraints (imposed to reduce downsize taps) and
+    //  maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
+    //  To increase the size limit, double the viewport-relative scales for the
+    //  two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    const float mask_triad_size_desired_static = 24.0 / 8.0;
+    //  If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
+    //  final size will be rounded and constrained as above); default 480.0
+    const float mask_num_triads_desired_static = 480.0;
+    //  How many lobes should the sinc/Lanczos resizer use?  More lobes require
+    //  more samples and avoid moire a bit better, but some is unavoidable
+    //  depending on the destination size (static option for now).
+    const float mask_sinc_lobes = 3.0;                   //  range [2, 4]
+    //  The mask is resized using a variable number of taps in each dimension,
+    //  but some Cg profiles always fetch a constant number of taps no matter
+    //  what (no dynamic branching).  We can limit the maximum number of taps if
+    //  we statically limit the minimum phosphor triad size.  Larger values are
+    //  faster, but the limit IS enforced (static option only, forever);
+    //      range [1, mask_texture_small_size/mask_triads_per_tile]
+    //  TODO: Make this 1.0 and compensate with smarter sampling!
+    const float mask_min_allowed_triad_size = 2.0;
+
+//  GEOMETRY:
+    //  Geometry mode:
+    //  0: Off (default), 1: Spherical mapping (like cgwg's),
+    //  2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
+    const float geom_mode_static = 0.0;      //  range [0, 3]
+    //  Radius of curvature: Measured in units of your viewport's diagonal size.
+    const float geom_radius_static = 2.0;    //  range [1/(2*pi), 1024]
+    //  View dist is the distance from the player to their physical screen, in
+    //  units of the viewport's diagonal size.  It controls the field of view.
+    const float geom_view_dist_static = 2.0; //  range [0.5, 1024]
+    //  Tilt angle in radians (clockwise around up and right vectors):
+    const vec2 geom_tilt_angle_static = vec2(0.0, 0.0);  //  range [-pi, pi]
+    //  Aspect ratio: When the true viewport size is unknown, this value is used
+    //  to help convert between the phosphor triad size and count, along with
+    //  the mask_resize_viewport_scale constant from user-cgp-constants.h.  Set
+    //  this equal to Retroarch's display aspect ratio (DAR) for best results;
+    //  range [1, geom_max_aspect_ratio from user-cgp-constants.h];
+    //  default (256/224)*(54/47) = 1.313069909 (see below)
+    const float geom_aspect_ratio_static = 1.313069909;
+    //  Before getting into overscan, here's some general aspect ratio info:
+    //  - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
+    //  - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
+    //  - PAR = pixel aspect ratio   = DAR / SAR; holds regardless of cropping
+    //  Geometry processing has to "undo" the screen-space 2D DAR to calculate
+    //  3D view vectors, then reapplies the aspect ratio to the simulated CRT in
+    //  uv-space.  To ensure the source SAR is intended for a ~4:3 DAR, either:
+    //  a.) Enable Retroarch's "Crop Overscan"
+    //  b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
+    //  Real consoles use horizontal black padding in the signal, but emulators
+    //  often crop this without cropping the vertical padding; a 256x224 [S]NES
+    //  frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
+    //  The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
+    //      http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
+    //      http://forums.nesdev.com/viewtopic.php?p=24815#p24815
+    //  For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
+    //  without doing a. or b., but horizontal image borders will be tighter
+    //  than vertical ones, messing up curvature and overscan.  Fixing the
+    //  padding first corrects this.
+    //  Overscan: Amount to "zoom in" before cropping.  You can zoom uniformly
+    //  or adjust x/y independently to e.g. readd horizontal padding, as noted
+    //  above: Values < 1.0 zoom out; range (0, inf)
+    const vec2 geom_overscan_static = vec2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
+    //  Compute a proper pixel-space to texture-space matrix even without ddx()/
+    //  ddy()?  This is ~8.5% slower but improves antialiasing/subpixel filtering
+    //  with strong curvature (static option only for now).
+    const bool geom_force_correct_tangent_matrix = true;
+
+//  BORDERS:
+    //  Rounded border size in texture uv coords:
+    const float border_size_static = 0.015;           //  range [0, 0.5]
+    //  Border darkness: Moderate values darken the border smoothly, and high
+    //  values make the image very dark just inside the border:
+    const float border_darkness_static = 2.0;        //  range [0, inf)
+    //  Border compression: High numbers compress border transitions, narrowing
+    //  the dark border area.
+    const float border_compress_static = 2.5;        //  range [1, inf)
+
+
+#endif  //  USER_SETTINGS_H
+