add initial, broken first pass of crt-royale

This commit is contained in:
hunterk 2016-08-12 11:04:59 -05:00
parent f0dcc2198d
commit 8f4886fcc3
33 changed files with 9313 additions and 0 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 214 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 202 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 200 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.8 KiB

View file

@ -0,0 +1,12 @@
These files aren't nearly as good as canned .cgp presets with all of your
favorite settings, and there aren't nearly enough, but they're a start.
The nVidia settings files will only work on nVidia cards.
The ATI settings files will work on both AMD/ATI and nVidia cards.
The Intel settings files should additionally work on Intel HD 4000 Graphics, but
they disable manual phosphor mask resizing, so the phosphor mask will be softer.
For compatibility with Intel integrated graphics, you can either use the Intel-
specific .cgp files or use the Intel settings files. These are the same as the
ATI settings, except the following line is also uncommented:
#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE

View file

@ -0,0 +1,92 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
//#define DRIVERS_ALLOW_DERIVATIVES
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
//#define DRIVERS_ALLOW_TEX2DLOD
//#define DRIVERS_ALLOW_TEX2DBIAS
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
#define RUNTIME_SHADER_PARAMS_ENABLE
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
#define RUNTIME_ANTIALIAS_WEIGHTS
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#define RUNTIME_GEOMETRY_TILT
#define RUNTIME_GEOMETRY_MODE
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#define PHOSPHOR_MASK_MANUALLY_RESIZE
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
/////////////////////////////// USER PARAMETERS //////////////////////////////
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
static const float levels_contrast_static = 1.0; // range [0, 4)
static const float levels_autodim_temp = 0.5; // range (0, 1]
static const float halation_weight_static = 0.0; // range [0, 1]
static const float diffusion_weight_static = 0.075; // range [0, 1]
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
static const float bloom_excess_static = 0.0; // range [0, 1]
static const float bloom_approx_filter_static = 2.0;
static const float beam_num_scanlines = 3.0; // range [2, 6]
static const bool beam_generalized_gaussian = true;
static const float beam_antialias_level = 1.0; // range [0, 2]
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
static const float beam_spot_shape_function = 0.0;
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
static const float beam_horiz_filter_static = 0.0;
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
static const bool beam_misconvergence = true;
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
static const bool interlace_detect = true;
static const bool interlace_1080i_static = false;
static const bool interlace_bff_static = false;
static const float aa_level = 12.0; // range [0, 24]
static const float aa_filter = 6.0; // range [0, 9]
static const bool aa_temporal = false;
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
static const float aa_cubic_c_static = 0.5; // range [0, 4]
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
static const float mask_type_static = 1.0; // range [0, 2]
static const float mask_sample_mode_static = 0.0; // range [0, 2]
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
static const float mask_num_triads_desired_static = 480.0;
static const float mask_sinc_lobes = 3.0; // range [2, 4]
static const float mask_min_allowed_triad_size = 2.0;
static const float geom_mode_static = 0.0; // range [0, 3]
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
static const float geom_aspect_ratio_static = 1.313069909;
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
static const bool geom_force_correct_tangent_matrix = true;
static const float border_size_static = 0.015; // range [0, 0.5]
static const float border_darkness_static = 2.0; // range [0, inf)
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,92 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
//#define DRIVERS_ALLOW_DERIVATIVES
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
//#define DRIVERS_ALLOW_TEX2DLOD
//#define DRIVERS_ALLOW_TEX2DBIAS
#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
#define RUNTIME_SHADER_PARAMS_ENABLE
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
#define RUNTIME_ANTIALIAS_WEIGHTS
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#define RUNTIME_GEOMETRY_TILT
#define RUNTIME_GEOMETRY_MODE
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#define PHOSPHOR_MASK_MANUALLY_RESIZE
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
/////////////////////////////// USER PARAMETERS //////////////////////////////
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
static const float levels_contrast_static = 1.0; // range [0, 4)
static const float levels_autodim_temp = 0.5; // range (0, 1]
static const float halation_weight_static = 0.0; // range [0, 1]
static const float diffusion_weight_static = 0.075; // range [0, 1]
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
static const float bloom_excess_static = 0.0; // range [0, 1]
static const float bloom_approx_filter_static = 0.0;
static const float beam_num_scanlines = 3.0; // range [2, 6]
static const bool beam_generalized_gaussian = true;
static const float beam_antialias_level = 1.0; // range [0, 2]
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
static const float beam_spot_shape_function = 0.0;
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
static const float beam_horiz_filter_static = 0.0;
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
static const bool beam_misconvergence = true;
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
static const bool interlace_detect = true;
static const bool interlace_1080i_static = false;
static const bool interlace_bff_static = false;
static const float aa_level = 12.0; // range [0, 24]
static const float aa_filter = 6.0; // range [0, 9]
static const bool aa_temporal = false;
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
static const float aa_cubic_c_static = 0.5; // range [0, 4]
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
static const float mask_type_static = 1.0; // range [0, 2]
static const float mask_sample_mode_static = 0.0; // range [0, 2]
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
static const float mask_num_triads_desired_static = 480.0;
static const float mask_sinc_lobes = 3.0; // range [2, 4]
static const float mask_min_allowed_triad_size = 2.0;
static const float geom_mode_static = 0.0; // range [0, 3]
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
static const float geom_aspect_ratio_static = 1.313069909;
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
static const bool geom_force_correct_tangent_matrix = true;
static const float border_size_static = 0.015; // range [0, 0.5]
static const float border_darkness_static = 2.0; // range [0, inf)
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 0.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,92 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
#define DRIVERS_ALLOW_DERIVATIVES
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
#define DRIVERS_ALLOW_TEX2DLOD
#define DRIVERS_ALLOW_TEX2DBIAS
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
#define RUNTIME_SHADER_PARAMS_ENABLE
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
#define RUNTIME_ANTIALIAS_WEIGHTS
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#define RUNTIME_GEOMETRY_TILT
#define RUNTIME_GEOMETRY_MODE
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#define PHOSPHOR_MASK_MANUALLY_RESIZE
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
/////////////////////////////// USER PARAMETERS //////////////////////////////
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
static const float levels_contrast_static = 1.0; // range [0, 4)
static const float levels_autodim_temp = 0.5; // range (0, 1]
static const float halation_weight_static = 0.0; // range [0, 1]
static const float diffusion_weight_static = 0.075; // range [0, 1]
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
static const float bloom_excess_static = 0.0; // range [0, 1]
static const float bloom_approx_filter_static = 2.0;
static const float beam_num_scanlines = 3.0; // range [2, 6]
static const bool beam_generalized_gaussian = true;
static const float beam_antialias_level = 1.0; // range [0, 2]
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
static const float beam_spot_shape_function = 0.0;
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
static const float beam_horiz_filter_static = 0.0;
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
static const bool beam_misconvergence = true;
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
static const bool interlace_detect = true;
static const bool interlace_1080i_static = false;
static const bool interlace_bff_static = false;
static const float aa_level = 12.0; // range [0, 24]
static const float aa_filter = 6.0; // range [0, 9]
static const bool aa_temporal = false;
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
static const float aa_cubic_c_static = 0.5; // range [0, 4]
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
static const float mask_type_static = 1.0; // range [0, 2]
static const float mask_sample_mode_static = 0.0; // range [0, 2]
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
static const float mask_num_triads_desired_static = 480.0;
static const float mask_sinc_lobes = 3.0; // range [2, 4]
static const float mask_min_allowed_triad_size = 2.0;
static const float geom_mode_static = 0.0; // range [0, 3]
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
static const float geom_aspect_ratio_static = 1.313069909;
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
static const bool geom_force_correct_tangent_matrix = true;
static const float border_size_static = 0.015; // range [0, 0.5]
static const float border_darkness_static = 2.0; // range [0, inf)
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
//#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
//#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = false;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 8.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = false;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
//#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
//#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 0.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = false;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 8.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = false;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
//#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
//#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = false;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 8.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = false;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
//#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
//#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
//#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
//#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 0.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 2.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = false;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = false;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 8.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = mask_specify_num_triads_static;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = false;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
//#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
//#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
//#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
//#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 0.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 2.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = false;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = false;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 8.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = mask_specify_num_triads_static;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = false;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
//#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
//#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
//#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
//#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 0.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 2.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = false;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = false;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 8.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = mask_specify_num_triads_static;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = false;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 4.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 4.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "float2 ddx(float2);" not supported in this profile
// error C3004: function "float2 ddy(float2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
static const float crt_gamma_static = 2.5; // range [1, 5]
static const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
static const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
static const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
static const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
static const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
static const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
static const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
static const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
static const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
static const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
static const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
static const float beam_min_sigma_static = 0.02; // range (0, 1]
static const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
static const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
static const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
static const float beam_min_shape_static = 2.0; // range [2, 32]
static const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
static const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
static const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
static const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
static const float2 convergence_offsets_r_static = float2(0.1, 0.2);
static const float2 convergence_offsets_g_static = float2(0.3, 0.4);
static const float2 convergence_offsets_b_static = float2(0.5, 0.6);
// Detect interlacing (static option only for now)?
static const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
static const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
static const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
static const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
static const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
static const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
static const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
static const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
static const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
static const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
static const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
static const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
static const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
static const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
static const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
static const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
static const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
static const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
static const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
static const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
static const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H

View file

@ -0,0 +1,247 @@
#ifndef BIND_SHADER_PARAMS_H
#define BIND_SHADER_PARAMS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
#include "../user-settings.h"
#include "derived-settings-and-constants.h"
// Override some parameters for gamma-management.h and tex2Dantialias.h:
#define OVERRIDE_DEVICE_GAMMA
const float gba_gamma = 3.5; // Irrelevant but necessary to define.
#define ANTIALIAS_OVERRIDE_BASICS
#define ANTIALIAS_OVERRIDE_PARAMETERS
// Disable runtime shader params if the user doesn't explicitly want them.
// Static constants will be defined in place of uniforms of the same name.
#ifndef RUNTIME_SHADER_PARAMS_ENABLE
#undef PARAMETER_UNIFORM
#endif
// Bind option names to shader parameter uniforms or static constants.
#ifdef PARAMETER_UNIFORM
uniform float crt_gamma;
uniform float lcd_gamma;
uniform float levels_contrast;
uniform float halation_weight;
uniform float diffusion_weight;
uniform float bloom_underestimate_levels;
uniform float bloom_excess;
uniform float beam_min_sigma;
uniform float beam_max_sigma;
uniform float beam_spot_power;
uniform float beam_min_shape;
uniform float beam_max_shape;
uniform float beam_shape_power;
uniform float beam_horiz_sigma;
#ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
uniform float beam_horiz_filter;
uniform float beam_horiz_linear_rgb_weight;
#else
const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
#endif
uniform float convergence_offset_x_r;
uniform float convergence_offset_x_g;
uniform float convergence_offset_x_b;
uniform float convergence_offset_y_r;
uniform float convergence_offset_y_g;
uniform float convergence_offset_y_b;
#ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
uniform float mask_type;
#else
const float mask_type = clamp(mask_type_static, 0.0, 2.0);
#endif
uniform float mask_sample_mode_desired;
uniform float mask_specify_num_triads;
uniform float mask_triad_size_desired;
uniform float mask_num_triads_desired;
uniform float aa_subpixel_r_offset_x_runtime;
uniform float aa_subpixel_r_offset_y_runtime;
#ifdef RUNTIME_ANTIALIAS_WEIGHTS
uniform float aa_cubic_c;
uniform float aa_gauss_sigma;
#else
const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]?
const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]?
#endif
uniform float geom_mode_runtime;
uniform float geom_radius;
uniform float geom_view_dist;
uniform float geom_tilt_angle_x;
uniform float geom_tilt_angle_y;
uniform float geom_aspect_ratio_x;
uniform float geom_aspect_ratio_y;
uniform float geom_overscan_x;
uniform float geom_overscan_y;
uniform float border_size;
uniform float border_darkness;
uniform float border_compress;
uniform float interlace_bff;
uniform float interlace_1080i;
#else
// Use constants from user-settings.h, and limit ranges appropriately:
const float crt_gamma = max(0.0, crt_gamma_static);
const float lcd_gamma = max(0.0, lcd_gamma_static);
const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0);
const float halation_weight = clamp(halation_weight_static, 0.0, 1.0);
const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0);
const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static);
const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0);
const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static);
const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static);
const float beam_spot_power = max(beam_spot_power_static, 0.0);
const float beam_min_shape = max(2.0, beam_min_shape_static);
const float beam_max_shape = max(beam_min_shape, beam_max_shape_static);
const float beam_shape_power = max(0.0, beam_shape_power_static);
const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static);
const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
// Unpack vector elements to match scalar uniforms:
const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0);
const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0);
const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0);
const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0);
const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0);
const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0);
const float mask_type = clamp(mask_type_static, 0.0, 2.0);
const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0);
const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0);
const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0);
const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0);
const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5);
const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5);
const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]?
const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]?
const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0);
const float geom_radius = max(1.0/(2.0*pi), geom_radius_static); // Clamp to [1/(2*pi), 1024]?
const float geom_view_dist = max(0.5, geom_view_dist_static); // Clamp to [0.5, 1024]?
const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi);
const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi);
const float geom_aspect_ratio_x = geom_aspect_ratio_static; // Force >= 1?
const float geom_aspect_ratio_y = 1.0;
const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x);
const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y);
const float border_size = clamp(border_size_static, 0.0, 0.5); // 0.5 reaches to image center
const float border_darkness = max(0.0, border_darkness_static);
const float border_compress = max(1.0, border_compress_static); // < 1.0 darkens whole image
const float interlace_bff = float(interlace_bff_static);
const float interlace_1080i = float(interlace_1080i_static);
#endif
// Provide accessors for vector constants that pack scalar uniforms:
vec2 get_aspect_vector(const float geom_aspect_ratio)
{
// Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent
// the absolute scale from affecting the uv-mapping for curvature:
const float geom_clamped_aspect_ratio =
min(geom_aspect_ratio, geom_max_aspect_ratio);
const vec2 geom_aspect =
normalize(vec2(geom_clamped_aspect_ratio, 1.0));
return geom_aspect;
}
vec2 get_geom_overscan_vector()
{
return vec2(geom_overscan_x, geom_overscan_y);
}
vec2 get_geom_tilt_angle_vector()
{
return vec2(geom_tilt_angle_x, geom_tilt_angle_y);
}
vec3 get_convergence_offsets_x_vector()
{
return vec3(convergence_offset_x_r, convergence_offset_x_g,
convergence_offset_x_b);
}
vec3 get_convergence_offsets_y_vector()
{
return vec3(convergence_offset_y_r, convergence_offset_y_g,
convergence_offset_y_b);
}
vec2 get_convergence_offsets_r_vector()
{
return vec2(convergence_offset_x_r, convergence_offset_y_r);
}
vec2 get_convergence_offsets_g_vector()
{
return vec2(convergence_offset_x_g, convergence_offset_y_g);
}
vec2 get_convergence_offsets_b_vector()
{
return vec2(convergence_offset_x_b, convergence_offset_y_b);
}
vec2 get_aa_subpixel_r_offset()
{
#ifdef RUNTIME_ANTIALIAS_WEIGHTS
#ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// WARNING: THIS IS EXTREMELY EXPENSIVE.
return vec2(aa_subpixel_r_offset_x_runtime,
aa_subpixel_r_offset_y_runtime);
#else
return aa_subpixel_r_offset_static;
#endif
#else
return aa_subpixel_r_offset_static;
#endif
}
// Provide accessors settings which still need "cooking:"
float get_mask_amplify()
{
const float mask_grille_amplify = 1.0/mask_grille_avg_color;
const float mask_slot_amplify = 1.0/mask_slot_avg_color;
const float mask_shadow_amplify = 1.0/mask_shadow_avg_color;
return mask_type < 0.5 ? mask_grille_amplify :
mask_type < 1.5 ? mask_slot_amplify :
mask_shadow_amplify;
}
float get_mask_sample_mode()
{
#ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
return mask_sample_mode_desired;
#else
return clamp(mask_sample_mode_desired, 1.0, 2.0);
#endif
#else
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
return mask_sample_mode_static;
#else
return clamp(mask_sample_mode_static, 1.0, 2.0);
#endif
#endif
}
#endif // BIND_SHADER_PARAMS_H

View file

@ -0,0 +1,403 @@
#version 450
layout(push_constant) uniform Push
{
vec4 SourceSize;
uint FrameCount;
} registers;
layout(std140, set = 0, binding = 0) uniform UBO
{
mat4 MVP;
float crt_gamma;
float lcd_gamma;
float levels_contrast;
float halation_weight;
float diffusion_weight;
float bloom_underestimate_levels;
float bloom_excess;
float beam_min_sigma;
float beam_max_sigma;
float beam_spot_power;
float beam_min_shape;
float beam_max_shape;
float beam_shape_power;
float beam_horiz_filter;
float beam_horiz_sigma;
float beam_horiz_linear_rgb_weight;
float convergence_offset_x_r;
float convergence_offset_x_g;
float convergence_offset_x_b;
float convergence_offset_y_r;
float convergence_offset_y_g;
float convergence_offset_y_b;
float mask_type;
float mask_sample_mode_desired;
float mask_specify_num_triads;
float mask_triad_size_desired;
float mask_num_triads_desired;
float aa_subpixel_r_offset_x_runtime;
float aa_subpixel_r_offset_y_runtime;
float aa_cubic_c;
float aa_gauss_sigma;
float geom_mode_runtime;
float geom_radius;
float geom_view_dist;
float geom_tilt_angle_x;
float geom_tilt_angle_y;
float geom_aspect_ratio_x;
float geom_aspect_ratio_y;
float geom_overscan_x;
float geom_overscan_y;
float border_size;
float border_darkness;
float border_compress;
float interlace_bff;
float interlace_1080i;
} params;
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////// SETTINGS MANAGEMENT ////////////////////////////
// PASS SETTINGS:
// gamma-management.h needs to know what kind of pipeline we're using and
// what pass this is in that pipeline. This will become obsolete if/when we
// can #define things like this in the preset file.
#define FIRST_PASS
#define SIMULATE_CRT_ON_LCD
// Set shader params for all passes here:
#pragma parameter crt_gamma "crt_gamma" 2.5 1.0 5.0 0.025
#pragma parameter lcd_gamma "lcd_gamma" 2.2 1.0 5.0 0.025
#pragma parameter levels_contrast "levels_contrast" 1.0 0.0 4.0 0.015625
#pragma parameter halation_weight "halation_weight" 0.0 0.0 1.0 0.005
#pragma parameter diffusion_weight "diffusion_weight" 0.075 0.0 1.0 0.005
#pragma parameter bloom_underestimate_levels "bloom_underestimate_levels" 0.8 0.0 5.0 0.01
#pragma parameter bloom_excess "bloom_excess" 0.0 0.0 1.0 0.005
#pragma parameter beam_min_sigma "beam_min_sigma" 0.02 0.005 1.0 0.005
#pragma parameter beam_max_sigma "beam_max_sigma" 0.3 0.005 1.0 0.005
#pragma parameter beam_spot_power "beam_spot_power" 0.33 0.01 16.0 0.01
#pragma parameter beam_min_shape "beam_min_shape" 2.0 2.0 32.0 0.1
#pragma parameter beam_max_shape "beam_max_shape" 4.0 2.0 32.0 0.1
#pragma parameter beam_shape_power "beam_shape_power" 0.25 0.01 16.0 0.01
#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0
#pragma parameter beam_horiz_sigma "beam_horiz_sigma" 0.35 0.0 0.67 0.005
#pragma parameter beam_horiz_linear_rgb_weight "beam_horiz_linear_rgb_weight" 1.0 0.0 1.0 0.01
#pragma parameter convergence_offset_x_r "convergence_offset_x_r" 0.0 -4.0 4.0 0.05
#pragma parameter convergence_offset_x_g "convergence_offset_x_g" 0.0 -4.0 4.0 0.05
#pragma parameter convergence_offset_x_b "convergence_offset_x_b" 0.0 -4.0 4.0 0.05
#pragma parameter convergence_offset_y_r "convergence_offset_y_r" 0.0 -2.0 2.0 0.05
#pragma parameter convergence_offset_y_g "convergence_offset_y_g" 0.0 -2.0 2.0 0.05
#pragma parameter convergence_offset_y_b "convergence_offset_y_b" 0.0 -2.0 2.0 0.05
#pragma parameter mask_type "mask_type" 1.0 0.0 2.0 1.0
#pragma parameter mask_sample_mode_desired "mask_sample_mode" 0.0 0.0 2.0 1.0 // Consider blocking mode 2.
#pragma parameter mask_specify_num_triads "mask_specify_num_triads" 0.0 0.0 1.0 1.0
#pragma parameter mask_triad_size_desired "mask_triad_size_desired" 3.0 1.0 18.0 0.125
#pragma parameter mask_num_triads_desired "mask_num_triads_desired" 480.0 342.0 1920.0 1.0
#pragma parameter aa_subpixel_r_offset_x_runtime "aa_subpixel_r_offset_x" -0.333333333 -0.333333333 0.333333333 0.333333333
#pragma parameter aa_subpixel_r_offset_y_runtime "aa_subpixel_r_offset_y" 0.0 -0.333333333 0.333333333 0.333333333
#pragma parameter aa_cubic_c "antialias_cubic_sharpness" 0.5 0.0 4.0 0.015625
#pragma parameter aa_gauss_sigma "antialias_gauss_sigma" 0.5 0.0625 1.0 0.015625
#pragma parameter geom_mode_runtime "geom_mode" 0.0 0.0 3.0 1.0
#pragma parameter geom_radius "geom_radius" 2.0 0.16 1024.0 0.1
#pragma parameter geom_view_dist "geom_view_dist" 2.0 0.5 1024.0 0.25
#pragma parameter geom_tilt_angle_x "geom_tilt_angle_x" 0.0 -3.14159265 3.14159265 0.017453292519943295
#pragma parameter geom_tilt_angle_y "geom_tilt_angle_y" 0.0 -3.14159265 3.14159265 0.017453292519943295
#pragma parameter geom_aspect_ratio_x "geom_aspect_ratio_x" 432.0 1.0 512.0 1.0
#pragma parameter geom_aspect_ratio_y "geom_aspect_ratio_y" 329.0 1.0 512.0 1.0
#pragma parameter geom_overscan_x "geom_overscan_x" 1.0 0.00390625 4.0 0.00390625
#pragma parameter geom_overscan_y "geom_overscan_y" 1.0 0.00390625 4.0 0.00390625
#pragma parameter border_size "border_size" 0.015 0.0000001 0.5 0.005
#pragma parameter border_darkness "border_darkness" 2.0 0.0 16.0 0.0625
#pragma parameter border_compress "border_compress" 2.5 1.0 64.0 0.0625
#pragma parameter interlace_bff "interlace_bff" 0.0 0.0 1.0 1.0
#pragma parameter interlace_1080i "interlace_1080i" 0.0 0.0 1.0 1.0
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../user-settings.h"
#include "bind-shader-params.h"
//#include "../../../../include/gamma-management.h"
//#include "scanline-functions.h"
// from scanline-functions.h //
bool is_interlaced(float num_lines)
{
// Detect interlacing based on the number of lines in the source.
if(interlace_detect)
{
// NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
// NTSC Emulators: Typically 224 or 240 lines
// PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
// PAL Emulators: ?
// ATSC: 720p, 1080i, 1080p
// Where do we place our cutoffs? Assumptions:
// 1.) We only need to care about active lines.
// 2.) Anything > 288 and <= 576 lines is probably interlaced.
// 3.) Anything > 576 lines is probably not interlaced...
// 4.) ...except 1080 lines, which is a crapshoot (user decision).
// 5.) Just in case the main program uses calculated video sizes,
// we should nudge the float thresholds a bit.
bool sd_interlace;
if (num_lines > 288.5 && num_lines < 576.5)
{sd_interlace = true;}
else
{sd_interlace = false;}
bool hd_interlace;
if (num_lines > 1079.5 && num_lines < 1080.5)
{hd_interlace = true;}
else
{hd_interlace = false;}
return (sd_interlace || hd_interlace);
}
else
{
return false;
}
}
// end scanline-functions.h //
// from gamma-management.h //
/////////////////////////////// BASE CONSTANTS ///////////////////////////////
// Set standard gamma constants, but allow users to override them:
#ifndef OVERRIDE_STANDARD_GAMMA
// Standard encoding gammas:
const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too?
const float pal_gamma = 2.8; // Never actually 2.8 in practice
// Typical device decoding gammas (only use for emulating devices):
// CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard
// gammas: The standards purposely undercorrected for an analog CRT's
// assumed 2.5 reference display gamma to maintain contrast in assumed
// [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf
// These unstated assumptions about display gamma and perceptual rendering
// intent caused a lot of confusion, and more modern CRT's seemed to target
// NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit
// (they struggle near black with 2.5 gamma anyway), especially PC/laptop
// displays designed to view sRGB in bright environments. (Standards are
// also in flux again with BT.1886, but it's underspecified for displays.)
const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55)
const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55)
const float lcd_reference_gamma = 2.5; // To match CRT
const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC
const float lcd_office_gamma = 2.2; // Approximates sRGB
#endif // OVERRIDE_STANDARD_GAMMA
// Assuming alpha == 1.0 might make it easier for users to avoid some bugs,
// but only if they're aware of it.
#ifndef OVERRIDE_ALPHA_ASSUMPTIONS
const bool assume_opaque_alpha = false;
#endif
/////////////////////// DERIVED CONSTANTS AS FUNCTIONS ///////////////////////
// gamma-management.h should be compatible with overriding gamma values with
// runtime user parameters, but we can only define other global constants in
// terms of static constants, not uniform user parameters. To get around this
// limitation, we need to define derived constants using functions.
// Set device gamma constants, but allow users to override them:
#ifdef OVERRIDE_DEVICE_GAMMA
// The user promises to globally define the appropriate constants:
float get_crt_gamma() { return crt_gamma; }
float get_gba_gamma() { return gba_gamma; }
float get_lcd_gamma() { return lcd_gamma; }
#else
float get_crt_gamma() { return crt_reference_gamma_high; }
float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0)
float get_lcd_gamma() { return lcd_office_gamma; }
#endif // OVERRIDE_DEVICE_GAMMA
// Set decoding/encoding gammas for the first/lass passes, but allow overrides:
#ifdef OVERRIDE_FINAL_GAMMA
// The user promises to globally define the appropriate constants:
float get_intermediate_gamma() { return intermediate_gamma; }
float get_input_gamma() { return input_gamma; }
float get_output_gamma() { return output_gamma; }
#else
// If we gamma-correct every pass, always use ntsc_gamma between passes to
// ensure middle passes don't need to care if anything is being simulated:
float get_intermediate_gamma() { return ntsc_gamma; }
#ifdef SIMULATE_CRT_ON_LCD
float get_input_gamma() { return get_crt_gamma(); }
float get_output_gamma() { return get_lcd_gamma(); }
#else
#ifdef SIMULATE_GBA_ON_LCD
float get_input_gamma() { return get_gba_gamma(); }
float get_output_gamma() { return get_lcd_gamma(); }
#else
#ifdef SIMULATE_LCD_ON_CRT
float get_input_gamma() { return get_lcd_gamma(); }
float get_output_gamma() { return get_crt_gamma(); }
#else
#ifdef SIMULATE_GBA_ON_CRT
float get_input_gamma() { return get_gba_gamma(); }
float get_output_gamma() { return get_crt_gamma(); }
#else // Don't simulate anything:
float get_input_gamma() { return ntsc_gamma; }
float get_output_gamma() { return ntsc_gamma; }
#endif // SIMULATE_GBA_ON_CRT
#endif // SIMULATE_LCD_ON_CRT
#endif // SIMULATE_GBA_ON_LCD
#endif // SIMULATE_CRT_ON_LCD
#endif // OVERRIDE_FINAL_GAMMA
#ifndef GAMMA_ENCODE_EVERY_FBO
#ifdef FIRST_PASS
const bool linearize_input = true;
float get_pass_input_gamma() { return get_input_gamma(); }
#else
const bool linearize_input = false;
float get_pass_input_gamma() { return 1.0; }
#endif
#ifdef LAST_PASS
const bool gamma_encode_output = true;
float get_pass_output_gamma() { return get_output_gamma(); }
#else
const bool gamma_encode_output = false;
float get_pass_output_gamma() { return 1.0; }
#endif
#else
const bool linearize_input = true;
const bool gamma_encode_output = true;
#ifdef FIRST_PASS
float get_pass_input_gamma() { return get_input_gamma(); }
#else
float get_pass_input_gamma() { return get_intermediate_gamma(); }
#endif
#ifdef LAST_PASS
float get_pass_output_gamma() { return get_output_gamma(); }
#else
float get_pass_output_gamma() { return get_intermediate_gamma(); }
#endif
#endif
vec4 decode_input(const vec4 color)
{
if(linearize_input)
{
if(assume_opaque_alpha)
{
return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0);
}
else
{
return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a);
}
}
else
{
return color;
}
}
vec4 encode_output(const vec4 color)
{
if(gamma_encode_output)
{
if(assume_opaque_alpha)
{
return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0);
}
else
{
return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a);
}
}
else
{
return color;
}
}
#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D)))
//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords)
//{ return decode_input(vec4(texture(tex, tex_coords))); }
//#define tex2D_linearize(C, D, E) decode_input(vec4(texture(C, D, E)))
//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords, const int texel_off)
//{ return decode_input(vec4(texture(tex, tex_coords, texel_off))); }
// end gamma-management.h //
#pragma stage vertex
layout(location = 0) in vec4 Position;
layout(location = 1) in vec2 TexCoord;
layout(location = 0) out vec2 tex_uv;
layout(location = 1) out vec2 uv_step;
void main()
{
gl_Position = params.MVP * Position;
tex_uv = TexCoord;
// Save the uv distance between texels:
uv_step = vec2(1.0) * registers.SourceSize.zw;
}
#pragma stage fragment
layout(location = 0) in vec2 tex_uv;
layout(location = 1) in vec2 uv_step;
layout(location = 0) out vec4 FragColor;
layout(set = 0, binding = 2) uniform sampler2D Source;
void main()
{
// Detect interlacing: 1.0 = true, 0.0 = false.
const vec2 video_size = registers.SourceSize.xy;
bool interlaced = is_interlaced(video_size.y);
// Linearize the input based on CRT gamma and bob interlaced fields.
// Bobbing ensures we can immediately blur without getting artifacts.
// Note: TFF/BFF won't matter for sources that double-weave or similar.
if(interlace_detect)
{
// Sample the current line and an average of the previous/next line;
// tex2D_linearize will decode CRT gamma. Don't bother branching:
// const vec2 tex_uv = tex_uv;
const vec2 v_step = vec2(0.0, uv_step.y);
const vec3 curr_line = tex2D_linearize(
Source, tex_uv).rgb;
const vec3 last_line = tex2D_linearize(
Source, tex_uv - v_step).rgb;
const vec3 next_line = tex2D_linearize(
Source, tex_uv + v_step).rgb;
const vec3 interpolated_line = 0.5 * (last_line + next_line);
// If we're interlacing, determine which field curr_line is in:
float interlace_check = 0.0;
if (interlaced = true) interlace_check = 1.0;
const float modulus = interlace_check + 1.0;
const float field_offset =
mod(registers.FrameCount + float(params.interlace_bff), modulus);
const float curr_line_texel = tex_uv.y * registers.SourceSize.y;
// Use under_half to fix a rounding bug around exact texel locations.
const float line_num_last = floor(curr_line_texel - under_half);
const float wrong_field = mod(line_num_last + field_offset, modulus);
// Select the correct color, and output the result:
const vec3 color = mix(curr_line, interpolated_line, wrong_field);
FragColor = encode_output(vec4(color, 1.0));
}
else
{
FragColor = encode_output(tex2D_linearize(Source, tex_uv));
}
}

View file

@ -0,0 +1,315 @@
#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H
#define DERIVED_SETTINGS_AND_CONSTANTS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
///////////////////////////////// DESCRIPTION ////////////////////////////////
// These macros and constants can be used across the whole codebase.
// Unlike the values in user-settings.cgh, end users shouldn't modify these.
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../user-settings.h"
#include "user-preset-constants.h"
/////////////////////////////// FIXED SETTINGS ///////////////////////////////
// Avoid dividing by zero; using a macro overloads for float, vec2, etc.:
#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16
// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma.
#ifndef SIMULATE_CRT_ON_LCD
#define SIMULATE_CRT_ON_LCD
#endif
// Manually tiling a manually resized texture creates texture coord derivative
// discontinuities and confuses anisotropic filtering, causing discolored tile
// seams in the phosphor mask. Workarounds:
// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's
// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and
// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either.
// b.) "Tile flat twice" requires drawing two full tiles without border padding
// to the resized mask FBO, and it's incompatible with same-pass curvature.
// (Same-pass curvature isn't used but could be in the future...maybe.)
// c.) "Fix discontinuities" requires derivatives and drawing one tile with
// border padding to the resized mask FBO, but it works with same-pass
// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined.
// Precedence: a, then, b, then c (if multiple strategies are #defined).
#define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen
#define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen
#define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen
// Also, manually resampling the phosphor mask is slightly blurrier with
// anisotropic filtering. (Resampling with mipmapping is even worse: It
// creates artifacts, but only with the fully bloomed shader.) The difference
// is subtle with small triads, but you can fix it for a small cost.
//#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
////////////////////////////// DERIVED SETTINGS //////////////////////////////
// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the
// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable
// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be
// #defined by either user-settings.h or a wrapper .cg that #includes the
// current .cg pass.)
#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
#ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
#undef PHOSPHOR_MASK_MANUALLY_RESIZE
#endif
#ifdef RUNTIME_GEOMETRY_MODE
#undef RUNTIME_GEOMETRY_MODE
#endif
// Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is
// inferior in most cases, so replace 2.0 with 0.0:
const float bloom_approx_filter =
bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static;
#else
const float bloom_approx_filter = bloom_approx_filter_static;
#endif
// Disable slow runtime paths if static parameters are used. Most of these
// won't be a problem anyway once the params are disabled, but some will.
#ifndef RUNTIME_SHADER_PARAMS_ENABLE
#ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA
#undef RUNTIME_PHOSPHOR_BLOOM_SIGMA
#endif
#ifdef RUNTIME_ANTIALIAS_WEIGHTS
#undef RUNTIME_ANTIALIAS_WEIGHTS
#endif
#ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
#endif
#ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#endif
#ifdef RUNTIME_GEOMETRY_TILT
#undef RUNTIME_GEOMETRY_TILT
#endif
#ifdef RUNTIME_GEOMETRY_MODE
#undef RUNTIME_GEOMETRY_MODE
#endif
#ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#endif
#endif
// Make tex2Dbias a backup for tex2Dlod for wider compatibility.
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#endif
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#endif
// Rule out unavailable anisotropic compatibility strategies:
#ifndef DRIVERS_ALLOW_DERIVATIVES
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#endif
#ifndef DRIVERS_ALLOW_TEX2DLOD
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#endif
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#endif
#ifdef ANTIALIAS_DISABLE_ANISOTROPIC
#undef ANTIALIAS_DISABLE_ANISOTROPIC
#endif
#endif
#ifndef DRIVERS_ALLOW_TEX2DBIAS
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#endif
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#endif
#endif
// Prioritize anisotropic tiling compatibility strategies by performance and
// disable unused strategies. This concentrates all the nesting in one place.
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#else
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#else
// ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with
// flat texture coords in the same pass, but that's all we use.
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
#endif
#endif
#endif
#endif
// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can
// reduce some #ifdef nesting in the next section by essentially OR'ing them:
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD
#define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS
#define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
#endif
// Prioritize anisotropic resampling compatibility strategies the same way:
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
#endif
#endif
/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS //////////////////////
// If we can use the large mipmapped LUT without mipmapping artifacts, we
// should: It gives us more options for using fewer samples.
#ifdef DRIVERS_ALLOW_TEX2DLOD
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
// TODO: Take advantage of this!
#define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT
const vec2 mask_resize_src_lut_size = mask_texture_large_size;
#else
const vec2 mask_resize_src_lut_size = mask_texture_small_size;
#endif
#else
const vec2 mask_resize_src_lut_size = mask_texture_small_size;
#endif
// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to
// main_fragment, or a static alias of one of the above. This makes it hard
// to select the phosphor mask at runtime: We can't even assign to a uniform
// global in the vertex shader or select a sampler2D in the vertex shader and
// pass it to the fragment shader (even with explicit TEXUNIT# bindings),
// because it just gives us the input texture or a black screen. However, we
// can get around these limitations by calling tex2D three times with different
// uniform samplers (or resizing the phosphor mask three times altogether).
// With dynamic branches, we can process only one of these branches on top of
// quickly discarding fragments we don't need (cgc seems able to overcome
// limigations around dependent texture fetches inside of branches). Without
// dynamic branches, we have to process every branch for every fragment...which
// is slower. Runtime sampling mode selection is slower without dynamic
// branches as well. Let the user's static #defines decide if it's worth it.
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#else
#ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
#endif
#endif
// We need to render some minimum number of tiles in the resize passes.
// We need at least 1.0 just to repeat a single tile, and we need extra
// padding beyond that for anisotropic filtering, discontinuitity fixing,
// antialiasing, same-pass curvature (not currently used), etc. First
// determine how many border texels and tiles we need, based on how the result
// will be sampled:
#ifdef GEOMETRY_EARLY
const float max_subpixel_offset = aa_subpixel_r_offset_static.x;
// Most antialiasing filters have a base radius of 4.0 pixels:
const float max_aa_base_pixel_border = 4.0 +
max_subpixel_offset;
#else
const float max_aa_base_pixel_border = 0.0;
#endif
// Anisotropic filtering adds about 0.5 to the pixel border:
#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY
const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5;
#else
const float max_aniso_pixel_border = max_aa_base_pixel_border;
#endif
// Fixing discontinuities adds 1.0 more to the pixel border:
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0;
#else
const float max_tiled_pixel_border = max_aniso_pixel_border;
#endif
// Convert the pixel border to an integer texel border. Assume same-pass
// curvature about triples the texel frequency:
#ifdef GEOMETRY_EARLY
const float max_mask_texel_border =
ceil(max_tiled_pixel_border * 3.0);
#else
const float max_mask_texel_border = ceil(max_tiled_pixel_border);
#endif
// Convert the texel border to a tile border using worst-case assumptions:
const float max_mask_tile_border = max_mask_texel_border/
(mask_min_allowed_triad_size * mask_triads_per_tile);
// Finally, set the number of resized tiles to render to MASK_RESIZE, and set
// the starting texel (inside borders) for sampling it.
#ifndef GEOMETRY_EARLY
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
// Special case: Render two tiles without borders. Anisotropic
// filtering doesn't seem to be a problem here.
const float mask_resize_num_tiles = 1.0 + 1.0;
const float mask_start_texels = 0.0;
#else
const float mask_resize_num_tiles = 1.0 +
2.0 * max_mask_tile_border;
const float mask_start_texels = max_mask_texel_border;
#endif
#else
const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border;
const float mask_start_texels = max_mask_texel_border;
#endif
// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of
// mask_resize_viewport_scale. This limits the maximum final triad size.
// Estimate the minimum number of triads we can split the screen into in each
// dimension (we'll be as correct as mask_resize_viewport_scale is):
const float mask_resize_num_triads =
mask_resize_num_tiles * mask_triads_per_tile;
const vec2 min_allowed_viewport_triads =
vec2(mask_resize_num_triads) / mask_resize_viewport_scale;
//////////////////////// COMMON MATHEMATICAL CONSTANTS ///////////////////////
const float pi = 3.141592653589;
// We often want to find the location of the previous texel, e.g.:
// const vec2 curr_texel = uv * texture_size;
// const vec2 prev_texel = floor(curr_texel - vec2(0.5)) + vec2(0.5);
// const vec2 prev_texel_uv = prev_texel / texture_size;
// However, many GPU drivers round incorrectly around exact texel locations.
// We need to subtract a little less than 0.5 before flooring, and some GPU's
// require this value to be farther from 0.5 than others; define it here.
// const vec2 prev_texel =
// floor(curr_texel - vec2(under_half)) + vec2(0.5);
const float under_half = 0.4995;
#endif // DERIVED_SETTINGS_AND_CONSTANTS_H

View file

@ -0,0 +1,693 @@
#ifndef GEOMETRY_FUNCTIONS_H
#define GEOMETRY_FUNCTIONS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../user-settings.h"
#include "derived-settings-and-constants.h"
#include "bind-shader-params.h"
//////////////////////////// MACROS AND CONSTANTS ////////////////////////////
// Curvature-related constants:
#define MAX_POINT_CLOUD_SIZE 9
///////////////////////////// CURVATURE FUNCTIONS /////////////////////////////
vec2 quadratic_solve(const float a, const float b_over_2, const float c)
{
// Requires: 1.) a, b, and c are quadratic formula coefficients
// 2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out)
// 3.) b_over_2 must be guaranteed < 0.0 (avoids a branch)
// Returns: Returns vec2(first_solution, discriminant), so the caller
// can choose how to handle the "no intersection" case. The
// Kahan or Citardauq formula is used for numerical robustness.
const float discriminant = b_over_2*b_over_2 - a*c;
const float solution0 = c/(-b_over_2 + sqrt(discriminant));
return vec2(solution0, discriminant);
}
vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec)
{
// Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
// local coordinate frame (eye_pos_vec is a position, i.e.
// a vector from the origin to the eye/camera)
// 2.) geom_radius is a global containing the sphere's radius
// Returns: Cast a ray of direction view_vec from eye_pos_vec at a
// sphere of radius geom_radius, and return the distance to
// the first intersection in units of length(view_vec).
// http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection
// Quadratic formula coefficients (b_over_2 is guaranteed negative):
const float a = dot(view_vec, view_vec);
const float b_over_2 = dot(view_vec, eye_pos_vec); // * 2.0 factored out
const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius;
return quadratic_solve(a, b_over_2, c);
}
vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec)
{
// Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's
// local coordinate frame (eye_pos_vec is a position, i.e.
// a vector from the origin to the eye/camera)
// 2.) geom_radius is a global containing the cylinder's radius
// Returns: Cast a ray of direction view_vec from eye_pos_vec at a
// cylinder of radius geom_radius, and return the distance to
// the first intersection in units of length(view_vec). The
// derivation of the coefficients is in Christer Ericson's
// Real-Time Collision Detection, p. 195-196, and this version
// uses LaGrange's identity to reduce operations.
// Arbitrary "cylinder top" reference point for an infinite cylinder:
const vec3 cylinder_top_vec = vec3(0.0, geom_radius, 0.0);
const vec3 cylinder_axis_vec = vec3(0.0, 1.0, 0.0);//vec3(0.0, 2.0*geom_radius, 0.0);
const vec3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec;
const vec3 axis_x_view = cross(cylinder_axis_vec, view_vec);
const vec3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec);
// Quadratic formula coefficients (b_over_2 is guaranteed negative):
const float a = dot(axis_x_view, axis_x_view);
const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view);
const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) -
geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec);
return quadratic_solve(a, b_over_2, c);
}
vec2 cylinder_xyz_to_uv(const vec3 intersection_pos_local,
const vec2 geom_aspect)
{
// Requires: An xyz intersection position on a cylinder.
// Returns: video_uv coords mapped to range [-0.5, 0.5]
// Mapping: Define square_uv.x to be the signed arc length in xz-space,
// and define square_uv.y = -intersection_pos_local.y (+v = -y).
// Start with a numerically robust arc length calculation.
const float angle_from_image_center = atan2(intersection_pos_local.x,
intersection_pos_local.z);
const float signed_arc_len = angle_from_image_center * geom_radius;
// Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
// by the aspect ratio to stretch the mapping appropriately:
const vec2 square_uv = vec2(signed_arc_len, -intersection_pos_local.y);
const vec2 video_uv = square_uv / geom_aspect;
return video_uv;
}
vec3 cylinder_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
{
// Requires: video_uv coords mapped to range [-0.5, 0.5]
// Returns: An xyz intersection position on a cylinder. This is the
// inverse of cylinder_xyz_to_uv().
// Expand video_uv by the aspect ratio to get proportionate x/y lengths,
// then calculate an xyz position for the cylindrical mapping above.
const vec2 square_uv = video_uv * geom_aspect;
const float arc_len = square_uv.x;
const float angle_from_image_center = arc_len / geom_radius;
const float x_pos = sin(angle_from_image_center) * geom_radius;
const float z_pos = cos(angle_from_image_center) * geom_radius;
// Or: z = sqrt(geom_radius**2 - x**2)
// Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle)
const vec3 intersection_pos_local = vec3(x_pos, -square_uv.y, z_pos);
return intersection_pos_local;
}
vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local,
const vec2 geom_aspect)
{
// Requires: An xyz intersection position on a sphere.
// Returns: video_uv coords mapped to range [-0.5, 0.5]
// Mapping: First define square_uv.x/square_uv.y ==
// intersection_pos_local.x/intersection_pos_local.y. Then,
// length(square_uv) is the arc length from the image center
// at (0.0, 0.0, geom_radius) along the tangent great circle.
// Credit for this mapping goes to cgwg: I never managed to
// understand his code, but he told me his mapping was based on
// great circle distances when I asked him about it, which
// informed this very similar (almost identical) mapping.
// Start with a numerically robust arc length calculation between the ray-
// sphere intersection point and the image center using a method posted by
// Roger Stafford on comp.soft-sys.matlab:
// https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ
const vec3 image_center_pos_local = vec3(0.0, 0.0, geom_radius);
const float cp_len =
length(cross(intersection_pos_local, image_center_pos_local));
const float dp = dot(intersection_pos_local, image_center_pos_local);
const float angle_from_image_center = atan2(cp_len, dp);
const float arc_len = angle_from_image_center * geom_radius;
// Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide
// by the aspect ratio to stretch the mapping appropriately:
const vec2 square_uv_unit = normalize(vec2(intersection_pos_local.x,
-intersection_pos_local.y));
const vec2 square_uv = arc_len * square_uv_unit;
const vec2 video_uv = square_uv / geom_aspect;
return video_uv;
}
vec3 sphere_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
{
// Requires: video_uv coords mapped to range [-0.5, 0.5]
// Returns: An xyz intersection position on a sphere. This is the
// inverse of sphere_xyz_to_uv().
// Expand video_uv by the aspect ratio to get proportionate x/y lengths,
// then calculate an xyz position for the spherical mapping above.
const vec2 square_uv = video_uv * geom_aspect;
// Using length or sqrt here butchers the framerate on my 8800GTS if
// this function is called too many times, and so does taking the max
// component of square_uv/square_uv_unit (program length threshold?).
//float arc_len = length(square_uv);
const vec2 square_uv_unit = normalize(square_uv);
const float arc_len = square_uv.y/square_uv_unit.y;
const float angle_from_image_center = arc_len / geom_radius;
const float xy_dist_from_sphere_center =
sin(angle_from_image_center) * geom_radius;
//vec2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len));
const vec2 xy_pos = xy_dist_from_sphere_center * square_uv_unit;
const float z_pos = cos(angle_from_image_center) * geom_radius;
const vec3 intersection_pos_local = vec3(xy_pos.x, -xy_pos.y, z_pos);
return intersection_pos_local;
}
vec2 sphere_alt_xyz_to_uv(const vec3 intersection_pos_local,
const vec2 geom_aspect)
{
// Requires: An xyz intersection position on a cylinder.
// Returns: video_uv coords mapped to range [-0.5, 0.5]
// Mapping: Define square_uv.x to be the signed arc length in xz-space,
// and define square_uv.y == signed arc length in yz-space.
// See cylinder_xyz_to_uv() for implementation details (very similar).
const vec2 angle_from_image_center = atan2(
vec2(intersection_pos_local.x, -intersection_pos_local.y),
intersection_pos_local.zz);
const vec2 signed_arc_len = angle_from_image_center * geom_radius;
const vec2 video_uv = signed_arc_len / geom_aspect;
return video_uv;
}
vec3 sphere_alt_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect)
{
// Requires: video_uv coords mapped to range [-0.5, 0.5]
// Returns: An xyz intersection position on a sphere. This is the
// inverse of sphere_alt_xyz_to_uv().
// See cylinder_uv_to_xyz() for implementation details (very similar).
const vec2 square_uv = video_uv * geom_aspect;
const vec2 arc_len = square_uv;
const vec2 angle_from_image_center = arc_len / geom_radius;
const vec2 xy_pos = sin(angle_from_image_center) * geom_radius;
const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos));
return vec3(xy_pos.x, -xy_pos.y, z_pos);
}
inline vec2 intersect(const vec3 view_vec_local, const vec3 eye_pos_local,
const float geom_mode)
{
return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) :
intersect_cylinder(view_vec_local, eye_pos_local);
}
inline vec2 xyz_to_uv(const vec3 intersection_pos_local,
const vec2 geom_aspect, const float geom_mode)
{
return geom_mode < 1.5 ?
sphere_xyz_to_uv(intersection_pos_local, geom_aspect) :
geom_mode < 2.5 ?
sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) :
cylinder_xyz_to_uv(intersection_pos_local, geom_aspect);
}
inline vec3 uv_to_xyz(const vec2 uv, const vec2 geom_aspect,
const float geom_mode)
{
return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) :
geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) :
cylinder_uv_to_xyz(uv, geom_aspect);
}
vec2 view_vec_to_uv(const vec3 view_vec_local, const vec3 eye_pos_local,
const vec2 geom_aspect, const float geom_mode, out vec3 intersection_pos)
{
// Get the intersection point on the primitive, given an eye position
// and view vector already in its local coordinate frame:
const vec2 intersect_dist_and_discriminant = intersect(view_vec_local,
eye_pos_local, geom_mode);
const vec3 intersection_pos_local = eye_pos_local +
view_vec_local * intersect_dist_and_discriminant.x;
// Save the intersection position to an output parameter:
intersection_pos = intersection_pos_local;
// Transform into uv coords, but give out-of-range coords if the
// view ray doesn't intersect the primitive in the first place:
return intersect_dist_and_discriminant.y > 0.005 ?
xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : vec2(1.0);
}
vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos,
const vec2 geom_aspect, const vec3 global_coords[MAX_POINT_CLOUD_SIZE],
const int num_points)
{
// Requires: Parameters:
// 1.) Starting eye_pos is a global 3D position at which the
// camera contains all points in global_coords[] in its FOV
// 2.) geom_aspect = get_aspect_vector(
// IN.output_size.x / IN.output_size.y);
// 3.) global_coords is a point cloud containing global xyz
// coords of extreme points on the simulated CRT screen.
// Globals:
// 1.) geom_view_dist must be > 0.0. It controls the "near
// plane" used to interpret flat_video_uv as a view
// vector, which controls the field of view (FOV).
// Eyespace coordinate frame: +x = right, +y = up, +z = back
// Returns: Return an eye position at which the point cloud spans as
// much of the screen as possible (given the FOV controlled by
// geom_view_dist) without being cropped or sheared.
// Algorithm:
// 1.) Move the eye laterally to a point which attempts to maximize the
// the amount we can move forward without clipping the CRT screen.
// 2.) Move forward by as much as possible without clipping the CRT.
// Get the allowed movement range by solving for the eye_pos offsets
// that result in each point being projected to a screen edge/corner in
// pseudo-normalized device coords (where xy ranges from [-0.5, 0.5]
// and z = eyespace z):
// pndc_coord = vec3(vec2(eyespace_xyz.x, -eyespace_xyz.y)*
// geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z);
// Notes:
// The field of view is controlled by geom_view_dist's magnitude relative to
// the view vector's x and y components:
// view_vec.xy ranges from [-0.5, 0.5] * geom_aspect
// view_vec.z = -geom_view_dist
// But for the purposes of perspective divide, it should be considered:
// view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist
// view_vec.z = -1.0
const int max_centering_iters = 1; // Keep for easy testing.
for(int iter = 0; iter < max_centering_iters; iter++)
{
// 0.) Get the eyespace coordinates of our point cloud:
vec3 eyespace_coords[MAX_POINT_CLOUD_SIZE];
for(int i = 0; i < num_points; i++)
{
eyespace_coords[i] = global_coords[i] - eye_pos;
}
// 1a.)For each point, find out how far we can move eye_pos in each
// lateral direction without the point clipping the frustum.
// Eyespace +y = up, screenspace +y = down, so flip y after
// applying the eyespace offset (on the way to "clip space").
// Solve for two offsets per point based on:
// (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) *
// geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(-0.5)
// (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) *
// geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(0.5)
// offset_ul and offset_dr represent the farthest we can move the
// eye_pos up-left and down-right. Save the min of all offset_dr's
// and the max of all offset_ul's (since it's negative).
float abs_radius = abs(geom_radius); // In case anyone gets ideas. ;)
vec2 offset_dr_min = vec2(10.0 * abs_radius, 10.0 * abs_radius);
vec2 offset_ul_max = vec2(-10.0 * abs_radius, -10.0 * abs_radius);
for(int i = 0; i < num_points; i++)
{
const vec2 flipy = vec2(1.0, -1.0);
vec3 eyespace_xyz = eyespace_coords[i];
vec2 offset_dr = eyespace_xyz.xy - vec2(-0.5) *
(geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
vec2 offset_ul = eyespace_xyz.xy - vec2(0.5) *
(geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy);
offset_dr_min = min(offset_dr_min, offset_dr);
offset_ul_max = max(offset_ul_max, offset_ul);
}
// 1b.)Update eye_pos: Adding the average of offset_ul_max and
// offset_dr_min gives it equal leeway on the top vs. bottom
// and left vs. right. Recalculate eyespace_coords accordingly.
vec2 center_offset = 0.5 * (offset_ul_max + offset_dr_min);
eye_pos.xy += center_offset;
for(int i = 0; i < num_points; i++)
{
eyespace_coords[i] = global_coords[i] - eye_pos;
}
// 2a.)For each point, find out how far we can move eye_pos forward
// without the point clipping the frustum. Flip the y
// direction in advance (matters for a later step, not here).
// Solve for four offsets per point based on:
// eyespace_xyz_flipy.x * geom_view_dist /
// (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5
// eyespace_xyz_flipy.y * geom_view_dist /
// (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5
// eyespace_xyz_flipy.x * geom_view_dist /
// (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5
// eyespace_xyz_flipy.y * geom_view_dist /
// (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5
// We'll vectorize the actual computation. Take the maximum of
// these four for a single offset, and continue taking the max
// for every point (use max because offset.z is negative).
float offset_z_max = -10.0 * geom_radius * geom_view_dist;
for(int i = 0; i < num_points; i++)
{
vec3 eyespace_xyz_flipy = eyespace_coords[i] *
vec3(1.0, -1.0, 1.0);
vec4 offset_zzzz = eyespace_xyz_flipy.zzzz +
(eyespace_xyz_flipy.xyxy * geom_view_dist) /
(vec4(-0.5, -0.5, 0.5, 0.5) * vec4(geom_aspect, geom_aspect));
// Ignore offsets that push positive x/y values to opposite
// boundaries, and vice versa, and don't let the camera move
// past a point in the dead center of the screen:
offset_z_max = (eyespace_xyz_flipy.x < 0.0) ?
max(offset_z_max, offset_zzzz.x) : offset_z_max;
offset_z_max = (eyespace_xyz_flipy.y < 0.0) ?
max(offset_z_max, offset_zzzz.y) : offset_z_max;
offset_z_max = (eyespace_xyz_flipy.x > 0.0) ?
max(offset_z_max, offset_zzzz.z) : offset_z_max;
offset_z_max = (eyespace_xyz_flipy.y > 0.0) ?
max(offset_z_max, offset_zzzz.w) : offset_z_max;
offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z);
}
// 2b.)Update eye_pos: Add the maximum (smallest negative) z offset.
eye_pos.z += offset_z_max;
}
return eye_pos;
}
vec3 get_ideal_global_eye_pos(const vec3x3 local_to_global,
const vec2 geom_aspect, const float geom_mode)
{
// Start with an initial eye_pos that includes the entire primitive
// (sphere or cylinder) in its field-of-view:
const vec3 high_view = vec3(0.0, geom_aspect.y, -geom_view_dist);
const vec3 low_view = high_view * vec3(1.0, -1.0, 1.0);
const float len_sq = dot(high_view, high_view);
const float fov = abs(acos(dot(high_view, low_view)/len_sq));
// Trigonometry/similar triangles say distance = geom_radius/sin(fov/2):
const float eye_z_spherical = geom_radius/sin(fov*0.5);
const vec3 eye_pos = geom_mode < 2.5 ?
vec3(0.0, 0.0, eye_z_spherical) :
vec3(0.0, 0.0, max(geom_view_dist, eye_z_spherical));
// Get global xyz coords of extreme sample points on the simulated CRT
// screen. Start with the center, edge centers, and corners of the
// video image. We can't ignore backfacing points: They're occluded
// by closer points on the primitive, but they may NOT be occluded by
// the convex hull of the remaining samples (i.e. the remaining convex
// hull might not envelope points that do occlude a back-facing point.)
const int num_points = MAX_POINT_CLOUD_SIZE;
vec3 global_coords[MAX_POINT_CLOUD_SIZE];
global_coords[0] = mul(local_to_global, uv_to_xyz(vec2(0.0, 0.0), geom_aspect, geom_mode));
global_coords[1] = mul(local_to_global, uv_to_xyz(vec2(0.0, -0.5), geom_aspect, geom_mode));
global_coords[2] = mul(local_to_global, uv_to_xyz(vec2(0.0, 0.5), geom_aspect, geom_mode));
global_coords[3] = mul(local_to_global, uv_to_xyz(vec2(-0.5, 0.0), geom_aspect, geom_mode));
global_coords[4] = mul(local_to_global, uv_to_xyz(vec2(0.5, 0.0), geom_aspect, geom_mode));
global_coords[5] = mul(local_to_global, uv_to_xyz(vec2(-0.5, -0.5), geom_aspect, geom_mode));
global_coords[6] = mul(local_to_global, uv_to_xyz(vec2(0.5, -0.5), geom_aspect, geom_mode));
global_coords[7] = mul(local_to_global, uv_to_xyz(vec2(-0.5, 0.5), geom_aspect, geom_mode));
global_coords[8] = mul(local_to_global, uv_to_xyz(vec2(0.5, 0.5), geom_aspect, geom_mode));
// Adding more inner image points could help in extreme cases, but too many
// points will kille the framerate. For safety, default to the initial
// eye_pos if any z coords are negative:
float num_negative_z_coords = 0.0;
for(int i = 0; i < num_points; i++)
{
num_negative_z_coords += float(global_coords[0].z < 0.0);
}
// Outsource the optimized eye_pos calculation:
return num_negative_z_coords > 0.5 ? eye_pos :
get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect,
global_coords, num_points);
}
vec3x3 get_pixel_to_object_matrix(const vec3x3 global_to_local,
const vec3 eye_pos_local, const vec3 view_vec_global,
const vec3 intersection_pos_local, const vec3 normal,
const vec2 output_size_inv)
{
// Requires: See get_curved_video_uv_coords_and_tangent_matrix for
// descriptions of each parameter.
// Returns: Return a transformation matrix from 2D pixel-space vectors
// (where (+1.0, +1.0) is a vector to one pixel down-right,
// i.e. same directionality as uv texels) to 3D object-space
// vectors in the CRT's local coordinate frame (right-handed)
// ***which are tangent to the CRT surface at the intersection
// position.*** (Basically, we want to convert pixel-space
// vectors to 3D vectors along the CRT's surface, for later
// conversion to uv vectors.)
// Shorthand inputs:
const vec3 pos = intersection_pos_local;
const vec3 eye_pos = eye_pos_local;
// Get a piecewise-linear matrix transforming from "pixelspace" offset
// vectors (1.0 = one pixel) to object space vectors in the tangent
// plane (faster than finding 3 view-object intersections).
// 1.) Get the local view vecs for the pixels to the right and down:
const vec3 view_vec_right_global = view_vec_global +
vec3(output_size_inv.x, 0.0, 0.0);
const vec3 view_vec_down_global = view_vec_global +
vec3(0.0, -output_size_inv.y, 0.0);
const vec3 view_vec_right_local =
mul(global_to_local, view_vec_right_global);
const vec3 view_vec_down_local =
mul(global_to_local, view_vec_down_global);
// 2.) Using the true intersection point, intersect the neighboring
// view vectors with the tangent plane:
const vec3 intersection_vec_dot_normal = dot(pos - eye_pos, normal);
const vec3 right_pos = eye_pos + (intersection_vec_dot_normal /
dot(view_vec_right_local, normal))*view_vec_right_local;
const vec3 down_pos = eye_pos + (intersection_vec_dot_normal /
dot(view_vec_down_local, normal))*view_vec_down_local;
// 3.) Subtract the original intersection pos from its neighbors; the
// resulting vectors are object-space vectors tangent to the plane.
// These vectors are the object-space transformations of (1.0, 0.0)
// and (0.0, 1.0) pixel offsets, so they form the first two basis
// vectors of a pixelspace to object space transformation. This
// transformation is 2D to 3D, so use (0, 0, 0) for the third vector.
const vec3 object_right_vec = right_pos - pos;
const vec3 object_down_vec = down_pos - pos;
const vec3x3 pixel_to_object = vec3x3(
object_right_vec.x, object_down_vec.x, 0.0,
object_right_vec.y, object_down_vec.y, 0.0,
object_right_vec.z, object_down_vec.z, 0.0);
return pixel_to_object;
}
vec3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local,
const vec3 normal, const vec2 geom_aspect, const float geom_mode)
{
// Requires: See get_curved_video_uv_coords_and_tangent_matrix for
// descriptions of each parameter.
// Returns: Return a transformation matrix from 3D object-space vectors
// in the CRT's local coordinate frame (right-handed, +y = up)
// to 2D video_uv vectors (+v = down).
// Description:
// The TBN matrix formed by the [tangent, bitangent, normal] basis
// vectors transforms ordinary vectors from tangent->object space.
// The cotangent matrix formed by the [cotangent, cobitangent, normal]
// basis vectors transforms normal vectors (covectors) from
// tangent->object space. It's the inverse-transpose of the TBN matrix.
// We want the inverse of the TBN matrix (transpose of the cotangent
// matrix), which transforms ordinary vectors from object->tangent space.
// Start by calculating the relevant basis vectors in accordance with
// Christian Schüler's blog post "Followup: Normal Mapping Without
// Precomputed Tangents": http://www.thetenthplanet.de/archives/1180
// With our particular uv mapping, the scale of the u and v directions
// is determined entirely by the aspect ratio for cylindrical and ordinary
// spherical mappings, and so tangent and bitangent lengths are also
// determined by it (the alternate mapping is more complex). Therefore, we
// must ensure appropriate cotangent and cobitangent lengths as well.
// Base these off the uv<=>xyz mappings for each primitive.
const vec3 pos = intersection_pos_local;
const vec3 x_vec = vec3(1.0, 0.0, 0.0);
const vec3 y_vec = vec3(0.0, 1.0, 0.0);
// The tangent and bitangent vectors correspond with increasing u and v,
// respectively. Mathematically we'd base the cotangent/cobitangent on
// those, but we'll compute the cotangent/cobitangent directly when we can.
vec3 cotangent_unscaled, cobitangent_unscaled;
// geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE.
if(geom_mode < 1.5)
{
// Sphere:
// tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x
// bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y
// inv_determinant = 1.0/length(cross(bitangent, tangent))
// cotangent = cross(normal, bitangent) * inv_determinant
// == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
// cobitangent = cross(tangent, normal) * inv_determinant
// == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant
// Simplified (scale by inv_determinant below):
cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y;
cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x;
}
else if(geom_mode < 2.5)
{
// Sphere, alternate mapping:
// This mapping works a bit like the cylindrical mapping in two
// directions, which makes the lengths and directions more complex.
// Unfortunately, I can't find much of a shortcut:
const vec3 tangent = normalize(
cross(y_vec, vec3(pos.x, 0.0, pos.z))) * geom_aspect.x;
const vec3 bitangent = normalize(
cross(x_vec, vec3(0.0, pos.yz))) * geom_aspect.y;
cotangent_unscaled = cross(normal, bitangent);
cobitangent_unscaled = cross(tangent, normal);
}
else
{
// Cylinder:
// tangent = normalize(cross(y_vec, normal)) * geom_aspect.x;
// bitangent = vec3(0.0, -geom_aspect.y, 0.0);
// inv_determinant = 1.0/length(cross(bitangent, tangent))
// cotangent = cross(normal, bitangent) * inv_determinant
// == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant
// cobitangent = cross(tangent, normal) * inv_determinant
// == vec3(0.0, -geom_aspect.x, 0.0) * inv_determinant
cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y;
cobitangent_unscaled = vec3(0.0, -geom_aspect.x, 0.0);
}
const vec3 computed_normal =
cross(cobitangent_unscaled, cotangent_unscaled);
const float inv_determinant = rsqrt(dot(computed_normal, computed_normal));
const vec3 cotangent = cotangent_unscaled * inv_determinant;
const vec3 cobitangent = cobitangent_unscaled * inv_determinant;
// The [cotangent, cobitangent, normal] column vecs form the cotangent
// frame, i.e. the inverse-transpose TBN matrix. Get its transpose:
const vec3x3 object_to_tangent = vec3x3(cotangent, cobitangent, normal);
return object_to_tangent;
}
vec2 get_curved_video_uv_coords_and_tangent_matrix(
const vec2 flat_video_uv, const vec3 eye_pos_local,
const vec2 output_size_inv, const vec2 geom_aspect,
const float geom_mode, const vec3x3 global_to_local,
out vec2x2 pixel_to_tangent_video_uv)
{
// Requires: Parameters:
// 1.) flat_video_uv coords are in range [0.0, 1.0], where
// (0.0, 0.0) is the top-left corner of the screen and
// (1.0, 1.0) is the bottom-right corner.
// 2.) eye_pos_local is the 3D camera position in the simulated
// CRT's local coordinate frame. For best results, it must
// be computed based on the same geom_view_dist used here.
// 3.) output_size_inv = vec2(1.0)/IN.output_size
// 4.) geom_aspect = get_aspect_vector(
// IN.output_size.x / IN.output_size.y);
// 5.) geom_mode is a static or runtime mode setting:
// 0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder
// 6.) global_to_local is a 3x3 matrix transforming (ordinary)
// worldspace vectors to the CRT's local coordinate frame
// Globals:
// 1.) geom_view_dist must be > 0.0. It controls the "near
// plane" used to interpret flat_video_uv as a view
// vector, which controls the field of view (FOV).
// Returns: Return final uv coords in [0.0, 1.0], and return a pixel-
// space to video_uv tangent-space matrix in the out parameter.
// (This matrix assumes pixel-space +y = down, like +v = down.)
// We'll transform flat_video_uv into a view vector, project
// the view vector from the camera/eye, intersect with a sphere
// or cylinder representing the simulated CRT, and convert the
// intersection position into final uv coords and a local
// transformation matrix.
// First get the 3D view vector (geom_aspect and geom_view_dist are globals):
// 1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5)
// correspond to the top-left/bottom-right output screen corners.
// 2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen-
// space 2D aspect correction. We'll reapply it in uv-space.
// 3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y
// is up in 3D worldspace (enforce a right-handed system).
// 4.) The view vector z controls the "near plane" distance and FOV.
// For the effect of "looking through a window" at a CRT, it should be
// set equal to the user's distance from their physical screen, in
// units of the viewport's physical diagonal size.
const vec2 view_uv = (flat_video_uv - vec2(0.5)) * geom_aspect;
const vec3 view_vec_global =
vec3(view_uv.x, -view_uv.y, -geom_view_dist);
// Transform the view vector into the CRT's local coordinate frame, convert
// to video_uv coords, and get the local 3D intersection position:
const vec3 view_vec_local = mul(global_to_local, view_vec_global);
vec3 pos;
const vec2 centered_uv = view_vec_to_uv(
view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos);
const vec2 video_uv = centered_uv + vec2(0.5);
// Get a pixel-to-tangent-video-uv matrix. The caller could deal with
// all but one of these cases, but that would be more complicated.
#ifdef DRIVERS_ALLOW_DERIVATIVES
// Derivatives obtain a matrix very fast, but the direction of pixel-
// space +y seems to depend on the pass. Enforce the correct direction
// on a best-effort basis (but it shouldn't matter for antialiasing).
const vec2 duv_dx = ddx(video_uv);
const vec2 duv_dy = ddy(video_uv);
#ifdef LAST_PASS
pixel_to_tangent_video_uv = vec2x2(
duv_dx.x, duv_dy.x,
-duv_dx.y, -duv_dy.y);
#else
pixel_to_tangent_video_uv = vec2x2(
duv_dx.x, duv_dy.x,
duv_dx.y, duv_dy.y);
#endif
#else
// Manually define a transformation matrix. We'll assume pixel-space
// +y = down, just like +v = down.
if(geom_force_correct_tangent_matrix)
{
// Get the surface normal based on the local intersection position:
const vec3 normal_base = geom_mode < 2.5 ? pos :
vec3(pos.x, 0.0, pos.z);
const vec3 normal = normalize(normal_base);
// Get pixel-to-object and object-to-tangent matrices and combine
// them into a 2x2 pixel-to-tangent matrix for video_uv offsets:
const vec3x3 pixel_to_object = get_pixel_to_object_matrix(
global_to_local, eye_pos_local, view_vec_global, pos, normal,
output_size_inv);
const vec3x3 object_to_tangent = get_object_to_tangent_matrix(
pos, normal, geom_aspect, geom_mode);
const vec3x3 pixel_to_tangent3x3 =
mul(object_to_tangent, pixel_to_object);
pixel_to_tangent_video_uv = vec2x2(
pixel_to_tangent3x3._m00_m01_m10_m11);
}
else
{
// Ignore curvature, and just consider flat scaling. The
// difference is only apparent with strong curvature:
pixel_to_tangent_video_uv = vec2x2(
output_size_inv.x, 0.0, 0.0, output_size_inv.y);
}
#endif
return video_uv;
}
float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect)
{
// COPYRIGHT NOTE FOR THIS FUNCTION:
// Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey
// This function uses an algorithm first coded in several of cgwg's GPL-
// licensed lines in crt-geom-curved.cg and its ancestors. The line
// between algorithm and code is nearly indistinguishable here, so it's
// unclear whether I could even release this project under a non-GPL
// license with this function included.
// Calculate border_dim_factor from the proximity to uv-space image
// borders; geom_aspect/border_size/border/darkness/border_compress are globals:
const vec2 edge_dists = min(video_uv, vec2(1.0) - video_uv) *
geom_aspect;
const vec2 border_penetration =
max(vec2(border_size) - edge_dists, vec2(0.0));
const float penetration_ratio = length(border_penetration)/border_size;
const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0);
const float border_dim_factor =
pow(border_escape_ratio, border_darkness) * max(1.0, border_compress);
return min(border_dim_factor, 1.0);
}
#endif // GEOMETRY_FUNCTIONS_H

View file

@ -0,0 +1,677 @@
#ifndef PHOSPHOR_MASK_RESIZING_H
#define PHOSPHOR_MASK_RESIZING_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../user-settings.h"
#include "derived-settings-and-constants.h"
///////////////////////////// CODEPATH SELECTION /////////////////////////////
// Choose a looping strategy based on what's allowed:
// Dynamic loops not allowed: Use a flat static loop.
// Dynamic loops accomodated: Coarsely branch around static loops.
// Dynamic loops assumed allowed: Use a flat dynamic loop.
#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
#define BREAK_LOOPS_INTO_PIECES
#else
#define USE_SINGLE_STATIC_LOOP
#endif
#endif // No else needed: Dynamic loops assumed.
////////////////////////////////// CONSTANTS /////////////////////////////////
// The larger the resized tile, the fewer samples we'll need for downsizing.
// See if we can get a static min tile size > mask_min_allowed_tile_size:
const float mask_min_allowed_tile_size = ceil(
mask_min_allowed_triad_size * mask_triads_per_tile);
const float mask_min_expected_tile_size =
mask_min_allowed_tile_size;
// Limit the number of sinc resize taps by the maximum minification factor:
const float pi_over_lobes = pi/mask_sinc_lobes;
const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes *
mask_resize_src_lut_size.x/mask_min_expected_tile_size;
// Vectorized loops sample in multiples of 4. Round up to be safe:
const float max_sinc_resize_samples_m4 = ceil(
max_sinc_resize_samples_float * 0.25) * 4.0;
///////////////////////// RESAMPLING FUNCTION HELPERS ////////////////////////
inline float get_dynamic_loop_size(const float magnification_scale)
{
// Requires: The following global constants must be defined:
// 1.) mask_sinc_lobes
// 2.) max_sinc_resize_samples_m4
// Returns: The minimum number of texture samples for a correct downsize
// at magnification_scale.
// We're downsizing, so the filter is sized across 2*lobes output pixels
// (not 2*lobes input texels). This impacts distance measurements and the
// minimum number of input samples needed.
const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale;
const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0;
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
const float max_samples_m4 = max_sinc_resize_samples_m4;
#else // ifdef BREAK_LOOPS_INTO_PIECES
// Simulating loops with branches imposes a 128-sample limit.
const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4);
#endif
return min(min_samples_m4, max_samples_m4);
}
vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv,
const vec2 texture_size, const float dr,
const float input_tiles_per_texture_r, const float samples,
const bool vertical)
{
// Requires: 1.) dr == du == 1.0/texture_size.x or
// dr == dv == 1.0/texture_size.y
// (whichever direction we're resampling in).
// It's a scalar to save register space.
// 2.) input_tiles_per_texture_r is the number of input tiles
// that can fit in the input texture in the direction we're
// resampling this pass.
// 3.) vertical indicates whether we're resampling vertically
// this pass (or horizontally).
// Returns: Pack and return the first sample's tile_uv coord in [0, 1]
// and its texel distance from the destination pixel, in the
// resized dimension only.
// We'll start with the topmost or leftmost sample and work down or right,
// so get the first sample location and distance. Modify both dimensions
// as if we're doing a one-pass 2D resize; we'll throw away the unneeded
// (and incorrect) dimension at the end.
const vec2 curr_texel = tex_uv * texture_size;
const vec2 prev_texel =
floor(curr_texel - vec2(under_half)) + vec2(0.5);
const vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0);
const vec2 first_texel_uv_wrap_2D = first_texel * dr;
const vec2 first_texel_dist_2D = curr_texel - first_texel;
// Convert from tex_uv to tile_uv coords so we can sub fracs for fmods.
const vec2 first_texel_tile_uv_wrap_2D =
first_texel_uv_wrap_2D * input_tiles_per_texture_r;
// Project wrapped coordinates to the [0, 1] range. We'll do this with all
// samples,but the first texel is special, since it might be negative.
const vec2 coord_negative =
vec2(first_texel_tile_uv_wrap_2D < vec2(0.0));
const vec2 first_texel_tile_uv_2D =
frac(first_texel_tile_uv_wrap_2D) + coord_negative;
// Pack the first texel's tile_uv coord and texel distance in 1D:
const vec2 tile_u_and_dist =
vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x);
const vec2 tile_v_and_dist =
vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y);
return vertical ? tile_v_and_dist : tile_u_and_dist;
//return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical));
}
inline vec4 tex2Dlod0try(const sampler2D tex, const vec2 tex_uv)
{
// Mipmapping and anisotropic filtering get confused by sinc-resampling.
// One [slow] workaround is to select the lowest mip level:
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD
return tex2Dlod(tex, vec4(tex_uv, 0.0, 0.0));
#else
#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS
return tex2Dbias(tex, vec4(tex_uv, 0.0, -16.0));
#else
return texture(tex, tex_uv);
#endif
#endif
}
////////////////////////////// LOOP BODY MACROS //////////////////////////////
// Using inline functions can exceed the temporary register limit, so we're
// stuck with #define macros (I'm TRULY sorry). They're declared here instead
// of above to be closer to the actual invocation sites. Steps:
// 1.) Get the exact texel location.
// 2.) Sample the phosphor mask (already assumed encoded in linear RGB).
// 3.) Get the distance from the current pixel and sinc weight:
// sinc(dist) = sin(pi * dist)/(pi * dist)
// We can also use the slower/smoother Lanczos instead:
// L(x) = sinc(dist) * sinc(dist / lobes)
// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels
// in pixel_color (we'll normalize outside the loop at the end).
// We vectorize the loop to help reduce the Lanczos window's cost.
// The r coord is the coord in the dimension we're resizing along (u or v),
// and first_texel_tile_uv_rrrr is a vec4 of the first texel's u or v
// tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord
// for four new texel samples.
#define CALCULATE_R_COORD_FOR_4_SAMPLES \
const vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \
const vec4 tile_uv_r = frac( \
first_texel_tile_uv_rrrr + true_i * tile_dr); \
const vec4 tex_uv_r = tile_uv_r * tile_size_uv_r;
#ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
#define CALCULATE_SINC_RESAMPLE_WEIGHTS \
const vec4 pi_dist_over_lobes = pi_over_lobes * dist; \
const vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\
(pi_dist*pi_dist_over_lobes), vec4(1.0));
#else
#define CALCULATE_SINC_RESAMPLE_WEIGHTS \
const vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0));
#endif
#define UPDATE_COLOR_AND_WEIGHT_SUMS \
const vec4 dist = magnification_scale * \
abs(first_dist_unscaled - true_i); \
const vec4 pi_dist = pi * dist; \
CALCULATE_SINC_RESAMPLE_WEIGHTS; \
pixel_color += new_sample0 * weights.xxx; \
pixel_color += new_sample1 * weights.yyy; \
pixel_color += new_sample2 * weights.zzz; \
pixel_color += new_sample3 * weights.www; \
weight_sum += weights;
#define VERTICAL_SINC_RESAMPLE_LOOP_BODY \
CALCULATE_R_COORD_FOR_4_SAMPLES; \
const vec3 new_sample0 = tex2Dlod0try(texture, \
vec2(tex_uv.x, tex_uv_r.x)).rgb; \
const vec3 new_sample1 = tex2Dlod0try(texture, \
vec2(tex_uv.x, tex_uv_r.y)).rgb; \
const vec3 new_sample2 = tex2Dlod0try(texture, \
vec2(tex_uv.x, tex_uv_r.z)).rgb; \
const vec3 new_sample3 = tex2Dlod0try(texture, \
vec2(tex_uv.x, tex_uv_r.w)).rgb; \
UPDATE_COLOR_AND_WEIGHT_SUMS;
#define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \
CALCULATE_R_COORD_FOR_4_SAMPLES; \
const vec3 new_sample0 = tex2Dlod0try(texture, \
vec2(tex_uv_r.x, tex_uv.y)).rgb; \
const vec3 new_sample1 = tex2Dlod0try(texture, \
vec2(tex_uv_r.y, tex_uv.y)).rgb; \
const vec3 new_sample2 = tex2Dlod0try(texture, \
vec2(tex_uv_r.z, tex_uv.y)).rgb; \
const vec3 new_sample3 = tex2Dlod0try(texture, \
vec2(tex_uv_r.w, tex_uv.y)).rgb; \
UPDATE_COLOR_AND_WEIGHT_SUMS;
//////////////////////////// RESAMPLING FUNCTIONS ////////////////////////////
vec3 downsample_vertical_sinc_tiled(const sampler2D texture,
const vec2 tex_uv, const vec2 texture_size, const float dr,
const float magnification_scale, const float tile_size_uv_r)
{
// Requires: 1.) dr == du == 1.0/texture_size.x or
// dr == dv == 1.0/texture_size.y
// (whichever direction we're resampling in).
// It's a scalar to save register space.
// 2.) tile_size_uv_r is the number of texels an input tile
// takes up in the input texture, in the direction we're
// resampling this pass.
// 3.) magnification_scale must be <= 1.0.
// Returns: Return a [Lanczos] sinc-resampled pixel of a vertically
// downsized input tile embedded in an input texture. (The
// vertical version is special-cased though: It assumes the
// tile size equals the [static] texture size, since it's used
// on an LUT texture input containing one tile. For more
// generic use, eliminate the "static" in the parameters.)
// The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
// we're resizing along, e.g. "dy" in this case.
#ifdef USE_SINGLE_STATIC_LOOP
// A static loop can be faster, but it might blur too much from using
// more samples than it should.
const int samples = int(max_sinc_resize_samples_m4);
#else
const int samples = int(get_dynamic_loop_size(magnification_scale));
#endif
// Get the first sample location (scalar tile uv coord along the resized
// dimension) and distance from the output location (in texels):
const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
// true = vertical resize:
const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, true);
const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
// Get the tile sample offset:
const float tile_dr = dr * input_tiles_per_texture_r;
// Sum up each weight and weighted sample color, varying the looping
// strategy based on our expected dynamic loop capabilities. See the
// loop body macros above.
int i_base = 0;
vec4 weight_sum = vec4(0.0);
vec3 pixel_color = vec3(0.0);
const int i_step = 4;
#ifdef BREAK_LOOPS_INTO_PIECES
if(samples - i_base >= 64)
{
for(int i = 0; i < 64; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 64;
}
if(samples - i_base >= 32)
{
for(int i = 0; i < 32; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 32;
}
if(samples - i_base >= 16)
{
for(int i = 0; i < 16; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 16;
}
if(samples - i_base >= 8)
{
for(int i = 0; i < 8; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 8;
}
if(samples - i_base >= 4)
{
for(int i = 0; i < 4; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 4;
}
// Do another 4-sample block for a total of 128 max samples.
if(samples - i_base > 0)
{
for(int i = 0; i < 4; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
}
#else
for(int i = 0; i < samples; i += i_step)
{
VERTICAL_SINC_RESAMPLE_LOOP_BODY;
}
#endif
// Normalize so the weight_sum == 1.0, and return:
const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x +
weight_sum_reduce.y);
return (pixel_color/scalar_weight_sum);
}
vec3 downsample_horizontal_sinc_tiled(const sampler2D texture,
const vec2 tex_uv, const vec2 texture_size, const float dr,
const float magnification_scale, const float tile_size_uv_r)
{
// Differences from downsample_horizontal_sinc_tiled:
// 1.) The dr and tile_size_uv_r parameters are not static consts.
// 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is
// set to false instead of true.
// 3.) The horizontal version of the loop body is used.
// TODO: If we can get guaranteed compile-time dead code elimination,
// we can combine the vertical/horizontal downsampling functions by:
// 1.) Add an extra static const bool parameter called "vertical."
// 2.) Supply it with the result of get_first_texel_tile_uv_and_dist().
// 3.) Use a conditional assignment in the loop body macro. This is the
// tricky part: We DO NOT want to incur the extra conditional
// assignment in the inner loop at runtime!
// The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension
// we're resizing along, e.g. "dx" in this case.
#ifdef USE_SINGLE_STATIC_LOOP
// If we have to load all samples, we might as well use them.
const int samples = int(max_sinc_resize_samples_m4);
#else
const int samples = int(get_dynamic_loop_size(magnification_scale));
#endif
// Get the first sample location (scalar tile uv coord along resized
// dimension) and distance from the output location (in texels):
const float input_tiles_per_texture_r = 1.0/tile_size_uv_r;
// false = horizontal resize:
const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist(
tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, false);
const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx;
const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy;
// Get the tile sample offset:
const float tile_dr = dr * input_tiles_per_texture_r;
// Sum up each weight and weighted sample color, varying the looping
// strategy based on our expected dynamic loop capabilities. See the
// loop body macros above.
int i_base = 0;
vec4 weight_sum = vec4(0.0);
vec3 pixel_color = vec3(0.0);
const int i_step = 4;
#ifdef BREAK_LOOPS_INTO_PIECES
if(samples - i_base >= 64)
{
for(int i = 0; i < 64; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 64;
}
if(samples - i_base >= 32)
{
for(int i = 0; i < 32; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 32;
}
if(samples - i_base >= 16)
{
for(int i = 0; i < 16; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 16;
}
if(samples - i_base >= 8)
{
for(int i = 0; i < 8; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 8;
}
if(samples - i_base >= 4)
{
for(int i = 0; i < 4; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
i_base += 4;
}
// Do another 4-sample block for a total of 128 max samples.
if(samples - i_base > 0)
{
for(int i = 0; i < 4; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
}
#else
for(int i = 0; i < samples; i += i_step)
{
HORIZONTAL_SINC_RESAMPLE_LOOP_BODY;
}
#endif
// Normalize so the weight_sum == 1.0, and return:
const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw;
const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x +
weight_sum_reduce.y);
return (pixel_color/scalar_weight_sum);
}
//////////////////////////// TILE SIZE CALCULATION ///////////////////////////
vec2 get_resized_mask_tile_size(const vec2 estimated_viewport_size,
const vec2 estimated_mask_resize_output_size,
const bool solemnly_swear_same_inputs_for_every_pass)
{
// Requires: The following global constants must be defined according to
// certain constraints:
// 1.) mask_resize_num_triads: Must be high enough that our
// mask sampling method won't have artifacts later
// (long story; see derived-settings-and-constants.h)
// 2.) mask_resize_src_lut_size: Texel size of our mask LUT
// 3.) mask_triads_per_tile: Num horizontal triads in our LUT
// 4.) mask_min_allowed_triad_size: User setting (the more
// restrictive it is, the faster the resize will go)
// 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x
// 6.) mask_triad_size_desired_{runtime, static}
// 7.) mask_num_triads_desired_{runtime, static}
// 8.) mask_specify_num_triads must be 0.0/1.0 (false/true)
// The function parameters must be defined as follows:
// 1.) estimated_viewport_size == (final viewport size);
// If mask_specify_num_triads is 1.0/true and the viewport
// estimate is wrong, the number of triads will differ from
// the user's preference by about the same factor.
// 2.) estimated_mask_resize_output_size: Must equal the
// output size of the MASK_RESIZE pass.
// Exception: The x component may be estimated garbage if
// and only if the caller throws away the x result.
// 3.) solemnly_swear_same_inputs_for_every_pass: Set to false,
// unless you can guarantee that every call across every
// pass will use the same sizes for the other parameters.
// When calling this across multiple passes, always use the
// same y viewport size/scale, and always use the same x
// viewport size/scale when using the x result.
// Returns: Return the final size of a manually resized mask tile, after
// constraining the desired size to avoid artifacts. Under
// unusual circumstances, tiles may become stretched vertically
// (see wall of text below).
// Stated tile properties must be correct:
const float tile_aspect_ratio_inv =
mask_resize_src_lut_size.y/mask_resize_src_lut_size.x;
const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv;
const vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv);
// If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is
// wrong, the user preference will be misinterpreted:
const float desired_tile_size_x = mask_triads_per_tile * lerp(
mask_triad_size_desired,
estimated_viewport_size.x / mask_num_triads_desired,
mask_specify_num_triads);
if(get_mask_sample_mode() > 0.5)
{
// We don't need constraints unless we're sampling MASK_RESIZE.
return desired_tile_size_x * tile_aspect;
}
// Make sure we're not upsizing:
const float temp_tile_size_x =
min(desired_tile_size_x, mask_resize_src_lut_size.x);
// Enforce min_tile_size and max_tile_size in both dimensions:
const vec2 temp_tile_size = temp_tile_size_x * tile_aspect;
const vec2 min_tile_size =
mask_min_allowed_tile_size * tile_aspect;
const vec2 max_tile_size =
estimated_mask_resize_output_size / mask_resize_num_tiles;
const vec2 clamped_tile_size =
clamp(temp_tile_size, min_tile_size, max_tile_size);
// Try to maintain tile_aspect_ratio. This is the tricky part:
// If we're currently resizing in the y dimension, the x components
// could be MEANINGLESS. (If estimated_mask_resize_output_size.x is
// bogus, then so is max_tile_size.x and clamped_tile_size.x.)
// We can't adjust the y size based on clamped_tile_size.x. If it
// clamps when it shouldn't, it won't clamp again when later passes
// call this function with the correct sizes, and the discrepancy will
// break the sampling coords in MASKED_SCANLINES. Instead, we'll limit
// the x size based on the y size, but not vice versa, unless the
// caller swears the parameters were the same (correct) in every pass.
// As a result, triads could appear vertically stretched if:
// a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide
// LUT's might clamp x more than y (all provided LUT's are square)
// b.) true_viewport_size.x < true_viewport_size.y: The user is playing
// with a vertically oriented screen (not accounted for anyway)
// c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y:
// Viewport scales are equal by default.
// If any of these are the case, you can fix the stretching by setting:
// mask_resize_viewport_scale.x = mask_resize_viewport_scale.y *
// (1.0 / min_expected_aspect_ratio) *
// (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y)
const float x_tile_size_from_y =
clamped_tile_size.y * tile_aspect_ratio;
const float y_tile_size_from_x = lerp(clamped_tile_size.y,
clamped_tile_size.x * tile_aspect_ratio_inv,
float(solemnly_swear_same_inputs_for_every_pass));
const vec2 reclamped_tile_size = vec2(
min(clamped_tile_size.x, x_tile_size_from_y),
min(clamped_tile_size.y, y_tile_size_from_x));
// We need integer tile sizes in both directions for tiled sampling to
// work correctly. Use floor (to make sure we don't round up), but be
// careful to avoid a rounding bug where floor decreases whole numbers:
const vec2 final_resized_tile_size =
floor(reclamped_tile_size + vec2(FIX_ZERO(0.0)));
return final_resized_tile_size;
}
///////////////////////// FINAL MASK SAMPLING HELPERS ////////////////////////
vec4 get_mask_sampling_parameters(const vec2 mask_resize_texture_size,
const vec2 mask_resize_video_size, const vec2 true_viewport_size,
out vec2 mask_tiles_per_screen)
{
// Requires: 1.) Requirements of get_resized_mask_tile_size() must be
// met, particularly regarding global constants.
// The function parameters must be defined as follows:
// 1.) mask_resize_texture_size == MASK_RESIZE.texture_size
// if get_mask_sample_mode() is 0 (otherwise anything)
// 2.) mask_resize_video_size == MASK_RESIZE.video_size
// if get_mask_sample_mode() is 0 (otherwise anything)
// 3.) true_viewport_size == IN.output_size for a pass set to
// 1.0 viewport scale (i.e. it must be correct)
// Returns: Return a vec4 containing:
// xy: tex_uv coords for the start of the mask tile
// zw: tex_uv size of the mask tile from start to end
// mask_tiles_per_screen is an out parameter containing the
// number of mask tiles that will fit on the screen.
// First get the final resized tile size. The viewport size and mask
// resize viewport scale must be correct, but don't solemnly swear they
// were correct in both mask resize passes unless you know it's true.
// (We can better ensure a correct tile aspect ratio if the parameters are
// guaranteed correct in all passes...but if we lie, we'll get inconsistent
// sizes across passes, resulting in broken texture coordinates.)
const float mask_sample_mode = get_mask_sample_mode();
const vec2 mask_resize_tile_size = get_resized_mask_tile_size(
true_viewport_size, mask_resize_video_size, false);
if(mask_sample_mode < 0.5)
{
// Sample MASK_RESIZE: The resized tile is a fraction of the texture
// size and starts at a nonzero offset to allow for border texels:
const vec2 mask_tile_uv_size = mask_resize_tile_size /
mask_resize_texture_size;
const vec2 skipped_tiles = mask_start_texels/mask_resize_tile_size;
const vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size;
// mask_tiles_per_screen must be based on the *true* viewport size:
mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
return vec4(mask_tile_start_uv, mask_tile_uv_size);
}
else
{
// If we're tiling at the original size (1:1 pixel:texel), redefine a
// "tile" to be the full texture containing many triads. Otherwise,
// we're hardware-resampling an LUT, and the texture truly contains a
// single unresized phosphor mask tile anyway.
const vec2 mask_tile_uv_size = vec2(1.0);
const vec2 mask_tile_start_uv = vec2(0.0);
if(mask_sample_mode > 1.5)
{
// Repeat the full LUT at a 1:1 pixel:texel ratio without resizing:
mask_tiles_per_screen = true_viewport_size/mask_texture_large_size;
}
else
{
// Hardware-resize the original LUT:
mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size;
}
return vec4(mask_tile_start_uv, mask_tile_uv_size);
}
}
vec2 fix_tiling_discontinuities_normalized(const vec2 tile_uv,
vec2 duv_dx, vec2 duv_dy)
{
// Requires: 1.) duv_dx == ddx(tile_uv)
// 2.) duv_dy == ddy(tile_uv)
// 3.) tile_uv contains tile-relative uv coords in [0, 1],
// such that (0.5, 0.5) is the center of a tile, etc.
// ("Tile" can mean texture, the video embedded in the
// texture, or some other "tile" embedded in a texture.)
// Returns: Return new tile_uv coords that contain no discontinuities
// across a 2x2 pixel quad.
// Description:
// When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the
// derivatives, which we assume happened if the absolute difference between
// any fragment in a 2x2 block is > ~half a tile. If the current block has
// a u or v discontinuity and the current fragment is in the first half of
// the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile
// to that coord to make the 2x2 block continuous. (It will now have a
// coord > 1.0 in the padding area beyond the tile.) This function takes
// derivatives as parameters so the caller can reuse them.
// In case we're using high-quality (nVidia-style) derivatives, ensure
// diagonically opposite fragments see each other for correctness:
duv_dx = abs(duv_dx) + abs(ddy(duv_dx));
duv_dy = abs(duv_dy) + abs(ddx(duv_dy));
const vec2 pixel_in_first_half_tile = vec2(tile_uv < vec2(0.5));
const vec2 jump_exists = vec2(duv_dx + duv_dy > vec2(0.5));
return tile_uv + jump_exists * pixel_in_first_half_tile;
}
vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap,
const vec4 mask_tile_start_uv_and_size)
{
// Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the
// tile spans from [0, 1], such that (0.5, 0.5) is at the
// tile center. The input coords can range from [0, inf],
// and their fractional parts map to a repeated tile.
// ("Tile" can mean texture, the video embedded in the
// texture, or some other "tile" embedded in a texture.)
// 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords
// for the start of the embedded tile in the full texture.
// 3.) mask_tile_start_uv_and_size.zw contains the [fractional]
// tex_uv size of the embedded tile in the full texture.
// Returns: Return tex_uv coords (used for texture sampling)
// corresponding to tile_uv_wrap.
if(get_mask_sample_mode() < 0.5)
{
// Manually repeat the resized mask tile to fill the screen:
// First get fractional tile_uv coords. Using frac/fmod on coords
// confuses anisotropic filtering; fix it as user options dictate.
// derived-settings-and-constants.h disables incompatible options.
#ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE
vec2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0;
#else
vec2 tile_uv = frac(tile_uv_wrap);
#endif
#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES
const vec2 tile_uv_dx = ddx(tile_uv);
const vec2 tile_uv_dy = ddy(tile_uv);
tile_uv = fix_tiling_discontinuities_normalized(tile_uv,
tile_uv_dx, tile_uv_dy);
#endif
// The tile is embedded in a padded FBO, and it may start at a
// nonzero offset if border texels are used to avoid artifacts:
const vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy +
tile_uv * mask_tile_start_uv_and_size.zw;
return mask_tex_uv;
}
else
{
// Sample from the input phosphor mask texture with hardware tiling.
// If we're tiling at the original size (mode 2), the "tile" is the
// whole texture, and it contains a large number of triads mapped with
// a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single
// unresized tile. tile_uv_wrap already has correct coords for both!
return tile_uv_wrap;
}
}
#endif // PHOSPHOR_MASK_RESIZING_H

View file

@ -0,0 +1,572 @@
#ifndef SCANLINE_FUNCTIONS_H
#define SCANLINE_FUNCTIONS_H
///////////////////////////// GPL LICENSE NOTICE /////////////////////////////
// crt-royale: A full-featured CRT shader, with cheese.
// Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation; either version 2 of the License, or any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
// more details.
//
// You should have received a copy of the GNU General Public License along with
// this program; if not, write to the Free Software Foundation, Inc., 59 Temple
// Place, Suite 330, Boston, MA 02111-1307 USA
////////////////////////////////// INCLUDES //////////////////////////////////
#include "../user-settings.h"
#include "derived-settings-and-constants.h"
#include "../../../../include/special-functions.h"
#include "../../../../include/gamma-management.h"
///////////////////////////// SCANLINE FUNCTIONS /////////////////////////////
/*
inline float3 get_gaussian_sigma(const float3 color, const float sigma_range)
{
// Requires: Globals:
// 1.) beam_min_sigma and beam_max_sigma are global floats
// containing the desired minimum and maximum beam standard
// deviations, for dim and bright colors respectively.
// 2.) beam_max_sigma must be > 0.0
// 3.) beam_min_sigma must be in (0.0, beam_max_sigma]
// 4.) beam_spot_power must be defined as a global float.
// Parameters:
// 1.) color is the underlying source color along a scanline
// 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take
// sigma_range as a parameter to avoid repeated computation
// when beam_{min, max}_sigma are runtime shader parameters
// Optional: Users may set beam_spot_shape_function to 1 to define the
// inner f(color) subfunction (see below) as:
// f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0))
// Otherwise (technically, if beam_spot_shape_function < 0.5):
// f(color) = pow(color, beam_spot_power)
// Returns: The standard deviation of the Gaussian beam for "color:"
// sigma = beam_min_sigma + sigma_range * f(color)
// Details/Discussion:
// The beam's spot shape vaguely resembles an aspect-corrected f() in the
// range [0, 1] (not quite, but it's related). f(color) = color makes
// spots look like diamonds, and a spherical function or cube balances
// between variable width and a soft/realistic shape. A beam_spot_power
// > 1.0 can produce an ugly spot shape and more initial clipping, but the
// final shape also differs based on the horizontal resampling filter and
// the phosphor bloom. For instance, resampling horizontally in nonlinear
// light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot
// shape, but a sixth root is still quite soft. A power function (default
// 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve
// has the highest variability without an awful spot shape.
//
// beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its
// difference from beam_max_sigma affects beam width variability. It only
// affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is
// a conservative estimate for a more complex constraint).
//
// beam_max_sigma affects clipping and increasing scanline width/softness
// as color increases. The wider this is, the more scanlines need to be
// evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma
// at which the first unused scanline always has a weight < 1.0/255.0 is:
// num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34
// num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52
// num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70
// num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89
// num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08
// Generalized Gaussians permit more leeway here as steepness increases.
if(beam_spot_shape_function < 0.5)
{
// Use a power function:
return float3(beam_min_sigma) + sigma_range *
pow(color, beam_spot_power);
}
else
{
// Use a spherical function:
const float3 color_minus_1 = color - float3(1.0);
return float3(beam_min_sigma) + sigma_range *
sqrt(float3(1.0) - color_minus_1*color_minus_1);
}
}
inline float3 get_generalized_gaussian_beta(const float3 color,
const float shape_range)
{
// Requires: Globals:
// 1.) beam_min_shape and beam_max_shape are global floats
// containing the desired min/max generalized Gaussian
// beta parameters, for dim and bright colors respectively.
// 2.) beam_max_shape must be >= 2.0
// 3.) beam_min_shape must be in [2.0, beam_max_shape]
// 4.) beam_shape_power must be defined as a global float.
// Parameters:
// 1.) color is the underlying source color along a scanline
// 2.) shape_range = beam_max_shape - beam_min_shape; we take
// shape_range as a parameter to avoid repeated computation
// when beam_{min, max}_shape are runtime shader parameters
// Returns: The type-I generalized Gaussian "shape" parameter beta for
// the given color.
// Details/Discussion:
// Beta affects the scanline distribution as follows:
// a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope
// b.) beta == 2.0 just degenerates to a Gaussian
// c.) beta > 2.0 flattens and widens the peak, then drops off more steeply
// than a Gaussian. Whereas high sigmas widen and soften peaks, high
// beta widen and sharpen peaks at the risk of aliasing.
// Unlike high beam_spot_powers, high beam_shape_powers actually soften shape
// transitions, whereas lower ones sharpen them (at the risk of aliasing).
return beam_min_shape + shape_range * pow(color, beam_shape_power);
}
float3 scanline_gaussian_integral_contrib(const float3 dist,
const float3 color, const float pixel_height, const float sigma_range)
{
// Requires: 1.) dist is the distance of the [potentially separate R/G/B]
// point(s) from a scanline in units of scanlines, where
// 1.0 means the sample point straddles the next scanline.
// 2.) color is the underlying source color along a scanline.
// 3.) pixel_height is the output pixel height in scanlines.
// 4.) Requirements of get_gaussian_sigma() must be met.
// Returns: Return a scanline's light output over a given pixel.
// Details:
// The CRT beam profile follows a roughly Gaussian distribution which is
// wider for bright colors than dark ones. The integral over the full
// range of a Gaussian function is always 1.0, so we can vary the beam
// with a standard deviation without affecting brightness. 'x' = distance:
// gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
// gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2))))
// Use a numerical approximation of the "error function" (the Gaussian
// indefinite integral) to find the definite integral of the scanline's
// average brightness over a given pixel area. Even if curved coords were
// used in this pass, a flat scalar pixel height works almost as well as a
// pixel height computed from a full pixel-space to scanline-space matrix.
const float3 sigma = get_gaussian_sigma(color, sigma_range);
const float3 ph_offset = float3(pixel_height * 0.5);
const float3 denom_inv = 1.0/(sigma*sqrt(2.0));
const float3 integral_high = erf((dist + ph_offset)*denom_inv);
const float3 integral_low = erf((dist - ph_offset)*denom_inv);
return color * 0.5*(integral_high - integral_low)/pixel_height;
}
float3 scanline_generalized_gaussian_integral_contrib(const float3 dist,
const float3 color, const float pixel_height, const float sigma_range,
const float shape_range)
{
// Requires: 1.) Requirements of scanline_gaussian_integral_contrib()
// must be met.
// 2.) Requirements of get_gaussian_sigma() must be met.
// 3.) Requirements of get_generalized_gaussian_beta() must be
// met.
// Returns: Return a scanline's light output over a given pixel.
// A generalized Gaussian distribution allows the shape (beta) to vary
// as well as the width (alpha). "gamma" refers to the gamma function:
// generalized sample =
// beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
// ligamma(s, z) is the lower incomplete gamma function, for which we only
// implement two of four branches (because we keep 1/beta <= 0.5):
// generalized integral = 0.5 + 0.5* sign(x) *
// ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta)
// See get_generalized_gaussian_beta() for a discussion of beta.
// We base alpha on the intended Gaussian sigma, but it only strictly
// models models standard deviation at beta == 2, because the standard
// deviation depends on both alpha and beta (keeping alpha independent is
// faster and preserves intuitive behavior and a full spectrum of results).
const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
const float3 beta = get_generalized_gaussian_beta(color, shape_range);
const float3 alpha_inv = float3(1.0)/alpha;
const float3 s = float3(1.0)/beta;
const float3 ph_offset = float3(pixel_height * 0.5);
// Pass beta to gamma_impl to avoid repeated divides. Similarly pass
// beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl.
const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta);
const float3 dist1 = dist + ph_offset;
const float3 dist0 = dist - ph_offset;
const float3 integral_high = sign(dist1) * normalized_ligamma_impl(
s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv);
const float3 integral_low = sign(dist0) * normalized_ligamma_impl(
s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv);
return color * 0.5*(integral_high - integral_low)/pixel_height;
}
float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color,
const float pixel_height, const float sigma_range)
{
// See scanline_gaussian integral_contrib() for detailed comments!
// gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2))
const float3 sigma = get_gaussian_sigma(color, sigma_range);
// Avoid repeated divides:
const float3 sigma_inv = float3(1.0)/sigma;
const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv;
const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi);
if(beam_antialias_level > 0.5)
{
// Sample 1/3 pixel away in each direction as well:
const float3 sample_offset = float3(pixel_height/3.0);
const float3 dist2 = dist + sample_offset;
const float3 dist3 = abs(dist - sample_offset);
// Average three pure Gaussian samples:
const float3 scale = color/3.0 * outer_denom_inv;
const float3 weight1 = exp(-(dist*dist)*inner_denom_inv);
const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv);
const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv);
return scale * (weight1 + weight2 + weight3);
}
else
{
return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv;
}
}
float3 scanline_generalized_gaussian_sampled_contrib(const float3 dist,
const float3 color, const float pixel_height, const float sigma_range,
const float shape_range)
{
// See scanline_generalized_gaussian_integral_contrib() for details!
// generalized sample =
// beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta)
const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range);
const float3 beta = get_generalized_gaussian_beta(color, shape_range);
// Avoid repeated divides:
const float3 alpha_inv = float3(1.0)/alpha;
const float3 beta_inv = float3(1.0)/beta;
const float3 scale = color * beta * 0.5 * alpha_inv /
gamma_impl(beta_inv, beta);
if(beam_antialias_level > 0.5)
{
// Sample 1/3 pixel closer to and farther from the scanline too.
const float3 sample_offset = float3(pixel_height/3.0);
const float3 dist2 = dist + sample_offset;
const float3 dist3 = abs(dist - sample_offset);
// Average three generalized Gaussian samples:
const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta));
const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta));
const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta));
return scale/3.0 * (weight1 + weight2 + weight3);
}
else
{
return scale * exp(-pow(abs(dist*alpha_inv), beta));
}
}
inline float3 scanline_contrib(float3 dist, float3 color,
float pixel_height, const float sigma_range, const float shape_range)
{
// Requires: 1.) Requirements of scanline_gaussian_integral_contrib()
// must be met.
// 2.) Requirements of get_gaussian_sigma() must be met.
// 3.) Requirements of get_generalized_gaussian_beta() must be
// met.
// Returns: Return a scanline's light output over a given pixel, using
// a generalized or pure Gaussian distribution and sampling or
// integrals as desired by user codepath choices.
if(beam_generalized_gaussian)
{
if(beam_antialias_level > 1.5)
{
return scanline_generalized_gaussian_integral_contrib(
dist, color, pixel_height, sigma_range, shape_range);
}
else
{
return scanline_generalized_gaussian_sampled_contrib(
dist, color, pixel_height, sigma_range, shape_range);
}
}
else
{
if(beam_antialias_level > 1.5)
{
return scanline_gaussian_integral_contrib(
dist, color, pixel_height, sigma_range);
}
else
{
return scanline_gaussian_sampled_contrib(
dist, color, pixel_height, sigma_range);
}
}
}
inline float3 get_raw_interpolated_color(const float3 color0,
const float3 color1, const float3 color2, const float3 color3,
const float4 weights)
{
// Use max to avoid bizarre artifacts from negative colors:
return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0);
}
float3 get_interpolated_linear_color(const float3 color0, const float3 color1,
const float3 color2, const float3 color3, const float4 weights)
{
// Requires: 1.) Requirements of include/gamma-management.h must be met:
// intermediate_gamma must be globally defined, and input
// colors are interpreted as linear RGB unless you #define
// GAMMA_ENCODE_EVERY_FBO (in which case they are
// interpreted as gamma-encoded with intermediate_gamma).
// 2.) color0-3 are colors sampled from a texture with tex2D().
// They are interpreted as defined in requirement 1.
// 3.) weights contains weights for each color, summing to 1.0.
// 4.) beam_horiz_linear_rgb_weight must be defined as a global
// float in [0.0, 1.0] describing how much blending should
// be done in linear RGB (rest is gamma-corrected RGB).
// 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined
// if beam_horiz_linear_rgb_weight is anything other than a
// static constant, or we may try branching at runtime
// without dynamic branches allowed (slow).
// Returns: Return an interpolated color lookup between the four input
// colors based on the weights in weights. The final color will
// be a linear RGB value, but the blending will be done as
// indicated above.
const float intermediate_gamma = get_intermediate_gamma();
// Branch if beam_horiz_linear_rgb_weight is static (for free) or if the
// profile allows dynamic branches (faster than computing extra pows):
#ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
#define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
#else
#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES
#define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
#endif
#endif
#ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
// beam_horiz_linear_rgb_weight is static, so we can branch:
#ifdef GAMMA_ENCODE_EVERY_FBO
const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
color0, color1, color2, color3, weights), intermediate_gamma);
if(beam_horiz_linear_rgb_weight > 0.0)
{
const float3 linear_mixed_color = get_raw_interpolated_color(
pow(color0, intermediate_gamma),
pow(color1, intermediate_gamma),
pow(color2, intermediate_gamma),
pow(color3, intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
}
else
{
return gamma_mixed_color;
}
#else
const float3 linear_mixed_color = get_raw_interpolated_color(
color0, color1, color2, color3, weights);
if(beam_horiz_linear_rgb_weight < 1.0)
{
const float3 gamma_mixed_color = get_raw_interpolated_color(
pow(color0, 1.0/intermediate_gamma),
pow(color1, 1.0/intermediate_gamma),
pow(color2, 1.0/intermediate_gamma),
pow(color3, 1.0/intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
}
else
{
return linear_mixed_color;
}
#endif // GAMMA_ENCODE_EVERY_FBO
#else
#ifdef GAMMA_ENCODE_EVERY_FBO
// Inputs: color0-3 are colors in gamma-encoded RGB.
const float3 gamma_mixed_color = pow(get_raw_interpolated_color(
color0, color1, color2, color3, weights), intermediate_gamma);
const float3 linear_mixed_color = get_raw_interpolated_color(
pow(color0, intermediate_gamma),
pow(color1, intermediate_gamma),
pow(color2, intermediate_gamma),
pow(color3, intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
#else
// Inputs: color0-3 are colors in linear RGB.
const float3 linear_mixed_color = get_raw_interpolated_color(
color0, color1, color2, color3, weights);
const float3 gamma_mixed_color = get_raw_interpolated_color(
pow(color0, 1.0/intermediate_gamma),
pow(color1, 1.0/intermediate_gamma),
pow(color2, 1.0/intermediate_gamma),
pow(color3, 1.0/intermediate_gamma),
weights);
return lerp(gamma_mixed_color, linear_mixed_color,
beam_horiz_linear_rgb_weight);
#endif // GAMMA_ENCODE_EVERY_FBO
#endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT
}
float3 get_scanline_color(const sampler2D texture, const float2 scanline_uv,
const float2 uv_step_x, const float4 weights)
{
// Requires: 1.) scanline_uv must be vertically snapped to the caller's
// desired line or scanline and horizontally snapped to the
// texel just left of the output pixel (color1)
// 2.) uv_step_x must contain the horizontal uv distance
// between texels.
// 3.) weights must contain interpolation filter weights for
// color0, color1, color2, and color3, where color1 is just
// left of the output pixel.
// Returns: Return a horizontally interpolated texture lookup using 2-4
// nearby texels, according to weights and the conventions of
// get_interpolated_linear_color().
// We can ignore the outside texture lookups for Quilez resampling.
const float3 color1 = tex2D(texture, scanline_uv).rgb;
const float3 color2 = tex2D(texture, scanline_uv + uv_step_x).rgb;
float3 color0 = float3(0.0);
float3 color3 = float3(0.0);
if(beam_horiz_filter > 0.5)
{
color0 = tex2D(texture, scanline_uv - uv_step_x).rgb;
color3 = tex2D(texture, scanline_uv + 2.0 * uv_step_x).rgb;
}
// Sample the texture as-is, whether it's linear or gamma-encoded:
// get_interpolated_linear_color() will handle the difference.
return get_interpolated_linear_color(color0, color1, color2, color3, weights);
}
float3 sample_single_scanline_horizontal(const sampler2D texture,
const float2 tex_uv, const float2 texture_size,
const float2 texture_size_inv)
{
// TODO: Add function requirements.
// Snap to the previous texel and get sample dists from 2/4 nearby texels:
const float2 curr_texel = tex_uv * texture_size;
// Use under_half to fix a rounding bug right around exact texel locations.
const float2 prev_texel =
floor(curr_texel - float2(under_half)) + float2(0.5);
const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y);
const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv;
const float prev_dist = curr_texel.x - prev_texel_hor.x;
const float4 sample_dists = float4(1.0 + prev_dist, prev_dist,
1.0 - prev_dist, 2.0 - prev_dist);
// Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
float4 weights;
if(beam_horiz_filter < 0.5)
{
// Quilez:
const float x = sample_dists.y;
const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
weights = float4(0.0, 1.0 - w2, w2, 0.0);
}
else if(beam_horiz_filter < 1.5)
{
// Gaussian:
float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
weights = exp(-(sample_dists*sample_dists)*inner_denom_inv);
}
else
{
// Lanczos2:
const float4 pi_dists = FIX_ZERO(sample_dists * pi);
weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) /
(pi_dists * pi_dists);
}
// Ensure the weight sum == 1.0:
const float4 final_weights = weights/dot(weights, float4(1.0));
// Get the interpolated horizontal scanline color:
const float2 uv_step_x = float2(texture_size_inv.x, 0.0);
return get_scanline_color(
texture, prev_texel_hor_uv, uv_step_x, final_weights);
}
float3 sample_rgb_scanline_horizontal(const sampler2D texture,
const float2 tex_uv, const float2 texture_size,
const float2 texture_size_inv)
{
// TODO: Add function requirements.
// Rely on a helper to make convergence easier.
if(beam_misconvergence)
{
const float3 convergence_offsets_rgb =
get_convergence_offsets_x_vector();
const float3 offset_u_rgb =
convergence_offsets_rgb * texture_size_inv.xxx;
const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0);
const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0);
const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0);
const float3 sample_r = sample_single_scanline_horizontal(
texture, scanline_uv_r, texture_size, texture_size_inv);
const float3 sample_g = sample_single_scanline_horizontal(
texture, scanline_uv_g, texture_size, texture_size_inv);
const float3 sample_b = sample_single_scanline_horizontal(
texture, scanline_uv_b, texture_size, texture_size_inv);
return float3(sample_r.r, sample_g.g, sample_b.b);
}
else
{
return sample_single_scanline_horizontal(texture, tex_uv, texture_size,
texture_size_inv);
}
}
float2 get_last_scanline_uv(const float2 tex_uv, const float2 texture_size,
const float2 texture_size_inv, const float2 il_step_multiple,
const float frame_count, out float dist)
{
// Compute texture coords for the last/upper scanline, accounting for
// interlacing: With interlacing, only consider even/odd scanlines every
// other frame. Top-field first (TFF) order puts even scanlines on even
// frames, and BFF order puts them on odd frames. Texels are centered at:
// frac(tex_uv * texture_size) == x.5
// Caution: If these coordinates ever seem incorrect, first make sure it's
// not because anisotropic filtering is blurring across field boundaries.
// Note: TFF/BFF won't matter for sources that double-weave or similar.
const float field_offset = floor(il_step_multiple.y * 0.75) *
fmod(frame_count + float(interlace_bff), 2.0);
const float2 curr_texel = tex_uv * texture_size;
// Use under_half to fix a rounding bug right around exact texel locations.
const float2 prev_texel_num = floor(curr_texel - float2(under_half));
const float wrong_field = fmod(
prev_texel_num.y + field_offset, il_step_multiple.y);
const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field);
// Snap to the center of the previous scanline in the current field:
const float2 scanline_texel = scanline_texel_num + float2(0.5);
const float2 scanline_uv = scanline_texel * texture_size_inv;
// Save the sample's distance from the scanline, in units of scanlines:
dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y;
return scanline_uv;
}
*/
bool is_interlaced(float num_lines)
{
// Detect interlacing based on the number of lines in the source.
if(interlace_detect)
{
// NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field
// NTSC Emulators: Typically 224 or 240 lines
// PAL: 625 lines, 312.5/field; 576 active (typical), 288/field
// PAL Emulators: ?
// ATSC: 720p, 1080i, 1080p
// Where do we place our cutoffs? Assumptions:
// 1.) We only need to care about active lines.
// 2.) Anything > 288 and <= 576 lines is probably interlaced.
// 3.) Anything > 576 lines is probably not interlaced...
// 4.) ...except 1080 lines, which is a crapshoot (user decision).
// 5.) Just in case the main program uses calculated video sizes,
// we should nudge the float thresholds a bit.
bool sd_interlace;
if (num_lines > 288.5 && num_lines < 576.5)
{sd_interlace = true;}
else
{sd_interlace = false;}
bool hd_interlace;
if (num_lines > 1079.5 && num_lines < 1080.5)
{hd_interlace = false;}
else
{hd_interlace = sd_interlace || hd_interlace;}
}
else
{
return false;
}
}
#endif // SCANLINE_FUNCTIONS_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,58 @@
#ifndef USER_CGP_CONSTANTS_H
#define USER_CGP_CONSTANTS_H
// IMPORTANT:
// These constants MUST be set appropriately for the settings in crt-royale.cgp
// (or whatever related .cgp file you're using). If they aren't, you're likely
// to get artifacts, the wrong phosphor mask size, etc. I wish these could be
// set directly in the .cgp file to make things easier, but...they can't.
// PASS SCALES AND RELATED CONSTANTS:
// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of
// this shader: One does a viewport-scale bloom, and the other skips it. The
// latter benefits from a higher bloom_approx_scale_x, so save both separately:
const float bloom_approx_size_x = 320.0;
const float bloom_approx_size_x_for_fake = 400.0;
// Copy the viewport-relative scales of the phosphor mask resize passes
// (MASK_RESIZE and the pass immediately preceding it):
const vec2 mask_resize_viewport_scale = vec2(0.0625, 0.0625);
// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.:
const float geom_max_aspect_ratio = 4.0/3.0;
// PHOSPHOR MASK TEXTURE CONSTANTS:
// Set the following constants to reflect the properties of the phosphor mask
// texture named in crt-royale.cgp. The shader optionally resizes a mask tile
// based on user settings, then repeats a single tile until filling the screen.
// The shader must know the input texture size (default 64x64), and to manually
// resize, it must also know the horizontal triads per tile (default 8).
const vec2 mask_texture_small_size = vec2(64.0);
const vec2 mask_texture_large_size = vec2(512.0);
const float mask_triads_per_tile = 8.0;
// We need the average brightness of the phosphor mask to compensate for the
// dimming it causes. The following four values are roughly correct for the
// masks included with the shader. Update the value for any LUT texture you
// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether
// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15).
//#define PHOSPHOR_MASK_GRILLE14
const float mask_grille14_avg_color = 50.6666666/255.0;
// TileableLinearApertureGrille14Wide7d33Spacing*.png
// TileableLinearApertureGrille14Wide10And6Spacing*.png
const float mask_grille15_avg_color = 53.0/255.0;
// TileableLinearApertureGrille15Wide6d33Spacing*.png
// TileableLinearApertureGrille15Wide8And5d5Spacing*.png
const float mask_slot_avg_color = 46.0/255.0;
// TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png
// TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png
const float mask_shadow_avg_color = 41.0/255.0;
// TileableLinearShadowMask*.png
// TileableLinearShadowMaskEDP*.png
#ifdef PHOSPHOR_MASK_GRILLE14
const float mask_grille_avg_color = mask_grille14_avg_color;
#else
const float mask_grille_avg_color = mask_grille15_avg_color;
#endif
#endif // USER_CGP_CONSTANTS_H

View file

@ -0,0 +1,359 @@
#ifndef USER_SETTINGS_H
#define USER_SETTINGS_H
///////////////////////////// DRIVER CAPABILITIES ////////////////////////////
// The Cg compiler uses different "profiles" with different capabilities.
// This shader requires a Cg compilation profile >= arbfp1, but a few options
// require higher profiles like fp30 or fp40. The shader can't detect profile
// or driver capabilities, so instead you must comment or uncomment the lines
// below with "//" before "#define." Disable an option if you get compilation
// errors resembling those listed. Generally speaking, all of these options
// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is
// likely to run on ATI/AMD, due to the Cg compiler's profile limitations.
// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1.
// Among other things, derivatives help us fix anisotropic filtering artifacts
// with curved manually tiled phosphor mask coords. Related errors:
// error C3004: function "vec2 ddx(vec2);" not supported in this profile
// error C3004: function "vec2 ddy(vec2);" not supported in this profile
//#define DRIVERS_ALLOW_DERIVATIVES
// Fine derivatives: Unsupported on older ATI cards.
// Fine derivatives enable 2x2 fragment block communication, letting us perform
// fast single-pass blur operations. If your card uses coarse derivatives and
// these are enabled, blurs could look broken. Derivatives are a prerequisite.
#ifdef DRIVERS_ALLOW_DERIVATIVES
#define DRIVERS_ALLOW_FINE_DERIVATIVES
#endif
// Dynamic looping: Requires an fp30 or newer profile.
// This makes phosphor mask resampling faster in some cases. Related errors:
// error C5013: profile does not support "for" statements and "for" could not
// be unrolled
//#define DRIVERS_ALLOW_DYNAMIC_BRANCHES
// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops.
// Using one static loop avoids overhead if the user is right, but if the user
// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a
// binary search can potentially save some iterations. However, it may fail:
// error C6001: Temporary register limit of 32 exceeded; 35 registers
// needed to compile program
//#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS
// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable
// anisotropic filtering, thereby fixing related artifacts. Related errors:
// error C3004: function "vec4 tex2Dlod(sampler2D, vec4);" not supported in
// this profile
//#define DRIVERS_ALLOW_TEX2DLOD
// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate
// artifacts from anisotropic filtering and mipmapping. Related errors:
// error C3004: function "vec4 tex2Dbias(sampler2D, vec4);" not supported
// in this profile
//#define DRIVERS_ALLOW_TEX2DBIAS
// Integrated graphics compatibility: Integrated graphics like Intel HD 4000
// impose stricter limitations on register counts and instructions. Enable
// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or:
// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed
// to compile program.
// Enabling integrated graphics compatibility mode will automatically disable:
// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer.
// (This may be reenabled in a later release.)
// 2.) RUNTIME_GEOMETRY_MODE
// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation
//#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE
//////////////////////////// USER CODEPATH OPTIONS ///////////////////////////
// To disable a #define option, turn its line into a comment with "//."
// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications):
// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override
// many of the options in this file and allow real-time tuning, but many of
// them are slower. Disabling them and using this text file will boost FPS.
#define RUNTIME_SHADER_PARAMS_ENABLE
// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but
// it's the only way to do a wide-enough full bloom with a runtime dot pitch.
#define RUNTIME_PHOSPHOR_BLOOM_SIGMA
// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics)
#define RUNTIME_ANTIALIAS_WEIGHTS
// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!)
//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS
// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader
// parameters? This will require more math or dynamic branching.
#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE
// Specify the tilt at runtime? This makes things about 3% slower.
#define RUNTIME_GEOMETRY_TILT
// Specify the geometry mode at runtime?
#define RUNTIME_GEOMETRY_MODE
// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and
// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without
// dynamic branches? This is cheap if mask_resize_viewport_scale is small.
#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT
// PHOSPHOR MASK:
// Manually resize the phosphor mask for best results (slower)? Disabling this
// removes the option to do so, but it may be faster without dynamic branches.
#define PHOSPHOR_MASK_MANUALLY_RESIZE
// If we sinc-resize the mask, should we Lanczos-window it (slower but better)?
#define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW
// Larger blurs are expensive, but we need them to blur larger triads. We can
// detect the right blur if the triad size is static or our profile allows
// dynamic branches, but otherwise we use the largest blur the user indicates
// they might need:
#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS
//#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS
// Here's a helpful chart:
// MaxTriadSize BlurSize MinTriadCountsByResolution
// 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
// 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect
/////////////////////////////// USER PARAMETERS //////////////////////////////
// Note: Many of these static parameters are overridden by runtime shader
// parameters when those are enabled. However, many others are static codepath
// options that were cleaner or more convert to code as static constants.
// GAMMA:
const float crt_gamma_static = 2.5; // range [1, 5]
const float lcd_gamma_static = 2.2; // range [1, 5]
// LEVELS MANAGEMENT:
// Control the final multiplicative image contrast:
const float levels_contrast_static = 1.0; // range [0, 4)
// We auto-dim to avoid clipping between passes and restore brightness
// later. Control the dim factor here: Lower values clip less but crush
// blacks more (static only for now).
const float levels_autodim_temp = 0.5; // range (0, 1]
// HALATION/DIFFUSION/BLOOM:
// Halation weight: How much energy should be lost to electrons bounding
// around under the CRT glass and exciting random phosphors?
const float halation_weight_static = 0.0; // range [0, 1]
// Refractive diffusion weight: How much light should spread/diffuse from
// refracting through the CRT glass?
const float diffusion_weight_static = 0.075; // range [0, 1]
// Underestimate brightness: Bright areas bloom more, but we can base the
// bloom brightpass on a lower brightness to sharpen phosphors, or a higher
// brightness to soften them. Low values clip, but >= 0.8 looks okay.
const float bloom_underestimate_levels_static = 0.8; // range [0, 5]
// Blur all colors more than necessary for a softer phosphor bloom?
const float bloom_excess_static = 0.0; // range [0, 1]
// The BLOOM_APPROX pass approximates a phosphor blur early on with a small
// blurred resize of the input (convergence offsets are applied as well).
// There are three filter options (static option only for now):
// 0.) Bilinear resize: A fast, close approximation to a 4x4 resize
// if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane
// and beam_max_sigma is low.
// 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring,
// always uses a static sigma regardless of beam_max_sigma or
// mask_num_triads_desired.
// 2.) True 4x4 Gaussian resize: Slowest, technically correct.
// These options are more pronounced for the fast, unbloomed shader version.
const float bloom_approx_filter_static = 2.0;
// ELECTRON BEAM SCANLINE DISTRIBUTION:
// How many scanlines should contribute light to each pixel? Using more
// scanlines is slower (especially for a generalized Gaussian) but less
// distorted with larger beam sigmas (especially for a pure Gaussian). The
// max_beam_sigma at which the closest unused weight is guaranteed <
// 1.0/255.0 (for a 3x antialiased pure Gaussian) is:
// 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized
// 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized
// 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized
// 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized
// 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized
const float beam_num_scanlines = 3.0; // range [2, 6]
// A generalized Gaussian beam varies shape with color too, now just width.
// It's slower but more flexible (static option only for now).
const bool beam_generalized_gaussian = true;
// What kind of scanline antialiasing do you want?
// 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral
// Integrals are slow (especially for generalized Gaussians) and rarely any
// better than 3x antialiasing (static option only for now).
const float beam_antialias_level = 1.0; // range [0, 2]
// Min/max standard deviations for scanline beams: Higher values widen and
// soften scanlines. Depending on other options, low min sigmas can alias.
const float beam_min_sigma_static = 0.02; // range (0, 1]
const float beam_max_sigma_static = 0.3; // range (0, 1]
// Beam width varies as a function of color: A power function (0) is more
// configurable, but a spherical function (1) gives the widest beam
// variability without aliasing (static option only for now).
const float beam_spot_shape_function = 0.0;
// Spot shape power: Powers <= 1 give smoother spot shapes but lower
// sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close.
const float beam_spot_power_static = 1.0/3.0; // range (0, 16]
// Generalized Gaussian max shape parameters: Higher values give flatter
// scanline plateaus and steeper dropoffs, simultaneously widening and
// sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and
// values > ~40.0 cause artifacts with integrals.
const float beam_min_shape_static = 2.0; // range [2, 32]
const float beam_max_shape_static = 4.0; // range [2, 32]
// Generalized Gaussian shape power: Affects how quickly the distribution
// changes shape from Gaussian to steep/plateaued as color increases from 0
// to 1.0. Higher powers appear softer for most colors, and lower powers
// appear sharper for most colors.
const float beam_shape_power_static = 1.0/4.0; // range (0, 16]
// What filter should be used to sample scanlines horizontally?
// 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp)
const float beam_horiz_filter_static = 0.0;
// Standard deviation for horizontal Gaussian resampling:
const float beam_horiz_sigma_static = 0.35; // range (0, 2/3]
// Do horizontal scanline sampling in linear RGB (correct light mixing),
// gamma-encoded RGB (darker, hard spot shape, may better match bandwidth-
// limiting circuitry in some CRT's), or a weighted avg.?
const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1]
// Simulate scanline misconvergence? This needs 3x horizontal texture
// samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in
// later passes (static option only for now).
const bool beam_misconvergence = true;
// Convergence offsets in x/y directions for R/G/B scanline beams in units
// of scanlines. Positive offsets go right/down; ranges [-2, 2]
const vec2 convergence_offsets_r_static = vec2(0.1, 0.2);
const vec2 convergence_offsets_g_static = vec2(0.3, 0.4);
const vec2 convergence_offsets_b_static = vec2(0.5, 0.6);
// Detect interlacing (static option only for now)?
const bool interlace_detect = true;
// Assume 1080-line sources are interlaced?
const bool interlace_1080i_static = false;
// For interlaced sources, assume TFF (top-field first) or BFF order?
// (Whether this matters depends on the nature of the interlaced input.)
const bool interlace_bff_static = false;
// ANTIALIASING:
// What AA level do you want for curvature/overscan/subpixels? Options:
// 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x
// (Static option only for now)
const float aa_level = 12.0; // range [0, 24]
// What antialiasing filter do you want (static option only)? Options:
// 0: Box (separable), 1: Box (cylindrical),
// 2: Tent (separable), 3: Tent (cylindrical),
// 4: Gaussian (separable), 5: Gaussian (cylindrical),
// 6: Cubic* (separable), 7: Cubic* (cylindrical, poor)
// 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor)
// * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS
const float aa_filter = 6.0; // range [0, 9]
// Flip the sample grid on odd/even frames (static option only for now)?
const bool aa_temporal = false;
// Use RGB subpixel offsets for antialiasing? The pixel is at green, and
// the blue offset is the negative r offset; range [0, 0.5]
const vec2 aa_subpixel_r_offset_static = vec2(-1.0/3.0, 0.0);//vec2(0.0);
// Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell
// 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality.
// 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening.
// 3.) C = 1.0/3.0 is the Mitchell-Netravali filter.
// 4.) C = 0.0 is a soft spline filter.
const float aa_cubic_c_static = 0.5; // range [0, 4]
// Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter.
const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0]
// PHOSPHOR MASK:
// Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask
const float mask_type_static = 1.0; // range [0, 2]
// We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible.
// 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible).
// This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined.
// 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This
// is halfway decent with LUT mipmapping but atrocious without it.
// 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords
// (pretty/fast/inflexible). Each input LUT has a fixed dot pitch.
// This mode reuses the same masks, so triads will be enormous unless
// you change the mask LUT filenames in your .cgp file.
const float mask_sample_mode_static = 0.0; // range [0, 2]
// Prefer setting the triad size (0.0) or number on the screen (1.0)?
// If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size
// will always be used to calculate the full bloom sigma statically.
const float mask_specify_num_triads_static = 0.0; // range [0, 1]
// Specify the phosphor triad size, in pixels. Each tile (usually with 8
// triads) will be rounded to the nearest integer tile size and clamped to
// obey minimum size constraints (imposed to reduce downsize taps) and
// maximum size constraints (imposed to have a sane MASK_RESIZE FBO size).
// To increase the size limit, double the viewport-relative scales for the
// two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h.
// range [1, mask_texture_small_size/mask_triads_per_tile]
const float mask_triad_size_desired_static = 24.0 / 8.0;
// If mask_specify_num_triads is 1.0/true, we'll go by this instead (the
// final size will be rounded and constrained as above); default 480.0
const float mask_num_triads_desired_static = 480.0;
// How many lobes should the sinc/Lanczos resizer use? More lobes require
// more samples and avoid moire a bit better, but some is unavoidable
// depending on the destination size (static option for now).
const float mask_sinc_lobes = 3.0; // range [2, 4]
// The mask is resized using a variable number of taps in each dimension,
// but some Cg profiles always fetch a constant number of taps no matter
// what (no dynamic branching). We can limit the maximum number of taps if
// we statically limit the minimum phosphor triad size. Larger values are
// faster, but the limit IS enforced (static option only, forever);
// range [1, mask_texture_small_size/mask_triads_per_tile]
// TODO: Make this 1.0 and compensate with smarter sampling!
const float mask_min_allowed_triad_size = 2.0;
// GEOMETRY:
// Geometry mode:
// 0: Off (default), 1: Spherical mapping (like cgwg's),
// 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron
const float geom_mode_static = 0.0; // range [0, 3]
// Radius of curvature: Measured in units of your viewport's diagonal size.
const float geom_radius_static = 2.0; // range [1/(2*pi), 1024]
// View dist is the distance from the player to their physical screen, in
// units of the viewport's diagonal size. It controls the field of view.
const float geom_view_dist_static = 2.0; // range [0.5, 1024]
// Tilt angle in radians (clockwise around up and right vectors):
const vec2 geom_tilt_angle_static = vec2(0.0, 0.0); // range [-pi, pi]
// Aspect ratio: When the true viewport size is unknown, this value is used
// to help convert between the phosphor triad size and count, along with
// the mask_resize_viewport_scale constant from user-cgp-constants.h. Set
// this equal to Retroarch's display aspect ratio (DAR) for best results;
// range [1, geom_max_aspect_ratio from user-cgp-constants.h];
// default (256/224)*(54/47) = 1.313069909 (see below)
const float geom_aspect_ratio_static = 1.313069909;
// Before getting into overscan, here's some general aspect ratio info:
// - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting
// - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR
// - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping
// Geometry processing has to "undo" the screen-space 2D DAR to calculate
// 3D view vectors, then reapplies the aspect ratio to the simulated CRT in
// uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either:
// a.) Enable Retroarch's "Crop Overscan"
// b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0)
// Real consoles use horizontal black padding in the signal, but emulators
// often crop this without cropping the vertical padding; a 256x224 [S]NES
// frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not.
// The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun:
// http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50
// http://forums.nesdev.com/viewtopic.php?p=24815#p24815
// For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR
// without doing a. or b., but horizontal image borders will be tighter
// than vertical ones, messing up curvature and overscan. Fixing the
// padding first corrects this.
// Overscan: Amount to "zoom in" before cropping. You can zoom uniformly
// or adjust x/y independently to e.g. readd horizontal padding, as noted
// above: Values < 1.0 zoom out; range (0, inf)
const vec2 geom_overscan_static = vec2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0)
// Compute a proper pixel-space to texture-space matrix even without ddx()/
// ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering
// with strong curvature (static option only for now).
const bool geom_force_correct_tangent_matrix = true;
// BORDERS:
// Rounded border size in texture uv coords:
const float border_size_static = 0.015; // range [0, 0.5]
// Border darkness: Moderate values darken the border smoothly, and high
// values make the image very dark just inside the border:
const float border_darkness_static = 2.0; // range [0, inf)
// Border compression: High numbers compress border transitions, narrowing
// the dark border area.
const float border_compress_static = 2.5; // range [1, inf)
#endif // USER_SETTINGS_H