diff --git a/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png new file mode 100644 index 0000000..2995ae5 Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png differ diff --git a/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png new file mode 100644 index 0000000..2c3f21e Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png differ diff --git a/crt/shaders/crt-royale/TileableLinearShadowMask.png b/crt/shaders/crt-royale/TileableLinearShadowMask.png new file mode 100644 index 0000000..ca40956 Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMask.png differ diff --git a/crt/shaders/crt-royale/TileableLinearShadowMaskEDP.png b/crt/shaders/crt-royale/TileableLinearShadowMaskEDP.png new file mode 100644 index 0000000..a3844dc Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMaskEDP.png differ diff --git a/crt/shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png b/crt/shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png new file mode 100644 index 0000000..b61d92a Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png differ diff --git a/crt/shaders/crt-royale/TileableLinearShadowMaskResizeTo64.png b/crt/shaders/crt-royale/TileableLinearShadowMaskResizeTo64.png new file mode 100644 index 0000000..9b66ffb Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearShadowMaskResizeTo64.png differ diff --git a/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png new file mode 100644 index 0000000..eb20b23 Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png differ diff --git a/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png new file mode 100644 index 0000000..df518db Binary files /dev/null and b/crt/shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png differ diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/README.TXT b/crt/shaders/crt-royale/crt-royale-settings-files/README.TXT new file mode 100644 index 0000000..38270a7 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/README.TXT @@ -0,0 +1,12 @@ +These files aren't nearly as good as canned .cgp presets with all of your +favorite settings, and there aren't nearly enough, but they're a start. + +The nVidia settings files will only work on nVidia cards. +The ATI settings files will work on both AMD/ATI and nVidia cards. +The Intel settings files should additionally work on Intel HD 4000 Graphics, but +they disable manual phosphor mask resizing, so the phosphor mask will be softer. + +For compatibility with Intel integrated graphics, you can either use the Intel- +specific .cgp files or use the Intel settings files. These are the same as the +ATI settings, except the following line is also uncommented: + #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati-clean.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati-clean.h new file mode 100644 index 0000000..058899d --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati-clean.h @@ -0,0 +1,92 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + + //#define DRIVERS_ALLOW_DERIVATIVES + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + //#define DRIVERS_ALLOW_TEX2DLOD + //#define DRIVERS_ALLOW_TEX2DBIAS + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +#define RUNTIME_SHADER_PARAMS_ENABLE +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +#define RUNTIME_ANTIALIAS_WEIGHTS +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +#define RUNTIME_GEOMETRY_TILT +#define RUNTIME_GEOMETRY_MODE +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define PHOSPHOR_MASK_MANUALLY_RESIZE + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + static const float levels_contrast_static = 1.0; // range [0, 4) + static const float levels_autodim_temp = 0.5; // range (0, 1] + static const float halation_weight_static = 0.0; // range [0, 1] + static const float diffusion_weight_static = 0.075; // range [0, 1] + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + static const float bloom_excess_static = 0.0; // range [0, 1] + static const float bloom_approx_filter_static = 2.0; + static const float beam_num_scanlines = 3.0; // range [2, 6] + static const bool beam_generalized_gaussian = true; + static const float beam_antialias_level = 1.0; // range [0, 2] + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + static const float beam_spot_shape_function = 0.0; + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + static const float beam_horiz_filter_static = 0.0; + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + static const bool beam_misconvergence = true; + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + static const bool interlace_detect = true; + static const bool interlace_1080i_static = false; + static const bool interlace_bff_static = false; + static const float aa_level = 12.0; // range [0, 24] + static const float aa_filter = 6.0; // range [0, 9] + static const bool aa_temporal = false; + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + static const float aa_cubic_c_static = 0.5; // range [0, 4] + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + static const float mask_type_static = 1.0; // range [0, 2] + static const float mask_sample_mode_static = 0.0; // range [0, 2] + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + static const float mask_num_triads_desired_static = 480.0; + static const float mask_sinc_lobes = 3.0; // range [2, 4] + static const float mask_min_allowed_triad_size = 2.0; + static const float geom_mode_static = 0.0; // range [0, 3] + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + static const float geom_aspect_ratio_static = 1.313069909; + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + static const bool geom_force_correct_tangent_matrix = true; + static const float border_size_static = 0.015; // range [0, 0.5] + static const float border_darkness_static = 2.0; // range [0, inf) + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati.h new file mode 100644 index 0000000..fc35aee --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-ati.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel-clean.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel-clean.h new file mode 100644 index 0000000..6929369 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel-clean.h @@ -0,0 +1,92 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + + //#define DRIVERS_ALLOW_DERIVATIVES + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + //#define DRIVERS_ALLOW_TEX2DLOD + //#define DRIVERS_ALLOW_TEX2DBIAS + #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +#define RUNTIME_SHADER_PARAMS_ENABLE +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +#define RUNTIME_ANTIALIAS_WEIGHTS +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +#define RUNTIME_GEOMETRY_TILT +#define RUNTIME_GEOMETRY_MODE +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define PHOSPHOR_MASK_MANUALLY_RESIZE + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + static const float levels_contrast_static = 1.0; // range [0, 4) + static const float levels_autodim_temp = 0.5; // range (0, 1] + static const float halation_weight_static = 0.0; // range [0, 1] + static const float diffusion_weight_static = 0.075; // range [0, 1] + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + static const float bloom_excess_static = 0.0; // range [0, 1] + static const float bloom_approx_filter_static = 0.0; + static const float beam_num_scanlines = 3.0; // range [2, 6] + static const bool beam_generalized_gaussian = true; + static const float beam_antialias_level = 1.0; // range [0, 2] + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + static const float beam_spot_shape_function = 0.0; + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + static const float beam_horiz_filter_static = 0.0; + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + static const bool beam_misconvergence = true; + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + static const bool interlace_detect = true; + static const bool interlace_1080i_static = false; + static const bool interlace_bff_static = false; + static const float aa_level = 12.0; // range [0, 24] + static const float aa_filter = 6.0; // range [0, 9] + static const bool aa_temporal = false; + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + static const float aa_cubic_c_static = 0.5; // range [0, 4] + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + static const float mask_type_static = 1.0; // range [0, 2] + static const float mask_sample_mode_static = 0.0; // range [0, 2] + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + static const float mask_num_triads_desired_static = 480.0; + static const float mask_sinc_lobes = 3.0; // range [2, 4] + static const float mask_min_allowed_triad_size = 2.0; + static const float geom_mode_static = 0.0; // range [0, 3] + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + static const float geom_aspect_ratio_static = 1.313069909; + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + static const bool geom_force_correct_tangent_matrix = true; + static const float border_size_static = 0.015; // range [0, 0.5] + static const float border_darkness_static = 2.0; // range [0, inf) + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel.h new file mode 100644 index 0000000..9ce0b3f --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-intel.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 0.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia-clean.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia-clean.h new file mode 100644 index 0000000..ce837fd --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia-clean.h @@ -0,0 +1,92 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + + #define DRIVERS_ALLOW_DERIVATIVES + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + #define DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define DRIVERS_ALLOW_TEX2DLOD + #define DRIVERS_ALLOW_TEX2DBIAS + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +#define RUNTIME_SHADER_PARAMS_ENABLE +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +#define RUNTIME_ANTIALIAS_WEIGHTS +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +#define RUNTIME_GEOMETRY_TILT +#define RUNTIME_GEOMETRY_MODE +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define PHOSPHOR_MASK_MANUALLY_RESIZE + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + static const float levels_contrast_static = 1.0; // range [0, 4) + static const float levels_autodim_temp = 0.5; // range (0, 1] + static const float halation_weight_static = 0.0; // range [0, 1] + static const float diffusion_weight_static = 0.075; // range [0, 1] + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + static const float bloom_excess_static = 0.0; // range [0, 1] + static const float bloom_approx_filter_static = 2.0; + static const float beam_num_scanlines = 3.0; // range [2, 6] + static const bool beam_generalized_gaussian = true; + static const float beam_antialias_level = 1.0; // range [0, 2] + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + static const float beam_spot_shape_function = 0.0; + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + static const float beam_horiz_filter_static = 0.0; + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + static const bool beam_misconvergence = true; + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + static const bool interlace_detect = true; + static const bool interlace_1080i_static = false; + static const bool interlace_bff_static = false; + static const float aa_level = 12.0; // range [0, 24] + static const float aa_filter = 6.0; // range [0, 9] + static const bool aa_temporal = false; + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + static const float aa_cubic_c_static = 0.5; // range [0, 4] + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + static const float mask_type_static = 1.0; // range [0, 2] + static const float mask_sample_mode_static = 0.0; // range [0, 2] + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + static const float mask_num_triads_desired_static = 480.0; + static const float mask_sinc_lobes = 3.0; // range [2, 4] + static const float mask_min_allowed_triad_size = 2.0; + static const float geom_mode_static = 0.0; // range [0, 3] + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + static const float geom_aspect_ratio_static = 1.313069909; + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + static const bool geom_force_correct_tangent_matrix = true; + static const float border_size_static = 0.015; // range [0, 0.5] + static const float border_darkness_static = 2.0; // range [0, inf) + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia.h new file mode 100644 index 0000000..60c2c9a --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-default-nvidia.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + #define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + #define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + #define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + #define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-ati.h new file mode 100644 index 0000000..f01d68f --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-ati.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +//#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +//#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = false; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 8.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = false; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-intel.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-intel.h new file mode 100644 index 0000000..fd76d99 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-intel.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +//#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +//#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 0.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = false; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 8.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = false; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-nvidia.h new file mode 100644 index 0000000..1737335 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-nvidia.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + #define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + #define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + #define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + #define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +//#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +//#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = false; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 8.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = false; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-ati.h new file mode 100644 index 0000000..0439ec0 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-ati.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +//#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +//#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +//#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +//#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 0.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 2.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = false; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = false; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 8.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = mask_specify_num_triads_static; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = false; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-intel.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-intel.h new file mode 100644 index 0000000..8e2f683 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-intel.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + #define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +//#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +//#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +//#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +//#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 0.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 2.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = false; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = false; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 8.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = mask_specify_num_triads_static; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = false; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-nvidia.h new file mode 100644 index 0000000..fef7c4f --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-fast-static-nvidia.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + #define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + #define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + #define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + #define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +//#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +//#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +//#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +//#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +//#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +//#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +//#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 0.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 2.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = false; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = false; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 8.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(0.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = mask_specify_num_triads_static; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = false; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-ati.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-ati.h new file mode 100644 index 0000000..964ca12 --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-ati.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 4.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-nvidia.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-nvidia.h new file mode 100644 index 0000000..c6ad27e --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings-royale-with-cheese-nvidia.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + #define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + #define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + #define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + #define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + #define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 4.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/crt-royale-settings-files/user-settings.h b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings.h new file mode 100644 index 0000000..fc35aee --- /dev/null +++ b/crt/shaders/crt-royale/crt-royale-settings-files/user-settings.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "float2 ddx(float2);" not supported in this profile +// error C3004: function "float2 ddy(float2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "float4 tex2Dlod(sampler2D, float4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "float4 tex2Dbias(sampler2D, float4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + static const float crt_gamma_static = 2.5; // range [1, 5] + static const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + static const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + static const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + static const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + static const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + static const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + static const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + static const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + static const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + static const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + static const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + static const float beam_min_sigma_static = 0.02; // range (0, 1] + static const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + static const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + static const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + static const float beam_min_shape_static = 2.0; // range [2, 32] + static const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + static const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + static const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + static const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + static const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + static const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + static const float2 convergence_offsets_r_static = float2(0.1, 0.2); + static const float2 convergence_offsets_g_static = float2(0.3, 0.4); + static const float2 convergence_offsets_b_static = float2(0.5, 0.6); + // Detect interlacing (static option only for now)? + static const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + static const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + static const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + static const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + static const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + static const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + static const float2 aa_subpixel_r_offset_static = float2(-1.0/3.0, 0.0);//float2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + static const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + static const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + static const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + static const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + static const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + static const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + static const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + static const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + static const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + static const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + static const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + static const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + static const float2 geom_tilt_angle_static = float2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + static const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + static const float2 geom_overscan_static = float2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + static const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + static const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + static const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + static const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H + diff --git a/crt/shaders/crt-royale/src/bind-shader-params.h b/crt/shaders/crt-royale/src/bind-shader-params.h new file mode 100644 index 0000000..80adc7d --- /dev/null +++ b/crt/shaders/crt-royale/src/bind-shader-params.h @@ -0,0 +1,247 @@ +#ifndef BIND_SHADER_PARAMS_H +#define BIND_SHADER_PARAMS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#include "../user-settings.h" +#include "derived-settings-and-constants.h" + +// Override some parameters for gamma-management.h and tex2Dantialias.h: +#define OVERRIDE_DEVICE_GAMMA +const float gba_gamma = 3.5; // Irrelevant but necessary to define. +#define ANTIALIAS_OVERRIDE_BASICS +#define ANTIALIAS_OVERRIDE_PARAMETERS + +// Disable runtime shader params if the user doesn't explicitly want them. +// Static constants will be defined in place of uniforms of the same name. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #undef PARAMETER_UNIFORM +#endif + +// Bind option names to shader parameter uniforms or static constants. +#ifdef PARAMETER_UNIFORM + uniform float crt_gamma; + uniform float lcd_gamma; + uniform float levels_contrast; + uniform float halation_weight; + uniform float diffusion_weight; + uniform float bloom_underestimate_levels; + uniform float bloom_excess; + uniform float beam_min_sigma; + uniform float beam_max_sigma; + uniform float beam_spot_power; + uniform float beam_min_shape; + uniform float beam_max_shape; + uniform float beam_shape_power; + uniform float beam_horiz_sigma; + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + uniform float beam_horiz_filter; + uniform float beam_horiz_linear_rgb_weight; + #else + const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0); + const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0); + #endif + uniform float convergence_offset_x_r; + uniform float convergence_offset_x_g; + uniform float convergence_offset_x_b; + uniform float convergence_offset_y_r; + uniform float convergence_offset_y_g; + uniform float convergence_offset_y_b; + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + uniform float mask_type; + #else + const float mask_type = clamp(mask_type_static, 0.0, 2.0); + #endif + uniform float mask_sample_mode_desired; + uniform float mask_specify_num_triads; + uniform float mask_triad_size_desired; + uniform float mask_num_triads_desired; + uniform float aa_subpixel_r_offset_x_runtime; + uniform float aa_subpixel_r_offset_y_runtime; + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + uniform float aa_cubic_c; + uniform float aa_gauss_sigma; + #else + const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]? + const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]? + #endif + uniform float geom_mode_runtime; + uniform float geom_radius; + uniform float geom_view_dist; + uniform float geom_tilt_angle_x; + uniform float geom_tilt_angle_y; + uniform float geom_aspect_ratio_x; + uniform float geom_aspect_ratio_y; + uniform float geom_overscan_x; + uniform float geom_overscan_y; + uniform float border_size; + uniform float border_darkness; + uniform float border_compress; + uniform float interlace_bff; + uniform float interlace_1080i; +#else + // Use constants from user-settings.h, and limit ranges appropriately: + const float crt_gamma = max(0.0, crt_gamma_static); + const float lcd_gamma = max(0.0, lcd_gamma_static); + const float levels_contrast = clamp(levels_contrast_static, 0.0, 4.0); + const float halation_weight = clamp(halation_weight_static, 0.0, 1.0); + const float diffusion_weight = clamp(diffusion_weight_static, 0.0, 1.0); + const float bloom_underestimate_levels = max(FIX_ZERO(0.0), bloom_underestimate_levels_static); + const float bloom_excess = clamp(bloom_excess_static, 0.0, 1.0); + const float beam_min_sigma = max(FIX_ZERO(0.0), beam_min_sigma_static); + const float beam_max_sigma = max(beam_min_sigma, beam_max_sigma_static); + const float beam_spot_power = max(beam_spot_power_static, 0.0); + const float beam_min_shape = max(2.0, beam_min_shape_static); + const float beam_max_shape = max(beam_min_shape, beam_max_shape_static); + const float beam_shape_power = max(0.0, beam_shape_power_static); + const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0); + const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static); + const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0); + // Unpack vector elements to match scalar uniforms: + const float convergence_offset_x_r = clamp(convergence_offsets_r_static.x, -4.0, 4.0); + const float convergence_offset_x_g = clamp(convergence_offsets_g_static.x, -4.0, 4.0); + const float convergence_offset_x_b = clamp(convergence_offsets_b_static.x, -4.0, 4.0); + const float convergence_offset_y_r = clamp(convergence_offsets_r_static.y, -4.0, 4.0); + const float convergence_offset_y_g = clamp(convergence_offsets_g_static.y, -4.0, 4.0); + const float convergence_offset_y_b = clamp(convergence_offsets_b_static.y, -4.0, 4.0); + const float mask_type = clamp(mask_type_static, 0.0, 2.0); + const float mask_sample_mode_desired = clamp(mask_sample_mode_static, 0.0, 2.0); + const float mask_specify_num_triads = clamp(mask_specify_num_triads_static, 0.0, 1.0); + const float mask_triad_size_desired = clamp(mask_triad_size_desired_static, 1.0, 18.0); + const float mask_num_triads_desired = clamp(mask_num_triads_desired_static, 342.0, 1920.0); + const float aa_subpixel_r_offset_x_runtime = clamp(aa_subpixel_r_offset_static.x, -0.5, 0.5); + const float aa_subpixel_r_offset_y_runtime = clamp(aa_subpixel_r_offset_static.y, -0.5, 0.5); + const float aa_cubic_c = aa_cubic_c_static; // Clamp to [0, 4]? + const float aa_gauss_sigma = max(FIX_ZERO(0.0), aa_gauss_sigma_static); // Clamp to [FIXZERO(0), 1]? + const float geom_mode_runtime = clamp(geom_mode_static, 0.0, 3.0); + const float geom_radius = max(1.0/(2.0*pi), geom_radius_static); // Clamp to [1/(2*pi), 1024]? + const float geom_view_dist = max(0.5, geom_view_dist_static); // Clamp to [0.5, 1024]? + const float geom_tilt_angle_x = clamp(geom_tilt_angle_static.x, -pi, pi); + const float geom_tilt_angle_y = clamp(geom_tilt_angle_static.y, -pi, pi); + const float geom_aspect_ratio_x = geom_aspect_ratio_static; // Force >= 1? + const float geom_aspect_ratio_y = 1.0; + const float geom_overscan_x = max(FIX_ZERO(0.0), geom_overscan_static.x); + const float geom_overscan_y = max(FIX_ZERO(0.0), geom_overscan_static.y); + const float border_size = clamp(border_size_static, 0.0, 0.5); // 0.5 reaches to image center + const float border_darkness = max(0.0, border_darkness_static); + const float border_compress = max(1.0, border_compress_static); // < 1.0 darkens whole image + const float interlace_bff = float(interlace_bff_static); + const float interlace_1080i = float(interlace_1080i_static); +#endif + + +// Provide accessors for vector constants that pack scalar uniforms: +vec2 get_aspect_vector(const float geom_aspect_ratio) +{ + // Get an aspect ratio vector. Enforce geom_max_aspect_ratio, and prevent + // the absolute scale from affecting the uv-mapping for curvature: + const float geom_clamped_aspect_ratio = + min(geom_aspect_ratio, geom_max_aspect_ratio); + const vec2 geom_aspect = + normalize(vec2(geom_clamped_aspect_ratio, 1.0)); + return geom_aspect; +} + +vec2 get_geom_overscan_vector() +{ + return vec2(geom_overscan_x, geom_overscan_y); +} + +vec2 get_geom_tilt_angle_vector() +{ + return vec2(geom_tilt_angle_x, geom_tilt_angle_y); +} + +vec3 get_convergence_offsets_x_vector() +{ + return vec3(convergence_offset_x_r, convergence_offset_x_g, + convergence_offset_x_b); +} + +vec3 get_convergence_offsets_y_vector() +{ + return vec3(convergence_offset_y_r, convergence_offset_y_g, + convergence_offset_y_b); +} + +vec2 get_convergence_offsets_r_vector() +{ + return vec2(convergence_offset_x_r, convergence_offset_y_r); +} + +vec2 get_convergence_offsets_g_vector() +{ + return vec2(convergence_offset_x_g, convergence_offset_y_g); +} + +vec2 get_convergence_offsets_b_vector() +{ + return vec2(convergence_offset_x_b, convergence_offset_y_b); +} + +vec2 get_aa_subpixel_r_offset() +{ + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + // WARNING: THIS IS EXTREMELY EXPENSIVE. + return vec2(aa_subpixel_r_offset_x_runtime, + aa_subpixel_r_offset_y_runtime); + #else + return aa_subpixel_r_offset_static; + #endif + #else + return aa_subpixel_r_offset_static; + #endif +} + +// Provide accessors settings which still need "cooking:" +float get_mask_amplify() +{ + const float mask_grille_amplify = 1.0/mask_grille_avg_color; + const float mask_slot_amplify = 1.0/mask_slot_avg_color; + const float mask_shadow_amplify = 1.0/mask_shadow_avg_color; + return mask_type < 0.5 ? mask_grille_amplify : + mask_type < 1.5 ? mask_slot_amplify : + mask_shadow_amplify; +} + +float get_mask_sample_mode() +{ + #ifdef RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_desired; + #else + return clamp(mask_sample_mode_desired, 1.0, 2.0); + #endif + #else + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + return mask_sample_mode_static; + #else + return clamp(mask_sample_mode_static, 1.0, 2.0); + #endif + #endif +} + + +#endif // BIND_SHADER_PARAMS_H + + diff --git a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang new file mode 100644 index 0000000..43e8215 --- /dev/null +++ b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang @@ -0,0 +1,403 @@ +#version 450 + +layout(push_constant) uniform Push +{ + vec4 SourceSize; + uint FrameCount; +} registers; + +layout(std140, set = 0, binding = 0) uniform UBO +{ + mat4 MVP; + float crt_gamma; + float lcd_gamma; + float levels_contrast; + float halation_weight; + float diffusion_weight; + float bloom_underestimate_levels; + float bloom_excess; + float beam_min_sigma; + float beam_max_sigma; + float beam_spot_power; + float beam_min_shape; + float beam_max_shape; + float beam_shape_power; + float beam_horiz_filter; + float beam_horiz_sigma; + float beam_horiz_linear_rgb_weight; + float convergence_offset_x_r; + float convergence_offset_x_g; + float convergence_offset_x_b; + float convergence_offset_y_r; + float convergence_offset_y_g; + float convergence_offset_y_b; + float mask_type; + float mask_sample_mode_desired; + float mask_specify_num_triads; + float mask_triad_size_desired; + float mask_num_triads_desired; + float aa_subpixel_r_offset_x_runtime; + float aa_subpixel_r_offset_y_runtime; + float aa_cubic_c; + float aa_gauss_sigma; + float geom_mode_runtime; + float geom_radius; + float geom_view_dist; + float geom_tilt_angle_x; + float geom_tilt_angle_y; + float geom_aspect_ratio_x; + float geom_aspect_ratio_y; + float geom_overscan_x; + float geom_overscan_y; + float border_size; + float border_darkness; + float border_compress; + float interlace_bff; + float interlace_1080i; +} params; + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +// PASS SETTINGS: +// gamma-management.h needs to know what kind of pipeline we're using and +// what pass this is in that pipeline. This will become obsolete if/when we +// can #define things like this in the preset file. +#define FIRST_PASS +#define SIMULATE_CRT_ON_LCD + +// Set shader params for all passes here: +#pragma parameter crt_gamma "crt_gamma" 2.5 1.0 5.0 0.025 +#pragma parameter lcd_gamma "lcd_gamma" 2.2 1.0 5.0 0.025 +#pragma parameter levels_contrast "levels_contrast" 1.0 0.0 4.0 0.015625 +#pragma parameter halation_weight "halation_weight" 0.0 0.0 1.0 0.005 +#pragma parameter diffusion_weight "diffusion_weight" 0.075 0.0 1.0 0.005 +#pragma parameter bloom_underestimate_levels "bloom_underestimate_levels" 0.8 0.0 5.0 0.01 +#pragma parameter bloom_excess "bloom_excess" 0.0 0.0 1.0 0.005 +#pragma parameter beam_min_sigma "beam_min_sigma" 0.02 0.005 1.0 0.005 +#pragma parameter beam_max_sigma "beam_max_sigma" 0.3 0.005 1.0 0.005 +#pragma parameter beam_spot_power "beam_spot_power" 0.33 0.01 16.0 0.01 +#pragma parameter beam_min_shape "beam_min_shape" 2.0 2.0 32.0 0.1 +#pragma parameter beam_max_shape "beam_max_shape" 4.0 2.0 32.0 0.1 +#pragma parameter beam_shape_power "beam_shape_power" 0.25 0.01 16.0 0.01 +#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0 +#pragma parameter beam_horiz_sigma "beam_horiz_sigma" 0.35 0.0 0.67 0.005 +#pragma parameter beam_horiz_linear_rgb_weight "beam_horiz_linear_rgb_weight" 1.0 0.0 1.0 0.01 +#pragma parameter convergence_offset_x_r "convergence_offset_x_r" 0.0 -4.0 4.0 0.05 +#pragma parameter convergence_offset_x_g "convergence_offset_x_g" 0.0 -4.0 4.0 0.05 +#pragma parameter convergence_offset_x_b "convergence_offset_x_b" 0.0 -4.0 4.0 0.05 +#pragma parameter convergence_offset_y_r "convergence_offset_y_r" 0.0 -2.0 2.0 0.05 +#pragma parameter convergence_offset_y_g "convergence_offset_y_g" 0.0 -2.0 2.0 0.05 +#pragma parameter convergence_offset_y_b "convergence_offset_y_b" 0.0 -2.0 2.0 0.05 +#pragma parameter mask_type "mask_type" 1.0 0.0 2.0 1.0 +#pragma parameter mask_sample_mode_desired "mask_sample_mode" 0.0 0.0 2.0 1.0 // Consider blocking mode 2. +#pragma parameter mask_specify_num_triads "mask_specify_num_triads" 0.0 0.0 1.0 1.0 +#pragma parameter mask_triad_size_desired "mask_triad_size_desired" 3.0 1.0 18.0 0.125 +#pragma parameter mask_num_triads_desired "mask_num_triads_desired" 480.0 342.0 1920.0 1.0 +#pragma parameter aa_subpixel_r_offset_x_runtime "aa_subpixel_r_offset_x" -0.333333333 -0.333333333 0.333333333 0.333333333 +#pragma parameter aa_subpixel_r_offset_y_runtime "aa_subpixel_r_offset_y" 0.0 -0.333333333 0.333333333 0.333333333 +#pragma parameter aa_cubic_c "antialias_cubic_sharpness" 0.5 0.0 4.0 0.015625 +#pragma parameter aa_gauss_sigma "antialias_gauss_sigma" 0.5 0.0625 1.0 0.015625 +#pragma parameter geom_mode_runtime "geom_mode" 0.0 0.0 3.0 1.0 +#pragma parameter geom_radius "geom_radius" 2.0 0.16 1024.0 0.1 +#pragma parameter geom_view_dist "geom_view_dist" 2.0 0.5 1024.0 0.25 +#pragma parameter geom_tilt_angle_x "geom_tilt_angle_x" 0.0 -3.14159265 3.14159265 0.017453292519943295 +#pragma parameter geom_tilt_angle_y "geom_tilt_angle_y" 0.0 -3.14159265 3.14159265 0.017453292519943295 +#pragma parameter geom_aspect_ratio_x "geom_aspect_ratio_x" 432.0 1.0 512.0 1.0 +#pragma parameter geom_aspect_ratio_y "geom_aspect_ratio_y" 329.0 1.0 512.0 1.0 +#pragma parameter geom_overscan_x "geom_overscan_x" 1.0 0.00390625 4.0 0.00390625 +#pragma parameter geom_overscan_y "geom_overscan_y" 1.0 0.00390625 4.0 0.00390625 +#pragma parameter border_size "border_size" 0.015 0.0000001 0.5 0.005 +#pragma parameter border_darkness "border_darkness" 2.0 0.0 16.0 0.0625 +#pragma parameter border_compress "border_compress" 2.5 1.0 64.0 0.0625 +#pragma parameter interlace_bff "interlace_bff" 0.0 0.0 1.0 1.0 +#pragma parameter interlace_1080i "interlace_1080i" 0.0 0.0 1.0 1.0 + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../user-settings.h" +#include "bind-shader-params.h" +//#include "../../../../include/gamma-management.h" +//#include "scanline-functions.h" + +// from scanline-functions.h // +bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + bool sd_interlace; + if (num_lines > 288.5 && num_lines < 576.5) + {sd_interlace = true;} + else + {sd_interlace = false;} + bool hd_interlace; + if (num_lines > 1079.5 && num_lines < 1080.5) + {hd_interlace = true;} + else + {hd_interlace = false;} + return (sd_interlace || hd_interlace); + } + else + { + return false; + } +} +// end scanline-functions.h // + +// from gamma-management.h // +/////////////////////////////// BASE CONSTANTS /////////////////////////////// + +// Set standard gamma constants, but allow users to override them: +#ifndef OVERRIDE_STANDARD_GAMMA + // Standard encoding gammas: + const float ntsc_gamma = 2.2; // Best to use NTSC for PAL too? + const float pal_gamma = 2.8; // Never actually 2.8 in practice + // Typical device decoding gammas (only use for emulating devices): + // CRT/LCD reference gammas are higher than NTSC and Rec.709 video standard + // gammas: The standards purposely undercorrected for an analog CRT's + // assumed 2.5 reference display gamma to maintain contrast in assumed + // [dark] viewing conditions: http://www.poynton.com/PDFs/GammaFAQ.pdf + // These unstated assumptions about display gamma and perceptual rendering + // intent caused a lot of confusion, and more modern CRT's seemed to target + // NTSC 2.2 gamma with circuitry. LCD displays seem to have followed suit + // (they struggle near black with 2.5 gamma anyway), especially PC/laptop + // displays designed to view sRGB in bright environments. (Standards are + // also in flux again with BT.1886, but it's underspecified for displays.) + const float crt_reference_gamma_high = 2.5; // In (2.35, 2.55) + const float crt_reference_gamma_low = 2.35; // In (2.35, 2.55) + const float lcd_reference_gamma = 2.5; // To match CRT + const float crt_office_gamma = 2.2; // Circuitry-adjusted for NTSC + const float lcd_office_gamma = 2.2; // Approximates sRGB +#endif // OVERRIDE_STANDARD_GAMMA + +// Assuming alpha == 1.0 might make it easier for users to avoid some bugs, +// but only if they're aware of it. +#ifndef OVERRIDE_ALPHA_ASSUMPTIONS + const bool assume_opaque_alpha = false; +#endif + + +/////////////////////// DERIVED CONSTANTS AS FUNCTIONS /////////////////////// + +// gamma-management.h should be compatible with overriding gamma values with +// runtime user parameters, but we can only define other global constants in +// terms of static constants, not uniform user parameters. To get around this +// limitation, we need to define derived constants using functions. + +// Set device gamma constants, but allow users to override them: +#ifdef OVERRIDE_DEVICE_GAMMA + // The user promises to globally define the appropriate constants: + float get_crt_gamma() { return crt_gamma; } + float get_gba_gamma() { return gba_gamma; } + float get_lcd_gamma() { return lcd_gamma; } +#else + float get_crt_gamma() { return crt_reference_gamma_high; } + float get_gba_gamma() { return 3.5; } // Game Boy Advance; in (3.0, 4.0) + float get_lcd_gamma() { return lcd_office_gamma; } +#endif // OVERRIDE_DEVICE_GAMMA + +// Set decoding/encoding gammas for the first/lass passes, but allow overrides: +#ifdef OVERRIDE_FINAL_GAMMA + // The user promises to globally define the appropriate constants: + float get_intermediate_gamma() { return intermediate_gamma; } + float get_input_gamma() { return input_gamma; } + float get_output_gamma() { return output_gamma; } +#else + // If we gamma-correct every pass, always use ntsc_gamma between passes to + // ensure middle passes don't need to care if anything is being simulated: + float get_intermediate_gamma() { return ntsc_gamma; } + #ifdef SIMULATE_CRT_ON_LCD + float get_input_gamma() { return get_crt_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_LCD + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_lcd_gamma(); } + #else + #ifdef SIMULATE_LCD_ON_CRT + float get_input_gamma() { return get_lcd_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else + #ifdef SIMULATE_GBA_ON_CRT + float get_input_gamma() { return get_gba_gamma(); } + float get_output_gamma() { return get_crt_gamma(); } + #else // Don't simulate anything: + float get_input_gamma() { return ntsc_gamma; } + float get_output_gamma() { return ntsc_gamma; } + #endif // SIMULATE_GBA_ON_CRT + #endif // SIMULATE_LCD_ON_CRT + #endif // SIMULATE_GBA_ON_LCD + #endif // SIMULATE_CRT_ON_LCD +#endif // OVERRIDE_FINAL_GAMMA + +#ifndef GAMMA_ENCODE_EVERY_FBO + #ifdef FIRST_PASS + const bool linearize_input = true; + float get_pass_input_gamma() { return get_input_gamma(); } + #else + const bool linearize_input = false; + float get_pass_input_gamma() { return 1.0; } + #endif + #ifdef LAST_PASS + const bool gamma_encode_output = true; + float get_pass_output_gamma() { return get_output_gamma(); } + #else + const bool gamma_encode_output = false; + float get_pass_output_gamma() { return 1.0; } + #endif +#else + const bool linearize_input = true; + const bool gamma_encode_output = true; + #ifdef FIRST_PASS + float get_pass_input_gamma() { return get_input_gamma(); } + #else + float get_pass_input_gamma() { return get_intermediate_gamma(); } + #endif + #ifdef LAST_PASS + float get_pass_output_gamma() { return get_output_gamma(); } + #else + float get_pass_output_gamma() { return get_intermediate_gamma(); } + #endif +#endif + +vec4 decode_input(const vec4 color) +{ + if(linearize_input) + { + if(assume_opaque_alpha) + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(get_pass_input_gamma())), color.a); + } + } + else + { + return color; + } +} + +vec4 encode_output(const vec4 color) +{ + if(gamma_encode_output) + { + if(assume_opaque_alpha) + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), 1.0); + } + else + { + return vec4(pow(color.rgb, vec3(1.0/get_pass_output_gamma())), color.a); + } + } + else + { + return color; + } +} + +#define tex2D_linearize(C, D) decode_input(vec4(texture(C, D))) +//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords) +//{ return decode_input(vec4(texture(tex, tex_coords))); } + +//#define tex2D_linearize(C, D, E) decode_input(vec4(texture(C, D, E))) +//vec4 tex2D_linearize(const sampler2D tex, const vec2 tex_coords, const int texel_off) +//{ return decode_input(vec4(texture(tex, tex_coords, texel_off))); } + +// end gamma-management.h // + +#pragma stage vertex +layout(location = 0) in vec4 Position; +layout(location = 1) in vec2 TexCoord; +layout(location = 0) out vec2 tex_uv; +layout(location = 1) out vec2 uv_step; + +void main() +{ + gl_Position = params.MVP * Position; + tex_uv = TexCoord; + + // Save the uv distance between texels: + uv_step = vec2(1.0) * registers.SourceSize.zw; +} + +#pragma stage fragment +layout(location = 0) in vec2 tex_uv; +layout(location = 1) in vec2 uv_step; +layout(location = 0) out vec4 FragColor; +layout(set = 0, binding = 2) uniform sampler2D Source; + +void main() +{ + // Detect interlacing: 1.0 = true, 0.0 = false. + const vec2 video_size = registers.SourceSize.xy; + bool interlaced = is_interlaced(video_size.y); + +// Linearize the input based on CRT gamma and bob interlaced fields. +// Bobbing ensures we can immediately blur without getting artifacts. +// Note: TFF/BFF won't matter for sources that double-weave or similar. +if(interlace_detect) + { + // Sample the current line and an average of the previous/next line; + // tex2D_linearize will decode CRT gamma. Don't bother branching: +// const vec2 tex_uv = tex_uv; + const vec2 v_step = vec2(0.0, uv_step.y); + const vec3 curr_line = tex2D_linearize( + Source, tex_uv).rgb; + const vec3 last_line = tex2D_linearize( + Source, tex_uv - v_step).rgb; + const vec3 next_line = tex2D_linearize( + Source, tex_uv + v_step).rgb; + const vec3 interpolated_line = 0.5 * (last_line + next_line); + // If we're interlacing, determine which field curr_line is in: + float interlace_check = 0.0; + if (interlaced = true) interlace_check = 1.0; + const float modulus = interlace_check + 1.0; + const float field_offset = + mod(registers.FrameCount + float(params.interlace_bff), modulus); + const float curr_line_texel = tex_uv.y * registers.SourceSize.y; + // Use under_half to fix a rounding bug around exact texel locations. + const float line_num_last = floor(curr_line_texel - under_half); + const float wrong_field = mod(line_num_last + field_offset, modulus); + // Select the correct color, and output the result: + const vec3 color = mix(curr_line, interpolated_line, wrong_field); + FragColor = encode_output(vec4(color, 1.0)); + } + else + { + FragColor = encode_output(tex2D_linearize(Source, tex_uv)); + } +} \ No newline at end of file diff --git a/crt/shaders/crt-royale/src/derived-settings-and-constants.h b/crt/shaders/crt-royale/src/derived-settings-and-constants.h new file mode 100644 index 0000000..356eea3 --- /dev/null +++ b/crt/shaders/crt-royale/src/derived-settings-and-constants.h @@ -0,0 +1,315 @@ +#ifndef DERIVED_SETTINGS_AND_CONSTANTS_H +#define DERIVED_SETTINGS_AND_CONSTANTS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// These macros and constants can be used across the whole codebase. +// Unlike the values in user-settings.cgh, end users shouldn't modify these. + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../user-settings.h" +#include "user-preset-constants.h" + + +/////////////////////////////// FIXED SETTINGS /////////////////////////////// + +// Avoid dividing by zero; using a macro overloads for float, vec2, etc.: +#define FIX_ZERO(c) (max(abs(c), 0.0000152587890625)) // 2^-16 + +// Ensure the first pass decodes CRT gamma and the last encodes LCD gamma. +#ifndef SIMULATE_CRT_ON_LCD + #define SIMULATE_CRT_ON_LCD +#endif + +// Manually tiling a manually resized texture creates texture coord derivative +// discontinuities and confuses anisotropic filtering, causing discolored tile +// seams in the phosphor mask. Workarounds: +// a.) Using tex2Dlod disables anisotropic filtering for tiled masks. It's +// downgraded to tex2Dbias without DRIVERS_ALLOW_TEX2DLOD #defined and +// disabled without DRIVERS_ALLOW_TEX2DBIAS #defined either. +// b.) "Tile flat twice" requires drawing two full tiles without border padding +// to the resized mask FBO, and it's incompatible with same-pass curvature. +// (Same-pass curvature isn't used but could be in the future...maybe.) +// c.) "Fix discontinuities" requires derivatives and drawing one tile with +// border padding to the resized mask FBO, but it works with same-pass +// curvature. It's disabled without DRIVERS_ALLOW_DERIVATIVES #defined. +// Precedence: a, then, b, then c (if multiple strategies are #defined). + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD // 129.7 FPS, 4x, flat; 101.8 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE // 128.1 FPS, 4x, flat; 101.5 at fullscreen + #define ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES // 124.4 FPS, 4x, flat; 97.4 at fullscreen +// Also, manually resampling the phosphor mask is slightly blurrier with +// anisotropic filtering. (Resampling with mipmapping is even worse: It +// creates artifacts, but only with the fully bloomed shader.) The difference +// is subtle with small triads, but you can fix it for a small cost. + //#define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + + +////////////////////////////// DERIVED SETTINGS ////////////////////////////// + +// Intel HD 4000 GPU's can't handle manual mask resizing (for now), setting the +// geometry mode at runtime, or a 4x4 true Gaussian resize. Disable +// incompatible settings ASAP. (INTEGRATED_GRAPHICS_COMPATIBILITY_MODE may be +// #defined by either user-settings.h or a wrapper .cg that #includes the +// current .cg pass.) +#ifdef INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE + #undef PHOSPHOR_MASK_MANUALLY_RESIZE + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + // Mode 2 (4x4 Gaussian resize) won't work, and mode 1 (3x3 blur) is + // inferior in most cases, so replace 2.0 with 0.0: + const float bloom_approx_filter = + bloom_approx_filter_static > 1.5 ? 0.0 : bloom_approx_filter_static; +#else + const float bloom_approx_filter = bloom_approx_filter_static; +#endif + +// Disable slow runtime paths if static parameters are used. Most of these +// won't be a problem anyway once the params are disabled, but some will. +#ifndef RUNTIME_SHADER_PARAMS_ENABLE + #ifdef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #undef RUNTIME_PHOSPHOR_BLOOM_SIGMA + #endif + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + #undef RUNTIME_ANTIALIAS_WEIGHTS + #endif + #ifdef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #undef RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS + #endif + #ifdef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #undef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #endif + #ifdef RUNTIME_GEOMETRY_TILT + #undef RUNTIME_GEOMETRY_TILT + #endif + #ifdef RUNTIME_GEOMETRY_MODE + #undef RUNTIME_GEOMETRY_MODE + #endif + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #undef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// Make tex2Dbias a backup for tex2Dlod for wider compatibility. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DBIAS +#endif +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #define ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS +#endif +// Rule out unavailable anisotropic compatibility strategies: +#ifndef DRIVERS_ALLOW_DERIVATIVES + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #endif + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + #undef ANTIALIAS_DISABLE_ANISOTROPIC + #endif +#endif +#ifndef DRIVERS_ALLOW_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif +// Prioritize anisotropic tiling compatibility strategies by performance and +// disable unused strategies. This concentrates all the nesting in one place. +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif +#else + #ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #undef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #else + // ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE is only compatible with + // flat texture coords in the same pass, but that's all we use. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #undef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + #endif + #endif + #endif +#endif +// The tex2Dlod and tex2Dbias strategies share a lot in common, and we can +// reduce some #ifdef nesting in the next section by essentially OR'ing them: +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DLOD + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +#ifdef ANISOTROPIC_TILING_COMPAT_TEX2DBIAS + #define ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY +#endif +// Prioritize anisotropic resampling compatibility strategies the same way: +#ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #undef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + #endif +#endif + + +/////////////////////// DERIVED PHOSPHOR MASK CONSTANTS ////////////////////// + +// If we can use the large mipmapped LUT without mipmapping artifacts, we +// should: It gives us more options for using fewer samples. +#ifdef DRIVERS_ALLOW_TEX2DLOD + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + // TODO: Take advantage of this! + #define PHOSPHOR_MASK_RESIZE_MIPMAPPED_LUT + const vec2 mask_resize_src_lut_size = mask_texture_large_size; + #else + const vec2 mask_resize_src_lut_size = mask_texture_small_size; + #endif +#else + const vec2 mask_resize_src_lut_size = mask_texture_small_size; +#endif + + +// tex2D's sampler2D parameter MUST be a uniform global, a uniform input to +// main_fragment, or a static alias of one of the above. This makes it hard +// to select the phosphor mask at runtime: We can't even assign to a uniform +// global in the vertex shader or select a sampler2D in the vertex shader and +// pass it to the fragment shader (even with explicit TEXUNIT# bindings), +// because it just gives us the input texture or a black screen. However, we +// can get around these limitations by calling tex2D three times with different +// uniform samplers (or resizing the phosphor mask three times altogether). +// With dynamic branches, we can process only one of these branches on top of +// quickly discarding fragments we don't need (cgc seems able to overcome +// limigations around dependent texture fetches inside of branches). Without +// dynamic branches, we have to process every branch for every fragment...which +// is slower. Runtime sampling mode selection is slower without dynamic +// branches as well. Let the user's static #defines decide if it's worth it. +#ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT +#else + #ifdef FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #define RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + #endif +#endif + +// We need to render some minimum number of tiles in the resize passes. +// We need at least 1.0 just to repeat a single tile, and we need extra +// padding beyond that for anisotropic filtering, discontinuitity fixing, +// antialiasing, same-pass curvature (not currently used), etc. First +// determine how many border texels and tiles we need, based on how the result +// will be sampled: +#ifdef GEOMETRY_EARLY + const float max_subpixel_offset = aa_subpixel_r_offset_static.x; + // Most antialiasing filters have a base radius of 4.0 pixels: + const float max_aa_base_pixel_border = 4.0 + + max_subpixel_offset; +#else + const float max_aa_base_pixel_border = 0.0; +#endif +// Anisotropic filtering adds about 0.5 to the pixel border: +#ifndef ANISOTROPIC_TILING_COMPAT_TEX2DLOD_FAMILY + const float max_aniso_pixel_border = max_aa_base_pixel_border + 0.5; +#else + const float max_aniso_pixel_border = max_aa_base_pixel_border; +#endif +// Fixing discontinuities adds 1.0 more to the pixel border: +#ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const float max_tiled_pixel_border = max_aniso_pixel_border + 1.0; +#else + const float max_tiled_pixel_border = max_aniso_pixel_border; +#endif +// Convert the pixel border to an integer texel border. Assume same-pass +// curvature about triples the texel frequency: +#ifdef GEOMETRY_EARLY + const float max_mask_texel_border = + ceil(max_tiled_pixel_border * 3.0); +#else + const float max_mask_texel_border = ceil(max_tiled_pixel_border); +#endif +// Convert the texel border to a tile border using worst-case assumptions: + const float max_mask_tile_border = max_mask_texel_border/ + (mask_min_allowed_triad_size * mask_triads_per_tile); + +// Finally, set the number of resized tiles to render to MASK_RESIZE, and set +// the starting texel (inside borders) for sampling it. +#ifndef GEOMETRY_EARLY + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + // Special case: Render two tiles without borders. Anisotropic + // filtering doesn't seem to be a problem here. + const float mask_resize_num_tiles = 1.0 + 1.0; + const float mask_start_texels = 0.0; + #else + const float mask_resize_num_tiles = 1.0 + + 2.0 * max_mask_tile_border; + const float mask_start_texels = max_mask_texel_border; + #endif +#else + const float mask_resize_num_tiles = 1.0 + 2.0*max_mask_tile_border; + const float mask_start_texels = max_mask_texel_border; +#endif + +// We have to fit mask_resize_num_tiles into an FBO with a viewport scale of +// mask_resize_viewport_scale. This limits the maximum final triad size. +// Estimate the minimum number of triads we can split the screen into in each +// dimension (we'll be as correct as mask_resize_viewport_scale is): + const float mask_resize_num_triads = + mask_resize_num_tiles * mask_triads_per_tile; + const vec2 min_allowed_viewport_triads = + vec2(mask_resize_num_triads) / mask_resize_viewport_scale; + + +//////////////////////// COMMON MATHEMATICAL CONSTANTS /////////////////////// + + const float pi = 3.141592653589; +// We often want to find the location of the previous texel, e.g.: +// const vec2 curr_texel = uv * texture_size; +// const vec2 prev_texel = floor(curr_texel - vec2(0.5)) + vec2(0.5); +// const vec2 prev_texel_uv = prev_texel / texture_size; +// However, many GPU drivers round incorrectly around exact texel locations. +// We need to subtract a little less than 0.5 before flooring, and some GPU's +// require this value to be farther from 0.5 than others; define it here. +// const vec2 prev_texel = +// floor(curr_texel - vec2(under_half)) + vec2(0.5); + const float under_half = 0.4995; + + +#endif // DERIVED_SETTINGS_AND_CONSTANTS_H + diff --git a/crt/shaders/crt-royale/src/geometry-functions.h b/crt/shaders/crt-royale/src/geometry-functions.h new file mode 100644 index 0000000..fff281c --- /dev/null +++ b/crt/shaders/crt-royale/src/geometry-functions.h @@ -0,0 +1,693 @@ +#ifndef GEOMETRY_FUNCTIONS_H +#define GEOMETRY_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../user-settings.h" +#include "derived-settings-and-constants.h" +#include "bind-shader-params.h" + + +//////////////////////////// MACROS AND CONSTANTS //////////////////////////// + +// Curvature-related constants: +#define MAX_POINT_CLOUD_SIZE 9 + + +///////////////////////////// CURVATURE FUNCTIONS ///////////////////////////// + +vec2 quadratic_solve(const float a, const float b_over_2, const float c) +{ + // Requires: 1.) a, b, and c are quadratic formula coefficients + // 2.) b_over_2 = b/2.0 (simplifies terms to factor 2 out) + // 3.) b_over_2 must be guaranteed < 0.0 (avoids a branch) + // Returns: Returns vec2(first_solution, discriminant), so the caller + // can choose how to handle the "no intersection" case. The + // Kahan or Citardauq formula is used for numerical robustness. + const float discriminant = b_over_2*b_over_2 - a*c; + const float solution0 = c/(-b_over_2 + sqrt(discriminant)); + return vec2(solution0, discriminant); +} + +vec2 intersect_sphere(const vec3 view_vec, const vec3 eye_pos_vec) +{ + // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's + // local coordinate frame (eye_pos_vec is a position, i.e. + // a vector from the origin to the eye/camera) + // 2.) geom_radius is a global containing the sphere's radius + // Returns: Cast a ray of direction view_vec from eye_pos_vec at a + // sphere of radius geom_radius, and return the distance to + // the first intersection in units of length(view_vec). + // http://wiki.cgsociety.org/index.php/Ray_Sphere_Intersection + // Quadratic formula coefficients (b_over_2 is guaranteed negative): + const float a = dot(view_vec, view_vec); + const float b_over_2 = dot(view_vec, eye_pos_vec); // * 2.0 factored out + const float c = dot(eye_pos_vec, eye_pos_vec) - geom_radius*geom_radius; + return quadratic_solve(a, b_over_2, c); +} + +vec2 intersect_cylinder(const vec3 view_vec, const vec3 eye_pos_vec) +{ + // Requires: 1.) view_vec and eye_pos_vec are 3D vectors in the sphere's + // local coordinate frame (eye_pos_vec is a position, i.e. + // a vector from the origin to the eye/camera) + // 2.) geom_radius is a global containing the cylinder's radius + // Returns: Cast a ray of direction view_vec from eye_pos_vec at a + // cylinder of radius geom_radius, and return the distance to + // the first intersection in units of length(view_vec). The + // derivation of the coefficients is in Christer Ericson's + // Real-Time Collision Detection, p. 195-196, and this version + // uses LaGrange's identity to reduce operations. + // Arbitrary "cylinder top" reference point for an infinite cylinder: + const vec3 cylinder_top_vec = vec3(0.0, geom_radius, 0.0); + const vec3 cylinder_axis_vec = vec3(0.0, 1.0, 0.0);//vec3(0.0, 2.0*geom_radius, 0.0); + const vec3 top_to_eye_vec = eye_pos_vec - cylinder_top_vec; + const vec3 axis_x_view = cross(cylinder_axis_vec, view_vec); + const vec3 axis_x_top_to_eye = cross(cylinder_axis_vec, top_to_eye_vec); + // Quadratic formula coefficients (b_over_2 is guaranteed negative): + const float a = dot(axis_x_view, axis_x_view); + const float b_over_2 = dot(axis_x_top_to_eye, axis_x_view); + const float c = dot(axis_x_top_to_eye, axis_x_top_to_eye) - + geom_radius*geom_radius;//*dot(cylinder_axis_vec, cylinder_axis_vec); + return quadratic_solve(a, b_over_2, c); +} + +vec2 cylinder_xyz_to_uv(const vec3 intersection_pos_local, + const vec2 geom_aspect) +{ + // Requires: An xyz intersection position on a cylinder. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: Define square_uv.x to be the signed arc length in xz-space, + // and define square_uv.y = -intersection_pos_local.y (+v = -y). + // Start with a numerically robust arc length calculation. + const float angle_from_image_center = atan2(intersection_pos_local.x, + intersection_pos_local.z); + const float signed_arc_len = angle_from_image_center * geom_radius; + // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide + // by the aspect ratio to stretch the mapping appropriately: + const vec2 square_uv = vec2(signed_arc_len, -intersection_pos_local.y); + const vec2 video_uv = square_uv / geom_aspect; + return video_uv; +} + +vec3 cylinder_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a cylinder. This is the + // inverse of cylinder_xyz_to_uv(). + // Expand video_uv by the aspect ratio to get proportionate x/y lengths, + // then calculate an xyz position for the cylindrical mapping above. + const vec2 square_uv = video_uv * geom_aspect; + const float arc_len = square_uv.x; + const float angle_from_image_center = arc_len / geom_radius; + const float x_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = cos(angle_from_image_center) * geom_radius; + // Or: z = sqrt(geom_radius**2 - x**2) + // Or: z = geom_radius/sqrt(1.0 + tan(angle)**2), x = z * tan(angle) + const vec3 intersection_pos_local = vec3(x_pos, -square_uv.y, z_pos); + return intersection_pos_local; +} + +vec2 sphere_xyz_to_uv(const vec3 intersection_pos_local, + const vec2 geom_aspect) +{ + // Requires: An xyz intersection position on a sphere. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: First define square_uv.x/square_uv.y == + // intersection_pos_local.x/intersection_pos_local.y. Then, + // length(square_uv) is the arc length from the image center + // at (0.0, 0.0, geom_radius) along the tangent great circle. + // Credit for this mapping goes to cgwg: I never managed to + // understand his code, but he told me his mapping was based on + // great circle distances when I asked him about it, which + // informed this very similar (almost identical) mapping. + // Start with a numerically robust arc length calculation between the ray- + // sphere intersection point and the image center using a method posted by + // Roger Stafford on comp.soft-sys.matlab: + // https://groups.google.com/d/msg/comp.soft-sys.matlab/zNbUui3bjcA/c0HV_bHSx9cJ + const vec3 image_center_pos_local = vec3(0.0, 0.0, geom_radius); + const float cp_len = + length(cross(intersection_pos_local, image_center_pos_local)); + const float dp = dot(intersection_pos_local, image_center_pos_local); + const float angle_from_image_center = atan2(cp_len, dp); + const float arc_len = angle_from_image_center * geom_radius; + // Get a uv-mapping where [-0.5, 0.5] maps to a "square" area, then divide + // by the aspect ratio to stretch the mapping appropriately: + const vec2 square_uv_unit = normalize(vec2(intersection_pos_local.x, + -intersection_pos_local.y)); + const vec2 square_uv = arc_len * square_uv_unit; + const vec2 video_uv = square_uv / geom_aspect; + return video_uv; +} + +vec3 sphere_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a sphere. This is the + // inverse of sphere_xyz_to_uv(). + // Expand video_uv by the aspect ratio to get proportionate x/y lengths, + // then calculate an xyz position for the spherical mapping above. + const vec2 square_uv = video_uv * geom_aspect; + // Using length or sqrt here butchers the framerate on my 8800GTS if + // this function is called too many times, and so does taking the max + // component of square_uv/square_uv_unit (program length threshold?). + //float arc_len = length(square_uv); + const vec2 square_uv_unit = normalize(square_uv); + const float arc_len = square_uv.y/square_uv_unit.y; + const float angle_from_image_center = arc_len / geom_radius; + const float xy_dist_from_sphere_center = + sin(angle_from_image_center) * geom_radius; + //vec2 xy_pos = xy_dist_from_sphere_center * (square_uv/FIX_ZERO(arc_len)); + const vec2 xy_pos = xy_dist_from_sphere_center * square_uv_unit; + const float z_pos = cos(angle_from_image_center) * geom_radius; + const vec3 intersection_pos_local = vec3(xy_pos.x, -xy_pos.y, z_pos); + return intersection_pos_local; +} + +vec2 sphere_alt_xyz_to_uv(const vec3 intersection_pos_local, + const vec2 geom_aspect) +{ + // Requires: An xyz intersection position on a cylinder. + // Returns: video_uv coords mapped to range [-0.5, 0.5] + // Mapping: Define square_uv.x to be the signed arc length in xz-space, + // and define square_uv.y == signed arc length in yz-space. + // See cylinder_xyz_to_uv() for implementation details (very similar). + const vec2 angle_from_image_center = atan2( + vec2(intersection_pos_local.x, -intersection_pos_local.y), + intersection_pos_local.zz); + const vec2 signed_arc_len = angle_from_image_center * geom_radius; + const vec2 video_uv = signed_arc_len / geom_aspect; + return video_uv; +} + +vec3 sphere_alt_uv_to_xyz(const vec2 video_uv, const vec2 geom_aspect) +{ + // Requires: video_uv coords mapped to range [-0.5, 0.5] + // Returns: An xyz intersection position on a sphere. This is the + // inverse of sphere_alt_xyz_to_uv(). + // See cylinder_uv_to_xyz() for implementation details (very similar). + const vec2 square_uv = video_uv * geom_aspect; + const vec2 arc_len = square_uv; + const vec2 angle_from_image_center = arc_len / geom_radius; + const vec2 xy_pos = sin(angle_from_image_center) * geom_radius; + const float z_pos = sqrt(geom_radius*geom_radius - dot(xy_pos, xy_pos)); + return vec3(xy_pos.x, -xy_pos.y, z_pos); +} + +inline vec2 intersect(const vec3 view_vec_local, const vec3 eye_pos_local, + const float geom_mode) +{ + return geom_mode < 2.5 ? intersect_sphere(view_vec_local, eye_pos_local) : + intersect_cylinder(view_vec_local, eye_pos_local); +} + +inline vec2 xyz_to_uv(const vec3 intersection_pos_local, + const vec2 geom_aspect, const float geom_mode) +{ + return geom_mode < 1.5 ? + sphere_xyz_to_uv(intersection_pos_local, geom_aspect) : + geom_mode < 2.5 ? + sphere_alt_xyz_to_uv(intersection_pos_local, geom_aspect) : + cylinder_xyz_to_uv(intersection_pos_local, geom_aspect); +} + +inline vec3 uv_to_xyz(const vec2 uv, const vec2 geom_aspect, + const float geom_mode) +{ + return geom_mode < 1.5 ? sphere_uv_to_xyz(uv, geom_aspect) : + geom_mode < 2.5 ? sphere_alt_uv_to_xyz(uv, geom_aspect) : + cylinder_uv_to_xyz(uv, geom_aspect); +} + +vec2 view_vec_to_uv(const vec3 view_vec_local, const vec3 eye_pos_local, + const vec2 geom_aspect, const float geom_mode, out vec3 intersection_pos) +{ + // Get the intersection point on the primitive, given an eye position + // and view vector already in its local coordinate frame: + const vec2 intersect_dist_and_discriminant = intersect(view_vec_local, + eye_pos_local, geom_mode); + const vec3 intersection_pos_local = eye_pos_local + + view_vec_local * intersect_dist_and_discriminant.x; + // Save the intersection position to an output parameter: + intersection_pos = intersection_pos_local; + // Transform into uv coords, but give out-of-range coords if the + // view ray doesn't intersect the primitive in the first place: + return intersect_dist_and_discriminant.y > 0.005 ? + xyz_to_uv(intersection_pos_local, geom_aspect, geom_mode) : vec2(1.0); +} + +vec3 get_ideal_global_eye_pos_for_points(vec3 eye_pos, + const vec2 geom_aspect, const vec3 global_coords[MAX_POINT_CLOUD_SIZE], + const int num_points) +{ + // Requires: Parameters: + // 1.) Starting eye_pos is a global 3D position at which the + // camera contains all points in global_coords[] in its FOV + // 2.) geom_aspect = get_aspect_vector( + // IN.output_size.x / IN.output_size.y); + // 3.) global_coords is a point cloud containing global xyz + // coords of extreme points on the simulated CRT screen. + // Globals: + // 1.) geom_view_dist must be > 0.0. It controls the "near + // plane" used to interpret flat_video_uv as a view + // vector, which controls the field of view (FOV). + // Eyespace coordinate frame: +x = right, +y = up, +z = back + // Returns: Return an eye position at which the point cloud spans as + // much of the screen as possible (given the FOV controlled by + // geom_view_dist) without being cropped or sheared. + // Algorithm: + // 1.) Move the eye laterally to a point which attempts to maximize the + // the amount we can move forward without clipping the CRT screen. + // 2.) Move forward by as much as possible without clipping the CRT. + // Get the allowed movement range by solving for the eye_pos offsets + // that result in each point being projected to a screen edge/corner in + // pseudo-normalized device coords (where xy ranges from [-0.5, 0.5] + // and z = eyespace z): + // pndc_coord = vec3(vec2(eyespace_xyz.x, -eyespace_xyz.y)* + // geom_view_dist / (geom_aspect * -eyespace_xyz.z), eyespace_xyz.z); + // Notes: + // The field of view is controlled by geom_view_dist's magnitude relative to + // the view vector's x and y components: + // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect + // view_vec.z = -geom_view_dist + // But for the purposes of perspective divide, it should be considered: + // view_vec.xy ranges from [-0.5, 0.5] * geom_aspect / geom_view_dist + // view_vec.z = -1.0 + const int max_centering_iters = 1; // Keep for easy testing. + for(int iter = 0; iter < max_centering_iters; iter++) + { + // 0.) Get the eyespace coordinates of our point cloud: + vec3 eyespace_coords[MAX_POINT_CLOUD_SIZE]; + for(int i = 0; i < num_points; i++) + { + eyespace_coords[i] = global_coords[i] - eye_pos; + } + // 1a.)For each point, find out how far we can move eye_pos in each + // lateral direction without the point clipping the frustum. + // Eyespace +y = up, screenspace +y = down, so flip y after + // applying the eyespace offset (on the way to "clip space"). + // Solve for two offsets per point based on: + // (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(-0.5) + // (eyespace_xyz.xy - offset_dr) * vec2(1.0, -1.0) * + // geom_view_dist / (geom_aspect * -eyespace_xyz.z) = vec2(0.5) + // offset_ul and offset_dr represent the farthest we can move the + // eye_pos up-left and down-right. Save the min of all offset_dr's + // and the max of all offset_ul's (since it's negative). + float abs_radius = abs(geom_radius); // In case anyone gets ideas. ;) + vec2 offset_dr_min = vec2(10.0 * abs_radius, 10.0 * abs_radius); + vec2 offset_ul_max = vec2(-10.0 * abs_radius, -10.0 * abs_radius); + for(int i = 0; i < num_points; i++) + { + const vec2 flipy = vec2(1.0, -1.0); + vec3 eyespace_xyz = eyespace_coords[i]; + vec2 offset_dr = eyespace_xyz.xy - vec2(-0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + vec2 offset_ul = eyespace_xyz.xy - vec2(0.5) * + (geom_aspect * -eyespace_xyz.z) / (geom_view_dist * flipy); + offset_dr_min = min(offset_dr_min, offset_dr); + offset_ul_max = max(offset_ul_max, offset_ul); + } + // 1b.)Update eye_pos: Adding the average of offset_ul_max and + // offset_dr_min gives it equal leeway on the top vs. bottom + // and left vs. right. Recalculate eyespace_coords accordingly. + vec2 center_offset = 0.5 * (offset_ul_max + offset_dr_min); + eye_pos.xy += center_offset; + for(int i = 0; i < num_points; i++) + { + eyespace_coords[i] = global_coords[i] - eye_pos; + } + // 2a.)For each point, find out how far we can move eye_pos forward + // without the point clipping the frustum. Flip the y + // direction in advance (matters for a later step, not here). + // Solve for four offsets per point based on: + // eyespace_xyz_flipy.x * geom_view_dist / + // (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) =-0.5 + // eyespace_xyz_flipy.y * geom_view_dist / + // (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) =-0.5 + // eyespace_xyz_flipy.x * geom_view_dist / + // (geom_aspect.x * (offset_z - eyespace_xyz_flipy.z)) = 0.5 + // eyespace_xyz_flipy.y * geom_view_dist / + // (geom_aspect.y * (offset_z - eyespace_xyz_flipy.z)) = 0.5 + // We'll vectorize the actual computation. Take the maximum of + // these four for a single offset, and continue taking the max + // for every point (use max because offset.z is negative). + float offset_z_max = -10.0 * geom_radius * geom_view_dist; + for(int i = 0; i < num_points; i++) + { + vec3 eyespace_xyz_flipy = eyespace_coords[i] * + vec3(1.0, -1.0, 1.0); + vec4 offset_zzzz = eyespace_xyz_flipy.zzzz + + (eyespace_xyz_flipy.xyxy * geom_view_dist) / + (vec4(-0.5, -0.5, 0.5, 0.5) * vec4(geom_aspect, geom_aspect)); + // Ignore offsets that push positive x/y values to opposite + // boundaries, and vice versa, and don't let the camera move + // past a point in the dead center of the screen: + offset_z_max = (eyespace_xyz_flipy.x < 0.0) ? + max(offset_z_max, offset_zzzz.x) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.y < 0.0) ? + max(offset_z_max, offset_zzzz.y) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.x > 0.0) ? + max(offset_z_max, offset_zzzz.z) : offset_z_max; + offset_z_max = (eyespace_xyz_flipy.y > 0.0) ? + max(offset_z_max, offset_zzzz.w) : offset_z_max; + offset_z_max = max(offset_z_max, eyespace_xyz_flipy.z); + } + // 2b.)Update eye_pos: Add the maximum (smallest negative) z offset. + eye_pos.z += offset_z_max; + } + return eye_pos; +} + +vec3 get_ideal_global_eye_pos(const vec3x3 local_to_global, + const vec2 geom_aspect, const float geom_mode) +{ + // Start with an initial eye_pos that includes the entire primitive + // (sphere or cylinder) in its field-of-view: + const vec3 high_view = vec3(0.0, geom_aspect.y, -geom_view_dist); + const vec3 low_view = high_view * vec3(1.0, -1.0, 1.0); + const float len_sq = dot(high_view, high_view); + const float fov = abs(acos(dot(high_view, low_view)/len_sq)); + // Trigonometry/similar triangles say distance = geom_radius/sin(fov/2): + const float eye_z_spherical = geom_radius/sin(fov*0.5); + const vec3 eye_pos = geom_mode < 2.5 ? + vec3(0.0, 0.0, eye_z_spherical) : + vec3(0.0, 0.0, max(geom_view_dist, eye_z_spherical)); + + // Get global xyz coords of extreme sample points on the simulated CRT + // screen. Start with the center, edge centers, and corners of the + // video image. We can't ignore backfacing points: They're occluded + // by closer points on the primitive, but they may NOT be occluded by + // the convex hull of the remaining samples (i.e. the remaining convex + // hull might not envelope points that do occlude a back-facing point.) + const int num_points = MAX_POINT_CLOUD_SIZE; + vec3 global_coords[MAX_POINT_CLOUD_SIZE]; + global_coords[0] = mul(local_to_global, uv_to_xyz(vec2(0.0, 0.0), geom_aspect, geom_mode)); + global_coords[1] = mul(local_to_global, uv_to_xyz(vec2(0.0, -0.5), geom_aspect, geom_mode)); + global_coords[2] = mul(local_to_global, uv_to_xyz(vec2(0.0, 0.5), geom_aspect, geom_mode)); + global_coords[3] = mul(local_to_global, uv_to_xyz(vec2(-0.5, 0.0), geom_aspect, geom_mode)); + global_coords[4] = mul(local_to_global, uv_to_xyz(vec2(0.5, 0.0), geom_aspect, geom_mode)); + global_coords[5] = mul(local_to_global, uv_to_xyz(vec2(-0.5, -0.5), geom_aspect, geom_mode)); + global_coords[6] = mul(local_to_global, uv_to_xyz(vec2(0.5, -0.5), geom_aspect, geom_mode)); + global_coords[7] = mul(local_to_global, uv_to_xyz(vec2(-0.5, 0.5), geom_aspect, geom_mode)); + global_coords[8] = mul(local_to_global, uv_to_xyz(vec2(0.5, 0.5), geom_aspect, geom_mode)); + // Adding more inner image points could help in extreme cases, but too many + // points will kille the framerate. For safety, default to the initial + // eye_pos if any z coords are negative: + float num_negative_z_coords = 0.0; + for(int i = 0; i < num_points; i++) + { + num_negative_z_coords += float(global_coords[0].z < 0.0); + } + // Outsource the optimized eye_pos calculation: + return num_negative_z_coords > 0.5 ? eye_pos : + get_ideal_global_eye_pos_for_points(eye_pos, geom_aspect, + global_coords, num_points); +} + +vec3x3 get_pixel_to_object_matrix(const vec3x3 global_to_local, + const vec3 eye_pos_local, const vec3 view_vec_global, + const vec3 intersection_pos_local, const vec3 normal, + const vec2 output_size_inv) +{ + // Requires: See get_curved_video_uv_coords_and_tangent_matrix for + // descriptions of each parameter. + // Returns: Return a transformation matrix from 2D pixel-space vectors + // (where (+1.0, +1.0) is a vector to one pixel down-right, + // i.e. same directionality as uv texels) to 3D object-space + // vectors in the CRT's local coordinate frame (right-handed) + // ***which are tangent to the CRT surface at the intersection + // position.*** (Basically, we want to convert pixel-space + // vectors to 3D vectors along the CRT's surface, for later + // conversion to uv vectors.) + // Shorthand inputs: + const vec3 pos = intersection_pos_local; + const vec3 eye_pos = eye_pos_local; + // Get a piecewise-linear matrix transforming from "pixelspace" offset + // vectors (1.0 = one pixel) to object space vectors in the tangent + // plane (faster than finding 3 view-object intersections). + // 1.) Get the local view vecs for the pixels to the right and down: + const vec3 view_vec_right_global = view_vec_global + + vec3(output_size_inv.x, 0.0, 0.0); + const vec3 view_vec_down_global = view_vec_global + + vec3(0.0, -output_size_inv.y, 0.0); + const vec3 view_vec_right_local = + mul(global_to_local, view_vec_right_global); + const vec3 view_vec_down_local = + mul(global_to_local, view_vec_down_global); + // 2.) Using the true intersection point, intersect the neighboring + // view vectors with the tangent plane: + const vec3 intersection_vec_dot_normal = dot(pos - eye_pos, normal); + const vec3 right_pos = eye_pos + (intersection_vec_dot_normal / + dot(view_vec_right_local, normal))*view_vec_right_local; + const vec3 down_pos = eye_pos + (intersection_vec_dot_normal / + dot(view_vec_down_local, normal))*view_vec_down_local; + // 3.) Subtract the original intersection pos from its neighbors; the + // resulting vectors are object-space vectors tangent to the plane. + // These vectors are the object-space transformations of (1.0, 0.0) + // and (0.0, 1.0) pixel offsets, so they form the first two basis + // vectors of a pixelspace to object space transformation. This + // transformation is 2D to 3D, so use (0, 0, 0) for the third vector. + const vec3 object_right_vec = right_pos - pos; + const vec3 object_down_vec = down_pos - pos; + const vec3x3 pixel_to_object = vec3x3( + object_right_vec.x, object_down_vec.x, 0.0, + object_right_vec.y, object_down_vec.y, 0.0, + object_right_vec.z, object_down_vec.z, 0.0); + return pixel_to_object; +} + +vec3x3 get_object_to_tangent_matrix(const vec3 intersection_pos_local, + const vec3 normal, const vec2 geom_aspect, const float geom_mode) +{ + // Requires: See get_curved_video_uv_coords_and_tangent_matrix for + // descriptions of each parameter. + // Returns: Return a transformation matrix from 3D object-space vectors + // in the CRT's local coordinate frame (right-handed, +y = up) + // to 2D video_uv vectors (+v = down). + // Description: + // The TBN matrix formed by the [tangent, bitangent, normal] basis + // vectors transforms ordinary vectors from tangent->object space. + // The cotangent matrix formed by the [cotangent, cobitangent, normal] + // basis vectors transforms normal vectors (covectors) from + // tangent->object space. It's the inverse-transpose of the TBN matrix. + // We want the inverse of the TBN matrix (transpose of the cotangent + // matrix), which transforms ordinary vectors from object->tangent space. + // Start by calculating the relevant basis vectors in accordance with + // Christian Schüler's blog post "Followup: Normal Mapping Without + // Precomputed Tangents": http://www.thetenthplanet.de/archives/1180 + // With our particular uv mapping, the scale of the u and v directions + // is determined entirely by the aspect ratio for cylindrical and ordinary + // spherical mappings, and so tangent and bitangent lengths are also + // determined by it (the alternate mapping is more complex). Therefore, we + // must ensure appropriate cotangent and cobitangent lengths as well. + // Base these off the uv<=>xyz mappings for each primitive. + const vec3 pos = intersection_pos_local; + const vec3 x_vec = vec3(1.0, 0.0, 0.0); + const vec3 y_vec = vec3(0.0, 1.0, 0.0); + // The tangent and bitangent vectors correspond with increasing u and v, + // respectively. Mathematically we'd base the cotangent/cobitangent on + // those, but we'll compute the cotangent/cobitangent directly when we can. + vec3 cotangent_unscaled, cobitangent_unscaled; + // geom_mode should be constant-folded without RUNTIME_GEOMETRY_MODE. + if(geom_mode < 1.5) + { + // Sphere: + // tangent = normalize(cross(normal, cross(x_vec, pos))) * geom_aspect.x + // bitangent = normalize(cross(cross(y_vec, pos), normal)) * geom_aspect.y + // inv_determinant = 1.0/length(cross(bitangent, tangent)) + // cotangent = cross(normal, bitangent) * inv_determinant + // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant + // cobitangent = cross(tangent, normal) * inv_determinant + // == normalize(cross(x_vec, pos)) * geom_aspect.x * inv_determinant + // Simplified (scale by inv_determinant below): + cotangent_unscaled = normalize(cross(y_vec, pos)) * geom_aspect.y; + cobitangent_unscaled = normalize(cross(x_vec, pos)) * geom_aspect.x; + } + else if(geom_mode < 2.5) + { + // Sphere, alternate mapping: + // This mapping works a bit like the cylindrical mapping in two + // directions, which makes the lengths and directions more complex. + // Unfortunately, I can't find much of a shortcut: + const vec3 tangent = normalize( + cross(y_vec, vec3(pos.x, 0.0, pos.z))) * geom_aspect.x; + const vec3 bitangent = normalize( + cross(x_vec, vec3(0.0, pos.yz))) * geom_aspect.y; + cotangent_unscaled = cross(normal, bitangent); + cobitangent_unscaled = cross(tangent, normal); + } + else + { + // Cylinder: + // tangent = normalize(cross(y_vec, normal)) * geom_aspect.x; + // bitangent = vec3(0.0, -geom_aspect.y, 0.0); + // inv_determinant = 1.0/length(cross(bitangent, tangent)) + // cotangent = cross(normal, bitangent) * inv_determinant + // == normalize(cross(y_vec, pos)) * geom_aspect.y * inv_determinant + // cobitangent = cross(tangent, normal) * inv_determinant + // == vec3(0.0, -geom_aspect.x, 0.0) * inv_determinant + cotangent_unscaled = cross(y_vec, normal) * geom_aspect.y; + cobitangent_unscaled = vec3(0.0, -geom_aspect.x, 0.0); + } + const vec3 computed_normal = + cross(cobitangent_unscaled, cotangent_unscaled); + const float inv_determinant = rsqrt(dot(computed_normal, computed_normal)); + const vec3 cotangent = cotangent_unscaled * inv_determinant; + const vec3 cobitangent = cobitangent_unscaled * inv_determinant; + // The [cotangent, cobitangent, normal] column vecs form the cotangent + // frame, i.e. the inverse-transpose TBN matrix. Get its transpose: + const vec3x3 object_to_tangent = vec3x3(cotangent, cobitangent, normal); + return object_to_tangent; +} + +vec2 get_curved_video_uv_coords_and_tangent_matrix( + const vec2 flat_video_uv, const vec3 eye_pos_local, + const vec2 output_size_inv, const vec2 geom_aspect, + const float geom_mode, const vec3x3 global_to_local, + out vec2x2 pixel_to_tangent_video_uv) +{ + // Requires: Parameters: + // 1.) flat_video_uv coords are in range [0.0, 1.0], where + // (0.0, 0.0) is the top-left corner of the screen and + // (1.0, 1.0) is the bottom-right corner. + // 2.) eye_pos_local is the 3D camera position in the simulated + // CRT's local coordinate frame. For best results, it must + // be computed based on the same geom_view_dist used here. + // 3.) output_size_inv = vec2(1.0)/IN.output_size + // 4.) geom_aspect = get_aspect_vector( + // IN.output_size.x / IN.output_size.y); + // 5.) geom_mode is a static or runtime mode setting: + // 0 = off, 1 = sphere, 2 = sphere alt., 3 = cylinder + // 6.) global_to_local is a 3x3 matrix transforming (ordinary) + // worldspace vectors to the CRT's local coordinate frame + // Globals: + // 1.) geom_view_dist must be > 0.0. It controls the "near + // plane" used to interpret flat_video_uv as a view + // vector, which controls the field of view (FOV). + // Returns: Return final uv coords in [0.0, 1.0], and return a pixel- + // space to video_uv tangent-space matrix in the out parameter. + // (This matrix assumes pixel-space +y = down, like +v = down.) + // We'll transform flat_video_uv into a view vector, project + // the view vector from the camera/eye, intersect with a sphere + // or cylinder representing the simulated CRT, and convert the + // intersection position into final uv coords and a local + // transformation matrix. + // First get the 3D view vector (geom_aspect and geom_view_dist are globals): + // 1.) Center uv around (0.0, 0.0) and make (-0.5, -0.5) and (0.5, 0.5) + // correspond to the top-left/bottom-right output screen corners. + // 2.) Multiply by geom_aspect to preemptively "undo" Retroarch's screen- + // space 2D aspect correction. We'll reapply it in uv-space. + // 3.) (x, y) = (u, -v), because +v is down in 2D screenspace, but +y + // is up in 3D worldspace (enforce a right-handed system). + // 4.) The view vector z controls the "near plane" distance and FOV. + // For the effect of "looking through a window" at a CRT, it should be + // set equal to the user's distance from their physical screen, in + // units of the viewport's physical diagonal size. + const vec2 view_uv = (flat_video_uv - vec2(0.5)) * geom_aspect; + const vec3 view_vec_global = + vec3(view_uv.x, -view_uv.y, -geom_view_dist); + // Transform the view vector into the CRT's local coordinate frame, convert + // to video_uv coords, and get the local 3D intersection position: + const vec3 view_vec_local = mul(global_to_local, view_vec_global); + vec3 pos; + const vec2 centered_uv = view_vec_to_uv( + view_vec_local, eye_pos_local, geom_aspect, geom_mode, pos); + const vec2 video_uv = centered_uv + vec2(0.5); + // Get a pixel-to-tangent-video-uv matrix. The caller could deal with + // all but one of these cases, but that would be more complicated. + #ifdef DRIVERS_ALLOW_DERIVATIVES + // Derivatives obtain a matrix very fast, but the direction of pixel- + // space +y seems to depend on the pass. Enforce the correct direction + // on a best-effort basis (but it shouldn't matter for antialiasing). + const vec2 duv_dx = ddx(video_uv); + const vec2 duv_dy = ddy(video_uv); + #ifdef LAST_PASS + pixel_to_tangent_video_uv = vec2x2( + duv_dx.x, duv_dy.x, + -duv_dx.y, -duv_dy.y); + #else + pixel_to_tangent_video_uv = vec2x2( + duv_dx.x, duv_dy.x, + duv_dx.y, duv_dy.y); + #endif + #else + // Manually define a transformation matrix. We'll assume pixel-space + // +y = down, just like +v = down. + if(geom_force_correct_tangent_matrix) + { + // Get the surface normal based on the local intersection position: + const vec3 normal_base = geom_mode < 2.5 ? pos : + vec3(pos.x, 0.0, pos.z); + const vec3 normal = normalize(normal_base); + // Get pixel-to-object and object-to-tangent matrices and combine + // them into a 2x2 pixel-to-tangent matrix for video_uv offsets: + const vec3x3 pixel_to_object = get_pixel_to_object_matrix( + global_to_local, eye_pos_local, view_vec_global, pos, normal, + output_size_inv); + const vec3x3 object_to_tangent = get_object_to_tangent_matrix( + pos, normal, geom_aspect, geom_mode); + const vec3x3 pixel_to_tangent3x3 = + mul(object_to_tangent, pixel_to_object); + pixel_to_tangent_video_uv = vec2x2( + pixel_to_tangent3x3._m00_m01_m10_m11); + } + else + { + // Ignore curvature, and just consider flat scaling. The + // difference is only apparent with strong curvature: + pixel_to_tangent_video_uv = vec2x2( + output_size_inv.x, 0.0, 0.0, output_size_inv.y); + } + #endif + return video_uv; +} + +float get_border_dim_factor(const vec2 video_uv, const vec2 geom_aspect) +{ + // COPYRIGHT NOTE FOR THIS FUNCTION: + // Copyright (C) 2010-2012 cgwg, 2014 TroggleMonkey + // This function uses an algorithm first coded in several of cgwg's GPL- + // licensed lines in crt-geom-curved.cg and its ancestors. The line + // between algorithm and code is nearly indistinguishable here, so it's + // unclear whether I could even release this project under a non-GPL + // license with this function included. + + // Calculate border_dim_factor from the proximity to uv-space image + // borders; geom_aspect/border_size/border/darkness/border_compress are globals: + const vec2 edge_dists = min(video_uv, vec2(1.0) - video_uv) * + geom_aspect; + const vec2 border_penetration = + max(vec2(border_size) - edge_dists, vec2(0.0)); + const float penetration_ratio = length(border_penetration)/border_size; + const float border_escape_ratio = max(1.0 - penetration_ratio, 0.0); + const float border_dim_factor = + pow(border_escape_ratio, border_darkness) * max(1.0, border_compress); + return min(border_dim_factor, 1.0); +} + + + +#endif // GEOMETRY_FUNCTIONS_H + + + diff --git a/crt/shaders/crt-royale/src/phosphor-mask-resizing.h b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h new file mode 100644 index 0000000..be26624 --- /dev/null +++ b/crt/shaders/crt-royale/src/phosphor-mask-resizing.h @@ -0,0 +1,677 @@ +#ifndef PHOSPHOR_MASK_RESIZING_H +#define PHOSPHOR_MASK_RESIZING_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../user-settings.h" +#include "derived-settings-and-constants.h" + +///////////////////////////// CODEPATH SELECTION ///////////////////////////// + +// Choose a looping strategy based on what's allowed: +// Dynamic loops not allowed: Use a flat static loop. +// Dynamic loops accomodated: Coarsely branch around static loops. +// Dynamic loops assumed allowed: Use a flat dynamic loop. +#ifndef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #ifdef ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + #define BREAK_LOOPS_INTO_PIECES + #else + #define USE_SINGLE_STATIC_LOOP + #endif +#endif // No else needed: Dynamic loops assumed. + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +// The larger the resized tile, the fewer samples we'll need for downsizing. +// See if we can get a static min tile size > mask_min_allowed_tile_size: +const float mask_min_allowed_tile_size = ceil( + mask_min_allowed_triad_size * mask_triads_per_tile); +const float mask_min_expected_tile_size = + mask_min_allowed_tile_size; +// Limit the number of sinc resize taps by the maximum minification factor: +const float pi_over_lobes = pi/mask_sinc_lobes; +const float max_sinc_resize_samples_float = 2.0 * mask_sinc_lobes * + mask_resize_src_lut_size.x/mask_min_expected_tile_size; +// Vectorized loops sample in multiples of 4. Round up to be safe: +const float max_sinc_resize_samples_m4 = ceil( + max_sinc_resize_samples_float * 0.25) * 4.0; + + +///////////////////////// RESAMPLING FUNCTION HELPERS //////////////////////// + +inline float get_dynamic_loop_size(const float magnification_scale) +{ + // Requires: The following global constants must be defined: + // 1.) mask_sinc_lobes + // 2.) max_sinc_resize_samples_m4 + // Returns: The minimum number of texture samples for a correct downsize + // at magnification_scale. + // We're downsizing, so the filter is sized across 2*lobes output pixels + // (not 2*lobes input texels). This impacts distance measurements and the + // minimum number of input samples needed. + const float min_samples_float = 2.0 * mask_sinc_lobes / magnification_scale; + const float min_samples_m4 = ceil(min_samples_float * 0.25) * 4.0; + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + const float max_samples_m4 = max_sinc_resize_samples_m4; + #else // ifdef BREAK_LOOPS_INTO_PIECES + // Simulating loops with branches imposes a 128-sample limit. + const float max_samples_m4 = min(128.0, max_sinc_resize_samples_m4); + #endif + return min(min_samples_m4, max_samples_m4); +} + +vec2 get_first_texel_tile_uv_and_dist(const vec2 tex_uv, + const vec2 texture_size, const float dr, + const float input_tiles_per_texture_r, const float samples, + const bool vertical) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) input_tiles_per_texture_r is the number of input tiles + // that can fit in the input texture in the direction we're + // resampling this pass. + // 3.) vertical indicates whether we're resampling vertically + // this pass (or horizontally). + // Returns: Pack and return the first sample's tile_uv coord in [0, 1] + // and its texel distance from the destination pixel, in the + // resized dimension only. + // We'll start with the topmost or leftmost sample and work down or right, + // so get the first sample location and distance. Modify both dimensions + // as if we're doing a one-pass 2D resize; we'll throw away the unneeded + // (and incorrect) dimension at the end. + const vec2 curr_texel = tex_uv * texture_size; + const vec2 prev_texel = + floor(curr_texel - vec2(under_half)) + vec2(0.5); + const vec2 first_texel = prev_texel - vec2(samples/2.0 - 1.0); + const vec2 first_texel_uv_wrap_2D = first_texel * dr; + const vec2 first_texel_dist_2D = curr_texel - first_texel; + // Convert from tex_uv to tile_uv coords so we can sub fracs for fmods. + const vec2 first_texel_tile_uv_wrap_2D = + first_texel_uv_wrap_2D * input_tiles_per_texture_r; + // Project wrapped coordinates to the [0, 1] range. We'll do this with all + // samples,but the first texel is special, since it might be negative. + const vec2 coord_negative = + vec2(first_texel_tile_uv_wrap_2D < vec2(0.0)); + const vec2 first_texel_tile_uv_2D = + frac(first_texel_tile_uv_wrap_2D) + coord_negative; + // Pack the first texel's tile_uv coord and texel distance in 1D: + const vec2 tile_u_and_dist = + vec2(first_texel_tile_uv_2D.x, first_texel_dist_2D.x); + const vec2 tile_v_and_dist = + vec2(first_texel_tile_uv_2D.y, first_texel_dist_2D.y); + return vertical ? tile_v_and_dist : tile_u_and_dist; + //return lerp(tile_u_and_dist, tile_v_and_dist, float(vertical)); +} + +inline vec4 tex2Dlod0try(const sampler2D tex, const vec2 tex_uv) +{ + // Mipmapping and anisotropic filtering get confused by sinc-resampling. + // One [slow] workaround is to select the lowest mip level: + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DLOD + return tex2Dlod(tex, vec4(tex_uv, 0.0, 0.0)); + #else + #ifdef ANISOTROPIC_RESAMPLING_COMPAT_TEX2DBIAS + return tex2Dbias(tex, vec4(tex_uv, 0.0, -16.0)); + #else + return texture(tex, tex_uv); + #endif + #endif +} + + +////////////////////////////// LOOP BODY MACROS ////////////////////////////// + +// Using inline functions can exceed the temporary register limit, so we're +// stuck with #define macros (I'm TRULY sorry). They're declared here instead +// of above to be closer to the actual invocation sites. Steps: +// 1.) Get the exact texel location. +// 2.) Sample the phosphor mask (already assumed encoded in linear RGB). +// 3.) Get the distance from the current pixel and sinc weight: +// sinc(dist) = sin(pi * dist)/(pi * dist) +// We can also use the slower/smoother Lanczos instead: +// L(x) = sinc(dist) * sinc(dist / lobes) +// 4.) Accumulate the weight sum in weights, and accumulate the weighted texels +// in pixel_color (we'll normalize outside the loop at the end). +// We vectorize the loop to help reduce the Lanczos window's cost. + + // The r coord is the coord in the dimension we're resizing along (u or v), + // and first_texel_tile_uv_rrrr is a vec4 of the first texel's u or v + // tile_uv coord in [0, 1]. tex_uv_r will contain the tile_uv u or v coord + // for four new texel samples. + #define CALCULATE_R_COORD_FOR_4_SAMPLES \ + const vec4 true_i = vec4(i_base + i) + vec4(0.0, 1.0, 2.0, 3.0); \ + const vec4 tile_uv_r = frac( \ + first_texel_tile_uv_rrrr + true_i * tile_dr); \ + const vec4 tex_uv_r = tile_uv_r * tile_size_uv_r; + + #ifdef PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const vec4 pi_dist_over_lobes = pi_over_lobes * dist; \ + const vec4 weights = min(sin(pi_dist) * sin(pi_dist_over_lobes) /\ + (pi_dist*pi_dist_over_lobes), vec4(1.0)); + #else + #define CALCULATE_SINC_RESAMPLE_WEIGHTS \ + const vec4 weights = min(sin(pi_dist)/pi_dist, vec4(1.0)); + #endif + + #define UPDATE_COLOR_AND_WEIGHT_SUMS \ + const vec4 dist = magnification_scale * \ + abs(first_dist_unscaled - true_i); \ + const vec4 pi_dist = pi * dist; \ + CALCULATE_SINC_RESAMPLE_WEIGHTS; \ + pixel_color += new_sample0 * weights.xxx; \ + pixel_color += new_sample1 * weights.yyy; \ + pixel_color += new_sample2 * weights.zzz; \ + pixel_color += new_sample3 * weights.www; \ + weight_sum += weights; + + #define VERTICAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const vec3 new_sample0 = tex2Dlod0try(texture, \ + vec2(tex_uv.x, tex_uv_r.x)).rgb; \ + const vec3 new_sample1 = tex2Dlod0try(texture, \ + vec2(tex_uv.x, tex_uv_r.y)).rgb; \ + const vec3 new_sample2 = tex2Dlod0try(texture, \ + vec2(tex_uv.x, tex_uv_r.z)).rgb; \ + const vec3 new_sample3 = tex2Dlod0try(texture, \ + vec2(tex_uv.x, tex_uv_r.w)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + #define HORIZONTAL_SINC_RESAMPLE_LOOP_BODY \ + CALCULATE_R_COORD_FOR_4_SAMPLES; \ + const vec3 new_sample0 = tex2Dlod0try(texture, \ + vec2(tex_uv_r.x, tex_uv.y)).rgb; \ + const vec3 new_sample1 = tex2Dlod0try(texture, \ + vec2(tex_uv_r.y, tex_uv.y)).rgb; \ + const vec3 new_sample2 = tex2Dlod0try(texture, \ + vec2(tex_uv_r.z, tex_uv.y)).rgb; \ + const vec3 new_sample3 = tex2Dlod0try(texture, \ + vec2(tex_uv_r.w, tex_uv.y)).rgb; \ + UPDATE_COLOR_AND_WEIGHT_SUMS; + + +//////////////////////////// RESAMPLING FUNCTIONS //////////////////////////// + +vec3 downsample_vertical_sinc_tiled(const sampler2D texture, + const vec2 tex_uv, const vec2 texture_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Requires: 1.) dr == du == 1.0/texture_size.x or + // dr == dv == 1.0/texture_size.y + // (whichever direction we're resampling in). + // It's a scalar to save register space. + // 2.) tile_size_uv_r is the number of texels an input tile + // takes up in the input texture, in the direction we're + // resampling this pass. + // 3.) magnification_scale must be <= 1.0. + // Returns: Return a [Lanczos] sinc-resampled pixel of a vertically + // downsized input tile embedded in an input texture. (The + // vertical version is special-cased though: It assumes the + // tile size equals the [static] texture size, since it's used + // on an LUT texture input containing one tile. For more + // generic use, eliminate the "static" in the parameters.) + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dy" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // A static loop can be faster, but it might blur too much from using + // more samples than it should. + const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along the resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // true = vertical resize: + const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, true); + const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + vec4 weight_sum = vec4(0.0); + vec3 pixel_color = vec3(0.0); + const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + VERTICAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + +vec3 downsample_horizontal_sinc_tiled(const sampler2D texture, + const vec2 tex_uv, const vec2 texture_size, const float dr, + const float magnification_scale, const float tile_size_uv_r) +{ + // Differences from downsample_horizontal_sinc_tiled: + // 1.) The dr and tile_size_uv_r parameters are not static consts. + // 2.) The "vertical" parameter to get_first_texel_tile_uv_and_dist is + // set to false instead of true. + // 3.) The horizontal version of the loop body is used. + // TODO: If we can get guaranteed compile-time dead code elimination, + // we can combine the vertical/horizontal downsampling functions by: + // 1.) Add an extra static const bool parameter called "vertical." + // 2.) Supply it with the result of get_first_texel_tile_uv_and_dist(). + // 3.) Use a conditional assignment in the loop body macro. This is the + // tricky part: We DO NOT want to incur the extra conditional + // assignment in the inner loop at runtime! + // The "r" in "dr," "tile_size_uv_r," etc. refers to the dimension + // we're resizing along, e.g. "dx" in this case. + #ifdef USE_SINGLE_STATIC_LOOP + // If we have to load all samples, we might as well use them. + const int samples = int(max_sinc_resize_samples_m4); + #else + const int samples = int(get_dynamic_loop_size(magnification_scale)); + #endif + + // Get the first sample location (scalar tile uv coord along resized + // dimension) and distance from the output location (in texels): + const float input_tiles_per_texture_r = 1.0/tile_size_uv_r; + // false = horizontal resize: + const vec2 first_texel_tile_r_and_dist = get_first_texel_tile_uv_and_dist( + tex_uv, texture_size, dr, input_tiles_per_texture_r, samples, false); + const vec4 first_texel_tile_uv_rrrr = first_texel_tile_r_and_dist.xxxx; + const vec4 first_dist_unscaled = first_texel_tile_r_and_dist.yyyy; + // Get the tile sample offset: + const float tile_dr = dr * input_tiles_per_texture_r; + + // Sum up each weight and weighted sample color, varying the looping + // strategy based on our expected dynamic loop capabilities. See the + // loop body macros above. + int i_base = 0; + vec4 weight_sum = vec4(0.0); + vec3 pixel_color = vec3(0.0); + const int i_step = 4; + #ifdef BREAK_LOOPS_INTO_PIECES + if(samples - i_base >= 64) + { + for(int i = 0; i < 64; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 64; + } + if(samples - i_base >= 32) + { + for(int i = 0; i < 32; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 32; + } + if(samples - i_base >= 16) + { + for(int i = 0; i < 16; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 16; + } + if(samples - i_base >= 8) + { + for(int i = 0; i < 8; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 8; + } + if(samples - i_base >= 4) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + i_base += 4; + } + // Do another 4-sample block for a total of 128 max samples. + if(samples - i_base > 0) + { + for(int i = 0; i < 4; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + } + #else + for(int i = 0; i < samples; i += i_step) + { + HORIZONTAL_SINC_RESAMPLE_LOOP_BODY; + } + #endif + // Normalize so the weight_sum == 1.0, and return: + const vec2 weight_sum_reduce = weight_sum.xy + weight_sum.zw; + const vec3 scalar_weight_sum = vec3(weight_sum_reduce.x + + weight_sum_reduce.y); + return (pixel_color/scalar_weight_sum); +} + + +//////////////////////////// TILE SIZE CALCULATION /////////////////////////// + +vec2 get_resized_mask_tile_size(const vec2 estimated_viewport_size, + const vec2 estimated_mask_resize_output_size, + const bool solemnly_swear_same_inputs_for_every_pass) +{ + // Requires: The following global constants must be defined according to + // certain constraints: + // 1.) mask_resize_num_triads: Must be high enough that our + // mask sampling method won't have artifacts later + // (long story; see derived-settings-and-constants.h) + // 2.) mask_resize_src_lut_size: Texel size of our mask LUT + // 3.) mask_triads_per_tile: Num horizontal triads in our LUT + // 4.) mask_min_allowed_triad_size: User setting (the more + // restrictive it is, the faster the resize will go) + // 5.) mask_min_allowed_tile_size_x < mask_resize_src_lut_size.x + // 6.) mask_triad_size_desired_{runtime, static} + // 7.) mask_num_triads_desired_{runtime, static} + // 8.) mask_specify_num_triads must be 0.0/1.0 (false/true) + // The function parameters must be defined as follows: + // 1.) estimated_viewport_size == (final viewport size); + // If mask_specify_num_triads is 1.0/true and the viewport + // estimate is wrong, the number of triads will differ from + // the user's preference by about the same factor. + // 2.) estimated_mask_resize_output_size: Must equal the + // output size of the MASK_RESIZE pass. + // Exception: The x component may be estimated garbage if + // and only if the caller throws away the x result. + // 3.) solemnly_swear_same_inputs_for_every_pass: Set to false, + // unless you can guarantee that every call across every + // pass will use the same sizes for the other parameters. + // When calling this across multiple passes, always use the + // same y viewport size/scale, and always use the same x + // viewport size/scale when using the x result. + // Returns: Return the final size of a manually resized mask tile, after + // constraining the desired size to avoid artifacts. Under + // unusual circumstances, tiles may become stretched vertically + // (see wall of text below). + // Stated tile properties must be correct: + const float tile_aspect_ratio_inv = + mask_resize_src_lut_size.y/mask_resize_src_lut_size.x; + const float tile_aspect_ratio = 1.0/tile_aspect_ratio_inv; + const vec2 tile_aspect = vec2(1.0, tile_aspect_ratio_inv); + // If mask_specify_num_triads is 1.0/true and estimated_viewport_size.x is + // wrong, the user preference will be misinterpreted: + const float desired_tile_size_x = mask_triads_per_tile * lerp( + mask_triad_size_desired, + estimated_viewport_size.x / mask_num_triads_desired, + mask_specify_num_triads); + if(get_mask_sample_mode() > 0.5) + { + // We don't need constraints unless we're sampling MASK_RESIZE. + return desired_tile_size_x * tile_aspect; + } + // Make sure we're not upsizing: + const float temp_tile_size_x = + min(desired_tile_size_x, mask_resize_src_lut_size.x); + // Enforce min_tile_size and max_tile_size in both dimensions: + const vec2 temp_tile_size = temp_tile_size_x * tile_aspect; + const vec2 min_tile_size = + mask_min_allowed_tile_size * tile_aspect; + const vec2 max_tile_size = + estimated_mask_resize_output_size / mask_resize_num_tiles; + const vec2 clamped_tile_size = + clamp(temp_tile_size, min_tile_size, max_tile_size); + // Try to maintain tile_aspect_ratio. This is the tricky part: + // If we're currently resizing in the y dimension, the x components + // could be MEANINGLESS. (If estimated_mask_resize_output_size.x is + // bogus, then so is max_tile_size.x and clamped_tile_size.x.) + // We can't adjust the y size based on clamped_tile_size.x. If it + // clamps when it shouldn't, it won't clamp again when later passes + // call this function with the correct sizes, and the discrepancy will + // break the sampling coords in MASKED_SCANLINES. Instead, we'll limit + // the x size based on the y size, but not vice versa, unless the + // caller swears the parameters were the same (correct) in every pass. + // As a result, triads could appear vertically stretched if: + // a.) mask_resize_src_lut_size.x > mask_resize_src_lut_size.y: Wide + // LUT's might clamp x more than y (all provided LUT's are square) + // b.) true_viewport_size.x < true_viewport_size.y: The user is playing + // with a vertically oriented screen (not accounted for anyway) + // c.) mask_resize_viewport_scale.x < masked_resize_viewport_scale.y: + // Viewport scales are equal by default. + // If any of these are the case, you can fix the stretching by setting: + // mask_resize_viewport_scale.x = mask_resize_viewport_scale.y * + // (1.0 / min_expected_aspect_ratio) * + // (mask_resize_src_lut_size.x / mask_resize_src_lut_size.y) + const float x_tile_size_from_y = + clamped_tile_size.y * tile_aspect_ratio; + const float y_tile_size_from_x = lerp(clamped_tile_size.y, + clamped_tile_size.x * tile_aspect_ratio_inv, + float(solemnly_swear_same_inputs_for_every_pass)); + const vec2 reclamped_tile_size = vec2( + min(clamped_tile_size.x, x_tile_size_from_y), + min(clamped_tile_size.y, y_tile_size_from_x)); + // We need integer tile sizes in both directions for tiled sampling to + // work correctly. Use floor (to make sure we don't round up), but be + // careful to avoid a rounding bug where floor decreases whole numbers: + const vec2 final_resized_tile_size = + floor(reclamped_tile_size + vec2(FIX_ZERO(0.0))); + return final_resized_tile_size; +} + + +///////////////////////// FINAL MASK SAMPLING HELPERS //////////////////////// + +vec4 get_mask_sampling_parameters(const vec2 mask_resize_texture_size, + const vec2 mask_resize_video_size, const vec2 true_viewport_size, + out vec2 mask_tiles_per_screen) +{ + // Requires: 1.) Requirements of get_resized_mask_tile_size() must be + // met, particularly regarding global constants. + // The function parameters must be defined as follows: + // 1.) mask_resize_texture_size == MASK_RESIZE.texture_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 2.) mask_resize_video_size == MASK_RESIZE.video_size + // if get_mask_sample_mode() is 0 (otherwise anything) + // 3.) true_viewport_size == IN.output_size for a pass set to + // 1.0 viewport scale (i.e. it must be correct) + // Returns: Return a vec4 containing: + // xy: tex_uv coords for the start of the mask tile + // zw: tex_uv size of the mask tile from start to end + // mask_tiles_per_screen is an out parameter containing the + // number of mask tiles that will fit on the screen. + // First get the final resized tile size. The viewport size and mask + // resize viewport scale must be correct, but don't solemnly swear they + // were correct in both mask resize passes unless you know it's true. + // (We can better ensure a correct tile aspect ratio if the parameters are + // guaranteed correct in all passes...but if we lie, we'll get inconsistent + // sizes across passes, resulting in broken texture coordinates.) + const float mask_sample_mode = get_mask_sample_mode(); + const vec2 mask_resize_tile_size = get_resized_mask_tile_size( + true_viewport_size, mask_resize_video_size, false); + if(mask_sample_mode < 0.5) + { + // Sample MASK_RESIZE: The resized tile is a fraction of the texture + // size and starts at a nonzero offset to allow for border texels: + const vec2 mask_tile_uv_size = mask_resize_tile_size / + mask_resize_texture_size; + const vec2 skipped_tiles = mask_start_texels/mask_resize_tile_size; + const vec2 mask_tile_start_uv = skipped_tiles * mask_tile_uv_size; + // mask_tiles_per_screen must be based on the *true* viewport size: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + return vec4(mask_tile_start_uv, mask_tile_uv_size); + } + else + { + // If we're tiling at the original size (1:1 pixel:texel), redefine a + // "tile" to be the full texture containing many triads. Otherwise, + // we're hardware-resampling an LUT, and the texture truly contains a + // single unresized phosphor mask tile anyway. + const vec2 mask_tile_uv_size = vec2(1.0); + const vec2 mask_tile_start_uv = vec2(0.0); + if(mask_sample_mode > 1.5) + { + // Repeat the full LUT at a 1:1 pixel:texel ratio without resizing: + mask_tiles_per_screen = true_viewport_size/mask_texture_large_size; + } + else + { + // Hardware-resize the original LUT: + mask_tiles_per_screen = true_viewport_size / mask_resize_tile_size; + } + return vec4(mask_tile_start_uv, mask_tile_uv_size); + } +} + +vec2 fix_tiling_discontinuities_normalized(const vec2 tile_uv, + vec2 duv_dx, vec2 duv_dy) +{ + // Requires: 1.) duv_dx == ddx(tile_uv) + // 2.) duv_dy == ddy(tile_uv) + // 3.) tile_uv contains tile-relative uv coords in [0, 1], + // such that (0.5, 0.5) is the center of a tile, etc. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // Returns: Return new tile_uv coords that contain no discontinuities + // across a 2x2 pixel quad. + // Description: + // When uv coords wrap from 1.0 to 0.0, they create a discontinuity in the + // derivatives, which we assume happened if the absolute difference between + // any fragment in a 2x2 block is > ~half a tile. If the current block has + // a u or v discontinuity and the current fragment is in the first half of + // the tile along that axis (i.e. it wrapped from 1.0 to 0.0), add a tile + // to that coord to make the 2x2 block continuous. (It will now have a + // coord > 1.0 in the padding area beyond the tile.) This function takes + // derivatives as parameters so the caller can reuse them. + // In case we're using high-quality (nVidia-style) derivatives, ensure + // diagonically opposite fragments see each other for correctness: + duv_dx = abs(duv_dx) + abs(ddy(duv_dx)); + duv_dy = abs(duv_dy) + abs(ddx(duv_dy)); + const vec2 pixel_in_first_half_tile = vec2(tile_uv < vec2(0.5)); + const vec2 jump_exists = vec2(duv_dx + duv_dy > vec2(0.5)); + return tile_uv + jump_exists * pixel_in_first_half_tile; +} + +vec2 convert_phosphor_tile_uv_wrap_to_tex_uv(const vec2 tile_uv_wrap, + const vec4 mask_tile_start_uv_and_size) +{ + // Requires: 1.) tile_uv_wrap contains tile-relative uv coords, where the + // tile spans from [0, 1], such that (0.5, 0.5) is at the + // tile center. The input coords can range from [0, inf], + // and their fractional parts map to a repeated tile. + // ("Tile" can mean texture, the video embedded in the + // texture, or some other "tile" embedded in a texture.) + // 2.) mask_tile_start_uv_and_size.xy contains tex_uv coords + // for the start of the embedded tile in the full texture. + // 3.) mask_tile_start_uv_and_size.zw contains the [fractional] + // tex_uv size of the embedded tile in the full texture. + // Returns: Return tex_uv coords (used for texture sampling) + // corresponding to tile_uv_wrap. + if(get_mask_sample_mode() < 0.5) + { + // Manually repeat the resized mask tile to fill the screen: + // First get fractional tile_uv coords. Using frac/fmod on coords + // confuses anisotropic filtering; fix it as user options dictate. + // derived-settings-and-constants.h disables incompatible options. + #ifdef ANISOTROPIC_TILING_COMPAT_TILE_FLAT_TWICE + vec2 tile_uv = frac(tile_uv_wrap * 0.5) * 2.0; + #else + vec2 tile_uv = frac(tile_uv_wrap); + #endif + #ifdef ANISOTROPIC_TILING_COMPAT_FIX_DISCONTINUITIES + const vec2 tile_uv_dx = ddx(tile_uv); + const vec2 tile_uv_dy = ddy(tile_uv); + tile_uv = fix_tiling_discontinuities_normalized(tile_uv, + tile_uv_dx, tile_uv_dy); + #endif + // The tile is embedded in a padded FBO, and it may start at a + // nonzero offset if border texels are used to avoid artifacts: + const vec2 mask_tex_uv = mask_tile_start_uv_and_size.xy + + tile_uv * mask_tile_start_uv_and_size.zw; + return mask_tex_uv; + } + else + { + // Sample from the input phosphor mask texture with hardware tiling. + // If we're tiling at the original size (mode 2), the "tile" is the + // whole texture, and it contains a large number of triads mapped with + // a 1:1 pixel:texel ratio. OTHERWISE, the texture contains a single + // unresized tile. tile_uv_wrap already has correct coords for both! + return tile_uv_wrap; + } +} + + +#endif // PHOSPHOR_MASK_RESIZING_H + diff --git a/crt/shaders/crt-royale/src/scanline-functions.h b/crt/shaders/crt-royale/src/scanline-functions.h new file mode 100644 index 0000000..d71a500 --- /dev/null +++ b/crt/shaders/crt-royale/src/scanline-functions.h @@ -0,0 +1,572 @@ +#ifndef SCANLINE_FUNCTIONS_H +#define SCANLINE_FUNCTIONS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../user-settings.h" +#include "derived-settings-and-constants.h" +#include "../../../../include/special-functions.h" +#include "../../../../include/gamma-management.h" + + +///////////////////////////// SCANLINE FUNCTIONS ///////////////////////////// +/* +inline float3 get_gaussian_sigma(const float3 color, const float sigma_range) +{ + // Requires: Globals: + // 1.) beam_min_sigma and beam_max_sigma are global floats + // containing the desired minimum and maximum beam standard + // deviations, for dim and bright colors respectively. + // 2.) beam_max_sigma must be > 0.0 + // 3.) beam_min_sigma must be in (0.0, beam_max_sigma] + // 4.) beam_spot_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) sigma_range = beam_max_sigma - beam_min_sigma; we take + // sigma_range as a parameter to avoid repeated computation + // when beam_{min, max}_sigma are runtime shader parameters + // Optional: Users may set beam_spot_shape_function to 1 to define the + // inner f(color) subfunction (see below) as: + // f(color) = sqrt(1.0 - (color - 1.0)*(color - 1.0)) + // Otherwise (technically, if beam_spot_shape_function < 0.5): + // f(color) = pow(color, beam_spot_power) + // Returns: The standard deviation of the Gaussian beam for "color:" + // sigma = beam_min_sigma + sigma_range * f(color) + // Details/Discussion: + // The beam's spot shape vaguely resembles an aspect-corrected f() in the + // range [0, 1] (not quite, but it's related). f(color) = color makes + // spots look like diamonds, and a spherical function or cube balances + // between variable width and a soft/realistic shape. A beam_spot_power + // > 1.0 can produce an ugly spot shape and more initial clipping, but the + // final shape also differs based on the horizontal resampling filter and + // the phosphor bloom. For instance, resampling horizontally in nonlinear + // light and/or with a sharp (e.g. Lanczos) filter will sharpen the spot + // shape, but a sixth root is still quite soft. A power function (default + // 1.0/3.0 beam_spot_power) is most flexible, but a fixed spherical curve + // has the highest variability without an awful spot shape. + // + // beam_min_sigma affects scanline sharpness/aliasing in dim areas, and its + // difference from beam_max_sigma affects beam width variability. It only + // affects clipping [for pure Gaussians] if beam_spot_power > 1.0 (which is + // a conservative estimate for a more complex constraint). + // + // beam_max_sigma affects clipping and increasing scanline width/softness + // as color increases. The wider this is, the more scanlines need to be + // evaluated to avoid distortion. For a pure Gaussian, the max_beam_sigma + // at which the first unused scanline always has a weight < 1.0/255.0 is: + // num scanlines = 2, max_beam_sigma = 0.2089; distortions begin ~0.34 + // num scanlines = 3, max_beam_sigma = 0.3879; distortions begin ~0.52 + // num scanlines = 4, max_beam_sigma = 0.5723; distortions begin ~0.70 + // num scanlines = 5, max_beam_sigma = 0.7591; distortions begin ~0.89 + // num scanlines = 6, max_beam_sigma = 0.9483; distortions begin ~1.08 + // Generalized Gaussians permit more leeway here as steepness increases. + if(beam_spot_shape_function < 0.5) + { + // Use a power function: + return float3(beam_min_sigma) + sigma_range * + pow(color, beam_spot_power); + } + else + { + // Use a spherical function: + const float3 color_minus_1 = color - float3(1.0); + return float3(beam_min_sigma) + sigma_range * + sqrt(float3(1.0) - color_minus_1*color_minus_1); + } +} + +inline float3 get_generalized_gaussian_beta(const float3 color, + const float shape_range) +{ + // Requires: Globals: + // 1.) beam_min_shape and beam_max_shape are global floats + // containing the desired min/max generalized Gaussian + // beta parameters, for dim and bright colors respectively. + // 2.) beam_max_shape must be >= 2.0 + // 3.) beam_min_shape must be in [2.0, beam_max_shape] + // 4.) beam_shape_power must be defined as a global float. + // Parameters: + // 1.) color is the underlying source color along a scanline + // 2.) shape_range = beam_max_shape - beam_min_shape; we take + // shape_range as a parameter to avoid repeated computation + // when beam_{min, max}_shape are runtime shader parameters + // Returns: The type-I generalized Gaussian "shape" parameter beta for + // the given color. + // Details/Discussion: + // Beta affects the scanline distribution as follows: + // a.) beta < 2.0 narrows the peak to a spike with a discontinuous slope + // b.) beta == 2.0 just degenerates to a Gaussian + // c.) beta > 2.0 flattens and widens the peak, then drops off more steeply + // than a Gaussian. Whereas high sigmas widen and soften peaks, high + // beta widen and sharpen peaks at the risk of aliasing. + // Unlike high beam_spot_powers, high beam_shape_powers actually soften shape + // transitions, whereas lower ones sharpen them (at the risk of aliasing). + return beam_min_shape + shape_range * pow(color, beam_shape_power); +} + +float3 scanline_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range) +{ + // Requires: 1.) dist is the distance of the [potentially separate R/G/B] + // point(s) from a scanline in units of scanlines, where + // 1.0 means the sample point straddles the next scanline. + // 2.) color is the underlying source color along a scanline. + // 3.) pixel_height is the output pixel height in scanlines. + // 4.) Requirements of get_gaussian_sigma() must be met. + // Returns: Return a scanline's light output over a given pixel. + // Details: + // The CRT beam profile follows a roughly Gaussian distribution which is + // wider for bright colors than dark ones. The integral over the full + // range of a Gaussian function is always 1.0, so we can vary the beam + // with a standard deviation without affecting brightness. 'x' = distance: + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + // gaussian integral = 0.5 (1.0 + erf(x/(sigma * sqrt(2)))) + // Use a numerical approximation of the "error function" (the Gaussian + // indefinite integral) to find the definite integral of the scanline's + // average brightness over a given pixel area. Even if curved coords were + // used in this pass, a flat scalar pixel height works almost as well as a + // pixel height computed from a full pixel-space to scanline-space matrix. + const float3 sigma = get_gaussian_sigma(color, sigma_range); + const float3 ph_offset = float3(pixel_height * 0.5); + const float3 denom_inv = 1.0/(sigma*sqrt(2.0)); + const float3 integral_high = erf((dist + ph_offset)*denom_inv); + const float3 integral_low = erf((dist - ph_offset)*denom_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_generalized_gaussian_integral_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range, + const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel. + // A generalized Gaussian distribution allows the shape (beta) to vary + // as well as the width (alpha). "gamma" refers to the gamma function: + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + // ligamma(s, z) is the lower incomplete gamma function, for which we only + // implement two of four branches (because we keep 1/beta <= 0.5): + // generalized integral = 0.5 + 0.5* sign(x) * + // ligamma(1/beta, (|x|/alpha)**beta)/gamma(1/beta) + // See get_generalized_gaussian_beta() for a discussion of beta. + // We base alpha on the intended Gaussian sigma, but it only strictly + // models models standard deviation at beta == 2, because the standard + // deviation depends on both alpha and beta (keeping alpha independent is + // faster and preserves intuitive behavior and a full spectrum of results). + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + const float3 alpha_inv = float3(1.0)/alpha; + const float3 s = float3(1.0)/beta; + const float3 ph_offset = float3(pixel_height * 0.5); + // Pass beta to gamma_impl to avoid repeated divides. Similarly pass + // beta (i.e. 1/s) and 1/gamma(s) to normalized_ligamma_impl. + const float3 gamma_s_inv = float3(1.0)/gamma_impl(s, beta); + const float3 dist1 = dist + ph_offset; + const float3 dist0 = dist - ph_offset; + const float3 integral_high = sign(dist1) * normalized_ligamma_impl( + s, pow(abs(dist1)*alpha_inv, beta), beta, gamma_s_inv); + const float3 integral_low = sign(dist0) * normalized_ligamma_impl( + s, pow(abs(dist0)*alpha_inv, beta), beta, gamma_s_inv); + return color * 0.5*(integral_high - integral_low)/pixel_height; +} + +float3 scanline_gaussian_sampled_contrib(const float3 dist, const float3 color, + const float pixel_height, const float sigma_range) +{ + // See scanline_gaussian integral_contrib() for detailed comments! + // gaussian sample = 1/(sigma*sqrt(2*pi)) * e**(-(x**2)/(2*sigma**2)) + const float3 sigma = get_gaussian_sigma(color, sigma_range); + // Avoid repeated divides: + const float3 sigma_inv = float3(1.0)/sigma; + const float3 inner_denom_inv = 0.5 * sigma_inv * sigma_inv; + const float3 outer_denom_inv = sigma_inv/sqrt(2.0*pi); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel away in each direction as well: + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three pure Gaussian samples: + const float3 scale = color/3.0 * outer_denom_inv; + const float3 weight1 = exp(-(dist*dist)*inner_denom_inv); + const float3 weight2 = exp(-(dist2*dist2)*inner_denom_inv); + const float3 weight3 = exp(-(dist3*dist3)*inner_denom_inv); + return scale * (weight1 + weight2 + weight3); + } + else + { + return color*exp(-(dist*dist)*inner_denom_inv)*outer_denom_inv; + } +} + +float3 scanline_generalized_gaussian_sampled_contrib(const float3 dist, + const float3 color, const float pixel_height, const float sigma_range, + const float shape_range) +{ + // See scanline_generalized_gaussian_integral_contrib() for details! + // generalized sample = + // beta/(2*alpha*gamma(1/beta)) * e**(-(|x|/alpha)**beta) + const float3 alpha = sqrt(2.0) * get_gaussian_sigma(color, sigma_range); + const float3 beta = get_generalized_gaussian_beta(color, shape_range); + // Avoid repeated divides: + const float3 alpha_inv = float3(1.0)/alpha; + const float3 beta_inv = float3(1.0)/beta; + const float3 scale = color * beta * 0.5 * alpha_inv / + gamma_impl(beta_inv, beta); + if(beam_antialias_level > 0.5) + { + // Sample 1/3 pixel closer to and farther from the scanline too. + const float3 sample_offset = float3(pixel_height/3.0); + const float3 dist2 = dist + sample_offset; + const float3 dist3 = abs(dist - sample_offset); + // Average three generalized Gaussian samples: + const float3 weight1 = exp(-pow(abs(dist*alpha_inv), beta)); + const float3 weight2 = exp(-pow(abs(dist2*alpha_inv), beta)); + const float3 weight3 = exp(-pow(abs(dist3*alpha_inv), beta)); + return scale/3.0 * (weight1 + weight2 + weight3); + } + else + { + return scale * exp(-pow(abs(dist*alpha_inv), beta)); + } +} + +inline float3 scanline_contrib(float3 dist, float3 color, + float pixel_height, const float sigma_range, const float shape_range) +{ + // Requires: 1.) Requirements of scanline_gaussian_integral_contrib() + // must be met. + // 2.) Requirements of get_gaussian_sigma() must be met. + // 3.) Requirements of get_generalized_gaussian_beta() must be + // met. + // Returns: Return a scanline's light output over a given pixel, using + // a generalized or pure Gaussian distribution and sampling or + // integrals as desired by user codepath choices. + if(beam_generalized_gaussian) + { + if(beam_antialias_level > 1.5) + { + return scanline_generalized_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + else + { + return scanline_generalized_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range, shape_range); + } + } + else + { + if(beam_antialias_level > 1.5) + { + return scanline_gaussian_integral_contrib( + dist, color, pixel_height, sigma_range); + } + else + { + return scanline_gaussian_sampled_contrib( + dist, color, pixel_height, sigma_range); + } + } +} + +inline float3 get_raw_interpolated_color(const float3 color0, + const float3 color1, const float3 color2, const float3 color3, + const float4 weights) +{ + // Use max to avoid bizarre artifacts from negative colors: + return max(mul(weights, float4x3(color0, color1, color2, color3)), 0.0); +} + +float3 get_interpolated_linear_color(const float3 color0, const float3 color1, + const float3 color2, const float3 color3, const float4 weights) +{ + // Requires: 1.) Requirements of include/gamma-management.h must be met: + // intermediate_gamma must be globally defined, and input + // colors are interpreted as linear RGB unless you #define + // GAMMA_ENCODE_EVERY_FBO (in which case they are + // interpreted as gamma-encoded with intermediate_gamma). + // 2.) color0-3 are colors sampled from a texture with tex2D(). + // They are interpreted as defined in requirement 1. + // 3.) weights contains weights for each color, summing to 1.0. + // 4.) beam_horiz_linear_rgb_weight must be defined as a global + // float in [0.0, 1.0] describing how much blending should + // be done in linear RGB (rest is gamma-corrected RGB). + // 5.) RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE must be #defined + // if beam_horiz_linear_rgb_weight is anything other than a + // static constant, or we may try branching at runtime + // without dynamic branches allowed (slow). + // Returns: Return an interpolated color lookup between the four input + // colors based on the weights in weights. The final color will + // be a linear RGB value, but the blending will be done as + // indicated above. + const float intermediate_gamma = get_intermediate_gamma(); + // Branch if beam_horiz_linear_rgb_weight is static (for free) or if the + // profile allows dynamic branches (faster than computing extra pows): + #ifndef RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #else + #ifdef DRIVERS_ALLOW_DYNAMIC_BRANCHES + #define SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + #endif + #endif + #ifdef SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT + // beam_horiz_linear_rgb_weight is static, so we can branch: + #ifdef GAMMA_ENCODE_EVERY_FBO + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + if(beam_horiz_linear_rgb_weight > 0.0) + { + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, intermediate_gamma), + pow(color1, intermediate_gamma), + pow(color2, intermediate_gamma), + pow(color3, intermediate_gamma), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return gamma_mixed_color; + } + #else + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + if(beam_horiz_linear_rgb_weight < 1.0) + { + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, 1.0/intermediate_gamma), + pow(color1, 1.0/intermediate_gamma), + pow(color2, 1.0/intermediate_gamma), + pow(color3, 1.0/intermediate_gamma), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + } + else + { + return linear_mixed_color; + } + #endif // GAMMA_ENCODE_EVERY_FBO + #else + #ifdef GAMMA_ENCODE_EVERY_FBO + // Inputs: color0-3 are colors in gamma-encoded RGB. + const float3 gamma_mixed_color = pow(get_raw_interpolated_color( + color0, color1, color2, color3, weights), intermediate_gamma); + const float3 linear_mixed_color = get_raw_interpolated_color( + pow(color0, intermediate_gamma), + pow(color1, intermediate_gamma), + pow(color2, intermediate_gamma), + pow(color3, intermediate_gamma), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #else + // Inputs: color0-3 are colors in linear RGB. + const float3 linear_mixed_color = get_raw_interpolated_color( + color0, color1, color2, color3, weights); + const float3 gamma_mixed_color = get_raw_interpolated_color( + pow(color0, 1.0/intermediate_gamma), + pow(color1, 1.0/intermediate_gamma), + pow(color2, 1.0/intermediate_gamma), + pow(color3, 1.0/intermediate_gamma), + weights); + return lerp(gamma_mixed_color, linear_mixed_color, + beam_horiz_linear_rgb_weight); + #endif // GAMMA_ENCODE_EVERY_FBO + #endif // SCANLINES_BRANCH_FOR_LINEAR_RGB_WEIGHT +} + +float3 get_scanline_color(const sampler2D texture, const float2 scanline_uv, + const float2 uv_step_x, const float4 weights) +{ + // Requires: 1.) scanline_uv must be vertically snapped to the caller's + // desired line or scanline and horizontally snapped to the + // texel just left of the output pixel (color1) + // 2.) uv_step_x must contain the horizontal uv distance + // between texels. + // 3.) weights must contain interpolation filter weights for + // color0, color1, color2, and color3, where color1 is just + // left of the output pixel. + // Returns: Return a horizontally interpolated texture lookup using 2-4 + // nearby texels, according to weights and the conventions of + // get_interpolated_linear_color(). + // We can ignore the outside texture lookups for Quilez resampling. + const float3 color1 = tex2D(texture, scanline_uv).rgb; + const float3 color2 = tex2D(texture, scanline_uv + uv_step_x).rgb; + float3 color0 = float3(0.0); + float3 color3 = float3(0.0); + if(beam_horiz_filter > 0.5) + { + color0 = tex2D(texture, scanline_uv - uv_step_x).rgb; + color3 = tex2D(texture, scanline_uv + 2.0 * uv_step_x).rgb; + } + // Sample the texture as-is, whether it's linear or gamma-encoded: + // get_interpolated_linear_color() will handle the difference. + return get_interpolated_linear_color(color0, color1, color2, color3, weights); +} + +float3 sample_single_scanline_horizontal(const sampler2D texture, + const float2 tex_uv, const float2 texture_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Snap to the previous texel and get sample dists from 2/4 nearby texels: + const float2 curr_texel = tex_uv * texture_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel = + floor(curr_texel - float2(under_half)) + float2(0.5); + const float2 prev_texel_hor = float2(prev_texel.x, curr_texel.y); + const float2 prev_texel_hor_uv = prev_texel_hor * texture_size_inv; + const float prev_dist = curr_texel.x - prev_texel_hor.x; + const float4 sample_dists = float4(1.0 + prev_dist, prev_dist, + 1.0 - prev_dist, 2.0 - prev_dist); + // Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels: + float4 weights; + if(beam_horiz_filter < 0.5) + { + // Quilez: + const float x = sample_dists.y; + const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0); + weights = float4(0.0, 1.0 - w2, w2, 0.0); + } + else if(beam_horiz_filter < 1.5) + { + // Gaussian: + float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma); + weights = exp(-(sample_dists*sample_dists)*inner_denom_inv); + } + else + { + // Lanczos2: + const float4 pi_dists = FIX_ZERO(sample_dists * pi); + weights = 2.0 * sin(pi_dists) * sin(pi_dists * 0.5) / + (pi_dists * pi_dists); + } + // Ensure the weight sum == 1.0: + const float4 final_weights = weights/dot(weights, float4(1.0)); + // Get the interpolated horizontal scanline color: + const float2 uv_step_x = float2(texture_size_inv.x, 0.0); + return get_scanline_color( + texture, prev_texel_hor_uv, uv_step_x, final_weights); +} + +float3 sample_rgb_scanline_horizontal(const sampler2D texture, + const float2 tex_uv, const float2 texture_size, + const float2 texture_size_inv) +{ + // TODO: Add function requirements. + // Rely on a helper to make convergence easier. + if(beam_misconvergence) + { + const float3 convergence_offsets_rgb = + get_convergence_offsets_x_vector(); + const float3 offset_u_rgb = + convergence_offsets_rgb * texture_size_inv.xxx; + const float2 scanline_uv_r = tex_uv - float2(offset_u_rgb.r, 0.0); + const float2 scanline_uv_g = tex_uv - float2(offset_u_rgb.g, 0.0); + const float2 scanline_uv_b = tex_uv - float2(offset_u_rgb.b, 0.0); + const float3 sample_r = sample_single_scanline_horizontal( + texture, scanline_uv_r, texture_size, texture_size_inv); + const float3 sample_g = sample_single_scanline_horizontal( + texture, scanline_uv_g, texture_size, texture_size_inv); + const float3 sample_b = sample_single_scanline_horizontal( + texture, scanline_uv_b, texture_size, texture_size_inv); + return float3(sample_r.r, sample_g.g, sample_b.b); + } + else + { + return sample_single_scanline_horizontal(texture, tex_uv, texture_size, + texture_size_inv); + } +} + +float2 get_last_scanline_uv(const float2 tex_uv, const float2 texture_size, + const float2 texture_size_inv, const float2 il_step_multiple, + const float frame_count, out float dist) +{ + // Compute texture coords for the last/upper scanline, accounting for + // interlacing: With interlacing, only consider even/odd scanlines every + // other frame. Top-field first (TFF) order puts even scanlines on even + // frames, and BFF order puts them on odd frames. Texels are centered at: + // frac(tex_uv * texture_size) == x.5 + // Caution: If these coordinates ever seem incorrect, first make sure it's + // not because anisotropic filtering is blurring across field boundaries. + // Note: TFF/BFF won't matter for sources that double-weave or similar. + const float field_offset = floor(il_step_multiple.y * 0.75) * + fmod(frame_count + float(interlace_bff), 2.0); + const float2 curr_texel = tex_uv * texture_size; + // Use under_half to fix a rounding bug right around exact texel locations. + const float2 prev_texel_num = floor(curr_texel - float2(under_half)); + const float wrong_field = fmod( + prev_texel_num.y + field_offset, il_step_multiple.y); + const float2 scanline_texel_num = prev_texel_num - float2(0.0, wrong_field); + // Snap to the center of the previous scanline in the current field: + const float2 scanline_texel = scanline_texel_num + float2(0.5); + const float2 scanline_uv = scanline_texel * texture_size_inv; + // Save the sample's distance from the scanline, in units of scanlines: + dist = (curr_texel.y - scanline_texel.y)/il_step_multiple.y; + return scanline_uv; +} +*/ +bool is_interlaced(float num_lines) +{ + // Detect interlacing based on the number of lines in the source. + if(interlace_detect) + { + // NTSC: 525 lines, 262.5/field; 486 active (2 half-lines), 243/field + // NTSC Emulators: Typically 224 or 240 lines + // PAL: 625 lines, 312.5/field; 576 active (typical), 288/field + // PAL Emulators: ? + // ATSC: 720p, 1080i, 1080p + // Where do we place our cutoffs? Assumptions: + // 1.) We only need to care about active lines. + // 2.) Anything > 288 and <= 576 lines is probably interlaced. + // 3.) Anything > 576 lines is probably not interlaced... + // 4.) ...except 1080 lines, which is a crapshoot (user decision). + // 5.) Just in case the main program uses calculated video sizes, + // we should nudge the float thresholds a bit. + bool sd_interlace; + if (num_lines > 288.5 && num_lines < 576.5) + {sd_interlace = true;} + else + {sd_interlace = false;} + bool hd_interlace; + if (num_lines > 1079.5 && num_lines < 1080.5) + {hd_interlace = false;} + else + {hd_interlace = sd_interlace || hd_interlace;} + } + else + { + return false; + } +} + + +#endif // SCANLINE_FUNCTIONS_H + diff --git a/crt/shaders/crt-royale/src/tex2Dantialias.h b/crt/shaders/crt-royale/src/tex2Dantialias.h new file mode 100644 index 0000000..0a5f834 --- /dev/null +++ b/crt/shaders/crt-royale/src/tex2Dantialias.h @@ -0,0 +1,1393 @@ +#ifndef TEX2DANTIALIAS_H +#define TEX2DANTIALIAS_H + +///////////////////////////// GPL LICENSE NOTICE ///////////////////////////// + +// crt-royale: A full-featured CRT shader, with cheese. +// Copyright (C) 2014 TroggleMonkey +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 2 of the License, or any later version. +// +// This program is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +// more details. +// +// You should have received a copy of the GNU General Public License along with +// this program; if not, write to the Free Software Foundation, Inc., 59 Temple +// Place, Suite 330, Boston, MA 02111-1307 USA + + +///////////////////////////////// DESCRIPTION //////////////////////////////// + +// This file provides antialiased and subpixel-aware tex2D lookups. +// Requires: All functions share these requirements: +// 1.) All requirements of gamma-management.h must be satisfied! +// 2.) pixel_to_tex_uv must be a 2x2 matrix that transforms pixe- +// space offsets to texture uv offsets. You can get this with: +// const vec2 duv_dx = ddx(tex_uv); +// const vec2 duv_dy = ddy(tex_uv); +// const vec2x2 pixel_to_tex_uv = vec2x2( +// duv_dx.x, duv_dy.x, +// duv_dx.y, duv_dy.y); +// This is left to the user in case the current Cg profile +// doesn't support ddx()/ddy(). Ideally, the user could find +// calculate a distorted tangent-space mapping analytically. +// If not, a simple flat mapping can be obtained with: +// const vec2 xy_to_uv_scale = IN.output_size * +// IN.video_size/IN.texture_size; +// const vec2x2 pixel_to_tex_uv = vec2x2( +// xy_to_uv_scale.x, 0.0, +// 0.0, xy_to_uv_scale.y); +// Optional: To set basic AA settings, #define ANTIALIAS_OVERRIDE_BASICS and: +// 1.) Set an antialiasing level: +// static const float aa_level = {0 (none), +// 1 (sample subpixels), 4, 5, 6, 7, 8, 12, 16, 20, 24} +// 2.) Set a filter type: +// static const float aa_filter = { +// 0 (Box, Separable), 1 (Box, Cylindrical), +// 2 (Tent, Separable), 3 (Tent, Cylindrical) +// 4 (Gaussian, Separable), 5 (Gaussian, Cylindrical) +// 6 (Cubic, Separable), 7 (Cubic, Cylindrical) +// 8 (Lanczos Sinc, Separable), +// 9 (Lanczos Jinc, Cylindrical)} +// If the input is unknown, a separable box filter is used. +// Note: Lanczos Jinc is terrible for sparse sampling, and +// using aa_axis_importance (see below) defeats the purpose. +// 3.) Mirror the sample pattern on odd frames? +// static const bool aa_temporal = {true, false] +// This helps rotational invariance but can look "fluttery." +// The user may #define ANTIALIAS_OVERRIDE_PARAMETERS to override +// (all of) the following default parameters with static or uniform +// constants (or an accessor function for subpixel offsets): +// 1.) Cubic parameters: +// static const float aa_cubic_c = 0.5; +// See http://www.imagemagick.org/Usage/filter/#mitchell +// 2.) Gaussian parameters: +// static const float aa_gauss_sigma = +// 0.5/aa_pixel_diameter; +// 3.) Set subpixel offsets. This requires an accessor function +// for compatibility with scalar runtime shader params. Return +// a vec2 pixel offset in [-0.5, 0.5] for the red subpixel: +// vec2 get_aa_subpixel_r_offset() +// The user may also #define ANTIALIAS_OVERRIDE_STATIC_CONSTANTS to +// override (all of) the following default static values. However, +// the file's structure requires them to be declared static const: +// 1.) static const float aa_lanczos_lobes = 3.0; +// 2.) static const float aa_gauss_support = 1.0/aa_pixel_diameter; +// Note the default tent/Gaussian support radii may appear +// arbitrary, but extensive testing found them nearly optimal +// for tough cases like strong distortion at low AA levels. +// (The Gaussian default is only best for practical gauss_sigma +// values; much larger gauss_sigmas ironically prefer slightly +// smaller support given sparse sampling, and vice versa.) +// 3.) static const float aa_tent_support = 1.0 / aa_pixel_diameter; +// 4.) static const vec2 aa_xy_axis_importance: +// The sparse N-queens sampling grid interacts poorly with +// negative-lobed 2D filters. However, if aliasing is much +// stronger in one direction (e.g. horizontally with a phosphor +// mask), it can be useful to downplay sample offsets along the +// other axis. The support radius in each direction scales with +// aa_xy_axis_importance down to a minimum of 0.5 (box support), +// after which point only the offsets used for calculating +// weights continue to scale downward. This works as follows: +// If aa_xy_axis_importance = vec2(1.0, 1.0/support_radius), +// the vertical support radius will drop to 1.0, and we'll just +// filter vertical offsets with the first filter lobe, while +// horizontal offsets go through the full multi-lobe filter. +// If aa_xy_axis_importance = vec2(1.0, 0.0), the vertical +// support radius will drop to box support, and the vertical +// offsets will be ignored entirely (essentially giving us a +// box filter vertically). The former is potentially smoother +// (but less predictable) and the default behavior of Lanczos +// jinc, whereas the latter is sharper and the default behavior +// of cubics and Lanczos sinc. +// 5.) static const float aa_pixel_diameter: You can expand the +// pixel diameter to e.g. sqrt(2.0), which may be a better +// support range for cylindrical filters (they don't +// currently discard out-of-circle samples though). +// Finally, there are two miscellaneous options: +// 1.) If you want to antialias a manually tiled texture, you can +// #define ANTIALIAS_DISABLE_ANISOTROPIC to use tex2Dlod() to +// fix incompatibilities with anisotropic filtering. This is +// slower, and the Cg profile must support tex2Dlod(). +// 2.) If aa_cubic_c is a runtime uniform, you can #define +// RUNTIME_ANTIALIAS_WEIGHTS to evaluate cubic weights once per +// fragment instead of at the usage site (which is used by +// default, because it enables static evaluation). +// Description: +// Each antialiased lookup follows these steps: +// 1.) Define a sample pattern of pixel offsets in the range of [-0.5, 0.5] +// pixels, spanning the diameter of a rectangular box filter. +// 2.) Scale these offsets by the support diameter of the user's chosen filter. +// 3.) Using these pixel offsets from the pixel center, compute the offsets to +// predefined subpixel locations. +// 4.) Compute filter weights based on subpixel offsets. +// Much of that can often be done at compile-time. At runtime: +// 1.) Project pixel-space offsets into uv-space with a matrix multiplication +// to get the uv offsets for each sample. Rectangular pixels have a +// diameter of 1.0. Circular pixels are not currently supported, but they +// might be better with a diameter of sqrt(2.0) to ensure there are no gaps +// between them. +// 2.) Load, weight, and sum samples. +// We use a sparse bilinear sampling grid, so there are two major implications: +// 1.) We can directly project the pixel-space support box into uv-space even +// if we're upsizing. This wouldn't be the case for nearest neighbor, +// where we'd have to expand the uv-space diameter to at least the support +// size to ensure sufficient filter support. In our case, this allows us +// to treat upsizing the same as downsizing and use static weighting. :) +// 2.) For decent results, negative-lobed filters must be computed based on +// separable weights, not radial distances, because the sparse sampling +// makes no guarantees about radial distributions. Even then, it's much +// better to set aa_xy_axis_importance to e.g. vec2(1.0, 0.0) to use e.g. +// Lanczos2 horizontally and a box filter vertically. This is mainly due +// to the sparse N-queens sampling and a statistically enormous positive or +// negative covariance between horizontal and vertical weights. +// +// Design Decision Comments: +// "aa_temporal" mirrors the sample pattern on odd frames along the axis that +// keeps subpixel weights constant. This helps with rotational invariance, but +// it can cause distracting fluctuations, and horizontal and vertical edges +// will look the same. Using a different pattern on a shifted grid would +// exploit temporal AA better, but it would require a dynamic branch or a lot +// of conditional moves, so it's prohibitively slow for the minor benefit. + + +///////////////////////////// SETTINGS MANAGEMENT //////////////////////////// + +#ifndef ANTIALIAS_OVERRIDE_BASICS + // The following settings must be static constants: + const float aa_level = 12.0; + const float aa_filter = 0.0; + const bool aa_temporal = false; +#endif + +#ifndef ANTIALIAS_OVERRIDE_STATIC_CONSTANTS + // Users may override these parameters, but the file structure requires + // them to be static constants; see the descriptions above. + const float aa_pixel_diameter = 1.0; + const float aa_lanczos_lobes = 3.0; + const float aa_gauss_support = 1.0 / aa_pixel_diameter; + const float aa_tent_support = 1.0 / aa_pixel_diameter; + + // If we're using a negative-lobed filter, default to using it horizontally + // only, and use only the first lobe vertically or a box filter, over a + // correspondingly smaller range. This compensates for the sparse sampling + // grid's typically large positive/negative x/y covariance. + const vec2 aa_xy_axis_importance = + aa_filter < 5.5 ? vec2(1.0) : // Box, tent, Gaussian + aa_filter < 8.5 ? vec2(1.0, 0.0) : // Cubic and Lanczos sinc + aa_filter < 9.5 ? vec2(1.0, 1.0/aa_lanczos_lobes) : // Lanczos jinc + vec2(1.0); // Default to box +#endif + +#ifndef ANTIALIAS_OVERRIDE_PARAMETERS + // Users may override these values with their own uniform or static consts. + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + const float aa_cubic_c = 0.5; + const float aa_gauss_sigma = 0.5 / aa_pixel_diameter; + // Users may override the subpixel offset accessor function with their own. + // A function is used for compatibility with scalar runtime shader params. + vec2 get_aa_subpixel_r_offset() + { + return vec2(0.0, 0.0); + } +#endif + + +////////////////////////////////// INCLUDES ////////////////////////////////// + +#include "../../../../include/gamma-management.h" + + +////////////////////////////////// CONSTANTS ///////////////////////////////// + +const float aa_box_support = 0.5; +const float aa_cubic_support = 2.0; + + +//////////////////////////// GLOBAL NON-CONSTANTS //////////////////////////// + +// We'll want to define these only once per fragment at most. +#ifdef RUNTIME_ANTIALIAS_WEIGHTS + uniform float aa_cubic_b; + uniform float cubic_branch1_x3_coeff; + uniform float cubic_branch1_x2_coeff; + uniform float cubic_branch1_x0_coeff; + uniform float cubic_branch2_x3_coeff; + uniform float cubic_branch2_x2_coeff; + uniform float cubic_branch2_x1_coeff; + uniform float cubic_branch2_x0_coeff; +#endif + + +/////////////////////////////////// HELPERS ////////////////////////////////// + +void assign_aa_cubic_constants() +{ + // Compute cubic coefficients on demand at runtime, and save them to global + // uniforms. The B parameter is computed from C, because "Keys cubics" + // with B = 1 - 2C are considered the highest quality. + #ifdef RUNTIME_ANTIALIAS_WEIGHTS + if(aa_filter > 5.5 && aa_filter < 7.5) + { + aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; + cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; + cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; + } + #endif +} + +vec4 get_subpixel_support_diam_and_final_axis_importance() +{ + // Statically select the base support radius: + const float base_support_radius = + aa_filter < 1.5 ? aa_box_support : + aa_filter < 3.5 ? aa_tent_support : + aa_filter < 5.5 ? aa_gauss_support : + aa_filter < 7.5 ? aa_cubic_support : + aa_filter < 9.5 ? aa_lanczos_lobes : + aa_box_support; // Default to box + // Expand the filter support for subpixel filtering. + const vec2 subpixel_support_radius_raw = + vec2(base_support_radius) + abs(get_aa_subpixel_r_offset()); + if(aa_filter < 1.5) + { + // Ignore aa_xy_axis_importance for box filtering. + const vec2 subpixel_support_diam = + 2.0 * subpixel_support_radius_raw; + const vec2 final_axis_importance = vec2(1.0); + return vec4(subpixel_support_diam, final_axis_importance); + } + else + { + // Scale the support window by aa_xy_axis_importance, but don't narrow + // it further than box support. This allows decent vertical AA without + // messing up horizontal weights or using something silly like Lanczos4 + // horizontally with a huge vertical average over an 8-pixel radius. + const vec2 subpixel_support_radius = max(vec2(aa_box_support), + subpixel_support_radius_raw * aa_xy_axis_importance); + // Adjust aa_xy_axis_importance to compensate for what's already done: + const vec2 final_axis_importance = aa_xy_axis_importance * + subpixel_support_radius_raw/subpixel_support_radius; + const vec2 subpixel_support_diam = 2.0 * subpixel_support_radius; + return vec4(subpixel_support_diam, final_axis_importance); + } +} + + +/////////////////////////// FILTER WEIGHT FUNCTIONS ////////////////////////// + +float eval_box_filter(const float dist) +{ + return float(abs(dist) <= aa_box_support); +} + +float eval_separable_box_filter(const vec2 offset) +{ + return float(all(abs(offset) <= vec2(aa_box_support))); +} + +float eval_tent_filter(const float dist) +{ + return clamp((aa_tent_support - dist)/ + aa_tent_support, 0.0, 1.0); +} + +float eval_gaussian_filter(const float dist) +{ + return exp(-(dist*dist) / (2.0*aa_gauss_sigma*aa_gauss_sigma)); +} + +float eval_cubic_filter(const float dist) +{ + // Compute coefficients like assign_aa_cubic_constants(), but statically. + #ifndef RUNTIME_ANTIALIAS_WEIGHTS + // When runtime weights are used, these values are instead written to + // global uniforms at the beginning of each tex2Daa* call. + const float aa_cubic_b = 1.0 - 2.0*aa_cubic_c; + const float cubic_branch1_x3_coeff = 12.0 - 9.0*aa_cubic_b - 6.0*aa_cubic_c; + const float cubic_branch1_x2_coeff = -18.0 + 12.0*aa_cubic_b + 6.0*aa_cubic_c; + const float cubic_branch1_x0_coeff = 6.0 - 2.0 * aa_cubic_b; + const float cubic_branch2_x3_coeff = -aa_cubic_b - 6.0 * aa_cubic_c; + const float cubic_branch2_x2_coeff = 6.0*aa_cubic_b + 30.0*aa_cubic_c; + const float cubic_branch2_x1_coeff = -12.0*aa_cubic_b - 48.0*aa_cubic_c; + const float cubic_branch2_x0_coeff = 8.0*aa_cubic_b + 24.0*aa_cubic_c; + #endif + const float abs_dist = abs(dist); + // Compute the cubic based on the Horner's method formula in: + // http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/Mitchell.pdf + return (abs_dist < 1.0 ? + (cubic_branch1_x3_coeff*abs_dist + + cubic_branch1_x2_coeff)*abs_dist*abs_dist + + cubic_branch1_x0_coeff : + abs_dist < 2.0 ? + ((cubic_branch2_x3_coeff*abs_dist + + cubic_branch2_x2_coeff)*abs_dist + + cubic_branch2_x1_coeff)*abs_dist + cubic_branch2_x0_coeff : + 0.0)/6.0; +} + +float eval_separable_cubic_filter(const vec2 offset) +{ + // This is faster than using a specific vec2 version: + return eval_cubic_filter(offset.x) * + eval_cubic_filter(offset.y); +} + +vec2 eval_sinc_filter(const vec2 offset) +{ + // It's faster to let the caller handle the zero case, or at least it + // was when I used macros and the shader preset took a full minute to load. + const vec2 pi_offset = pi * offset; + return sin(pi_offset)/pi_offset; +} + +float eval_separable_lanczos_sinc_filter(const vec2 offset_unsafe) +{ + // Note: For sparse sampling, you really need to pick an axis to use + // Lanczos along (e.g. set aa_xy_axis_importance = vec2(1.0, 0.0)). + const vec2 offset = FIX_ZERO(offset_unsafe); + const vec2 xy_weights = eval_sinc_filter(offset) * + eval_sinc_filter(offset/aa_lanczos_lobes); + return xy_weights.x * xy_weights.y; +} + +float eval_jinc_filter_unorm(const float x) +{ + // This is a Jinc approximation for x in [0, 45). We'll use x in range + // [0, 4*pi) or so. There are faster/closer approximations based on + // piecewise cubics from [0, 45) and asymptotic approximations beyond that, + // but this has a maximum absolute error < 1/512, and it's simpler/faster + // for shaders...not that it's all that useful for sparse sampling anyway. + const float point3845_x = 0.38448566093564*x; + const float exp_term = exp(-(point3845_x*point3845_x)); + const float point8154_plus_x = 0.815362332840791 + x; + const float cos_term = cos(point8154_plus_x); + return ( + 0.0264727330997042*min(x, 6.83134964622778) + + 0.680823557250528*exp_term + + -0.0597255978950933*min(7.41043194481873, x)*cos_term / + (point8154_plus_x + 0.0646074538634482*(x*x) + + cos(x)*max(exp_term, cos(x) + cos_term)) - + 0.180837503591406); +} + +float eval_jinc_filter(const float dist) +{ + return eval_jinc_filter_unorm(pi * dist); +} + +float eval_lanczos_jinc_filter(const float dist) +{ + return eval_jinc_filter(dist) * eval_jinc_filter(dist/aa_lanczos_lobes); +} + + +vec3 eval_unorm_rgb_weights(const vec2 offset, + const vec2 final_axis_importance) +{ + // Requires: 1.) final_axis_impportance must be computed according to + // get_subpixel_support_diam_and_final_axis_importance(). + // 2.) aa_filter must be a global constant. + // 3.) offset must be an xy pixel offset in the range: + // ([-subpixel_support_diameter.x/2, + // subpixel_support_diameter.x/2], + // [-subpixel_support_diameter.y/2, + // subpixel_support_diameter.y/2]) + // Returns: Sample weights at R/G/B destination subpixels for the + // given xy pixel offset. + const vec2 offset_g = offset * final_axis_importance; + const vec2 aa_r_offset = get_aa_subpixel_r_offset(); + const vec2 offset_r = offset_g - aa_r_offset * final_axis_importance; + const vec2 offset_b = offset_g + aa_r_offset * final_axis_importance; + // Statically select a filter: + if(aa_filter < 0.5) + { + return vec3(eval_separable_box_filter(offset_r), + eval_separable_box_filter(offset_g), + eval_separable_box_filter(offset_b)); + } + else if(aa_filter < 1.5) + { + return vec3(eval_box_filter(length(offset_r)), + eval_box_filter(length(offset_g)), + eval_box_filter(length(offset_b))); + } + else if(aa_filter < 2.5) + { + return vec3( + eval_tent_filter(offset_r.x) * eval_tent_filter(offset_r.y), + eval_tent_filter(offset_g.x) * eval_tent_filter(offset_g.y), + eval_tent_filter(offset_b.x) * eval_tent_filter(offset_b.y)); + } + else if(aa_filter < 3.5) + { + return vec3(eval_tent_filter(length(offset_r)), + eval_tent_filter(length(offset_g)), + eval_tent_filter(length(offset_b))); + } + else if(aa_filter < 4.5) + { + return vec3( + eval_gaussian_filter(offset_r.x) * eval_gaussian_filter(offset_r.y), + eval_gaussian_filter(offset_g.x) * eval_gaussian_filter(offset_g.y), + eval_gaussian_filter(offset_b.x) * eval_gaussian_filter(offset_b.y)); + } + else if(aa_filter < 5.5) + { + return vec3(eval_gaussian_filter(length(offset_r)), + eval_gaussian_filter(length(offset_g)), + eval_gaussian_filter(length(offset_b))); + } + else if(aa_filter < 6.5) + { + return vec3( + eval_cubic_filter(offset_r.x) * eval_cubic_filter(offset_r.y), + eval_cubic_filter(offset_g.x) * eval_cubic_filter(offset_g.y), + eval_cubic_filter(offset_b.x) * eval_cubic_filter(offset_b.y)); + } + else if(aa_filter < 7.5) + { + return vec3(eval_cubic_filter(length(offset_r)), + eval_cubic_filter(length(offset_g)), + eval_cubic_filter(length(offset_b))); + } + else if(aa_filter < 8.5) + { + return vec3(eval_separable_lanczos_sinc_filter(offset_r), + eval_separable_lanczos_sinc_filter(offset_g), + eval_separable_lanczos_sinc_filter(offset_b)); + } + else if(aa_filter < 9.5) + { + return vec3(eval_lanczos_jinc_filter(length(offset_r)), + eval_lanczos_jinc_filter(length(offset_g)), + eval_lanczos_jinc_filter(length(offset_b))); + } + else + { + // Default to a box, because Lanczos Jinc is so bad. ;) + return vec3(eval_separable_box_filter(offset_r), + eval_separable_box_filter(offset_g), + eval_separable_box_filter(offset_b)); + } +} + + +////////////////////////////// HELPER FUNCTIONS ////////////////////////////// + +vec4 tex2Daa_tiled_linearize(const sampler2D samp, const vec2 s) +{ + // If we're manually tiling a texture, anisotropic filtering can get + // confused. This is one workaround: + #ifdef ANTIALIAS_DISABLE_ANISOTROPIC + // TODO: Use tex2Dlod_linearize with a calculated mip level. + return tex2Dlod_linearize(samp, vec4(s, 0.0, 0.0)); + #else + return tex2D_linearize(samp, s); + #endif +} + +vec2 get_frame_sign(const float frame) +{ + if(aa_temporal) + { + // Mirror the sampling pattern for odd frames in a direction that + // lets us keep the same subpixel sample weights: + const float frame_odd = float(fmod(frame, 2.0) > 0.5); + const vec2 aa_r_offset = get_aa_subpixel_r_offset(); + const vec2 mirror = -vec2(abs(aa_r_offset) < vec2(FIX_ZERO(0.0))); + return mirror; + } + else + { + return vec2(1.0); + } +} + + +///////////////////////// ANTIALIASED TEXTURE LOOKUPS //////////////////////// + +vec3 tex2Daa_subpixel_weights_only(const sampler2D texture, + const vec2 tex_uv, const vec2x2 pixel_to_tex_uv) +{ + // This function is unlike the others: Just perform a single independent + // lookup for each subpixel. It may be very aliased. + const vec2 aa_r_offset = get_aa_subpixel_r_offset(); + const vec2 aa_r_offset_uv_offset = mul(pixel_to_tex_uv, aa_r_offset); + const float color_g = tex2D_linearize(texture, tex_uv).g; + const float color_r = tex2D_linearize(texture, tex_uv + aa_r_offset_uv_offset).r; + const float color_b = tex2D_linearize(texture, tex_uv - aa_r_offset_uv_offset).b; + return vec3(color_r, color_g, color_b); +} + +// The tex2Daa* functions compile very slowly due to all the macros and +// compile-time math, so only include the ones we'll actually use! +vec3 tex2Daa4x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use an RGMS4 pattern (4-queens): + // . . Q . : off =(-1.5, -1.5)/4 + (2.0, 0.0)/4 + // Q . . . : off =(-1.5, -1.5)/4 + (0.0, 1.0)/4 + // . . . Q : off =(-1.5, -1.5)/4 + (3.0, 2.0)/4 + // . Q . . : off =(-1.5, -1.5)/4 + (1.0, 3.0)/4 + // Static screenspace sample offsets (compute some implicitly): + const float grid_size = 4.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(0.0, 1.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = w1.bgr; + const vec3 w3 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + + w2 * sample2 + w3 * sample3); +} + +vec3 tex2Daa5x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 5-queens pattern: + // . Q . . . : off =(-2.0, -2.0)/5 + (1.0, 0.0)/5 + // . . . . Q : off =(-2.0, -2.0)/5 + (4.0, 1.0)/5 + // . . Q . . : off =(-2.0, -2.0)/5 + (2.0, 2.0)/5 + // Q . . . . : off =(-2.0, -2.0)/5 + (0.0, 3.0)/5 + // . . . Q . : off =(-2.0, -2.0)/5 + (3.0, 4.0)/5 + // Static screenspace sample offsets (compute some implicitly): + const float grid_size = 5.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 2.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = w1.bgr; + const vec3 w4 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 w_sum_inv = vec3(1.0)/(w0 + w1 + w2 + w3 + w4); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + + w2 * sample2 + w3 * sample3 + w4 * sample4); +} + +vec3 tex2Daa6x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 6-queens pattern with a stronger horizontal + // than vertical slant: + // . . . . Q . : off =(-2.5, -2.5)/6 + (4.0, 0.0)/6 + // . . Q . . . : off =(-2.5, -2.5)/6 + (2.0, 1.0)/6 + // Q . . . . . : off =(-2.5, -2.5)/6 + (0.0, 2.0)/6 + // . . . . . Q : off =(-2.5, -2.5)/6 + (5.0, 3.0)/6 + // . . . Q . . : off =(-2.5, -2.5)/6 + (3.0, 4.0)/6 + // . Q . . . . : off =(-2.5, -2.5)/6 + (1.0, 5.0)/6 + // Static screenspace sample offsets (compute some implicitly): + const float grid_size = 6.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(4.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(2.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = w2.bgr; + const vec3 w4 = w1.bgr; + const vec3 w5 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * (w0 * sample0 + w1 * sample1 + w2 * sample2 + + w3 * sample3 + w4 * sample4 + w5 * sample5); +} + +vec3 tex2Daa7x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 7-queens pattern with a queen in the center: + // . Q . . . . . : off =(-3.0, -3.0)/7 + (1.0, 0.0)/7 + // . . . . Q . . : off =(-3.0, -3.0)/7 + (4.0, 1.0)/7 + // Q . . . . . . : off =(-3.0, -3.0)/7 + (0.0, 2.0)/7 + // . . . Q . . . : off =(-3.0, -3.0)/7 + (3.0, 3.0)/7 + // . . . . . . Q : off =(-3.0, -3.0)/7 + (6.0, 4.0)/7 + // . . Q . . . . : off =(-3.0, -3.0)/7 + (2.0, 5.0)/7 + // . . . . . Q . : off =(-3.0, -3.0)/7 + (5.0, 6.0)/7 + const float grid_size = 7.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(1.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(0.0, 2.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 3.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = w2.bgr; + const vec3 w5 = w1.bgr; + const vec3 w6 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2; + const vec3 w_sum = half_sum + half_sum.bgr + w3; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6); +} + +vec3 tex2Daa8x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 8-queens pattern. + // . . Q . . . . . : off =(-3.5, -3.5)/8 + (2.0, 0.0)/8 + // . . . . Q . . . : off =(-3.5, -3.5)/8 + (4.0, 1.0)/8 + // . Q . . . . . . : off =(-3.5, -3.5)/8 + (1.0, 2.0)/8 + // . . . . . . . Q : off =(-3.5, -3.5)/8 + (7.0, 3.0)/8 + // Q . . . . . . . : off =(-3.5, -3.5)/8 + (0.0, 4.0)/8 + // . . . . . . Q . : off =(-3.5, -3.5)/8 + (6.0, 5.0)/8 + // . . . Q . . . . : off =(-3.5, -3.5)/8 + (3.0, 6.0)/8 + // . . . . . Q . . : off =(-3.5, -3.5)/8 + (5.0, 7.0)/8 + const float grid_size = 8.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(4.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(1.0, 2.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(7.0, 3.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = w3.bgr; + const vec3 w5 = w2.bgr; + const vec3 w6 = w1.bgr; + const vec3 w7 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2 + w3; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, and mirror on odd frames if directed: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7); +} + +vec3 tex2Daa12x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 12-superqueens pattern where no 3 points are + // exactly collinear. + // . . . Q . . . . . . . . : off =(-5.5, -5.5)/12 + (3.0, 0.0)/12 + // . . . . . . . . . Q . . : off =(-5.5, -5.5)/12 + (9.0, 1.0)/12 + // . . . . . . Q . . . . . : off =(-5.5, -5.5)/12 + (6.0, 2.0)/12 + // . Q . . . . . . . . . . : off =(-5.5, -5.5)/12 + (1.0, 3.0)/12 + // . . . . . . . . . . . Q : off =(-5.5, -5.5)/12 + (11.0, 4.0)/12 + // . . . . Q . . . . . . . : off =(-5.5, -5.5)/12 + (4.0, 5.0)/12 + // . . . . . . . Q . . . . : off =(-5.5, -5.5)/12 + (7.0, 6.0)/12 + // Q . . . . . . . . . . . : off =(-5.5, -5.5)/12 + (0.0, 7.0)/12 + // . . . . . . . . . . Q . : off =(-5.5, -5.5)/12 + (10.0, 8.0)/12 + // . . . . . Q . . . . . . : off =(-5.5, -5.5)/12 + (5.0, 9.0)/12 + // . . Q . . . . . . . . . : off =(-5.5, -5.5)/12 + (2.0, 10.0)/12 + // . . . . . . . . Q . . . : off =(-5.5, -5.5)/12 + (8.0, 11.0)/12 + const float grid_size = 12.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(3.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(6.0, 2.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step; + const vec2 xy_offset4 = xy_start_offset + vec2(11.0, 4.0) * xy_step; + const vec2 xy_offset5 = xy_start_offset + vec2(4.0, 5.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const vec3 w6 = w5.bgr; + const vec3 w7 = w4.bgr; + const vec3 w8 = w3.bgr; + const vec3 w9 = w2.bgr; + const vec3 w10 = w1.bgr; + const vec3 w11 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/w_sum; + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb; + const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb; + const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb; + const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11); +} + +vec3 tex2Daa16x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 16-superqueens pattern where no 3 points are + // exactly collinear. + // . . Q . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (2.0, 0.0)/16 + // . . . . . . . . . Q . . . . . . : off =(-7.5, -7.5)/16 + (9.0, 1.0)/16 + // . . . . . . . . . . . . Q . . . : off =(-7.5, -7.5)/16 + (12.0, 2.0)/16 + // . . . . Q . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (4.0, 3.0)/16 + // . . . . . . . . Q . . . . . . . : off =(-7.5, -7.5)/16 + (8.0, 4.0)/16 + // . . . . . . . . . . . . . . Q . : off =(-7.5, -7.5)/16 + (14.0, 5.0)/16 + // Q . . . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (0.0, 6.0)/16 + // . . . . . . . . . . Q . . . . . : off =(-7.5, -7.5)/16 + (10.0, 7.0)/16 + // . . . . . Q . . . . . . . . . . : off =(-7.5, -7.5)/16 + (5.0, 8.0)/16 + // . . . . . . . . . . . . . . . Q : off =(-7.5, -7.5)/16 + (15.0, 9.0)/16 + // . Q . . . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (1.0, 10.0)/16 + // . . . . . . . Q . . . . . . . . : off =(-7.5, -7.5)/16 + (7.0, 11.0)/16 + // . . . . . . . . . . . Q . . . . : off =(-7.5, -7.5)/16 + (11.0, 12.0)/16 + // . . . Q . . . . . . . . . . . . : off =(-7.5, -7.5)/16 + (3.0, 13.0)/16 + // . . . . . . Q . . . . . . . . . : off =(-7.5, -7.5)/16 + (6.0, 14.0)/16 + // . . . . . . . . . . . . . Q . . : off =(-7.5, -7.5)/16 + (13.0, 15.0)/16 + const float grid_size = 16.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(2.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(9.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(12.0, 2.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(4.0, 3.0) * xy_step; + const vec2 xy_offset4 = xy_start_offset + vec2(8.0, 4.0) * xy_step; + const vec2 xy_offset5 = xy_start_offset + vec2(14.0, 5.0) * xy_step; + const vec2 xy_offset6 = xy_start_offset + vec2(0.0, 6.0) * xy_step; + const vec2 xy_offset7 = xy_start_offset + vec2(10.0, 7.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const vec3 w8 = w7.bgr; + const vec3 w9 = w6.bgr; + const vec3 w10 = w5.bgr; + const vec3 w11 = w4.bgr; + const vec3 w12 = w3.bgr; + const vec3 w13 = w2.bgr; + const vec3 w14 = w1.bgr; + const vec3 w15 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const vec2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const vec2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset6).rgb; + const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset7).rgb; + const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset7).rgb; + const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset6).rgb; + const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb; + const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb; + const vec3 sample12 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb; + const vec3 sample13 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample14 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample15 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); +} + +vec3 tex2Daa20x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 20-superqueens pattern where no 3 points are + // exactly collinear and superqueens have a squared attack radius of 13. + // . . . . . . . Q . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (7.0, 0.0)/20 + // . . . . . . . . . . . . . . . . Q . . . : off =(-9.5, -9.5)/20 + (16.0, 1.0)/20 + // . . . . . . . . . . . Q . . . . . . . . : off =(-9.5, -9.5)/20 + (11.0, 2.0)/20 + // . Q . . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (1.0, 3.0)/20 + // . . . . . Q . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (5.0, 4.0)/20 + // . . . . . . . . . . . . . . . Q . . . . : off =(-9.5, -9.5)/20 + (15.0, 5.0)/20 + // . . . . . . . . . . Q . . . . . . . . . : off =(-9.5, -9.5)/20 + (10.0, 6.0)/20 + // . . . . . . . . . . . . . . . . . . . Q : off =(-9.5, -9.5)/20 + (19.0, 7.0)/20 + // . . Q . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (2.0, 8.0)/20 + // . . . . . . Q . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (6.0, 9.0)/20 + // . . . . . . . . . . . . . Q . . . . . . : off =(-9.5, -9.5)/20 + (13.0, 10.0)/20 + // . . . . . . . . . . . . . . . . . Q . . : off =(-9.5, -9.5)/20 + (17.0, 11.0)/20 + // Q . . . . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (0.0, 12.0)/20 + // . . . . . . . . . Q . . . . . . . . . . : off =(-9.5, -9.5)/20 + (9.0, 13.0)/20 + // . . . . Q . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (4.0, 14.0)/20 + // . . . . . . . . . . . . . . Q . . . . . : off =(-9.5, -9.5)/20 + (14.0, 15.0)/20 + // . . . . . . . . . . . . . . . . . . Q . : off =(-9.5, -9.5)/20 + (18.0, 16.0)/20 + // . . . . . . . . Q . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (8.0, 17.0)/20 + // . . . Q . . . . . . . . . . . . . . . . : off =(-9.5, -9.5)/20 + (3.0, 18.0)/20 + // . . . . . . . . . . . . Q . . . . . . . : off =(-9.5, -9.5)/20 + (12.0, 19.0)/20 + const float grid_size = 20.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(7.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(11.0, 2.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(1.0, 3.0) * xy_step; + const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step; + const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step; + const vec2 xy_offset6 = xy_start_offset + vec2(10.0, 6.0) * xy_step; + const vec2 xy_offset7 = xy_start_offset + vec2(19.0, 7.0) * xy_step; + const vec2 xy_offset8 = xy_start_offset + vec2(2.0, 8.0) * xy_step; + const vec2 xy_offset9 = xy_start_offset + vec2(6.0, 9.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const vec3 w10 = w9.bgr; + const vec3 w11 = w8.bgr; + const vec3 w12 = w7.bgr; + const vec3 w13 = w6.bgr; + const vec3 w14 = w5.bgr; + const vec3 w15 = w4.bgr; + const vec3 w16 = w3.bgr; + const vec3 w17 = w2.bgr; + const vec3 w18 = w1.bgr; + const vec3 w19 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const vec2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const vec2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const vec2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const vec2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset6).rgb; + const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset7).rgb; + const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset8).rgb; + const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset9).rgb; + const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset9).rgb; + const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset8).rgb; + const vec3 sample12 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset7).rgb; + const vec3 sample13 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset6).rgb; + const vec3 sample14 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb; + const vec3 sample15 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb; + const vec3 sample16 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb; + const vec3 sample17 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample18 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample19 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 + + w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19); +} + +vec3 tex2Daa24x(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Use a diagonally symmetric 24-superqueens pattern where no 3 points are + // exactly collinear and superqueens have a squared attack radius of 13. + // . . . . . . Q . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (6.0, 0.0)/24 + // . . . . . . . . . . . . . . . . Q . . . . . . . : off =(-11.5, -11.5)/24 + (16.0, 1.0)/24 + // . . . . . . . . . . Q . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (10.0, 2.0)/24 + // . . . . . . . . . . . . . . . . . . . . . Q . . : off =(-11.5, -11.5)/24 + (21.0, 3.0)/24 + // . . . . . Q . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (5.0, 4.0)/24 + // . . . . . . . . . . . . . . . Q . . . . . . . . : off =(-11.5, -11.5)/24 + (15.0, 5.0)/24 + // . Q . . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (1.0, 6.0)/24 + // . . . . . . . . . . . Q . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (11.0, 7.0)/24 + // . . . . . . . . . . . . . . . . . . . Q . . . . : off =(-11.5, -11.5)/24 + (19.0, 8.0)/24 + // . . . . . . . . . . . . . . . . . . . . . . . Q : off =(-11.5, -11.5)/24 + (23.0, 9.0)/24 + // . . . Q . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (3.0, 10.0)/24 + // . . . . . . . . . . . . . . Q . . . . . . . . . : off =(-11.5, -11.5)/24 + (14.0, 11.0)/24 + // . . . . . . . . . Q . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (9.0, 12.0)/24 + // . . . . . . . . . . . . . . . . . . . . Q . . . : off =(-11.5, -11.5)/24 + (20.0, 13.0)/24 + // Q . . . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (0.0, 14.0)/24 + // . . . . Q . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (4.0, 15.0)/24 + // . . . . . . . . . . . . Q . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (12.0, 16.0)/24 + // . . . . . . . . . . . . . . . . . . . . . . Q . : off =(-11.5, -11.5)/24 + (22.0, 17.0)/24 + // . . . . . . . . Q . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (8.0, 18.0)/24 + // . . . . . . . . . . . . . . . . . . Q . . . . . : off =(-11.5, -11.5)/24 + (18.0, 19.0)/24 + // . . Q . . . . . . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (2.0, 20.0)/24 + // . . . . . . . . . . . . . Q . . . . . . . . . . : off =(-11.5, -11.5)/24 + (13.0, 21.0)/24 + // . . . . . . . Q . . . . . . . . . . . . . . . . : off =(-11.5, -11.5)/24 + (7.0, 22.0)/24 + // . . . . . . . . . . . . . . . . . Q . . . . . . : off =(-11.5, -11.5)/24 + (17.0, 23.0)/24 + const float grid_size = 24.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample. Exploit diagonal symmetry: + const vec2 xy_offset0 = xy_start_offset + vec2(6.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(16.0, 1.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(10.0, 2.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(21.0, 3.0) * xy_step; + const vec2 xy_offset4 = xy_start_offset + vec2(5.0, 4.0) * xy_step; + const vec2 xy_offset5 = xy_start_offset + vec2(15.0, 5.0) * xy_step; + const vec2 xy_offset6 = xy_start_offset + vec2(1.0, 6.0) * xy_step; + const vec2 xy_offset7 = xy_start_offset + vec2(11.0, 7.0) * xy_step; + const vec2 xy_offset8 = xy_start_offset + vec2(19.0, 8.0) * xy_step; + const vec2 xy_offset9 = xy_start_offset + vec2(23.0, 9.0) * xy_step; + const vec2 xy_offset10 = xy_start_offset + vec2(3.0, 10.0) * xy_step; + const vec2 xy_offset11 = xy_start_offset + vec2(14.0, 11.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const vec3 w8 = eval_unorm_rgb_weights(xy_offset8, final_axis_importance); + const vec3 w9 = eval_unorm_rgb_weights(xy_offset9, final_axis_importance); + const vec3 w10 = eval_unorm_rgb_weights(xy_offset10, final_axis_importance); + const vec3 w11 = eval_unorm_rgb_weights(xy_offset11, final_axis_importance); + const vec3 w12 = w11.bgr; + const vec3 w13 = w10.bgr; + const vec3 w14 = w9.bgr; + const vec3 w15 = w8.bgr; + const vec3 w16 = w7.bgr; + const vec3 w17 = w6.bgr; + const vec3 w18 = w5.bgr; + const vec3 w19 = w4.bgr; + const vec3 w20 = w3.bgr; + const vec3 w21 = w2.bgr; + const vec3 w22 = w1.bgr; + const vec3 w23 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + + w5 + w6 + w7 + w8 + w9 + w10 + w11; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, mirror on odd frames if directed, and exploit + // diagonal symmetry: + const vec2 frame_sign = get_frame_sign(frame); + const vec2 uv_offset0 = mul(true_pixel_to_tex_uv, xy_offset0 * frame_sign); + const vec2 uv_offset1 = mul(true_pixel_to_tex_uv, xy_offset1 * frame_sign); + const vec2 uv_offset2 = mul(true_pixel_to_tex_uv, xy_offset2 * frame_sign); + const vec2 uv_offset3 = mul(true_pixel_to_tex_uv, xy_offset3 * frame_sign); + const vec2 uv_offset4 = mul(true_pixel_to_tex_uv, xy_offset4 * frame_sign); + const vec2 uv_offset5 = mul(true_pixel_to_tex_uv, xy_offset5 * frame_sign); + const vec2 uv_offset6 = mul(true_pixel_to_tex_uv, xy_offset6 * frame_sign); + const vec2 uv_offset7 = mul(true_pixel_to_tex_uv, xy_offset7 * frame_sign); + const vec2 uv_offset8 = mul(true_pixel_to_tex_uv, xy_offset8 * frame_sign); + const vec2 uv_offset9 = mul(true_pixel_to_tex_uv, xy_offset9 * frame_sign); + const vec2 uv_offset10 = mul(true_pixel_to_tex_uv, xy_offset10 * frame_sign); + const vec2 uv_offset11 = mul(true_pixel_to_tex_uv, xy_offset11 * frame_sign); + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset0).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset1).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset2).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset3).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset4).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset5).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset6).rgb; + const vec3 sample7 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset7).rgb; + const vec3 sample8 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset8).rgb; + const vec3 sample9 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset9).rgb; + const vec3 sample10 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset10).rgb; + const vec3 sample11 = tex2Daa_tiled_linearize(texture, tex_uv + uv_offset11).rgb; + const vec3 sample12 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset11).rgb; + const vec3 sample13 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset10).rgb; + const vec3 sample14 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset9).rgb; + const vec3 sample15 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset8).rgb; + const vec3 sample16 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset7).rgb; + const vec3 sample17 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset6).rgb; + const vec3 sample18 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset5).rgb; + const vec3 sample19 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset4).rgb; + const vec3 sample20 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset3).rgb; + const vec3 sample21 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset2).rgb; + const vec3 sample22 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset1).rgb; + const vec3 sample23 = tex2Daa_tiled_linearize(texture, tex_uv - uv_offset0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15 + + w16 * sample16 + w17 * sample17 + w18 * sample18 + w19 * sample19 + + w20 * sample20 + w21 * sample21 + w22 * sample22 + w23 * sample23); +} + +vec3 tex2Daa_debug_16x_regular(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Sample on a regular 4x4 grid. This is mainly for testing. + const float grid_size = 4.0; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const vec2 xy_step = vec2(1.0)/grid_size * subpixel_support_diameter; + const vec2 xy_start_offset = vec2(0.5 - grid_size*0.5) * xy_step; + // Get the xy offset of each sample: + const vec2 xy_offset0 = xy_start_offset + vec2(0.0, 0.0) * xy_step; + const vec2 xy_offset1 = xy_start_offset + vec2(1.0, 0.0) * xy_step; + const vec2 xy_offset2 = xy_start_offset + vec2(2.0, 0.0) * xy_step; + const vec2 xy_offset3 = xy_start_offset + vec2(3.0, 0.0) * xy_step; + const vec2 xy_offset4 = xy_start_offset + vec2(0.0, 1.0) * xy_step; + const vec2 xy_offset5 = xy_start_offset + vec2(1.0, 1.0) * xy_step; + const vec2 xy_offset6 = xy_start_offset + vec2(2.0, 1.0) * xy_step; + const vec2 xy_offset7 = xy_start_offset + vec2(3.0, 1.0) * xy_step; + // Compute subpixel weights, and exploit diagonal symmetry for speed. + // (We can't exploit vertical or horizontal symmetry due to uncertain + // subpixel offsets. We could fix that by rotating xy offsets with the + // subpixel structure, but...no.) + const vec3 w0 = eval_unorm_rgb_weights(xy_offset0, final_axis_importance); + const vec3 w1 = eval_unorm_rgb_weights(xy_offset1, final_axis_importance); + const vec3 w2 = eval_unorm_rgb_weights(xy_offset2, final_axis_importance); + const vec3 w3 = eval_unorm_rgb_weights(xy_offset3, final_axis_importance); + const vec3 w4 = eval_unorm_rgb_weights(xy_offset4, final_axis_importance); + const vec3 w5 = eval_unorm_rgb_weights(xy_offset5, final_axis_importance); + const vec3 w6 = eval_unorm_rgb_weights(xy_offset6, final_axis_importance); + const vec3 w7 = eval_unorm_rgb_weights(xy_offset7, final_axis_importance); + const vec3 w8 = w7.bgr; + const vec3 w9 = w6.bgr; + const vec3 w10 = w5.bgr; + const vec3 w11 = w4.bgr; + const vec3 w12 = w3.bgr; + const vec3 w13 = w2.bgr; + const vec3 w14 = w1.bgr; + const vec3 w15 = w0.bgr; + // Get the weight sum to normalize the total to 1.0 later: + const vec3 half_sum = w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7; + const vec3 w_sum = half_sum + half_sum.bgr; + const vec3 w_sum_inv = vec3(1.0)/(w_sum); + // Scale the pixel-space to texture offset matrix by the pixel diameter. + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + // Get uv sample offsets, taking advantage of row alignment: + const vec2 uv_step_x = mul(true_pixel_to_tex_uv, vec2(xy_step.x, 0.0)); + const vec2 uv_step_y = mul(true_pixel_to_tex_uv, vec2(0.0, xy_step.y)); + const vec2 uv_offset0 = -1.5 * (uv_step_x + uv_step_y); + const vec2 sample0_uv = tex_uv + uv_offset0; + const vec2 sample4_uv = sample0_uv + uv_step_y; + const vec2 sample8_uv = sample0_uv + uv_step_y * 2.0; + const vec2 sample12_uv = sample0_uv + uv_step_y * 3.0; + // Load samples, linearizing if necessary, etc.: + const vec3 sample0 = tex2Daa_tiled_linearize(texture, sample0_uv).rgb; + const vec3 sample1 = tex2Daa_tiled_linearize(texture, sample0_uv + uv_step_x).rgb; + const vec3 sample2 = tex2Daa_tiled_linearize(texture, sample0_uv + uv_step_x * 2.0).rgb; + const vec3 sample3 = tex2Daa_tiled_linearize(texture, sample0_uv + uv_step_x * 3.0).rgb; + const vec3 sample4 = tex2Daa_tiled_linearize(texture, sample4_uv).rgb; + const vec3 sample5 = tex2Daa_tiled_linearize(texture, sample4_uv + uv_step_x).rgb; + const vec3 sample6 = tex2Daa_tiled_linearize(texture, sample4_uv + uv_step_x * 2.0).rgb; + const vec3 sample7 = tex2Daa_tiled_linearize(texture, sample4_uv + uv_step_x * 3.0).rgb; + const vec3 sample8 = tex2Daa_tiled_linearize(texture, sample8_uv).rgb; + const vec3 sample9 = tex2Daa_tiled_linearize(texture, sample8_uv + uv_step_x).rgb; + const vec3 sample10 = tex2Daa_tiled_linearize(texture, sample8_uv + uv_step_x * 2.0).rgb; + const vec3 sample11 = tex2Daa_tiled_linearize(texture, sample8_uv + uv_step_x * 3.0).rgb; + const vec3 sample12 = tex2Daa_tiled_linearize(texture, sample12_uv).rgb; + const vec3 sample13 = tex2Daa_tiled_linearize(texture, sample12_uv + uv_step_x).rgb; + const vec3 sample14 = tex2Daa_tiled_linearize(texture, sample12_uv + uv_step_x * 2.0).rgb; + const vec3 sample15 = tex2Daa_tiled_linearize(texture, sample12_uv + uv_step_x * 3.0).rgb; + // Sum weighted samples (weight sum must equal 1.0 for each channel): + return w_sum_inv * ( + w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 + + w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 + + w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 + + w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15); +} + +vec3 tex2Daa_debug_dynamic(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // This function is for testing only: Use an NxN grid with dynamic weights. + const int grid_size = 8; + assign_aa_cubic_constants(); + const vec4 ssd_fai = get_subpixel_support_diam_and_final_axis_importance(); + const vec2 subpixel_support_diameter = ssd_fai.xy; + const vec2 final_axis_importance = ssd_fai.zw; + const float grid_radius_in_samples = (float(grid_size) - 1.0)/2.0; + const vec2 filter_space_offset_step = + subpixel_support_diameter/vec2(grid_size); + const vec2 sample0_filter_space_offset = + -grid_radius_in_samples * filter_space_offset_step; + // Compute xy sample offsets and subpixel weights: + vec3 weights[grid_size * grid_size]; + vec3 weight_sum = 0.0; + for(int i = 0; i < grid_size; ++i) + { + for(int j = 0; j < grid_size; ++j) + { + // Weights based on xy distances: + const vec2 offset = sample0_filter_space_offset + + vec2(j, i) * filter_space_offset_step; + const vec3 weight = eval_unorm_rgb_weights(offset, final_axis_importance); + weights[i*grid_size + j] = weight; + weight_sum += weight; + } + } + // Get uv offset vectors along x and y directions: + const vec2x2 true_pixel_to_tex_uv = + vec2x2(vec4(pixel_to_tex_uv * aa_pixel_diameter)); + const vec2 uv_offset_step_x = mul(true_pixel_to_tex_uv, + vec2(filter_space_offset_step.x, 0.0)); + const vec2 uv_offset_step_y = mul(true_pixel_to_tex_uv, + vec2(0.0, filter_space_offset_step.y)); + // Get a starting sample location: + const vec2 sample0_uv_offset = -grid_radius_in_samples * + (uv_offset_step_x + uv_offset_step_y); + const vec2 sample0_uv = tex_uv + sample0_uv_offset; + // Load, weight, and sum [linearized] samples: + vec3 sum = 0.0; + const vec3 weight_sum_inv = vec3(1.0)/weight_sum; + for(int i = 0; i < grid_size; ++i) + { + const vec2 row_i_first_sample_uv = + sample0_uv + i * uv_offset_step_y; + for(int j = 0; j < grid_size; ++j) + { + const vec2 sample_uv = + row_i_first_sample_uv + j * uv_offset_step_x; + sum += weights[i*grid_size + j] * + tex2Daa_tiled_linearize(texture, sample_uv).rgb; + } + } + return sum * weight_sum_inv; +} + + +/////////////////////// ANTIALIASING CODEPATH SELECTION ////////////////////// + +vec3 tex2Daa(const sampler2D texture, const vec2 tex_uv, + const vec2x2 pixel_to_tex_uv, const float frame) +{ + // Statically switch between antialiasing modes/levels: + return aa_level < 0.5 ? tex2D_linearize(texture, tex_uv).rgb : + aa_level < 3.5 ? tex2Daa_subpixel_weights_only( + texture, tex_uv, pixel_to_tex_uv) : + aa_level < 4.5 ? tex2Daa4x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 5.5 ? tex2Daa5x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 6.5 ? tex2Daa6x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 7.5 ? tex2Daa7x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 11.5 ? tex2Daa8x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 15.5 ? tex2Daa12x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 19.5 ? tex2Daa16x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 23.5 ? tex2Daa20x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 253.5 ? tex2Daa24x(texture, tex_uv, pixel_to_tex_uv, frame) : + aa_level < 254.5 ? tex2Daa_debug_16x_regular( + texture, tex_uv, pixel_to_tex_uv, frame) : + tex2Daa_debug_dynamic(texture, tex_uv, pixel_to_tex_uv, frame); +} + + +#endif // TEX2DANTIALIAS_H + diff --git a/crt/shaders/crt-royale/src/user-preset-constants.h b/crt/shaders/crt-royale/src/user-preset-constants.h new file mode 100644 index 0000000..ad70a9a --- /dev/null +++ b/crt/shaders/crt-royale/src/user-preset-constants.h @@ -0,0 +1,58 @@ +#ifndef USER_CGP_CONSTANTS_H +#define USER_CGP_CONSTANTS_H + +// IMPORTANT: +// These constants MUST be set appropriately for the settings in crt-royale.cgp +// (or whatever related .cgp file you're using). If they aren't, you're likely +// to get artifacts, the wrong phosphor mask size, etc. I wish these could be +// set directly in the .cgp file to make things easier, but...they can't. + +// PASS SCALES AND RELATED CONSTANTS: +// Copy the absolute scale_x for BLOOM_APPROX. There are two major versions of +// this shader: One does a viewport-scale bloom, and the other skips it. The +// latter benefits from a higher bloom_approx_scale_x, so save both separately: +const float bloom_approx_size_x = 320.0; +const float bloom_approx_size_x_for_fake = 400.0; +// Copy the viewport-relative scales of the phosphor mask resize passes +// (MASK_RESIZE and the pass immediately preceding it): +const vec2 mask_resize_viewport_scale = vec2(0.0625, 0.0625); +// Copy the geom_max_aspect_ratio used to calculate the MASK_RESIZE scales, etc.: +const float geom_max_aspect_ratio = 4.0/3.0; + +// PHOSPHOR MASK TEXTURE CONSTANTS: +// Set the following constants to reflect the properties of the phosphor mask +// texture named in crt-royale.cgp. The shader optionally resizes a mask tile +// based on user settings, then repeats a single tile until filling the screen. +// The shader must know the input texture size (default 64x64), and to manually +// resize, it must also know the horizontal triads per tile (default 8). +const vec2 mask_texture_small_size = vec2(64.0); +const vec2 mask_texture_large_size = vec2(512.0); +const float mask_triads_per_tile = 8.0; +// We need the average brightness of the phosphor mask to compensate for the +// dimming it causes. The following four values are roughly correct for the +// masks included with the shader. Update the value for any LUT texture you +// change. [Un]comment "#define PHOSPHOR_MASK_GRILLE14" depending on whether +// the loaded aperture grille uses 14-pixel or 15-pixel stripes (default 15). +//#define PHOSPHOR_MASK_GRILLE14 +const float mask_grille14_avg_color = 50.6666666/255.0; + // TileableLinearApertureGrille14Wide7d33Spacing*.png + // TileableLinearApertureGrille14Wide10And6Spacing*.png +const float mask_grille15_avg_color = 53.0/255.0; + // TileableLinearApertureGrille15Wide6d33Spacing*.png + // TileableLinearApertureGrille15Wide8And5d5Spacing*.png +const float mask_slot_avg_color = 46.0/255.0; + // TileableLinearSlotMask15Wide9And4d5Horizontal8VerticalSpacing*.png + // TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing*.png +const float mask_shadow_avg_color = 41.0/255.0; + // TileableLinearShadowMask*.png + // TileableLinearShadowMaskEDP*.png + +#ifdef PHOSPHOR_MASK_GRILLE14 + const float mask_grille_avg_color = mask_grille14_avg_color; +#else + const float mask_grille_avg_color = mask_grille15_avg_color; +#endif + + +#endif // USER_CGP_CONSTANTS_H + diff --git a/crt/shaders/crt-royale/user-settings.h b/crt/shaders/crt-royale/user-settings.h new file mode 100644 index 0000000..b631311 --- /dev/null +++ b/crt/shaders/crt-royale/user-settings.h @@ -0,0 +1,359 @@ +#ifndef USER_SETTINGS_H +#define USER_SETTINGS_H + +///////////////////////////// DRIVER CAPABILITIES //////////////////////////// + +// The Cg compiler uses different "profiles" with different capabilities. +// This shader requires a Cg compilation profile >= arbfp1, but a few options +// require higher profiles like fp30 or fp40. The shader can't detect profile +// or driver capabilities, so instead you must comment or uncomment the lines +// below with "//" before "#define." Disable an option if you get compilation +// errors resembling those listed. Generally speaking, all of these options +// will run on nVidia cards, but only DRIVERS_ALLOW_TEX2DBIAS (if that) is +// likely to run on ATI/AMD, due to the Cg compiler's profile limitations. + +// Derivatives: Unsupported on fp20, ps_1_1, ps_1_2, ps_1_3, and arbfp1. +// Among other things, derivatives help us fix anisotropic filtering artifacts +// with curved manually tiled phosphor mask coords. Related errors: +// error C3004: function "vec2 ddx(vec2);" not supported in this profile +// error C3004: function "vec2 ddy(vec2);" not supported in this profile + //#define DRIVERS_ALLOW_DERIVATIVES + +// Fine derivatives: Unsupported on older ATI cards. +// Fine derivatives enable 2x2 fragment block communication, letting us perform +// fast single-pass blur operations. If your card uses coarse derivatives and +// these are enabled, blurs could look broken. Derivatives are a prerequisite. + #ifdef DRIVERS_ALLOW_DERIVATIVES + #define DRIVERS_ALLOW_FINE_DERIVATIVES + #endif + +// Dynamic looping: Requires an fp30 or newer profile. +// This makes phosphor mask resampling faster in some cases. Related errors: +// error C5013: profile does not support "for" statements and "for" could not +// be unrolled + //#define DRIVERS_ALLOW_DYNAMIC_BRANCHES + +// Without DRIVERS_ALLOW_DYNAMIC_BRANCHES, we need to use unrollable loops. +// Using one static loop avoids overhead if the user is right, but if the user +// is wrong (loops are allowed), breaking a loop into if-blocked pieces with a +// binary search can potentially save some iterations. However, it may fail: +// error C6001: Temporary register limit of 32 exceeded; 35 registers +// needed to compile program + //#define ACCOMODATE_POSSIBLE_DYNAMIC_LOOPS + +// tex2Dlod: Requires an fp40 or newer profile. This can be used to disable +// anisotropic filtering, thereby fixing related artifacts. Related errors: +// error C3004: function "vec4 tex2Dlod(sampler2D, vec4);" not supported in +// this profile + //#define DRIVERS_ALLOW_TEX2DLOD + +// tex2Dbias: Requires an fp30 or newer profile. This can be used to alleviate +// artifacts from anisotropic filtering and mipmapping. Related errors: +// error C3004: function "vec4 tex2Dbias(sampler2D, vec4);" not supported +// in this profile + //#define DRIVERS_ALLOW_TEX2DBIAS + +// Integrated graphics compatibility: Integrated graphics like Intel HD 4000 +// impose stricter limitations on register counts and instructions. Enable +// INTEGRATED_GRAPHICS_COMPATIBILITY_MODE if you still see error C6001 or: +// error C6002: Instruction limit of 1024 exceeded: 1523 instructions needed +// to compile program. +// Enabling integrated graphics compatibility mode will automatically disable: +// 1.) PHOSPHOR_MASK_MANUALLY_RESIZE: The phosphor mask will be softer. +// (This may be reenabled in a later release.) +// 2.) RUNTIME_GEOMETRY_MODE +// 3.) The high-quality 4x4 Gaussian resize for the bloom approximation + //#define INTEGRATED_GRAPHICS_COMPATIBILITY_MODE + + +//////////////////////////// USER CODEPATH OPTIONS /////////////////////////// + +// To disable a #define option, turn its line into a comment with "//." + +// RUNTIME VS. COMPILE-TIME OPTIONS (Major Performance Implications): +// Enable runtime shader parameters in the Retroarch (etc.) GUI? They override +// many of the options in this file and allow real-time tuning, but many of +// them are slower. Disabling them and using this text file will boost FPS. +#define RUNTIME_SHADER_PARAMS_ENABLE +// Specify the phosphor bloom sigma at runtime? This option is 10% slower, but +// it's the only way to do a wide-enough full bloom with a runtime dot pitch. +#define RUNTIME_PHOSPHOR_BLOOM_SIGMA +// Specify antialiasing weight parameters at runtime? (Costs ~20% with cubics) +#define RUNTIME_ANTIALIAS_WEIGHTS +// Specify subpixel offsets at runtime? (WARNING: EXTREMELY EXPENSIVE!) +//#define RUNTIME_ANTIALIAS_SUBPIXEL_OFFSETS +// Make beam_horiz_filter and beam_horiz_linear_rgb_weight into runtime shader +// parameters? This will require more math or dynamic branching. +#define RUNTIME_SCANLINES_HORIZ_FILTER_COLORSPACE +// Specify the tilt at runtime? This makes things about 3% slower. +#define RUNTIME_GEOMETRY_TILT +// Specify the geometry mode at runtime? +#define RUNTIME_GEOMETRY_MODE +// Specify the phosphor mask type (aperture grille, slot mask, shadow mask) and +// mode (Lanczos-resize, hardware resize, or tile 1:1) at runtime, even without +// dynamic branches? This is cheap if mask_resize_viewport_scale is small. +#define FORCE_RUNTIME_PHOSPHOR_MASK_MODE_TYPE_SELECT + +// PHOSPHOR MASK: +// Manually resize the phosphor mask for best results (slower)? Disabling this +// removes the option to do so, but it may be faster without dynamic branches. + #define PHOSPHOR_MASK_MANUALLY_RESIZE +// If we sinc-resize the mask, should we Lanczos-window it (slower but better)? + #define PHOSPHOR_MASK_RESIZE_LANCZOS_WINDOW +// Larger blurs are expensive, but we need them to blur larger triads. We can +// detect the right blur if the triad size is static or our profile allows +// dynamic branches, but otherwise we use the largest blur the user indicates +// they might need: + #define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_3_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_6_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_9_PIXELS + //#define PHOSPHOR_BLOOM_TRIADS_LARGER_THAN_12_PIXELS + // Here's a helpful chart: + // MaxTriadSize BlurSize MinTriadCountsByResolution + // 3.0 9.0 480/640/960/1920 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 6.0 17.0 240/320/480/960 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 9.0 25.0 160/213/320/640 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 12.0 31.0 120/160/240/480 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + // 18.0 43.0 80/107/160/320 triads at 1080p/1440p/2160p/4320p, 4:3 aspect + + +/////////////////////////////// USER PARAMETERS ////////////////////////////// + +// Note: Many of these static parameters are overridden by runtime shader +// parameters when those are enabled. However, many others are static codepath +// options that were cleaner or more convert to code as static constants. + +// GAMMA: + const float crt_gamma_static = 2.5; // range [1, 5] + const float lcd_gamma_static = 2.2; // range [1, 5] + +// LEVELS MANAGEMENT: + // Control the final multiplicative image contrast: + const float levels_contrast_static = 1.0; // range [0, 4) + // We auto-dim to avoid clipping between passes and restore brightness + // later. Control the dim factor here: Lower values clip less but crush + // blacks more (static only for now). + const float levels_autodim_temp = 0.5; // range (0, 1] + +// HALATION/DIFFUSION/BLOOM: + // Halation weight: How much energy should be lost to electrons bounding + // around under the CRT glass and exciting random phosphors? + const float halation_weight_static = 0.0; // range [0, 1] + // Refractive diffusion weight: How much light should spread/diffuse from + // refracting through the CRT glass? + const float diffusion_weight_static = 0.075; // range [0, 1] + // Underestimate brightness: Bright areas bloom more, but we can base the + // bloom brightpass on a lower brightness to sharpen phosphors, or a higher + // brightness to soften them. Low values clip, but >= 0.8 looks okay. + const float bloom_underestimate_levels_static = 0.8; // range [0, 5] + // Blur all colors more than necessary for a softer phosphor bloom? + const float bloom_excess_static = 0.0; // range [0, 1] + // The BLOOM_APPROX pass approximates a phosphor blur early on with a small + // blurred resize of the input (convergence offsets are applied as well). + // There are three filter options (static option only for now): + // 0.) Bilinear resize: A fast, close approximation to a 4x4 resize + // if min_allowed_viewport_triads and the BLOOM_APPROX resolution are sane + // and beam_max_sigma is low. + // 1.) 3x3 resize blur: Medium speed, soft/smeared from bilinear blurring, + // always uses a static sigma regardless of beam_max_sigma or + // mask_num_triads_desired. + // 2.) True 4x4 Gaussian resize: Slowest, technically correct. + // These options are more pronounced for the fast, unbloomed shader version. + const float bloom_approx_filter_static = 2.0; + +// ELECTRON BEAM SCANLINE DISTRIBUTION: + // How many scanlines should contribute light to each pixel? Using more + // scanlines is slower (especially for a generalized Gaussian) but less + // distorted with larger beam sigmas (especially for a pure Gaussian). The + // max_beam_sigma at which the closest unused weight is guaranteed < + // 1.0/255.0 (for a 3x antialiased pure Gaussian) is: + // 2 scanlines: max_beam_sigma = 0.2089; distortions begin ~0.34; 141.7 FPS pure, 131.9 FPS generalized + // 3 scanlines, max_beam_sigma = 0.3879; distortions begin ~0.52; 137.5 FPS pure; 123.8 FPS generalized + // 4 scanlines, max_beam_sigma = 0.5723; distortions begin ~0.70; 134.7 FPS pure; 117.2 FPS generalized + // 5 scanlines, max_beam_sigma = 0.7591; distortions begin ~0.89; 131.6 FPS pure; 112.1 FPS generalized + // 6 scanlines, max_beam_sigma = 0.9483; distortions begin ~1.08; 127.9 FPS pure; 105.6 FPS generalized + const float beam_num_scanlines = 3.0; // range [2, 6] + // A generalized Gaussian beam varies shape with color too, now just width. + // It's slower but more flexible (static option only for now). + const bool beam_generalized_gaussian = true; + // What kind of scanline antialiasing do you want? + // 0: Sample weights at 1x; 1: Sample weights at 3x; 2: Compute an integral + // Integrals are slow (especially for generalized Gaussians) and rarely any + // better than 3x antialiasing (static option only for now). + const float beam_antialias_level = 1.0; // range [0, 2] + // Min/max standard deviations for scanline beams: Higher values widen and + // soften scanlines. Depending on other options, low min sigmas can alias. + const float beam_min_sigma_static = 0.02; // range (0, 1] + const float beam_max_sigma_static = 0.3; // range (0, 1] + // Beam width varies as a function of color: A power function (0) is more + // configurable, but a spherical function (1) gives the widest beam + // variability without aliasing (static option only for now). + const float beam_spot_shape_function = 0.0; + // Spot shape power: Powers <= 1 give smoother spot shapes but lower + // sharpness. Powers >= 1.0 are awful unless mix/max sigmas are close. + const float beam_spot_power_static = 1.0/3.0; // range (0, 16] + // Generalized Gaussian max shape parameters: Higher values give flatter + // scanline plateaus and steeper dropoffs, simultaneously widening and + // sharpening scanlines at the cost of aliasing. 2.0 is pure Gaussian, and + // values > ~40.0 cause artifacts with integrals. + const float beam_min_shape_static = 2.0; // range [2, 32] + const float beam_max_shape_static = 4.0; // range [2, 32] + // Generalized Gaussian shape power: Affects how quickly the distribution + // changes shape from Gaussian to steep/plateaued as color increases from 0 + // to 1.0. Higher powers appear softer for most colors, and lower powers + // appear sharper for most colors. + const float beam_shape_power_static = 1.0/4.0; // range (0, 16] + // What filter should be used to sample scanlines horizontally? + // 0: Quilez (fast), 1: Gaussian (configurable), 2: Lanczos2 (sharp) + const float beam_horiz_filter_static = 0.0; + // Standard deviation for horizontal Gaussian resampling: + const float beam_horiz_sigma_static = 0.35; // range (0, 2/3] + // Do horizontal scanline sampling in linear RGB (correct light mixing), + // gamma-encoded RGB (darker, hard spot shape, may better match bandwidth- + // limiting circuitry in some CRT's), or a weighted avg.? + const float beam_horiz_linear_rgb_weight_static = 1.0; // range [0, 1] + // Simulate scanline misconvergence? This needs 3x horizontal texture + // samples and 3x texture samples of BLOOM_APPROX and HALATION_BLUR in + // later passes (static option only for now). + const bool beam_misconvergence = true; + // Convergence offsets in x/y directions for R/G/B scanline beams in units + // of scanlines. Positive offsets go right/down; ranges [-2, 2] + const vec2 convergence_offsets_r_static = vec2(0.1, 0.2); + const vec2 convergence_offsets_g_static = vec2(0.3, 0.4); + const vec2 convergence_offsets_b_static = vec2(0.5, 0.6); + // Detect interlacing (static option only for now)? + const bool interlace_detect = true; + // Assume 1080-line sources are interlaced? + const bool interlace_1080i_static = false; + // For interlaced sources, assume TFF (top-field first) or BFF order? + // (Whether this matters depends on the nature of the interlaced input.) + const bool interlace_bff_static = false; + +// ANTIALIASING: + // What AA level do you want for curvature/overscan/subpixels? Options: + // 0x (none), 1x (sample subpixels), 4x, 5x, 6x, 7x, 8x, 12x, 16x, 20x, 24x + // (Static option only for now) + const float aa_level = 12.0; // range [0, 24] + // What antialiasing filter do you want (static option only)? Options: + // 0: Box (separable), 1: Box (cylindrical), + // 2: Tent (separable), 3: Tent (cylindrical), + // 4: Gaussian (separable), 5: Gaussian (cylindrical), + // 6: Cubic* (separable), 7: Cubic* (cylindrical, poor) + // 8: Lanczos Sinc (separable), 9: Lanczos Jinc (cylindrical, poor) + // * = Especially slow with RUNTIME_ANTIALIAS_WEIGHTS + const float aa_filter = 6.0; // range [0, 9] + // Flip the sample grid on odd/even frames (static option only for now)? + const bool aa_temporal = false; + // Use RGB subpixel offsets for antialiasing? The pixel is at green, and + // the blue offset is the negative r offset; range [0, 0.5] + const vec2 aa_subpixel_r_offset_static = vec2(-1.0/3.0, 0.0);//vec2(0.0); + // Cubics: See http://www.imagemagick.org/Usage/filter/#mitchell + // 1.) "Keys cubics" with B = 1 - 2C are considered the highest quality. + // 2.) C = 0.5 (default) is Catmull-Rom; higher C's apply sharpening. + // 3.) C = 1.0/3.0 is the Mitchell-Netravali filter. + // 4.) C = 0.0 is a soft spline filter. + const float aa_cubic_c_static = 0.5; // range [0, 4] + // Standard deviation for Gaussian antialiasing: Try 0.5/aa_pixel_diameter. + const float aa_gauss_sigma_static = 0.5; // range [0.0625, 1.0] + +// PHOSPHOR MASK: + // Mask type: 0 = aperture grille, 1 = slot mask, 2 = EDP shadow mask + const float mask_type_static = 1.0; // range [0, 2] + // We can sample the mask three ways. Pick 2/3 from: Pretty/Fast/Flexible. + // 0.) Sinc-resize to the desired dot pitch manually (pretty/slow/flexible). + // This requires PHOSPHOR_MASK_MANUALLY_RESIZE to be #defined. + // 1.) Hardware-resize to the desired dot pitch (ugly/fast/flexible). This + // is halfway decent with LUT mipmapping but atrocious without it. + // 2.) Tile it without resizing at a 1:1 texel:pixel ratio for flat coords + // (pretty/fast/inflexible). Each input LUT has a fixed dot pitch. + // This mode reuses the same masks, so triads will be enormous unless + // you change the mask LUT filenames in your .cgp file. + const float mask_sample_mode_static = 0.0; // range [0, 2] + // Prefer setting the triad size (0.0) or number on the screen (1.0)? + // If RUNTIME_PHOSPHOR_BLOOM_SIGMA isn't #defined, the specified triad size + // will always be used to calculate the full bloom sigma statically. + const float mask_specify_num_triads_static = 0.0; // range [0, 1] + // Specify the phosphor triad size, in pixels. Each tile (usually with 8 + // triads) will be rounded to the nearest integer tile size and clamped to + // obey minimum size constraints (imposed to reduce downsize taps) and + // maximum size constraints (imposed to have a sane MASK_RESIZE FBO size). + // To increase the size limit, double the viewport-relative scales for the + // two MASK_RESIZE passes in crt-royale.cgp and user-cgp-contants.h. + // range [1, mask_texture_small_size/mask_triads_per_tile] + const float mask_triad_size_desired_static = 24.0 / 8.0; + // If mask_specify_num_triads is 1.0/true, we'll go by this instead (the + // final size will be rounded and constrained as above); default 480.0 + const float mask_num_triads_desired_static = 480.0; + // How many lobes should the sinc/Lanczos resizer use? More lobes require + // more samples and avoid moire a bit better, but some is unavoidable + // depending on the destination size (static option for now). + const float mask_sinc_lobes = 3.0; // range [2, 4] + // The mask is resized using a variable number of taps in each dimension, + // but some Cg profiles always fetch a constant number of taps no matter + // what (no dynamic branching). We can limit the maximum number of taps if + // we statically limit the minimum phosphor triad size. Larger values are + // faster, but the limit IS enforced (static option only, forever); + // range [1, mask_texture_small_size/mask_triads_per_tile] + // TODO: Make this 1.0 and compensate with smarter sampling! + const float mask_min_allowed_triad_size = 2.0; + +// GEOMETRY: + // Geometry mode: + // 0: Off (default), 1: Spherical mapping (like cgwg's), + // 2: Alt. spherical mapping (more bulbous), 3: Cylindrical/Trinitron + const float geom_mode_static = 0.0; // range [0, 3] + // Radius of curvature: Measured in units of your viewport's diagonal size. + const float geom_radius_static = 2.0; // range [1/(2*pi), 1024] + // View dist is the distance from the player to their physical screen, in + // units of the viewport's diagonal size. It controls the field of view. + const float geom_view_dist_static = 2.0; // range [0.5, 1024] + // Tilt angle in radians (clockwise around up and right vectors): + const vec2 geom_tilt_angle_static = vec2(0.0, 0.0); // range [-pi, pi] + // Aspect ratio: When the true viewport size is unknown, this value is used + // to help convert between the phosphor triad size and count, along with + // the mask_resize_viewport_scale constant from user-cgp-constants.h. Set + // this equal to Retroarch's display aspect ratio (DAR) for best results; + // range [1, geom_max_aspect_ratio from user-cgp-constants.h]; + // default (256/224)*(54/47) = 1.313069909 (see below) + const float geom_aspect_ratio_static = 1.313069909; + // Before getting into overscan, here's some general aspect ratio info: + // - DAR = display aspect ratio = SAR * PAR; as in your Retroarch setting + // - SAR = storage aspect ratio = DAR / PAR; square pixel emulator frame AR + // - PAR = pixel aspect ratio = DAR / SAR; holds regardless of cropping + // Geometry processing has to "undo" the screen-space 2D DAR to calculate + // 3D view vectors, then reapplies the aspect ratio to the simulated CRT in + // uv-space. To ensure the source SAR is intended for a ~4:3 DAR, either: + // a.) Enable Retroarch's "Crop Overscan" + // b.) Readd horizontal padding: Set overscan to e.g. N*(1.0, 240.0/224.0) + // Real consoles use horizontal black padding in the signal, but emulators + // often crop this without cropping the vertical padding; a 256x224 [S]NES + // frame (8:7 SAR) is intended for a ~4:3 DAR, but a 256x240 frame is not. + // The correct [S]NES PAR is 54:47, found by blargg and NewRisingSun: + // http://board.zsnes.com/phpBB3/viewtopic.php?f=22&t=11928&start=50 + // http://forums.nesdev.com/viewtopic.php?p=24815#p24815 + // For flat output, it's okay to set DAR = [existing] SAR * [correct] PAR + // without doing a. or b., but horizontal image borders will be tighter + // than vertical ones, messing up curvature and overscan. Fixing the + // padding first corrects this. + // Overscan: Amount to "zoom in" before cropping. You can zoom uniformly + // or adjust x/y independently to e.g. readd horizontal padding, as noted + // above: Values < 1.0 zoom out; range (0, inf) + const vec2 geom_overscan_static = vec2(1.0, 1.0);// * 1.005 * (1.0, 240/224.0) + // Compute a proper pixel-space to texture-space matrix even without ddx()/ + // ddy()? This is ~8.5% slower but improves antialiasing/subpixel filtering + // with strong curvature (static option only for now). + const bool geom_force_correct_tangent_matrix = true; + +// BORDERS: + // Rounded border size in texture uv coords: + const float border_size_static = 0.015; // range [0, 0.5] + // Border darkness: Moderate values darken the border smoothly, and high + // values make the image very dark just inside the border: + const float border_darkness_static = 2.0; // range [0, inf) + // Border compression: High numbers compress border transitions, narrowing + // the dark border area. + const float border_compress_static = 2.5; // range [1, inf) + + +#endif // USER_SETTINGS_H +