From 6f47464db3ad038b3ba9aa62d1ac0225f39a6b47 Mon Sep 17 00:00:00 2001
From: hunterk <hunter_kaller@yahoo.com>
Date: Thu, 4 May 2017 16:34:39 -0500
Subject: [PATCH] fix beam_horiz_filter, update kurozumi, add fallback to fix
 my AMD crash

---
 crt/crt-royale_fallback.slangp                | 206 ++++++++++
 .../crt-royale/src/bind-shader-params.h       |   2 +-
 .../crt-royale-bloom-approx_fallback.slang    | 354 ++++++++++++++++++
 ...-pass-linearize-crt-gamma-bob-fields.slang |   2 +
 .../crt-royale-mask-resize-horizontal.slang   |   2 +-
 .../crt-royale/src/scanline-functions.h       |   6 +-
 presets/crt-royale-kurozumi.slangp            |   8 +-
 7 files changed, 571 insertions(+), 9 deletions(-)
 create mode 100644 crt/crt-royale_fallback.slangp
 create mode 100644 crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang

diff --git a/crt/crt-royale_fallback.slangp b/crt/crt-royale_fallback.slangp
new file mode 100644
index 0000000..931f032
--- /dev/null
+++ b/crt/crt-royale_fallback.slangp
@@ -0,0 +1,206 @@
+# IMPORTANT:
+# Shader passes need to know details about the image in the mask_texture LUT
+# files, so set the following constants in user-preset-constants.h accordingly:
+# 1.) mask_triads_per_tile = (number of horizontal triads in mask texture LUT's)
+# 2.) mask_texture_small_size = (texture size of mask*texture_small LUT's)
+# 3.) mask_texture_large_size = (texture size of mask*texture_large LUT's)
+# 4.) mask_grille_avg_color = (avg. brightness of mask_grille_texture* LUT's, in [0, 1])
+# 5.) mask_slot_avg_color = (avg. brightness of mask_slot_texture* LUT's, in [0, 1])
+# 6.) mask_shadow_avg_color = (avg. brightness of mask_shadow_texture* LUT's, in [0, 1])
+# Shader passes also need to know certain scales set in this preset, but their
+# compilation model doesn't currently allow the preset file to tell them.  Make
+# sure to set the following constants in user-preset-constants.h accordingly too:
+# 1.) bloom_approx_scale_x = scale_x2
+# 2.) mask_resize_viewport_scale = vec2(scale_x6, scale_y5)
+# Finally, shader passes need to know the value of geom_max_aspect_ratio used to
+# calculate scale_y5 (among other values):
+# 1.) geom_max_aspect_ratio = (geom_max_aspect_ratio used to calculate scale_y5)
+
+shaders = "12"
+
+# Set an identifier, filename, and sampling traits for the phosphor mask texture.
+# Load an aperture grille, slot mask, and an EDP shadow mask, and load a small
+# non-mipmapped version and a large mipmapped version.
+# TODO: Test masks in other directories.
+textures = "mask_grille_texture_small;mask_grille_texture_large;mask_slot_texture_small;mask_slot_texture_large;mask_shadow_texture_small;mask_shadow_texture_large"
+mask_grille_texture_small = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5SpacingResizeTo64.png"
+mask_grille_texture_large = "shaders/crt-royale/TileableLinearApertureGrille15Wide8And5d5Spacing.png"
+mask_slot_texture_small = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacingResizeTo64.png"
+mask_slot_texture_large = "shaders/crt-royale/TileableLinearSlotMaskTall15Wide9And4d5Horizontal9d14VerticalSpacing.png"
+mask_shadow_texture_small = "shaders/crt-royale/TileableLinearShadowMaskEDPResizeTo64.png"
+mask_shadow_texture_large = "shaders/crt-royale/TileableLinearShadowMaskEDP.png"
+mask_grille_texture_small_wrap_mode = "repeat"
+mask_grille_texture_large_wrap_mode = "repeat"
+mask_slot_texture_small_wrap_mode = "repeat"
+mask_slot_texture_large_wrap_mode = "repeat"
+mask_shadow_texture_small_wrap_mode = "repeat"
+mask_shadow_texture_large_wrap_mode = "repeat"
+mask_grille_texture_small_linear = "true"
+mask_grille_texture_large_linear = "true"
+mask_slot_texture_small_linear = "true"
+mask_slot_texture_large_linear = "true"
+mask_shadow_texture_small_linear = "true"
+mask_shadow_texture_large_linear = "true"
+mask_grille_texture_small_mipmap = "false"  # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_grille_texture_large_mipmap = "true"   # Essential for hardware-resized masks
+mask_slot_texture_small_mipmap = "false"    # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_slot_texture_large_mipmap = "true"     # Essential for hardware-resized masks
+mask_shadow_texture_small_mipmap = "false"  # Mipmapping causes artifacts with manually resized masks without tex2Dlod
+mask_shadow_texture_large_mipmap = "true"   # Essential for hardware-resized masks
+
+
+# Pass0: Linearize the input based on CRT gamma and bob interlaced fields.
+# (Bobbing ensures we can immediately blur without getting artifacts.)
+shader0 = "shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang"
+alias0 = "ORIG_LINEARIZED"
+filter_linear0 = "false"
+scale_type0 = "source"
+scale0 = "1.0"
+srgb_framebuffer0 = "true"
+
+# Pass1: Resample interlaced (and misconverged) scanlines vertically.
+# Separating vertical/horizontal scanline sampling is faster: It lets us
+# consider more scanlines while calculating weights for fewer pixels, and
+# it reduces our samples from vertical*horizontal to vertical+horizontal.
+# This has to come right after ORIG_LINEARIZED, because there's no
+# "original_source" scale_type we can use later.
+shader1 = "shaders/crt-royale/src/crt-royale-scanlines-vertical-interlacing.slang"
+alias1 = "VERTICAL_SCANLINES"
+filter_linear1 = "true"
+scale_type_x1 = "source"
+scale_x1 = "1.0"
+scale_type_y1 = "viewport"
+scale_y1 = "1.0"
+srgb_framebuffer1 = "true"
+
+# Pass2: Do a small resize blur of ORIG_LINEARIZED at an absolute size, and
+# account for convergence offsets.  We want to blur a predictable portion of the
+# screen to match the phosphor bloom, and absolute scale works best for
+# reliable results with a fixed-size bloom.  Picking a scale is tricky:
+# a.) 400x300 is a good compromise for the "fake-bloom" version: It's low enough
+#     to blur high-res/interlaced sources but high enough that resampling
+#     doesn't smear low-res sources too much.
+# b.) 320x240 works well for the "real bloom" version: It's 1-1.5% faster, and
+#     the only noticeable visual difference is a larger halation spread (which
+#     may be a good thing for people who like to crank it up).
+# Note the 4:3 aspect ratio assumes the input has cropped geom_overscan (so it's
+# *intended* for an ~4:3 aspect ratio).
+shader2 = "shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang"
+alias2 = "BLOOM_APPROX"
+filter_linear2 = "true"
+scale_type2 = "absolute"
+scale_x2 = "320"
+scale_y2 = "240"
+srgb_framebuffer2 = "true"
+
+# Pass3: Vertically blur the input for halation and refractive diffusion.
+# Base this on BLOOM_APPROX: This blur should be small and fast, and blurring
+# a constant portion of the screen is probably physically correct if the
+# viewport resolution is proportional to the simulated CRT size.
+shader3 = "../blurs/blur9fast-vertical.slang"
+filter_linear3 = "true"
+scale_type3 = "source"
+scale3 = "1.0"
+srgb_framebuffer3 = "true"
+
+# Pass4: Horizontally blur the input for halation and refractive diffusion.
+# Note: Using a one-pass 9x9 blur is about 1% slower.
+shader4 = "../blurs/blur9fast-horizontal.slang"
+alias4 = "HALATION_BLUR"
+filter_linear4 = "true"
+scale_type4 = "source"
+scale4 = "1.0"
+srgb_framebuffer4 = "true"
+
+# Pass5: Lanczos-resize the phosphor mask vertically.  Set the absolute
+# scale_x5 == mask_texture_small_size.x (see IMPORTANT above).  Larger scales
+# will blur, and smaller scales could get nasty.  The vertical size must be
+# based on the viewport size and calculated carefully to avoid artifacts later.
+# First calculate the minimum number of mask tiles we need to draw.
+# Since curvature is computed after the scanline masking pass:
+#   num_resized_mask_tiles = 2.0;
+# If curvature were computed in the scanline masking pass (it's not):
+#   max_mask_texel_border = ~3.0 * (1/3.0 + 4.0*sqrt(2.0) + 0.5 + 1.0);
+#   max_mask_tile_border = max_mask_texel_border/
+#       (min_resized_phosphor_triad_size * mask_triads_per_tile);
+#   num_resized_mask_tiles = max(2.0, 1.0 + max_mask_tile_border * 2.0);
+#   At typical values (triad_size >= 2.0, mask_triads_per_tile == 8):
+#       num_resized_mask_tiles = ~3.8
+# Triad sizes are given in horizontal terms, so we need geom_max_aspect_ratio
+# to relate them to vertical resolution.  The widest we expect is:
+#   geom_max_aspect_ratio = 4.0/3.0  # Note: Shader passes need to know this!
+# The fewer triads we tile across the screen, the larger each triad will be as a
+# fraction of the viewport size, and the larger scale_y5 must be to draw a full
+# num_resized_mask_tiles.  Therefore, we must decide the smallest number of
+# triads we'll guarantee can be displayed on screen.  We'll set this according
+# to 3-pixel triads at 768p resolution (the lowest anyone's likely to use):
+#   min_allowed_viewport_triads = 768.0*geom_max_aspect_ratio / 3.0 = 341.333333
+# Now calculate the viewport scale that ensures we can draw resized_mask_tiles:
+#   min_scale_x = resized_mask_tiles * mask_triads_per_tile /
+#       min_allowed_viewport_triads
+#   scale_y5 = geom_max_aspect_ratio * min_scale_x
+#   # Some code might depend on equal scales:
+#   scale_x6 = scale_y5
+# Given our default geom_max_aspect_ratio and min_allowed_viewport_triads:
+#   scale_y5 = 4.0/3.0 * 2.0/(341.33333 / 8.0) = 0.0625
+# IMPORTANT: The scales MUST be calculated in this way.  If you wish to change
+# geom_max_aspect_ratio, update that constant in user-preset-constants.h!
+shader5 = "shaders/crt-royale/src/crt-royale-mask-resize-vertical.slang"
+filter_linear5 = "true"
+scale_type_x5 = "absolute"
+scale_x5 = "64"
+scale_type_y5 = "viewport"
+scale_y5 = "0.0625" # Safe for >= 341.333 horizontal triads at viewport size
+#srgb_framebuffer5 = "false" # mask_texture is already assumed linear
+
+# Pass6: Lanczos-resize the phosphor mask horizontally.  scale_x6 = scale_y5.
+# TODO: Check again if the shaders actually require equal scales.
+shader6 = "shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang"
+alias6 = "MASK_RESIZE"
+filter_linear6 = "false"
+scale_type_x6 = "viewport"
+scale_x6 = "0.0625"
+scale_type_y6 = "source"
+scale_y6 = "1.0"
+#srgb_framebuffer6 = "false" # mask_texture is already assumed linear
+
+# Pass7: Resample (misconverged) scanlines horizontally, apply halation, and
+# apply the phosphor mask.
+shader7 = "shaders/crt-royale/src/crt-royale-scanlines-horizontal-apply-mask.slang"
+alias7 = "MASKED_SCANLINES"
+filter_linear7 = "true" # This could just as easily be nearest neighbor.
+scale_type7 = "viewport"
+scale7 = "1.0"
+srgb_framebuffer7 = "true"
+
+# Pass 8: Compute a brightpass.  This will require reading the final mask.
+shader8 = "shaders/crt-royale/src/crt-royale-brightpass.slang"
+alias8 = "BRIGHTPASS"
+filter_linear8 = "true" # This could just as easily be nearest neighbor.
+scale_type8 = "viewport"
+scale8 = "1.0"
+srgb_framebuffer8 = "true"
+
+# Pass 9: Blur the brightpass vertically
+shader9 = "shaders/crt-royale/src/crt-royale-bloom-vertical.slang"
+filter_linear9 = "true" # This could just as easily be nearest neighbor.
+scale_type9 = "source"
+scale9 = "1.0"
+srgb_framebuffer9 = "true"
+
+# Pass 10: Blur the brightpass horizontally and combine it with the dimpass:
+shader10 = "shaders/crt-royale/src/crt-royale-bloom-horizontal-reconstitute.slang"
+filter_linear10 = "true"
+scale_type10 = "source"
+scale10 = "1.0"
+srgb_framebuffer10 = "true"
+
+# Pass 11: Compute curvature/AA:
+shader11 = "shaders/crt-royale/src/crt-royale-geometry-aa-last-pass.slang"
+filter_linear11 = "true"
+scale_type11 = "viewport"
+mipmap_input11 = "true"
+texture_wrap_mode11 = "clamp_to_edge"
+
+parameters = "beam_num_scanlines"
+beam_num_scanlines = 3.0
\ No newline at end of file
diff --git a/crt/shaders/crt-royale/src/bind-shader-params.h b/crt/shaders/crt-royale/src/bind-shader-params.h
index 76145b0..5a1792e 100644
--- a/crt/shaders/crt-royale/src/bind-shader-params.h
+++ b/crt/shaders/crt-royale/src/bind-shader-params.h
@@ -113,7 +113,7 @@ const float gba_gamma = 3.5; //  Irrelevant but necessary to define.
     const float beam_min_shape = max(2.0, beam_min_shape_static);
     const float beam_max_shape = max(beam_min_shape, beam_max_shape_static);
     const float beam_shape_power = max(0.0, beam_shape_power_static);
-    const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
+//    const float beam_horiz_filter = clamp(beam_horiz_filter_static, 0.0, 2.0);
     const float beam_horiz_sigma = max(FIX_ZERO(0.0), beam_horiz_sigma_static);
     const float beam_horiz_linear_rgb_weight = clamp(beam_horiz_linear_rgb_weight_static, 0.0, 1.0);
     //  Unpack vector elements to match scalar uniforms:
diff --git a/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang b/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang
new file mode 100644
index 0000000..0fd6d24
--- /dev/null
+++ b/crt/shaders/crt-royale/src/crt-royale-bloom-approx_fallback.slang
@@ -0,0 +1,354 @@
+#version 450
+
+layout(push_constant) uniform Push
+{
+	vec4 SourceSize;
+	vec4 OriginalSize;
+	vec4 OutputSize;
+	uint FrameCount;
+	vec4 ORIG_LINEARIZEDSize;
+} registers;
+
+#include "params.inc"
+
+/////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
+
+//  crt-royale: A full-featured CRT shader, with cheese.
+//  Copyright (C) 2014 TroggleMonkey <trogglemonkey@gmx.com>
+//
+//  This program is free software; you can redistribute it and/or modify it
+//  under the terms of the GNU General Public License as published by the Free
+//  Software Foundation; either version 2 of the License, or any later version.
+//
+//  This program is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+//  more details.
+//
+//  You should have received a copy of the GNU General Public License along with
+//  this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+//  Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+//////////////////////////////////  INCLUDES  //////////////////////////////////
+
+#include "../user-settings.h"
+#include "derived-settings-and-constants.h"
+#include "bind-shader-params.h"
+#include "../../../../include/gamma-management.h"
+#include "../../../../include/blur-functions.h"
+#include "scanline-functions.h"
+#include "bloom-functions.h"
+
+///////////////////////////////////  HELPERS  //////////////////////////////////
+
+vec3 tex2Dresize_gaussian4x4(const sampler2D tex, const vec2 tex_uv,
+    const vec2 dxdy, const vec2 texture_size, const vec2 texture_size_inv,
+    const vec2 tex_uv_to_pixel_scale, const float sigma)
+{
+    //  Requires:   1.) All requirements of gamma-management.h must be satisfied!
+    //              2.) filter_linearN must == "true" in your .cgp preset.
+    //              3.) mipmap_inputN must == "true" in your .cgp preset if
+    //                  IN.output_size << SRC.video_size.
+    //              4.) dxdy should contain the uv pixel spacing:
+    //                      dxdy = max(vec2(1.0),
+    //                          SRC.video_size/IN.output_size)/SRC.texture_size;
+    //              5.) texture_size == SRC.texture_size
+    //              6.) texture_size_inv == vec2(1.0)/SRC.texture_size
+    //              7.) tex_uv_to_pixel_scale == IN.output_size *
+    //                      SRC.texture_size / SRC.video_size;
+    //              8.) sigma is the desired Gaussian standard deviation, in
+    //                  terms of output pixels.  It should be < ~0.66171875 to
+    //                  ensure the first unused sample (outside the 4x4 box) has
+    //                  a weight < 1.0/256.0.
+    //  Returns:    A true 4x4 Gaussian resize of the input.
+    //  Description:
+    //  Given correct inputs, this Gaussian resizer samples 4 pixel locations
+    //  along each downsized dimension and/or 4 texel locations along each
+    //  upsized dimension.  It computes dynamic weights based on the pixel-space
+    //  distance of each sample from the destination pixel.  It is arbitrarily
+    //  resizable and higher quality than tex2Dblur3x3_resize, but it's slower.
+    //  TODO: Move this to a more suitable file once there are others like it.
+    const float denom_inv = 0.5/(sigma*sigma);
+    //  We're taking 4x4 samples, and we're snapping to texels for upsizing.
+    //  Find texture coords for sample 5 (second row, second column):
+    const vec2 curr_texel = tex_uv * texture_size;
+    const vec2 prev_texel =
+        floor(curr_texel - vec2(under_half)) + vec2(0.5);
+    const vec2 prev_texel_uv = prev_texel * texture_size_inv;
+    const bvec2 snap = lessThanEqual(dxdy , texture_size_inv);
+    const vec2 sample5_downsize_uv = tex_uv - 0.5 * dxdy;
+    const vec2 sample5_uv = mix(sample5_downsize_uv, prev_texel_uv, snap);
+    //  Compute texture coords for other samples:
+    const vec2 dx = vec2(dxdy.x, 0.0);
+    const vec2 sample0_uv = sample5_uv - dxdy;
+    const vec2 sample10_uv = sample5_uv + dxdy;
+    const vec2 sample15_uv = sample5_uv + 2.0 * dxdy;
+    const vec2 sample1_uv = sample0_uv + dx;
+    const vec2 sample2_uv = sample0_uv + 2.0 * dx;
+    const vec2 sample3_uv = sample0_uv + 3.0 * dx;
+    const vec2 sample4_uv = sample5_uv - dx;
+    const vec2 sample6_uv = sample5_uv + dx;
+    const vec2 sample7_uv = sample5_uv + 2.0 * dx;
+    const vec2 sample8_uv = sample10_uv - 2.0 * dx;
+    const vec2 sample9_uv = sample10_uv - dx;
+    const vec2 sample11_uv = sample10_uv + dx;
+    const vec2 sample12_uv = sample15_uv - 3.0 * dx;
+    const vec2 sample13_uv = sample15_uv - 2.0 * dx;
+    const vec2 sample14_uv = sample15_uv - dx;
+    //  Load each sample:
+    const vec3 sample0 = tex2D_linearize(tex, sample0_uv).rgb;
+    const vec3 sample1 = tex2D_linearize(tex, sample1_uv).rgb;
+    const vec3 sample2 = tex2D_linearize(tex, sample2_uv).rgb;
+    const vec3 sample3 = tex2D_linearize(tex, sample3_uv).rgb;
+    const vec3 sample4 = tex2D_linearize(tex, sample4_uv).rgb;
+    const vec3 sample5 = tex2D_linearize(tex, sample5_uv).rgb;
+    const vec3 sample6 = tex2D_linearize(tex, sample6_uv).rgb;
+    const vec3 sample7 = tex2D_linearize(tex, sample7_uv).rgb;
+    const vec3 sample8 = tex2D_linearize(tex, sample8_uv).rgb;
+    const vec3 sample9 = tex2D_linearize(tex, sample9_uv).rgb;
+    const vec3 sample10 = tex2D_linearize(tex, sample10_uv).rgb;
+    const vec3 sample11 = tex2D_linearize(tex, sample11_uv).rgb;
+    const vec3 sample12 = tex2D_linearize(tex, sample12_uv).rgb;
+    const vec3 sample13 = tex2D_linearize(tex, sample13_uv).rgb;
+    const vec3 sample14 = tex2D_linearize(tex, sample14_uv).rgb;
+    const vec3 sample15 = tex2D_linearize(tex, sample15_uv).rgb;
+    //  Compute destination pixel offsets for each sample:
+    const vec2 dest_pixel = tex_uv * tex_uv_to_pixel_scale;
+    const vec2 sample0_offset = sample0_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample1_offset = sample1_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample2_offset = sample2_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample3_offset = sample3_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample4_offset = sample4_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample5_offset = sample5_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample6_offset = sample6_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample7_offset = sample7_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample8_offset = sample8_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample9_offset = sample9_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample10_offset = sample10_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample11_offset = sample11_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample12_offset = sample12_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample13_offset = sample13_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample14_offset = sample14_uv * tex_uv_to_pixel_scale - dest_pixel;
+    const vec2 sample15_offset = sample15_uv * tex_uv_to_pixel_scale - dest_pixel;
+    //  Compute Gaussian sample weights:
+    const float w0 = exp(-LENGTH_SQ(sample0_offset) * denom_inv);
+    const float w1 = exp(-LENGTH_SQ(sample1_offset) * denom_inv);
+    const float w2 = exp(-LENGTH_SQ(sample2_offset) * denom_inv);
+    const float w3 = exp(-LENGTH_SQ(sample3_offset) * denom_inv);
+    const float w4 = exp(-LENGTH_SQ(sample4_offset) * denom_inv);
+    const float w5 = exp(-LENGTH_SQ(sample5_offset) * denom_inv);
+    const float w6 = exp(-LENGTH_SQ(sample6_offset) * denom_inv);
+    const float w7 = exp(-LENGTH_SQ(sample7_offset) * denom_inv);
+    const float w8 = exp(-LENGTH_SQ(sample8_offset) * denom_inv);
+    const float w9 = exp(-LENGTH_SQ(sample9_offset) * denom_inv);
+    const float w10 = exp(-LENGTH_SQ(sample10_offset) * denom_inv);
+    const float w11 = exp(-LENGTH_SQ(sample11_offset) * denom_inv);
+    const float w12 = exp(-LENGTH_SQ(sample12_offset) * denom_inv);
+    const float w13 = exp(-LENGTH_SQ(sample13_offset) * denom_inv);
+    const float w14 = exp(-LENGTH_SQ(sample14_offset) * denom_inv);
+    const float w15 = exp(-LENGTH_SQ(sample15_offset) * denom_inv);
+    const float weight_sum_inv = 1.0/(
+        w0 + w1 + w2 + w3 + w4 + w5 + w6 + w7 +
+        w8 +w9 + w10 + w11 + w12 + w13 + w14 + w15);
+    //  Weight and sum the samples:
+    const vec3 sum = w0 * sample0 + w1 * sample1 + w2 * sample2 + w3 * sample3 +
+        w4 * sample4 + w5 * sample5 + w6 * sample6 + w7 * sample7 +
+        w8 * sample8 + w9 * sample9 + w10 * sample10 + w11 * sample11 +
+        w12 * sample12 + w13 * sample13 + w14 * sample14 + w15 * sample15;
+    return sum * weight_sum_inv;
+}
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 tex_uv;
+layout(location = 1) out float estimated_viewport_size_x;
+layout(location = 2) out vec2 blur_dxdy;
+layout(location = 3) out vec2 uv_scanline_step;
+layout(location = 4) out vec2 texture_size_inv;
+layout(location = 5) out vec2 tex_uv_to_pixel_scale;
+
+void main()
+{
+    //  This vertex shader copies blurs/vertex-shader-blur-one-pass-resize.h,
+    //  except we're using a different source image.
+   gl_Position = params.MVP * Position;
+   const vec2 video_uv = TexCoord;
+   tex_uv = video_uv;
+    //  The last pass (vertical scanlines) had a viewport y scale, so we can
+    //  use it to calculate a better runtime sigma:
+	estimated_viewport_size_x = registers.SourceSize.y * params.geom_aspect_ratio_x / params.geom_aspect_ratio_y;
+   
+    //  Get the uv sample distance between output pixels.  We're using a resize
+    //  blur, so arbitrary upsizing will be acceptable if filter_linearN =
+    //  "true," and arbitrary downsizing will be acceptable if mipmap_inputN =
+    //  "true" too.  The blur will be much more accurate if a true 4x4 Gaussian
+    //  resize is used instead of tex2Dblur3x3_resize (which samples between
+    //  texels even for upsizing).
+	const vec2 dxdy_min_scale = registers.ORIG_LINEARIZEDSize.xy * registers.OutputSize.zw;
+    texture_size_inv = registers.ORIG_LINEARIZEDSize.zw;
+    if(bloom_approx_filter > 1.5)   //  4x4 true Gaussian resize
+    {
+        //  For upsizing, we'll snap to texels and sample the nearest 4.
+        const vec2 dxdy_scale = max(dxdy_min_scale, vec2(1.0));
+        blur_dxdy = dxdy_scale * texture_size_inv;
+    }
+    else
+    {
+        const vec2 dxdy_scale = dxdy_min_scale;
+        blur_dxdy = dxdy_scale * texture_size_inv;
+	}
+	
+	tex_uv_to_pixel_scale = registers.OutputSize.xy;
+//  texture_size_inv = texture_size_inv; <- commented out because it's pointless in slang
+
+    //  Detecting interlacing again here lets us apply convergence offsets in
+    //  this pass.  il_step_multiple contains the (texel, scanline) step
+    //  multiple: 1 for progressive, 2 for interlaced.
+    const vec2 orig_video_size = registers.ORIG_LINEARIZEDSize.xy;
+	float interlace_check = 0.0;
+	if (is_interlaced(orig_video_size.y) == true) interlace_check = 1.0;
+    const float y_step = 1.0 + interlace_check;
+    const vec2 il_step_multiple = vec2(1.0, y_step);
+    //  Get the uv distance between (texels, same-field scanlines):
+    uv_scanline_step = il_step_multiple * registers.ORIG_LINEARIZEDSize.zw;
+}
+
+#pragma stage fragment
+#pragma format R8G8B8A8_SRGB
+layout(location = 0) in vec2 tex_uv;
+layout(location = 1) in float estimated_viewport_size_x;
+layout(location = 2) in vec2 blur_dxdy;
+layout(location = 3) in vec2 uv_scanline_step;
+layout(location = 4) in vec2 texture_size_inv;
+layout(location = 5) in vec2 tex_uv_to_pixel_scale;
+layout(location = 0) out vec4 FragColor;
+layout(set = 0, binding = 2) uniform sampler2D Source;
+layout(set = 0, binding = 3) uniform sampler2D ORIG_LINEARIZED;
+
+void main()
+{
+    //  Would a viewport-relative size work better for this pass?  (No.)
+    //  PROS:
+    //  1.) Instead of writing an absolute size to user-cgp-constants.h, we'd
+    //      write a viewport scale.  That number could be used to directly scale
+    //      the viewport-resolution bloom sigma and/or triad size to a smaller
+    //      scale.  This way, we could calculate an optimal dynamic sigma no
+    //      matter how the dot pitch is specified.
+    //  CONS:
+    //  1.) Texel smearing would be much worse at small viewport sizes, but
+    //      performance would be much worse at large viewport sizes, so there
+    //      would be no easy way to calculate a decent scale.
+    //  2.) Worse, we could no longer get away with using a constant-size blur!
+    //      Instead, we'd have to face all the same difficulties as the real
+    //      phosphor bloom, which requires static #ifdefs to decide the blur
+    //      size based on the expected triad size...a dynamic value.
+    //  3.) Like the phosphor bloom, we'd have less control over making the blur
+    //      size correct for an optical blur.  That said, we likely overblur (to
+    //      maintain brightness) more than the eye would do by itself: 20/20
+    //      human vision distinguishes ~1 arc minute, or 1/60 of a degree.  The
+    //      highest viewing angle recommendation I know of is THX's 40.04 degree
+    //      recommendation, at which 20/20 vision can distinguish about 2402.4
+    //      lines.  Assuming the "TV lines" definition, that means 1201.2
+    //      distinct light lines and 1201.2 distinct dark lines can be told
+    //      apart, i.e. 1201.2 pairs of lines.  This would correspond to 1201.2
+    //      pairs of alternating lit/unlit phosphors, so 2402.4 phosphors total
+    //      (if they're alternately lit).  That's a max of 800.8 triads.  Using
+    //      a more popular 30 degree viewing angle recommendation, 20/20 vision
+    //      can distinguish 1800 lines, or 600 triads of alternately lit
+    //      phosphors.  In contrast, we currently blur phosphors all the way
+    //      down to 341.3 triads to ensure full brightness.
+    //  4.) Realistically speaking, we're usually just going to use bilinear
+    //      filtering in this pass anyway, but it only works well to limit
+    //      bandwidth if it's done at a small constant scale.
+    
+    //  Get the constants we need to sample:
+	const vec2 texture_size = registers.ORIG_LINEARIZEDSize.xy;
+	vec2 tex_uv_r, tex_uv_g, tex_uv_b;
+	
+	if(beam_misconvergence == true)
+    {
+        const vec2 convergence_offsets_r = vec2(params.convergence_offset_x_r, params.convergence_offset_y_r);//get_convergence_offsets_r_vector();
+        const vec2 convergence_offsets_g = vec2(params.convergence_offset_x_g, params.convergence_offset_y_g);//get_convergence_offsets_g_vector();
+        const vec2 convergence_offsets_b = vec2(params.convergence_offset_x_b, params.convergence_offset_y_b);//get_convergence_offsets_b_vector();
+        tex_uv_r = tex_uv - vec2(params.convergence_offset_x_r, params.convergence_offset_y_r) * uv_scanline_step;
+        tex_uv_g = tex_uv - vec2(params.convergence_offset_x_g, params.convergence_offset_y_g) * uv_scanline_step;
+        tex_uv_b = tex_uv - vec2(params.convergence_offset_x_b, params.convergence_offset_y_b) * uv_scanline_step;
+    }
+	//  Get the blur sigma:
+    const float bloom_approx_sigma = get_bloom_approx_sigma(registers.OutputSize.x, estimated_viewport_size_x);
+	
+	//  Sample the resized and blurred texture, and apply convergence offsets if
+    //  necessary.  Applying convergence offsets here triples our samples from
+    //  16/9/1 to 48/27/3, but faster and easier than sampling BLOOM_APPROX and
+    //  HALATION_BLUR 3 times at full resolution every time they're used.
+    vec3 color_r, color_g, color_b, color;
+	if(bloom_approx_filter > 1.5)
+    {
+        //  Use a 4x4 Gaussian resize.  This is slower but technically correct.
+        if(beam_misconvergence == true)
+        {
+            color_r = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_r,
+                blur_dxdy, texture_size, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+            color_g = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_g,
+                blur_dxdy, texture_size, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+            color_b = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv_b,
+                blur_dxdy, texture_size, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+        }
+        else
+        {
+            color = tex2Dresize_gaussian4x4(ORIG_LINEARIZED, tex_uv,
+                blur_dxdy, texture_size, texture_size_inv,
+                tex_uv_to_pixel_scale, bloom_approx_sigma);
+        }
+    }
+    else if(bloom_approx_filter > 0.5)
+    {
+        //  Use a 3x3 resize blur.  This is the softest option, because we're
+        //  blurring already blurry bilinear samples.  It doesn't play quite as
+        //  nicely with convergence offsets, but it has its charms.
+        if(beam_misconvergence == true)
+        {
+            color_r = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_r,
+                blur_dxdy, bloom_approx_sigma);
+            color_g = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_g,
+                blur_dxdy, bloom_approx_sigma);
+            color_b = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv_b,
+                blur_dxdy, bloom_approx_sigma);
+        }
+        else
+        {
+            color = tex2Dblur3x3resize(ORIG_LINEARIZED, tex_uv, blur_dxdy);
+        }
+    }
+    else
+    {
+        //  Use bilinear sampling.  This approximates a 4x4 Gaussian resize MUCH
+        //  better than tex2Dblur3x3_resize for the very small sigmas we're
+        //  likely to use at small output resolutions.  (This estimate becomes
+        //  too sharp above ~400x300, but the blurs break down above that
+        //  resolution too, unless min_allowed_viewport_triads is high enough to
+        //  keep bloom_approx_scale_x/min_allowed_viewport_triads < ~1.1658025.)
+        if(beam_misconvergence == true)
+        {
+            color_r = tex2D_linearize(ORIG_LINEARIZED, tex_uv_r).rgb;
+            color_g = tex2D_linearize(ORIG_LINEARIZED, tex_uv_g).rgb;
+            color_b = tex2D_linearize(ORIG_LINEARIZED, tex_uv_b).rgb;
+        }
+        else
+        {
+            color = tex2D_linearize(ORIG_LINEARIZED, tex_uv).rgb;
+        }
+    }
+	//  Pack the colors from the red/green/blue beams into a single vector:
+    if(beam_misconvergence == true)
+    {
+        color = vec3(color_r.r, color_g.g, color_b.b);
+    }
+    //  Encode and output the blurred image:
+   FragColor = vec4(texture(ORIG_LINEARIZED, tex_uv));//vec4(color, 1.0);//
+}
diff --git a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
index fd67019..02ec577 100755
--- a/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-first-pass-linearize-crt-gamma-bob-fields.slang
@@ -10,9 +10,11 @@ layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
     float interlace_bff;
+	float beam_horiz_filter;
 } params;
 
 #pragma parameter interlace_bff "interlace_bff" 1.0 0.0 1.0 1.0
+#pragma parameter beam_horiz_filter "beam_horiz_filter" 0.0 0.0 2.0 1.0
 
 /////////////////////////////  GPL LICENSE NOTICE  /////////////////////////////
 
diff --git a/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang b/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang
index d5b6fcb..0fcdc2f 100755
--- a/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang
+++ b/crt/shaders/crt-royale/src/crt-royale-mask-resize-horizontal.slang
@@ -108,7 +108,7 @@ void main()
     //  easier tiled sampling later.
     #ifdef PHOSPHOR_MASK_MANUALLY_RESIZE
         //  Discard unneeded fragments in case our profile allows real branches.
-        const vec2 tile_uv_wrap = tile_uv_wrap;
+//        const vec2 tile_uv_wrap = tile_uv_wrap;
         if(params.mask_sample_mode_desired < 0.5 &&
             max(tile_uv_wrap.x, tile_uv_wrap.y) <= mask_resize_num_tiles)
         {
diff --git a/crt/shaders/crt-royale/src/scanline-functions.h b/crt/shaders/crt-royale/src/scanline-functions.h
index ede23d8..5169b3d 100644
--- a/crt/shaders/crt-royale/src/scanline-functions.h
+++ b/crt/shaders/crt-royale/src/scanline-functions.h
@@ -156,7 +156,7 @@ vec3 get_scanline_color(const sampler2D tex, const vec2 scanline_uv,
     const vec3 color2 = texture(tex, scanline_uv + uv_step_x).rgb;
     vec3 color0 = vec3(0.0);
     vec3 color3 = vec3(0.0);
-    if(beam_horiz_filter > 0.5)
+    if(params.beam_horiz_filter > 0.5)
     {
         color0 = texture(tex, scanline_uv - uv_step_x).rgb;
         color3 = texture(tex, scanline_uv + 2.0 * uv_step_x).rgb;
@@ -183,14 +183,14 @@ vec3 sample_single_scanline_horizontal(const sampler2D texture,
         1.0 - prev_dist, 2.0 - prev_dist);
     //  Get Quilez, Lanczos2, or Gaussian resize weights for 2/4 nearby texels:
     vec4 weights;
-    if(beam_horiz_filter < 0.5)
+    if(params.beam_horiz_filter < 0.5)
     {
         //  Quilez:
         const float x = sample_dists.y;
         const float w2 = x*x*x*(x*(x*6.0 - 15.0) + 10.0);
         weights = vec4(0.0, 1.0 - w2, w2, 0.0);
     }
-    else if(beam_horiz_filter < 1.5)
+    else if(params.beam_horiz_filter < 1.5)
     {
         //  Gaussian:
         float inner_denom_inv = 1.0/(2.0*beam_horiz_sigma*beam_horiz_sigma);
diff --git a/presets/crt-royale-kurozumi.slangp b/presets/crt-royale-kurozumi.slangp
index 8364cfc..a412b85 100644
--- a/presets/crt-royale-kurozumi.slangp
+++ b/presets/crt-royale-kurozumi.slangp
@@ -219,12 +219,12 @@ beam_max_shape = "4.000000"
 beam_shape_power = "0.250000"
 beam_horiz_filter = "0.000000"
 beam_horiz_sigma = "0.545000"
-convergence_offset_x_r = "0.000000"
+convergence_offset_x_r = "-0.050000"
 convergence_offset_x_g = "0.000000"
 convergence_offset_x_b = "0.000000"
-convergence_offset_y_r = "0.000000"
-convergence_offset_y_g = "0.000000"
-convergence_offset_y_b = "0.000000"
+convergence_offset_y_r = "0.100000"
+convergence_offset_y_g = "-0.050000"
+convergence_offset_y_b = "0.100000"
 mask_type = "0.000000"
 mask_sample_mode_desired = "1.000000"
 mask_specify_num_triads = "0.000000"