add more blurs, ss-gamma-ramp, updated nds color (#176)

* add updated NDS color, Super Sleuth gamma ramp and more trogglemonkey blurs
2025-02-19 08:17:43 +11:00 · 2021-05-27 21:41:42 -05:00 · 2021-05-27 21:41:42 -05:00 · 70256e15f6
commit 70256e15f6
parent 4b9d005bc2
11 changed files with 894 additions and 61 deletions
--- a/blurs/README.md
+++ b/blurs/README.md
@ -0,0 +1,129 @@
 This info pertains to the Blurs by TroggleMonkey:
 DESCRIPTION:
 Gaussian blurs are common building blocks in multi-pass shaders, and this
 library of optimized and tested blurs should make it easier to use whatever size
 blur you need.  All of these shaders are based on the tex2Dblur* functions in
 include/blur-functions.h, so you can use those directly if you ever need to
 add more processing to the same pass as a Gaussian blur.
 PICK THE RIGHT BLUR FOR YOUR USE CASE:
 There are several different types of blurs, ranging in size from 3-12 texels:
 a.) "Resize" separable blurs use vertical and horizontal passes and require N
    taps for an Nx blur.  These are arbitrarily resizable.
 b.) "Fast" separable blurs use vertical and horizontal passes and require N taps
    for an (N*2 - 1)x blur.  They exploit bilinear filtering to reduce the
    required taps from e.g. 9 to 5.  These are always faster, but they have
    strict image scale requirements.
 c.) "Resize" one-pass blurs combine the vertical/horizontal passes of the
    "resize" separable blurs, and they require NxN taps for an NxN blur.  These
    perform slowly enough that only tex2Dblur3x3resize is useful/included.
 d.) Other one-pass blurs combine the vertical/horizontal passes of the "fast"
    separable blurs, and they exploit bilinear filtering the same way.  They're
    faster than separable blurs at 3x3, competitive at 5x5 depending on options,
    and slower at 7x7 and above...but larger blurs may still be useful if you're
    hurting for passes.
 e.) "Shared" one-pass blurs go a step further: They also use quad-pixel
    communication with fine-grained derivatives to distribute texture samples
    across a 2x2 pixel quad.  (ddx() and ddy() are required, as well as a GPU
    that uses fine-grained derivatives).  These blurs are faster than the other
    one-pass blurs, but they have some artifacts from combining sample-sharing
    with bilinear sampling, so they're best reserved for reblurring an already-
    blurred input.
 Every blur expects linear filtering.  Except for resize separable blurs, all
 require a pass scale of (1/(2^M)) for some M >= 0.  That is, the output image
 has to have a 1:1 pixel:texel ratio with some mipmap of the input image, so use
 e.g. scaleN = "1.0" or scaleN = "0.25", not scaleN = "0.33" or scaleN = "2.0".
 Note: mipmap_inputN must = "true" in your .slangp file for scales other than 1.0.
 There are two suffixes on the .slang files relating to gamma correction:
 * Blurs with no suffix assume linear RGB input and output.
 * Blurs with a "-last-pass" suffix use pow() to gamma-correct their output.
 * Blurs with a "-gamma-encode-every-fbo" suffix use pow() to linearize each
  input sample and again to gamma-correct the output.  These blurs are MUCH
  slower than blurs without this suffix, but they're provided in case you want
  to be [almost] gamma-correct on platforms without sRGB FBO's.  (The "almost"
  is because bilinear filtering still won't be gamma-correct without sRGB.)
 * There are also blurs with both suffixes.  This may seem redundant, but they
  make it easier to use a different output gamma for the last pass than for
  the rest of the pipeline (such as when simulating another display device like
  a Game Boy Advance or CRT).  See srgb-helpers/README.txt for more information.
 BENCHMARK RESULTS:
 Blurs have different performance characteristics depending on whether the input
 is mipmapped and depending on whether they're gamma-encoding every FBO.  Here's
 an excerpt from the blur-functions.h description with a comparison.  Note that
 benchmarks without an sRGB heading use "-gamma-encode-every-fbo" suffixes, and
 you can just look at the sRGB performance figures if you don't care about gamma:
 //  Here are some framerates from a GeForce 8800GTS.  The first pass resizes to
 //  viewport size (4x in this test) and linearizes for sRGB codepaths, and the
 //  remaining passes perform 6 full blurs.  Mipmapped tests are performed at the
 //  same scale, so they just measure the cost of mipmapping each FBO (only every
 //  other FBO is mipmapped for separable blurs, to mimic realistic usage).
 //  Mipmap      Neither     sRGB+Mipmap sRGB        Function
 //  76.0        92.3        131.3       193.7       tex2Dblur3fast
 //  63.2        74.4        122.4       175.5       tex2Dblur3resize
 //  93.7        121.2       159.3       263.2       tex2Dblur3x3
 //  59.7        68.7        115.4       162.1       tex2Dblur3x3resize
 //  63.2        74.4        122.4       175.5       tex2Dblur5fast
 //  49.3        54.8        100.0       132.7       tex2Dblur5resize
 //  59.7        68.7        115.4       162.1       tex2Dblur5x5
 //  64.9        77.2        99.1        137.2       tex2Dblur6x6shared
 //  55.8        63.7        110.4       151.8       tex2Dblur7fast
 //  39.8        43.9        83.9        105.8       tex2Dblur7resize
 //  40.0        44.2        83.2        104.9       tex2Dblur7x7
 //  56.4        65.5        71.9        87.9        tex2Dblur8x8shared
 //  49.3        55.1        99.9        132.5       tex2Dblur9fast
 //  33.3        36.2        72.4        88.0        tex2Dblur9resize
 //  27.8        29.7        61.3        72.2        tex2Dblur9x9
 //  37.2        41.1        52.6        60.2        tex2Dblur10x10shared
 //  44.4        49.5        91.3        117.8       tex2Dblur11fast
 //  28.8        30.8        63.6        75.4        tex2Dblur11resize
 //  33.6        36.5        40.9        45.5        tex2Dblur12x12shared
 BASIC USAGE:
 The .slangp presets in the quality-test-presets folder provide usage examples
 for basically every .slang blur shader.  The "-srgb" suffix on some .slangp
 presets is an explicit notice that they use sRGB FBO's.  Note how and when the
 "-last-pass" suffix is used for each .slang file, etc.
 The provided .slangp files with the "-mipmap" suffix are used to test quality and
 benchmarking with mipmapping enabled, but none of them actually use mipmapping
 as a feature in and of itself.  The following contrived .slangp would do that:
    shaders = "4"
    # Pass0: Linearize RGB:
    shader0 = ../../srgb-helpers/first-pass-linearize.slang
    filter_linear0 = "true"
    scale_type0 = "source"
    scale0 = "1.0"
    srgb_framebuffer0 = "true"
    # Pass1: Upsize to 4x.  Pretend this pass does significant processing at 4x.
    shader1 = ../../stock.slang
    filter_linear1 = "true"
    scale_type1 = "source"
    scale1 = "4.0"
    srgb_framebuffer1 = "true"
    # Pass2: Blur a source-sized mipmap 9x vertically; just shrink horizontally.
    shader2 = ../blur9fast-vertical.slang
    filter_linear2 = "true"
    scale_type2 = "source"
    scale2 = "0.25"
    srgb_framebuffer2 = "true"
    mipmap_input = "true"
    # Pass3: Blur 9x horizontally
    shader3 = ../blur9fast-horizontal.slang
    filter_linear3 = "true"
    scale_type3 = "source"
    scale3 = "1.0"
    srgb_framebuffer3 = "true"
    # Pass4: Scale to the screen size and gamma-correct the output:
    shader4 = ../../srgb-helpers/last-pass-gamma-correct.slang
    filter_linear4 = "true"
    scale_type4 = "viewport"
    scale4 = "1.0"
--- a/blurs/blur10x10shared-gamma-encode-every-fbo.slang
+++ b/blurs/blur10x10shared-gamma-encode-every-fbo.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 #define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 //#define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur10x10shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur10x10shared-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur10x10shared-last-pass-gamma-encode-every-fbo.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 #define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 #define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur10x10shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur10x10shared-last-pass.slang
+++ b/blurs/blur10x10shared-last-pass.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 //#define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 #define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur10x10shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur10x10shared.slang
+++ b/blurs/blur10x10shared.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 //#define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 //#define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur10x10shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur12x12shared-gamma-encode-every-fbo.slang
+++ b/blurs/blur12x12shared-gamma-encode-every-fbo.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 #define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 //#define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur12x12shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur12x12shared-last-pass-gamma-encode-every-fbo.slang
+++ b/blurs/blur12x12shared-last-pass-gamma-encode-every-fbo.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 #define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 #define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur12x12shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur12x12shared-last-pass.slang
+++ b/blurs/blur12x12shared-last-pass.slang
@ -0,0 +1,87 @@
 #version 450
 /////////////////////////////////  MIT LICENSE  ////////////////////////////////
 //  Copyright (C) 2014 TroggleMonkey
 //
 //  Permission is hereby granted, free of charge, to any person obtaining a copy
 //  of this software and associated documentation files (the "Software"), to
 //  deal in the Software without restriction, including without limitation the
 //  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 //  sell copies of the Software, and to permit persons to whom the Software is
 //  furnished to do so, subject to the following conditions:
 //  
 //  The above copyright notice and this permission notice shall be included in
 //  all copies or substantial portions of the Software.
 //
 //  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 //  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 //  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 //  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 //  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 //  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 //  IN THE SOFTWARE.
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 } params;
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 /////////////////////////////  SETTINGS MANAGEMENT  ////////////////////////////
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
 //  can #define things like this in the preset file.
 //#define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 #define LAST_PASS
 //#define SIMULATE_CRT_ON_LCD
 //#define SIMULATE_GBA_ON_LCD
 //#define SIMULATE_LCD_ON_CRT
 //#define SIMULATE_GBA_ON_CRT
 //  blur-functions.h needs to know our profile's capabilities:
 //  1.) DRIVERS_ALLOW_DERIVATIVES is mandatory for one-pass shared-sample blurs.
 //  2.) DRIVERS_ALLOW_TEX2DLOD is optional, but mipmapped blurs will have awful
 //      artifacts without it due to funky texture sampling derivatives.
 #define DRIVERS_ALLOW_DERIVATIVES
 #define DRIVERS_ALLOW_TEX2DLOD
 ///////////////////////////////  VERTEX INCLUDES  ///////////////////////////////
 #include "../include/compat_macros.inc"
 #pragma stage vertex
 #include "vertex-shader-blur-one-pass-shared-sample.h"
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
 layout(location = 1) in vec4 output_pixel_num;
 layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
 /////////////////////////////  FRAGMENT INCLUDES  /////////////////////////////
 #include "../include/gamma-management.h"
 #include "../include/blur-functions.h"
 void main()
 {
    //  Get the integer output pixel number from two origins (uv and screen):
    float4 output_pixel_num_integer = floor(output_pixel_num);
    //  Get the fragment's position in the pixel quad and do a shared-sample blur:
    float4 quad_vector = get_quad_vector(output_pixel_num_integer);
    float3 color = tex2Dblur12x12shared(input_texture, tex_uv,
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
 }
--- a/blurs/blur12x12shared.slang
+++ b/blurs/blur12x12shared.slang
@ -40,7 +40,7 @@ layout(std140, set = 0, binding = 0) uniform UBO
 //  PASS SETTINGS:
 //  gamma-management.h needs to know what kind of pipeline we're using and
 //  what pass this is in that pipeline.  This will become obsolete if/when we
-//  can #define things like this in the .cgp preset file.
+//  can #define things like this in the preset file.
 //#define GAMMA_ENCODE_EVERY_FBO
 //#define FIRST_PASS
 //#define LAST_PASS
@ -64,8 +64,8 @@ layout(std140, set = 0, binding = 0) uniform UBO
 #pragma stage fragment
 layout(location = 0) in vec4 tex_uv;
-layout(location = 1) in vec2 blur_dxdy;
+layout(location = 1) in vec4 output_pixel_num;
-layout(location = 2) in vec4 output_pixel_num;
+layout(location = 2) in vec2 blur_dxdy;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 #define input_texture Source
@ -84,4 +84,4 @@ void main()
        blur_dxdy, quad_vector);
    //  Encode and output the blurred image:
    FragColor = encode_output(float4(color, 1.0));
-}
+}
--- a/handheld/shaders/color/nds-color.slang
+++ b/handheld/shaders/color/nds-color.slang
@ -1,13 +1,5 @@
 #version 450
 layout(std140, set = 0, binding = 0) uniform UBO
 {
   mat4 MVP;
   vec4 OutputSize;
   vec4 OriginalSize;
   vec4 SourceSize;
 } global;
 /*
   Shader Modified: Pokefan531
   Color Mangler
@ -16,74 +8,105 @@ layout(std140, set = 0, binding = 0) uniform UBO
 */
 // Shader that replicates the LCD dynamics from a Nintendo DS Phat --
-#define target_gamma 1.91
+layout(std140, set = 0, binding = 0) uniform UBO
-#define display_gamma 1.91
+{
-#define sat 1.0
+	mat4 MVP;
-#define lum 0.89
+	vec4 OutputSize;
-#define contrast 1.0
+	vec4 OriginalSize;
-#define blr 0.0
+	vec4 SourceSize;
-#define blg 0.0
+	float mode, white_toggle;
-#define blb 0.0
+} global;
-#define r 0.87
+
-#define g 0.645
+#pragma parameter mode "Color Profile (1=sRGB, 2=DCI, 3=Rec2020)" 1.0 1.0 3.0 1.0
-#define b 0.73
+int color_mode = int(global.mode);
-#define rg 0.10
+
-#define rb 0.10
+#pragma parameter white_toggle "Toggle White Balance" 0.0 0.0 1.0 1.0
-#define gr 0.255
+bool white = bool(global.white_toggle);
-#define gb 0.17
+
-#define br -0.125
+#define target_gamma 2.2
-#define bg 0.255
+#define display_gamma 2.2
 #define overscan_percent_x 0.0
 #define overscan_percent_y 0.0
 /*
-White-NDS
+We'll define our color weights in this pattern:
-#define lum 0.98
+	r,   rg,  rb,  0.0,  //red channel
-#define r 0.83
+	gr,  g,   gb,  0.0,  //green channel
-#define g 0.625
+	br,  bg,  b,   0.0,  //blue channel
-#define b 0.785
+	blr, blg, blb, lum   //alpha channel; we'll hide luma at the end, too
 #define rg 0.07
 #define rb 0.075
 #define gr 0.22
 #define gb 0.155
 #define br -0.13
 #define bg 0.22
 */
 const mat4 NDS_Rec2020 = mat4(
 	0.58, 0.13,  0.09, 0.0,
 	0.38, 0.645, 0.20, 0.0,
 	0.04, 0.225, 0.71, 0.0,
 	0.0,  0.0,   0.0,  1.0
 ); 
 const mat4 NDS_Rec2020_white = mat4(
 	0.535, 0.12,  0.09, 0.0,
 	0.345, 0.585, 0.20, 0.0,
 	0.04,  0.215, 0.71, 0.0,
 	0.0,   0.0,   0.0,  1.0
 ); 
 const mat4 NDS_DCI = mat4(
 	0.745, 0.10,  0.09,  0.0,
 	0.315, 0.665, 0.195, 0.0,
 	-0.06, 0.235, 0.715, 0.0,
 	0.0,   0.0,   0.0,   0.95
 ); 
 const mat4 NDS_DCI_white = mat4(
 	0.685, 0.095, 0.09,  0.0,
 	0.29,  0.605, 0.19,  0.0,
 	-0.06, 0.215, 0.715, 0.0,
 	0.0,   0.0,   0.0,   1.0
 ); 
 const mat4 NDS_sRGB = mat4(
 	0.815, 0.07, 0.075, 0.0,
 	0.215, 0.62, 0.155, 0.0,
 	-0.12, 0.22, 0.77,  0.0,
 	0.0,   0.0,  0.0,   0.89
 ); 
 const mat4 NDS_sRGB_white = mat4(
 	0.815, 0.07, 0.075, 0.0,
 	0.215, 0.62, 0.155, 0.0,
 	-0.12, 0.22, 0.77,  0.0,
 	0.0,   0.0,  0.0,   0.97
 ); 
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
 layout(location = 0) out vec2 vTexCoord;
 layout(location = 1) out mat4 profile;
 void main()
 {
-   gl_Position = global.MVP * Position;
+	gl_Position = global.MVP * Position;
-   vTexCoord = TexCoord;
+	vTexCoord = TexCoord;
 	if (color_mode == 3) profile = (!white) ? NDS_Rec2020 : NDS_Rec2020_white;
 	else if (color_mode == 2) profile = (!white) ? NDS_DCI : NDS_DCI_white;
 	else if (color_mode == 1) profile = (!white) ? NDS_sRGB : NDS_sRGB_white;
 }
 #pragma stage fragment
 layout(location = 0) in vec2 vTexCoord;
 layout(location = 1) in mat4 profile;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 void main()
 {
-   vec4 screen = pow(texture(Source, vTexCoord), vec4(target_gamma)).rgba;
+	// bring out our stored luminance value
-   vec4 avglum = vec4(0.5);
+	float lum = profile[3].w;
-   screen = mix(screen, avglum, (1.0 - contrast));
+
-   
+	// our adjustments need to happen in linear gamma
- //				r   g    b   black
+	vec4 screen = pow(texture(Source, vTexCoord), vec4(target_gamma)).rgba;
-mat4 color = mat4(r,  rg,  rb, 0.0,  //red channel
+
 			   gr,  g,   gb, 0.0,  //green channel
 			   br,  bg,  b,  0.0,  //blue channel
 			  blr, blg, blb,    0.0); //alpha channel; these numbers do nothing for our purposes.
 mat4 adjust = mat4((1.0 - sat) * 0.3086 + sat, (1.0 - sat) * 0.3086, (1.0 - sat) * 0.3086, 1.0,
 (1.0 - sat) * 0.6094, (1.0 - sat) * 0.6094 + sat, (1.0 - sat) * 0.6094, 1.0,
 (1.0 - sat) * 0.0820, (1.0 - sat) * 0.0820, (1.0 - sat) * 0.0820 + sat, 1.0,
 0.0, 0.0, 0.0, 1.0);
 	color *= adjust;
 	screen = clamp(screen * lum, 0.0, 1.0);
-	screen = color * screen;
+	screen = profile * screen;
 	FragColor = pow(screen, vec4(1.0 / display_gamma));
-}
+}
--- a/misc/ss-gamma-ramp.slang
+++ b/misc/ss-gamma-ramp.slang
@ -0,0 +1,72 @@
 #version 450
 // Super Sleuth Gamma Ramp
 // based on Overload's ramp as implemented in bsnes v073
 // ported by hunterk
 // license: GPLv2
 layout(push_constant) uniform Push
 {
 	vec4 SourceSize;
 	vec4 OriginalSize;
 	vec4 OutputSize;
 	uint FrameCount;
 	float mixer;
 } params;
 #pragma parameter mixer "Gamma Boost (%)" 150.0 100.0 200.0 1.0
 layout(std140, set = 0, binding = 0) uniform UBO
 {
 	mat4 MVP;
 } global;
 #pragma stage vertex
 layout(location = 0) in vec4 Position;
 layout(location = 1) in vec2 TexCoord;
 layout(location = 0) out vec2 vTexCoord;
 void main()
 {
   gl_Position = global.MVP * Position;
   vTexCoord = TexCoord;
 }
 #pragma stage fragment
 layout(location = 0) in vec2 vTexCoord;
 layout(location = 0) out vec4 FragColor;
 layout(set = 0, binding = 2) uniform sampler2D Source;
 // Overload's gamma ramp from Super Sleuth
 // Apparently not really based on anything but it looks nice
 const uint gammaRamp[32] = {
      0x00, 0x01, 0x03, 0x06, 0x0a, 0x0f, 0x15, 0x1c,
      0x24, 0x2d, 0x37, 0x42, 0x4e, 0x5b, 0x69, 0x78,
      0x88, 0x90, 0x98, 0xa0, 0xa8, 0xb0, 0xb8, 0xc0,
      0xc8, 0xd0, 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, 0xff,
    };
 #define conv(f) ((f >= 1.0) ? 255 : (f <= 0.0 ? 0 : int(floor(f * 256.0))))
 void main()
 {
 	vec4 img = texture(Source, vTexCoord);
 	// convert standard vec4 to uint color values
 	uvec4 int_img = uvec4(conv(img.r), conv(img.g), conv(img.b), conv(img.a));
 	uint r = (int_img.r) & 0xff;
 	uint g = (int_img.g) & 0xff;
 	uint b = (int_img.b) & 0xff;
 	// apply the ramp
 	uint R = gammaRamp[r >> 3];
 	uint G = gammaRamp[g >> 3];
 	uint B = gammaRamp[b >> 3];
 	uvec3 output_i = uvec3(R, G, B);
 	vec3  output_f = vec3(output_i) * vec3(1./255.);
 	// mix between corrected and uncorrected output
 	FragColor.rgb = mix(img.rgb, output_f, (params.mixer / 100.0) - 1.0);
 }