Move FIR filters to their own module

2022-06-07 15:19:18 +02:00 · 2022-06-07 15:19:18 +02:00 · b32cd27e8c
parent ac5796ee59
commit b32cd27e8c
3 changed files with 211 additions and 188 deletions
--- a/plugins/crossover/src/crossover/fir.rs
+++ b/plugins/crossover/src/crossover/fir.rs
@ -17,11 +17,14 @@
 use nih_plug::buffer::ChannelSamples;
 use nih_plug::debug::*;
 use std::f32;
-use std::simd::{f32x2, StdFloat};
+use std::simd::f32x2;
-use crate::crossover::iir::biquad::{Biquad, BiquadCoefficients, NEUTRAL_Q};
+use self::filter::{FirCoefficients, FirFilter};
 use crate::crossover::iir::biquad::{BiquadCoefficients, NEUTRAL_Q};
 use crate::NUM_BANDS;
 pub mod filter;
 // TODO: Move this to FFT convolution so we can increase the filter size and improve low latency performance
 /// The size of the FIR filter window, or the number of taps. The low frequency performance is
@ -52,53 +55,6 @@ pub enum FirCrossoverType {
    LinkwitzRiley24LinearPhase,
 }
 /// A single FIR filter that may be configured in any way. In this plugin this will be a
 /// linear-phase low-pass, band-pass, or high-pass filter.
 #[derive(Debug, Clone)]
 struct FirFilter {
    /// The coefficients for this filter. The filters for both channels should be equivalent, this
    /// just avoids broadcasts in the filter process.
    ///
    /// TODO: Profile to see if storing this as f32x2 rather than f32s plus splatting makes any
    ///       difference in performance at all
    pub coefficients: FirCoefficients,
    /// A ring buffer storing the last `FILTER_SIZE - 1` samples. The capacity is `FILTER_SIZE`
    /// rounded up to the next power of two.
    delay_buffer: [f32x2; RING_BUFFER_SIZE],
    /// The index in `delay_buffer` to write the next sample to. Wrapping negative indices back to
    /// the end, the previous sample can be found at `delay_buffer[delay_buffer_next_idx - 1]`, the
    /// one before that at `delay_buffer[delay_buffer_next_idx - 2]`, and so on.
    delay_buffer_next_idx: usize,
 }
 /// Coefficients for an FIR filter. This struct includes ways to design the filter. Parameterized
 /// over `f32x2` only for the time being since that's what we need here.
 #[repr(transparent)]
 #[derive(Debug, Clone)]
 struct FirCoefficients([f32x2; FILTER_SIZE]);
 impl Default for FirFilter {
    fn default() -> Self {
        Self {
            coefficients: FirCoefficients::default(),
            delay_buffer: [f32x2::default(); RING_BUFFER_SIZE],
            delay_buffer_next_idx: 0,
        }
    }
 }
 impl Default for FirCoefficients {
    fn default() -> Self {
        // Initialize this to a delay with the same amount of latency as we'd introduce with our
        // linear-phase filters
        let mut coefficients = [f32x2::default(); FILTER_SIZE];
        coefficients[FILTER_SIZE / 2] = f32x2::splat(1.0);
        Self(coefficients)
    }
 }
 impl FirCrossover {
    /// Create a new multiband crossover processor. All filters will be configured to pass audio
    /// through as is, albeit with a delay. `.update()` needs to be called first to set up the
@ -258,141 +214,3 @@ impl FirCrossover {
        }
    }
 }
 impl FirFilter {
    /// Process left and right audio samples through the filter.
    pub fn process(&mut self, samples: f32x2) -> f32x2 {
        // TODO: Replace direct convolution with FFT convolution, would make the implementation much
        //       more complex though because of the multi output part
        let coefficients = &self.coefficients.0;
        let mut result = coefficients[0] * samples;
        // Now multiply `self.coefficients[1..]` with the delay buffer starting at
        // `self.delay_buffer_next_idx - 1`, wrapping around to the end when that is reached
        // The end index is exclusive, and we already did the multiply+add for the first coefficient.
        let before_wraparound_start_idx = self
            .delay_buffer_next_idx
            .saturating_sub(coefficients.len() - 1);
        let before_wraparound_end_idx = self.delay_buffer_next_idx;
        let num_before_wraparound = before_wraparound_end_idx - before_wraparound_start_idx;
        for (coefficient, delayed_sample) in coefficients[1..1 + num_before_wraparound].iter().zip(
            self.delay_buffer[before_wraparound_start_idx..before_wraparound_end_idx]
                .iter()
                .rev(),
        ) {
            // `result += coefficient * sample`, but with explicit FMA
            result = coefficient.mul_add(*delayed_sample, result);
        }
        let after_wraparound_begin_idx =
            self.delay_buffer.len() - (coefficients.len() - num_before_wraparound);
        let after_wraparound_end_idx = self.delay_buffer.len();
        for (coefficient, delayed_sample) in coefficients[1 + num_before_wraparound..].iter().zip(
            self.delay_buffer[after_wraparound_begin_idx..after_wraparound_end_idx]
                .iter()
                .rev(),
        ) {
            result = coefficient.mul_add(*delayed_sample, result);
        }
        // And finally write the samples to the delay buffer for the enxt sample
        self.delay_buffer[self.delay_buffer_next_idx] = samples;
        self.delay_buffer_next_idx = (self.delay_buffer_next_idx + 1) % self.delay_buffer.len();
        result
    }
    /// Reset the internal filter state.
    pub fn reset(&mut self) {
        self.delay_buffer.fill(f32x2::default());
        self.delay_buffer_next_idx = 0;
    }
 }
 impl FirCoefficients {
    /// A somewhat crude but very functional and relatively fast way create linear phase FIR
    /// **low-pass** filter that matches the frequency response of a fourth order biquad low-pass
    /// filter. As in, this matches the frequency response magnitudes of applying those biquads to a
    /// signal twice. This only works for low-pass filters, as the function normalizes the result to
    /// hae unity gain at the DC bin. The algorithm works as follows:
    ///
    /// - An impulse function (so all zeroes except for the first element) of length `FILTER_LEN / 2
    ///   + 1` is filtered with the biquad.
    /// - The biquad's state is reset, and the impulse response is filtered in the opposite
    ///   direction.
    /// - At this point the bidirectionally filtered impulse response contains the **right** half of
    ///   a truncated linear phase FIR kernel.
    ///
    /// Since the FIR filter will be a symmetrical version of this impulse response, we can optimize
    /// the post-processing work slightly by windowing and normalizing this bidirectionally filtered
    /// impulse response instead.
    ///
    /// - A half Blackman window is applied to the impulse response. Since this is the right half,
    ///   this starts at unity gain for the first sample and then tapers off towards the right.
    /// - The impulse response is then normalized such that the final linear-phase FIR kernel has a
    ///   sum of 1.0. Since it will be symmetrical around the IRs first sample, the would-be final
    ///   sum can be computed as `ir.sum() * 2 - ir[0]`.
    ///
    /// Lastly the linear phase FIR filter simply needs to be constructed from this right half:
    ///
    /// - This bidirectionally filtered impulse response is then reversed, and placed at the start
    ///   of the `FILTER_LEN` size FIR coefficient array.
    /// - The non-reversed bidirectionally filtered impulse response is copied to the second half of
    ///   the coefficients. (one of the copies doesn't need to include the centermost coefficient)
    ///
    /// The corresponding high-pass filter can be computed through spectral inversion.
    pub fn design_fourth_order_linear_phase_low_pass_from_biquad(
        biquad_coefs: BiquadCoefficients<f32x2>,
    ) -> Self {
        const CENTER_IDX: usize = FILTER_SIZE / 2;
        // We'll start with an impulse (at exactly half of this odd sized buffer)...
        let mut impulse_response = [f32x2::default(); FILTER_SIZE];
        impulse_response[CENTER_IDX] = f32x2::splat(1.0);
        // ...and filter that in both directions
        let mut biquad = Biquad::default();
        biquad.coefficients = biquad_coefs;
        for sample in impulse_response.iter_mut().skip(CENTER_IDX - 1) {
            *sample = biquad.process(*sample);
        }
        biquad.reset();
        for sample in impulse_response.iter_mut().skip(CENTER_IDX - 1).rev() {
            *sample = biquad.process(*sample);
        }
        // Now the right half of `impulse_response` contains a truncated right half of the
        // linear-phase FIR filter. We can apply the window function here, and then fianlly
        // normalize it so that the the final FIR filter kernel sums to 1.
        // Adopted from `nih_plug::util::window`. We only end up applying the right half of the
        // window, starting at the top of the window.
        let blackman_scale_1 = (2.0 * f32::consts::PI) / (impulse_response.len() - 1) as f32;
        let blackman_scale_2 = blackman_scale_1 * 2.0;
        for (sample_idx, sample) in impulse_response.iter_mut().enumerate().skip(CENTER_IDX - 1) {
            let cos_1 = (blackman_scale_1 * sample_idx as f32).cos();
            let cos_2 = (blackman_scale_2 * sample_idx as f32).cos();
            *sample *= f32x2::splat(0.42 - (0.5 * cos_1) + (0.08 * cos_2));
        }
        // Since this final filter will be symmetrical around `impulse_response[CENTER_IDX]`, we
        // can simply normalize based on that fact:
        let would_be_impulse_response_sum =
            (impulse_response.iter().skip(CENTER_IDX).sum::<f32x2>() * f32x2::splat(2.0))
                - impulse_response[CENTER_IDX];
        let would_be_impulse_response_recip = would_be_impulse_response_sum.recip();
        for sample in &mut impulse_response {
            *sample *= would_be_impulse_response_recip;
        }
        // And finally we can simply copy the right half of the filter kernel to the left half
        // around the `CENTER_IDX`.
        for source_idx in CENTER_IDX + 1..impulse_response.len() {
            let target_idx = CENTER_IDX - (source_idx - CENTER_IDX);
            impulse_response[target_idx] = impulse_response[source_idx];
        }
        Self(impulse_response)
    }
 }
--- a/plugins/crossover/src/crossover/fir/filter.rs
+++ b/plugins/crossover/src/crossover/fir/filter.rs
@ -0,0 +1,206 @@
 // Crossover: clean crossovers as a multi-out plugin
 // Copyright (C) 2022 Robbert van der Helm
 //
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 3 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 use std::f32;
 use std::simd::{f32x2, StdFloat};
 use super::{FILTER_SIZE, RING_BUFFER_SIZE};
 use crate::crossover::iir::biquad::{Biquad, BiquadCoefficients};
 /// A single FIR filter that may be configured in any way. In this plugin this will be a
 /// linear-phase low-pass, band-pass, or high-pass filter.
 #[derive(Debug, Clone)]
 pub struct FirFilter {
    /// The coefficients for this filter. The filters for both channels should be equivalent, this
    /// just avoids broadcasts in the filter process.
    ///
    /// TODO: Profile to see if storing this as f32x2 rather than f32s plus splatting makes any
    ///       difference in performance at all
    pub coefficients: FirCoefficients,
    /// A ring buffer storing the last `FILTER_SIZE - 1` samples. The capacity is `FILTER_SIZE`
    /// rounded up to the next power of two.
    delay_buffer: [f32x2; RING_BUFFER_SIZE],
    /// The index in `delay_buffer` to write the next sample to. Wrapping negative indices back to
    /// the end, the previous sample can be found at `delay_buffer[delay_buffer_next_idx - 1]`, the
    /// one before that at `delay_buffer[delay_buffer_next_idx - 2]`, and so on.
    delay_buffer_next_idx: usize,
 }
 /// Coefficients for an FIR filter. This struct includes ways to design the filter. Parameterized
 /// over `f32x2` only for the time being since that's what we need here.
 #[repr(transparent)]
 #[derive(Debug, Clone)]
 pub struct FirCoefficients(pub [f32x2; FILTER_SIZE]);
 impl Default for FirFilter {
    fn default() -> Self {
        Self {
            coefficients: FirCoefficients::default(),
            delay_buffer: [f32x2::default(); RING_BUFFER_SIZE],
            delay_buffer_next_idx: 0,
        }
    }
 }
 impl Default for FirCoefficients {
    fn default() -> Self {
        // Initialize this to a delay with the same amount of latency as we'd introduce with our
        // linear-phase filters
        let mut coefficients = [f32x2::default(); FILTER_SIZE];
        coefficients[FILTER_SIZE / 2] = f32x2::splat(1.0);
        Self(coefficients)
    }
 }
 impl FirFilter {
    /// Process left and right audio samples through the filter.
    pub fn process(&mut self, samples: f32x2) -> f32x2 {
        // TODO: Replace direct convolution with FFT convolution, would make the implementation much
        //       more complex though because of the multi output part
        let coefficients = &self.coefficients.0;
        let mut result = coefficients[0] * samples;
        // Now multiply `self.coefficients[1..]` with the delay buffer starting at
        // `self.delay_buffer_next_idx - 1`, wrapping around to the end when that is reached
        // The end index is exclusive, and we already did the multiply+add for the first coefficient.
        let before_wraparound_start_idx = self
            .delay_buffer_next_idx
            .saturating_sub(coefficients.len() - 1);
        let before_wraparound_end_idx = self.delay_buffer_next_idx;
        let num_before_wraparound = before_wraparound_end_idx - before_wraparound_start_idx;
        for (coefficient, delayed_sample) in coefficients[1..1 + num_before_wraparound].iter().zip(
            self.delay_buffer[before_wraparound_start_idx..before_wraparound_end_idx]
                .iter()
                .rev(),
        ) {
            // `result += coefficient * sample`, but with explicit FMA
            result = coefficient.mul_add(*delayed_sample, result);
        }
        let after_wraparound_begin_idx =
            self.delay_buffer.len() - (coefficients.len() - num_before_wraparound);
        let after_wraparound_end_idx = self.delay_buffer.len();
        for (coefficient, delayed_sample) in coefficients[1 + num_before_wraparound..].iter().zip(
            self.delay_buffer[after_wraparound_begin_idx..after_wraparound_end_idx]
                .iter()
                .rev(),
        ) {
            result = coefficient.mul_add(*delayed_sample, result);
        }
        // And finally write the samples to the delay buffer for the enxt sample
        self.delay_buffer[self.delay_buffer_next_idx] = samples;
        self.delay_buffer_next_idx = (self.delay_buffer_next_idx + 1) % self.delay_buffer.len();
        result
    }
    /// Reset the internal filter state.
    pub fn reset(&mut self) {
        self.delay_buffer.fill(f32x2::default());
        self.delay_buffer_next_idx = 0;
    }
 }
 impl FirCoefficients {
    /// A somewhat crude but very functional and relatively fast way create linear phase FIR
    /// **low-pass** filter that matches the frequency response of a fourth order biquad low-pass
    /// filter. As in, this matches the frequency response magnitudes of applying those biquads to a
    /// signal twice. This only works for low-pass filters, as the function normalizes the result to
    /// hae unity gain at the DC bin. The algorithm works as follows:
    ///
    /// - An impulse function (so all zeroes except for the first element) of length `FILTER_LEN / 2
    ///   + 1` is filtered with the biquad.
    /// - The biquad's state is reset, and the impulse response is filtered in the opposite
    ///   direction.
    /// - At this point the bidirectionally filtered impulse response contains the **right** half of
    ///   a truncated linear phase FIR kernel.
    ///
    /// Since the FIR filter will be a symmetrical version of this impulse response, we can optimize
    /// the post-processing work slightly by windowing and normalizing this bidirectionally filtered
    /// impulse response instead.
    ///
    /// - A half Blackman window is applied to the impulse response. Since this is the right half,
    ///   this starts at unity gain for the first sample and then tapers off towards the right.
    /// - The impulse response is then normalized such that the final linear-phase FIR kernel has a
    ///   sum of 1.0. Since it will be symmetrical around the IRs first sample, the would-be final
    ///   sum can be computed as `ir.sum() * 2 - ir[0]`.
    ///
    /// Lastly the linear phase FIR filter simply needs to be constructed from this right half:
    ///
    /// - This bidirectionally filtered impulse response is then reversed, and placed at the start
    ///   of the `FILTER_LEN` size FIR coefficient array.
    /// - The non-reversed bidirectionally filtered impulse response is copied to the second half of
    ///   the coefficients. (one of the copies doesn't need to include the centermost coefficient)
    ///
    /// The corresponding high-pass filter can be computed through spectral inversion.
    pub fn design_fourth_order_linear_phase_low_pass_from_biquad(
        biquad_coefs: BiquadCoefficients<f32x2>,
    ) -> Self {
        const CENTER_IDX: usize = FILTER_SIZE / 2;
        // We'll start with an impulse (at exactly half of this odd sized buffer)...
        let mut impulse_response = [f32x2::default(); FILTER_SIZE];
        impulse_response[CENTER_IDX] = f32x2::splat(1.0);
        // ...and filter that in both directions
        let mut biquad = Biquad::default();
        biquad.coefficients = biquad_coefs;
        for sample in impulse_response.iter_mut().skip(CENTER_IDX - 1) {
            *sample = biquad.process(*sample);
        }
        biquad.reset();
        for sample in impulse_response.iter_mut().skip(CENTER_IDX - 1).rev() {
            *sample = biquad.process(*sample);
        }
        // Now the right half of `impulse_response` contains a truncated right half of the
        // linear-phase FIR filter. We can apply the window function here, and then fianlly
        // normalize it so that the the final FIR filter kernel sums to 1.
        // Adopted from `nih_plug::util::window`. We only end up applying the right half of the
        // window, starting at the top of the window.
        let blackman_scale_1 = (2.0 * f32::consts::PI) / (impulse_response.len() - 1) as f32;
        let blackman_scale_2 = blackman_scale_1 * 2.0;
        for (sample_idx, sample) in impulse_response.iter_mut().enumerate().skip(CENTER_IDX - 1) {
            let cos_1 = (blackman_scale_1 * sample_idx as f32).cos();
            let cos_2 = (blackman_scale_2 * sample_idx as f32).cos();
            *sample *= f32x2::splat(0.42 - (0.5 * cos_1) + (0.08 * cos_2));
        }
        // Since this final filter will be symmetrical around `impulse_response[CENTER_IDX]`, we
        // can simply normalize based on that fact:
        let would_be_impulse_response_sum =
            (impulse_response.iter().skip(CENTER_IDX).sum::<f32x2>() * f32x2::splat(2.0))
                - impulse_response[CENTER_IDX];
        let would_be_impulse_response_recip = would_be_impulse_response_sum.recip();
        for sample in &mut impulse_response {
            *sample *= would_be_impulse_response_recip;
        }
        // And finally we can simply copy the right half of the filter kernel to the left half
        // around the `CENTER_IDX`.
        for source_idx in CENTER_IDX + 1..impulse_response.len() {
            let target_idx = CENTER_IDX - (source_idx - CENTER_IDX);
            impulse_response[target_idx] = impulse_response[source_idx];
        }
        Self(impulse_response)
    }
 }
--- a/src/event_loop/linux.rs
+++ b/src/event_loop/linux.rs
@ -7,7 +7,6 @@ use std::sync::Weak;
 use std::thread::{self, JoinHandle, ThreadId};
 use super::{EventLoop, MainThreadExecutor};
 use crate::nih_log;
 use crate::util::permit_alloc;
 /// See [`EventLoop`][super::EventLoop].