Rework FIR crossover to use FFT convolution

2022-06-07 20:32:27 +02:00 · 2022-06-07 20:32:27 +02:00 · 5a51dce00d
parent 2c48ceb392
commit 5a51dce00d
5 changed files with 323 additions and 118 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -711,6 +711,7 @@ name = "crossover"
 version = "0.1.0"
 dependencies = [
 "nih_plug",
+ "realfft",
 ]

 [[package]]
--- a/plugins/crossover/Cargo.toml
+++ b/plugins/crossover/Cargo.toml
@ -16,3 +16,4 @@ simd = ["nih_plug/simd"]

 [dependencies]
 nih_plug = { path = "../../", features = ["assert_process_allocs"] }
+realfft = "3.0.0"
--- a/plugins/crossover/src/crossover/fir.rs
+++ b/plugins/crossover/src/crossover/fir.rs
@ -14,36 +14,60 @@
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.

-use nih_plug::buffer::ChannelSamples;
 use nih_plug::debug::*;
+use realfft::num_complex::Complex32;
+use realfft::{ComplexToReal, RealFftPlanner, RealToComplex};
 use std::f32;
-use std::simd::f32x2;
+use std::sync::Arc;

-use self::filter::{FirCoefficients, FirFilter};
+use self::filter::{FftFirFilter, FirCoefficients, FFT_INPUT_SIZE, FFT_SIZE};
+use crate::crossover::fir::filter::FILTER_SIZE;
 use crate::crossover::iir::biquad::{BiquadCoefficients, NEUTRAL_Q};
-use crate::NUM_BANDS;
+use crate::{NUM_BANDS, NUM_CHANNELS};

 pub mod filter;

-// TODO: Move this to FFT convolution so we can increase the filter size and improve low latency performance
-
-/// The size of the FIR filter window, or the number of taps. The low frequency performance is
-/// greatly limited by this.
-const FILTER_SIZE: usize = 121;
-/// The size of the FIR filter's ring buffer. This is `FILTER_SIZE` rounded up to the next power of
-/// two.
-const RING_BUFFER_SIZE: usize = FILTER_SIZE.next_power_of_two();
-
-#[derive(Debug)]
 pub struct FirCrossover {
    /// The kind of crossover to use. `.update_filters()` must be called after changing this.
    mode: FirCrossoverType,

    /// Filters for each of the bands. Depending on the number of bands argument passed to
-    /// `.process()` two to five of these may be used. The first one always contains a low-pass
+    /// `.process()`, two to five of these may be used. The first one always contains a low-pass
    /// filter, the last one always contains a high-pass filter, while the other bands will contain
    /// band-pass filters.
-    band_filters: [FirFilter; NUM_BANDS],
+    ///
+    /// These filters will be fed the FFT from the main input to produce output samples for the enxt
+    /// period. Everything could be a bit nicer to read if the filter did the entire STFT process,
+    /// but that would mean duplicating the input ring buffer and forward DFT up to five times.
+    band_filters: [FftFirFilter; NUM_BANDS],
+
+    /// A ring buffer that is used to store inputs for the next FFT. Until it is time to take the
+    /// next FFT, samples are copied from the inputs to this buffer, while simultaneously copying
+    /// the already processed output samples from the output buffers to the output. Once
+    /// `io_buffer_next_indices` wrap back around to 0, the next buffer should be produced.
+    input_buffers: [[f32; FFT_INPUT_SIZE]; NUM_CHANNELS as usize],
+    /// A ring that contains the next period's outputs for each of the five bands. This is written
+    /// to and read from in lockstep with `input_buffers`.
+    band_output_buffers: [[[f32; FFT_INPUT_SIZE]; NUM_CHANNELS as usize]; NUM_BANDS],
+    /// The index in the inner `io_buffer` the next sample should be read from. After a sample is
+    /// written to the band's output then this is incremented by one. Once
+    /// `self.io_buffer_next_indices[channel_idx] == self.io_buffer.len()` then the next block
+    /// should be processed.
+    ///
+    /// This is stored as an array since each channel is processed individually. While this should
+    /// of course stay in sync, this makes it much simpler to process both channels in sequence.
+    io_buffers_next_indices: [usize; NUM_CHANNELS as usize],
+
+    /// The algorithm for the FFT operation.
+    r2c_plan: Arc<dyn RealToComplex<f32>>,
+    /// The algorithm for the IFFT operation.
+    c2r_plan: Arc<dyn ComplexToReal<f32>>,
+
+    /// A real buffer that may be written to in place during the FFT and IFFT operations.
+    real_scratch_buffer: [f32; FFT_SIZE],
+    /// A complex buffer corresponding to `real_scratch_buffer` that may be written to in place
+    /// during the FFT and IFFT operations.
+    complex_scratch_buffer: [Complex32; FFT_SIZE / 2 + 1],
 }

 /// The type of FIR crossover to use.
@ -63,9 +87,19 @@ impl FirCrossover {
    /// Make sure to add the latency reported by [`latency()`][Self::latency()] to the plugin's
    /// reported latency.
    pub fn new(mode: FirCrossoverType) -> Self {
+        let mut fft_planner = RealFftPlanner::new();
+
        Self {
            mode,
            band_filters: Default::default(),
+
+            input_buffers: [[0.0; FFT_INPUT_SIZE]; NUM_CHANNELS as usize],
+            band_output_buffers: [[[0.0; FFT_INPUT_SIZE]; NUM_CHANNELS as usize]; NUM_BANDS],
+            io_buffers_next_indices: [0; NUM_CHANNELS as usize],
+            r2c_plan: fft_planner.plan_fft_forward(FFT_SIZE),
+            c2r_plan: fft_planner.plan_fft_inverse(FFT_SIZE),
+            real_scratch_buffer: [0.0; FFT_SIZE],
+            complex_scratch_buffer: [Complex32::default(); FFT_SIZE / 2 + 1],
        }
    }

@ -74,43 +108,96 @@ impl FirCrossover {
        // Actually, that's a lie, since we currently only do linear-phase filters with a constant
        // size
        match self.mode {
-            FirCrossoverType::LinkwitzRiley24LinearPhase => (FILTER_SIZE / 2) as u32,
+            FirCrossoverType::LinkwitzRiley24LinearPhase => FFT_INPUT_SIZE as u32,
        }
    }

    /// Split the signal into bands using the crossovers previously configured through `.update()`.
-    /// The split bands will be written to `band_outputs`. `main_io` is not written to, and should
-    /// be cleared separately.
+    /// The split bands will be written to `band_outputs`. The main output should be cleared
+    /// separately. For efficiency's sake this processes an entire channel at once to minimize the
+    /// number of FFT operations needed. Since this process delays the signal by `FFT_INPUT_SIZE`
+    /// samples, the latency should be reported to the host.
    pub fn process(
        &mut self,
        num_bands: usize,
-        main_io: &ChannelSamples,
-        band_outputs: [ChannelSamples; NUM_BANDS],
+        main_input: &[f32],
+        mut band_outputs: [&mut &mut [f32]; NUM_BANDS],
+        channel_idx: usize,
    ) {
-        nih_debug_assert!(num_bands >= 2);
-        nih_debug_assert!(num_bands <= NUM_BANDS);
-        // Required for the SIMD, so we'll just do a hard assert or the unchecked conversions will
-        // be unsound
-        assert!(main_io.len() == 2);
+        nih_debug_assert!(main_input.len() == band_outputs[0].len());
+        nih_debug_assert!(channel_idx < NUM_CHANNELS as usize);

-        let samples: f32x2 = unsafe { main_io.to_simd_unchecked() };
-        match self.mode {
-            FirCrossoverType::LinkwitzRiley24LinearPhase => {
-                // TODO: Everything is structured to be fast to compute for the IIR filters. Instead
-                //       of doing two channels at the same time, it would probably be faster to use
-                //       SIMD for the actual convolution so we can do 4 or 8 multiply-adds at the
-                //       same time. Or perhaps a better way to spend the time, use FFT convolution
-                //       for this.
-                for (filter, mut output) in self
-                    .band_filters
+        // We'll copy already processed output to `band_outputs` while storing input for the next
+        // FFT operation. This is a modified version of what's going on in `StftHelper`.
+        let mut current_sample_idx = 0;
+        while current_sample_idx < main_input.len() {
+            {
+                // When `self.io_buffers_next_indices == FFT_SIZE`, the next block should be processed
+                let io_buffers_next_indices = self.io_buffers_next_indices[channel_idx];
+                let process_num_samples = (FFT_INPUT_SIZE - io_buffers_next_indices)
+                    .min(main_input.len() - current_sample_idx);
+
+                // Since we can't do this in-place (without unnecessarily duplicating a ton of data),
+                // copying data from and to the ring buffers can be done with simple memcpys
+                self.input_buffers[channel_idx]
+                    [io_buffers_next_indices..io_buffers_next_indices + process_num_samples]
+                    .copy_from_slice(
+                        &main_input[current_sample_idx..current_sample_idx + process_num_samples],
+                    );
+                for (band_output, band_output_buffers) in band_outputs
                    .iter_mut()
-                    .zip(band_outputs)
+                    .zip(self.band_output_buffers.iter())
                    .take(num_bands)
                {
-                    let filtered_samples = filter.process(samples);
-
-                    unsafe { output.from_simd_unchecked(filtered_samples) };
+                    band_output[current_sample_idx..current_sample_idx + process_num_samples]
+                        .copy_from_slice(
+                            &band_output_buffers[channel_idx][io_buffers_next_indices
+                                ..io_buffers_next_indices + process_num_samples],
+                        );
                }
+
+                // This is tracked per-channel because both channels are processed individually
+                self.io_buffers_next_indices[channel_idx] += process_num_samples;
+                current_sample_idx += process_num_samples;
+            }
+
+            // At this point we either reached the end of the buffer (`current_sample_idx ==
+            // main_input.len()`), or we filled up the `io_buffer` and we can process the next block
+            if self.io_buffers_next_indices[channel_idx] == FFT_INPUT_SIZE {
+                // Zero pad the input for the FFT
+                self.real_scratch_buffer[..FFT_INPUT_SIZE]
+                    .copy_from_slice(&self.input_buffers[channel_idx]);
+                self.real_scratch_buffer[FFT_INPUT_SIZE..].fill(0.0);
+
+                self.r2c_plan
+                    .process_with_scratch(
+                        &mut self.real_scratch_buffer,
+                        &mut self.complex_scratch_buffer,
+                        &mut [],
+                    )
+                    .unwrap();
+
+                // The input can then be used to produce each band's output. Since realfft expects
+                // to be able to modify the input, we need to make a copy of this first:
+                let input_fft = self.complex_scratch_buffer;
+
+                for (band_output_buffers, band_filter) in self
+                    .band_output_buffers
+                    .iter_mut()
+                    .zip(self.band_filters.iter_mut())
+                    .take(num_bands)
+                {
+                    band_filter.process(
+                        &input_fft,
+                        &mut band_output_buffers[channel_idx],
+                        channel_idx,
+                        &*self.c2r_plan,
+                        &mut self.real_scratch_buffer,
+                        &mut self.complex_scratch_buffer,
+                    )
+                }
+
+                self.io_buffers_next_indices[channel_idx] = 0;
            }
        }
    }
@ -150,11 +237,16 @@ impl FirCrossover {
                    FirCoefficients::design_fourth_order_linear_phase_low_pass_from_biquad(
                        iir_coefs,
                    );
-                self.band_filters[0].coefficients = lp_fir_coefs;
+                self.band_filters[0].recompute_coefficients(
+                    lp_fir_coefs.clone(),
+                    &*self.r2c_plan,
+                    &mut self.real_scratch_buffer,
+                    &mut self.complex_scratch_buffer,
+                );

                // For the band-pass filters and the final high-pass filter, we need to keep track
                // of the accumulated impulse response
-                let mut accumulated_ir = self.band_filters[0].coefficients.clone();
+                let mut accumulated_ir = lp_fir_coefs;
                for (split_frequency, band_filter) in frequencies
                    .iter()
                    .zip(self.band_filters.iter_mut())
@ -191,7 +283,12 @@ impl FirCrossover {
                        *accumulated_coef += *bp_coef;
                    }

-                    band_filter.coefficients = fir_bp_coefs;
+                    band_filter.recompute_coefficients(
+                        fir_bp_coefs,
+                        &*self.r2c_plan,
+                        &mut self.real_scratch_buffer,
+                        &mut self.complex_scratch_buffer,
+                    );
                }

                // And finally we can do a spectral inversion of the accumulated IR to the the last
@ -202,7 +299,12 @@ impl FirCrossover {
                }
                fir_hp_coefs.0[FILTER_SIZE / 2] += 1.0;

-                self.band_filters[num_bands - 1].coefficients = fir_hp_coefs;
+                self.band_filters[num_bands - 1].recompute_coefficients(
+                    fir_hp_coefs,
+                    &*self.r2c_plan,
+                    &mut self.real_scratch_buffer,
+                    &mut self.complex_scratch_buffer,
+                );
            }
        }
    }
@ -212,5 +314,16 @@ impl FirCrossover {
        for filter in &mut self.band_filters {
            filter.reset();
        }
+
+        // The inputs don't need to be reset as they'll be overwritten immediately
+        for band_buffers in &mut self.band_output_buffers {
+            for buffer in band_buffers {
+                buffer.fill(0.0);
+            }
+        }
+
+        // This being 0 means that the very first period will simply output the silence form above
+        // and gather input for the next FFT
+        self.io_buffers_next_indices.fill(0);
    }
 }
--- a/plugins/crossover/src/crossover/fir/filter.rs
+++ b/plugins/crossover/src/crossover/fir/filter.rs
@ -14,27 +14,50 @@
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.

+use realfft::num_complex::Complex32;
+use realfft::{ComplexToReal, RealToComplex};
 use std::f32;
-use std::simd::{f32x2, StdFloat};

-use super::{FILTER_SIZE, RING_BUFFER_SIZE};
 use crate::crossover::iir::biquad::{Biquad, BiquadCoefficients};
+use crate::NUM_CHANNELS;
+
+/// We're doing FFT convolution here since otherwise there's no way to get decent low-frequency
+/// accuracy while still having acceptable performance. The input going into the STFT will be
+/// smaller since it will be padding with zeroes to compensate for the otherwise overlapping tail
+/// caused by the convolution.
+pub const FFT_SIZE: usize = 4096;
+/// The input chunk size the FFT convolution is processing. This is also the latency. By having this
+/// be exactly half of FFT_SIZE, we can make the overlap-add part of the FFT convolution a lot
+/// simpler for ourselves. (check the `StftHelper` struct in NIH-plug itself for an examples that
+/// can handle arbitrary padding)
+pub const FFT_INPUT_SIZE: usize = FFT_SIZE / 2;
+/// The size of the FIR filter window, or the number of taps. Convoling `FFT_INPUT_SIZE` samples
+/// with this filter should fit exactly in `FFT_SIZE`, and it should be an odd number.
+pub const FILTER_SIZE: usize = FFT_SIZE - FFT_INPUT_SIZE + 1;

 /// A single FIR filter that may be configured in any way. In this plugin this will be a
-/// linear-phase low-pass, band-pass, or high-pass filter.
+/// linear-phase low-pass, band-pass, or high-pass filter. Implemented using FFT convolution. `git
+/// blame` this for a version that uses direct convolution.
+///
+/// `N_INPUT` is the size of the input that will be processed. The size of the FFT window becomes
+/// `N_INPUT * 2`. That makes handling the overlap easy, as each IDFT after multiplying the padded
+/// input and the padded impulse response FFTs will result one `N_INPUT` period of output that can
+/// be taken as is, followed by one `N_INPUT` period of samples that need to be added to the next
+/// period's outputs as part of the overlap-add process.
 #[derive(Debug, Clone)]
-pub struct FirFilter {
-    /// The coefficients for this filter. The filters for both channels should be equivalent, this
-    /// just avoids broadcasts in the filter process.
-    pub coefficients: FirCoefficients<FILTER_SIZE>,
+pub struct FftFirFilter {
+    /// An `N_INPUT + 1` sized IIR. Padded, ran through the DFT, and then normalized by dividing by
+    /// `FFT_SIZE`.
+    padded_ir_fft: [Complex32; FFT_SIZE / 2 + 1],

-    /// A ring buffer storing the last `FILTER_SIZE - 1` samples. The capacity is `FILTER_SIZE`
-    /// rounded up to the next power of two.
-    delay_buffer: [f32x2; RING_BUFFER_SIZE],
-    /// The index in `delay_buffer` to write the next sample to. Wrapping negative indices back to
-    /// the end, the previous sample can be found at `delay_buffer[delay_buffer_next_idx - 1]`, the
-    /// one before that at `delay_buffer[delay_buffer_next_idx - 2]`, and so on.
-    delay_buffer_next_idx: usize,
+    /// The padding from the previous IDFT operation that needs to be added to the next output
+    /// buffer. After the IDFT process there will be an `FFT_SIZE` real scratch buffer containing
+    /// the output. At that point the first `FFT_INPUT_SIZE` samples of those will be copied to
+    /// `output_buffers` in the FIR crossover, `unapplied_padding_buffer` will be added to that
+    /// output buffer, and then finally the last `FFT_INPUT_SIZE` samples of the scratch buffer are
+    /// copied to `unapplied_padding_buffer`. This thus makes sure the tail gets delayed by another
+    /// period so that everything matches up.
+    unapplied_padding_buffers: [[f32; FFT_INPUT_SIZE]; NUM_CHANNELS as usize],
 }

 /// Coefficients for a (linear-phase) FIR filter. This struct includes ways to design the filter.
@ -43,12 +66,14 @@ pub struct FirFilter {
 #[derive(Debug, Clone)]
 pub struct FirCoefficients<const N: usize>(pub [f32; N]);

-impl Default for FirFilter {
+impl Default for FftFirFilter {
    fn default() -> Self {
        Self {
-            coefficients: FirCoefficients::default(),
-            delay_buffer: [f32x2::default(); RING_BUFFER_SIZE],
-            delay_buffer_next_idx: 0,
+            // Would be nicer to initialize this to an impulse response that actually had the
+            // correct position wrt the usual linear-phase latency, but this is fine since it should
+            // never be used anyways
+            padded_ir_fft: [Complex32::new(1.0 / FFT_SIZE as f32, 0.0); FFT_SIZE / 2 + 1],
+            unapplied_padding_buffers: [[0.0; FFT_INPUT_SIZE]; NUM_CHANNELS as usize],
        }
    }
 }
@ -64,53 +89,87 @@ impl<const N: usize> Default for FirCoefficients<N> {
    }
 }

-impl FirFilter {
-    /// Process left and right audio samples through the filter.
-    pub fn process(&mut self, samples: f32x2) -> f32x2 {
-        // TODO: Replace direct convolution with FFT convolution, would make the implementation much
-        //       more complex though because of the multi output part
-        let coefficients = &self.coefficients.0;
-        let mut result = f32x2::splat(coefficients[0]) * samples;
+impl FftFirFilter {
+    /// Filter `FFT_INPUT_SIZE` samples padded to `FFT_SIZE` through this filter, and write the
+    /// outputs to `output_samples` (belonging to channel `channel_idx`), at an `FFT_INPUT_SIZE`
+    /// delay. This is a bit weird and probably difficult to follow because as an optimization the
+    /// DFT is taken only once, and then the IDFT is taken once for every filtered band. This
+    /// function is thus called inside of the overlap-add loop to avoid duplicate work.
+    pub fn process(
+        &mut self,
+        input_fft: &[Complex32; FFT_SIZE / 2 + 1],
+        output_samples: &mut [f32; FFT_INPUT_SIZE],
+        output_channel_idx: usize,
+        c2r_plan: &dyn ComplexToReal<f32>,
+        real_scratch_buffer: &mut [f32; FFT_SIZE],
+        complex_scratch_buffer: &mut [Complex32; FFT_SIZE / 2 + 1],
+    ) {
+        // The padded input FFT has already been taken, so we only need to copy it to the scratch
+        // buffer (the input cannot change as the next band might need it as well).
+        complex_scratch_buffer.copy_from_slice(input_fft);

-        // Now multiply `self.coefficients[1..]` with the delay buffer starting at
-        // `self.delay_buffer_next_idx - 1`, wrapping around to the end when that is reached
-        // The end index is exclusive, and we already did the multiply+add for the first coefficient.
-        let before_wraparound_start_idx = self
-            .delay_buffer_next_idx
-            .saturating_sub(coefficients.len() - 1);
-        let before_wraparound_end_idx = self.delay_buffer_next_idx;
-        let num_before_wraparound = before_wraparound_end_idx - before_wraparound_start_idx;
-        for (coefficient, delayed_sample) in coefficients[1..1 + num_before_wraparound].iter().zip(
-            self.delay_buffer[before_wraparound_start_idx..before_wraparound_end_idx]
-                .iter()
-                .rev(),
-        ) {
-            // `result += coefficient * sample`, but with explicit FMA
-            result = f32x2::splat(*coefficient).mul_add(*delayed_sample, result);
+        // The FFT of the impulse response has already been normalized, so we just need to
+        // multiply the two buffers
+        for (output_bin, ir_bin) in complex_scratch_buffer
+            .iter_mut()
+            .zip(self.padded_ir_fft.iter())
+        {
+            *output_bin *= ir_bin;
        }
+        c2r_plan
+            .process_with_scratch(complex_scratch_buffer, real_scratch_buffer, &mut [])
+            .unwrap();

-        let after_wraparound_begin_idx =
-            self.delay_buffer.len() - (coefficients.len() - num_before_wraparound);
-        let after_wraparound_end_idx = self.delay_buffer.len();
-        for (coefficient, delayed_sample) in coefficients[1 + num_before_wraparound..].iter().zip(
-            self.delay_buffer[after_wraparound_begin_idx..after_wraparound_end_idx]
-                .iter()
-                .rev(),
-        ) {
-            result = f32x2::splat(*coefficient).mul_add(*delayed_sample, result);
+        // At this point the first `FFT_INPUT_SIZE` elements in `real_scratch_buffer`
+        // contain the output for the next period, while the last `FFT_INPUT_SIZE` elements
+        // contain output that needs to be added to the period after that. Since previous
+        // period also produced similar delayed output, we'll need to copy that to the
+        // results as well.
+        output_samples.copy_from_slice(&real_scratch_buffer[..FFT_INPUT_SIZE]);
+        for (output_sample, padding_sample) in output_samples
+            .iter_mut()
+            .zip(self.unapplied_padding_buffers[output_channel_idx].iter())
+        {
+            *output_sample += *padding_sample;
        }
+        self.unapplied_padding_buffers[output_channel_idx]
+            .copy_from_slice(&real_scratch_buffer[FFT_INPUT_SIZE..]);
+    }

-        // And finally write the samples to the delay buffer for the enxt sample
-        self.delay_buffer[self.delay_buffer_next_idx] = samples;
-        self.delay_buffer_next_idx = (self.delay_buffer_next_idx + 1) % self.delay_buffer.len();
+    /// Set the filter's coefficients based on raw FIR filter coefficients. These will be padded,
+    /// ran through the DFT, and normalized.
+    pub fn recompute_coefficients(
+        &mut self,
+        coefficients: FirCoefficients<FILTER_SIZE>,
+        r2c_plan: &dyn RealToComplex<f32>,
+        real_scratch_buffer: &mut [f32; FFT_SIZE],
+        complex_scratch_buffer: &mut [Complex32; FFT_SIZE / 2 + 1],
+    ) {
+        // This needs to be padded with zeroes
+        real_scratch_buffer[..FILTER_SIZE].copy_from_slice(&coefficients.0);
+        real_scratch_buffer[FILTER_SIZE..].fill(0.0);

-        result
+        r2c_plan
+            .process_with_scratch(real_scratch_buffer, complex_scratch_buffer, &mut [])
+            .unwrap();
+
+        // The resulting buffer needs to be normalized and written to `self.padded_ir_fft`. That way
+        // we don't need to do anything but multiplying and writing the results back when
+        // processing.
+        let normalization_factor = 1.0 / FFT_SIZE as f32;
+        for (filter_bin, target_bin) in complex_scratch_buffer
+            .iter()
+            .zip(self.padded_ir_fft.iter_mut())
+        {
+            *target_bin = *filter_bin * normalization_factor;
+        }
    }

    /// Reset the internal filter state.
    pub fn reset(&mut self) {
-        self.delay_buffer.fill(f32x2::default());
-        self.delay_buffer_next_idx = 0;
+        for buffer in &mut self.unapplied_padding_buffers {
+            buffer.fill(0.0);
+        }
    }
 }

--- a/plugins/crossover/src/lib.rs
+++ b/plugins/crossover/src/lib.rs
@ -27,6 +27,9 @@ use std::sync::Arc;

 mod crossover;

+/// The number of channels this plugin supports. Hard capped at 2 for SIMD reasons.
+pub const NUM_CHANNELS: u32 = 2;
+
 /// The number of bands. Not used directly here, but this avoids hardcoding some constants in the
 /// crossover implementations.
 pub const NUM_BANDS: usize = 5;
@ -163,13 +166,13 @@ impl Plugin for Crossover {

    const VERSION: &'static str = "0.1.0";

-    const DEFAULT_NUM_INPUTS: u32 = 2;
-    const DEFAULT_NUM_OUTPUTS: u32 = 2;
+    const DEFAULT_NUM_INPUTS: u32 = NUM_CHANNELS;
+    const DEFAULT_NUM_OUTPUTS: u32 = NUM_CHANNELS;

    const DEFAULT_AUX_OUTPUTS: Option<AuxiliaryIOConfig> = Some(AuxiliaryIOConfig {
        // Two to five of these busses will be used at a time
        num_busses: 5,
-        num_channels: 2,
+        num_channels: NUM_CHANNELS,
    });

    const PORT_NAMES: PortNames = PortNames {
@ -186,9 +189,9 @@ impl Plugin for Crossover {

    fn accepts_bus_config(&self, config: &BusConfig) -> bool {
        // Only do stereo
-        config.num_input_channels == 2
-            && config.num_output_channels == 2
-            && config.aux_output_busses.num_channels == 2
+        config.num_input_channels == NUM_CHANNELS
+            && config.num_output_channels == NUM_CHANNELS
+            && config.aux_output_busses.num_channels == NUM_CHANNELS
    }

    fn initialize(
@ -232,18 +235,7 @@ impl Plugin for Crossover {
            CrossoverType::LinkwitzRiley24LinearPhase => {
                context.set_latency_samples(self.fir_crossover.latency());

-                todo!();
-                // Self::do_process(buffer, aux, |main_channel_samples, bands| {
-                //     if self.should_update_filters() {
-                //         self.update_filters(buffer.len() as u32);
-                //     }
-
-                //     self.fir_crossover.process(
-                //         self.params.num_bands.value as usize,
-                //         main_channel_samples,
-                //         bands,
-                //     );
-                // })
+                self.process_fir(buffer, aux);
            }
        }

@ -253,7 +245,7 @@ impl Plugin for Crossover {

 impl Crossover {
    /// Takes care of all of the boilerplate in zipping the outputs together to get a nice iterator
-    /// friendly and SIMD-able interface for the processing function. Prevents havign to branch per
+    /// friendly and SIMD-able interface for the processing function. Prevents having to branch per
    /// sample. The closure receives an input sample and it should write the output samples for each
    /// band to the array.
    fn process_iir(&mut self, buffer: &mut Buffer, aux: &mut AuxiliaryBuffers) {
@ -310,6 +302,45 @@ impl Crossover {
        }
    }

+    /// `process_iir()`, but for the linear-phase FIR crossovers. This processes an entire channel
+    /// at once instead of processing per-sample since we use FFT convolution.
+    fn process_fir(&mut self, buffer: &mut Buffer, aux: &mut AuxiliaryBuffers) {
+        // In theory we could do smoothing in between processed blocks, but this hsould be fine
+        if self.should_update_filters() {
+            self.update_filters(buffer.len() as u32);
+        }
+
+        let aux_outputs = &mut aux.outputs;
+        let (band_1_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap();
+        let (band_2_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap();
+        let (band_3_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap();
+        let (band_4_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap();
+        let (band_5_buffer, _) = aux_outputs.split_first_mut().unwrap();
+
+        // We can avoid a lot of hardcoding and conditionals by restoring the original array structure
+        for channel_idx in 0..buffer.channels() {
+            let main_io = &mut buffer.as_slice()[channel_idx];
+            let band_outputs = [
+                &mut band_1_buffer.as_slice()[channel_idx],
+                &mut band_2_buffer.as_slice()[channel_idx],
+                &mut band_3_buffer.as_slice()[channel_idx],
+                &mut band_4_buffer.as_slice()[channel_idx],
+                &mut band_5_buffer.as_slice()[channel_idx],
+            ];
+
+            self.fir_crossover.process(
+                self.params.num_bands.value as usize,
+                main_io,
+                band_outputs,
+                channel_idx,
+            );
+
+            // The main output should be silent as the signal is already evenly split over the other
+            // bands
+            main_io.fill(0.0);
+        }
+    }
+
    /// Returns whether the filters should be updated. There are different updating functions for
    /// the IIR and FIR crossovers.
    fn should_update_filters(&mut self) -> bool {