diff --git a/Cargo.lock b/Cargo.lock index f5a6476b..a12a0b89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -711,6 +711,7 @@ name = "crossover" version = "0.1.0" dependencies = [ "nih_plug", + "realfft", ] [[package]] diff --git a/plugins/crossover/Cargo.toml b/plugins/crossover/Cargo.toml index b7efa382..77c46114 100644 --- a/plugins/crossover/Cargo.toml +++ b/plugins/crossover/Cargo.toml @@ -16,3 +16,4 @@ simd = ["nih_plug/simd"] [dependencies] nih_plug = { path = "../../", features = ["assert_process_allocs"] } +realfft = "3.0.0" diff --git a/plugins/crossover/src/crossover/fir.rs b/plugins/crossover/src/crossover/fir.rs index c55bcd10..4d903eaa 100644 --- a/plugins/crossover/src/crossover/fir.rs +++ b/plugins/crossover/src/crossover/fir.rs @@ -14,36 +14,60 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . -use nih_plug::buffer::ChannelSamples; use nih_plug::debug::*; +use realfft::num_complex::Complex32; +use realfft::{ComplexToReal, RealFftPlanner, RealToComplex}; use std::f32; -use std::simd::f32x2; +use std::sync::Arc; -use self::filter::{FirCoefficients, FirFilter}; +use self::filter::{FftFirFilter, FirCoefficients, FFT_INPUT_SIZE, FFT_SIZE}; +use crate::crossover::fir::filter::FILTER_SIZE; use crate::crossover::iir::biquad::{BiquadCoefficients, NEUTRAL_Q}; -use crate::NUM_BANDS; +use crate::{NUM_BANDS, NUM_CHANNELS}; pub mod filter; -// TODO: Move this to FFT convolution so we can increase the filter size and improve low latency performance - -/// The size of the FIR filter window, or the number of taps. The low frequency performance is -/// greatly limited by this. -const FILTER_SIZE: usize = 121; -/// The size of the FIR filter's ring buffer. This is `FILTER_SIZE` rounded up to the next power of -/// two. -const RING_BUFFER_SIZE: usize = FILTER_SIZE.next_power_of_two(); - -#[derive(Debug)] pub struct FirCrossover { /// The kind of crossover to use. `.update_filters()` must be called after changing this. mode: FirCrossoverType, /// Filters for each of the bands. Depending on the number of bands argument passed to - /// `.process()` two to five of these may be used. The first one always contains a low-pass + /// `.process()`, two to five of these may be used. The first one always contains a low-pass /// filter, the last one always contains a high-pass filter, while the other bands will contain /// band-pass filters. - band_filters: [FirFilter; NUM_BANDS], + /// + /// These filters will be fed the FFT from the main input to produce output samples for the enxt + /// period. Everything could be a bit nicer to read if the filter did the entire STFT process, + /// but that would mean duplicating the input ring buffer and forward DFT up to five times. + band_filters: [FftFirFilter; NUM_BANDS], + + /// A ring buffer that is used to store inputs for the next FFT. Until it is time to take the + /// next FFT, samples are copied from the inputs to this buffer, while simultaneously copying + /// the already processed output samples from the output buffers to the output. Once + /// `io_buffer_next_indices` wrap back around to 0, the next buffer should be produced. + input_buffers: [[f32; FFT_INPUT_SIZE]; NUM_CHANNELS as usize], + /// A ring that contains the next period's outputs for each of the five bands. This is written + /// to and read from in lockstep with `input_buffers`. + band_output_buffers: [[[f32; FFT_INPUT_SIZE]; NUM_CHANNELS as usize]; NUM_BANDS], + /// The index in the inner `io_buffer` the next sample should be read from. After a sample is + /// written to the band's output then this is incremented by one. Once + /// `self.io_buffer_next_indices[channel_idx] == self.io_buffer.len()` then the next block + /// should be processed. + /// + /// This is stored as an array since each channel is processed individually. While this should + /// of course stay in sync, this makes it much simpler to process both channels in sequence. + io_buffers_next_indices: [usize; NUM_CHANNELS as usize], + + /// The algorithm for the FFT operation. + r2c_plan: Arc>, + /// The algorithm for the IFFT operation. + c2r_plan: Arc>, + + /// A real buffer that may be written to in place during the FFT and IFFT operations. + real_scratch_buffer: [f32; FFT_SIZE], + /// A complex buffer corresponding to `real_scratch_buffer` that may be written to in place + /// during the FFT and IFFT operations. + complex_scratch_buffer: [Complex32; FFT_SIZE / 2 + 1], } /// The type of FIR crossover to use. @@ -63,9 +87,19 @@ impl FirCrossover { /// Make sure to add the latency reported by [`latency()`][Self::latency()] to the plugin's /// reported latency. pub fn new(mode: FirCrossoverType) -> Self { + let mut fft_planner = RealFftPlanner::new(); + Self { mode, band_filters: Default::default(), + + input_buffers: [[0.0; FFT_INPUT_SIZE]; NUM_CHANNELS as usize], + band_output_buffers: [[[0.0; FFT_INPUT_SIZE]; NUM_CHANNELS as usize]; NUM_BANDS], + io_buffers_next_indices: [0; NUM_CHANNELS as usize], + r2c_plan: fft_planner.plan_fft_forward(FFT_SIZE), + c2r_plan: fft_planner.plan_fft_inverse(FFT_SIZE), + real_scratch_buffer: [0.0; FFT_SIZE], + complex_scratch_buffer: [Complex32::default(); FFT_SIZE / 2 + 1], } } @@ -74,43 +108,96 @@ impl FirCrossover { // Actually, that's a lie, since we currently only do linear-phase filters with a constant // size match self.mode { - FirCrossoverType::LinkwitzRiley24LinearPhase => (FILTER_SIZE / 2) as u32, + FirCrossoverType::LinkwitzRiley24LinearPhase => FFT_INPUT_SIZE as u32, } } /// Split the signal into bands using the crossovers previously configured through `.update()`. - /// The split bands will be written to `band_outputs`. `main_io` is not written to, and should - /// be cleared separately. + /// The split bands will be written to `band_outputs`. The main output should be cleared + /// separately. For efficiency's sake this processes an entire channel at once to minimize the + /// number of FFT operations needed. Since this process delays the signal by `FFT_INPUT_SIZE` + /// samples, the latency should be reported to the host. pub fn process( &mut self, num_bands: usize, - main_io: &ChannelSamples, - band_outputs: [ChannelSamples; NUM_BANDS], + main_input: &[f32], + mut band_outputs: [&mut &mut [f32]; NUM_BANDS], + channel_idx: usize, ) { - nih_debug_assert!(num_bands >= 2); - nih_debug_assert!(num_bands <= NUM_BANDS); - // Required for the SIMD, so we'll just do a hard assert or the unchecked conversions will - // be unsound - assert!(main_io.len() == 2); + nih_debug_assert!(main_input.len() == band_outputs[0].len()); + nih_debug_assert!(channel_idx < NUM_CHANNELS as usize); - let samples: f32x2 = unsafe { main_io.to_simd_unchecked() }; - match self.mode { - FirCrossoverType::LinkwitzRiley24LinearPhase => { - // TODO: Everything is structured to be fast to compute for the IIR filters. Instead - // of doing two channels at the same time, it would probably be faster to use - // SIMD for the actual convolution so we can do 4 or 8 multiply-adds at the - // same time. Or perhaps a better way to spend the time, use FFT convolution - // for this. - for (filter, mut output) in self - .band_filters + // We'll copy already processed output to `band_outputs` while storing input for the next + // FFT operation. This is a modified version of what's going on in `StftHelper`. + let mut current_sample_idx = 0; + while current_sample_idx < main_input.len() { + { + // When `self.io_buffers_next_indices == FFT_SIZE`, the next block should be processed + let io_buffers_next_indices = self.io_buffers_next_indices[channel_idx]; + let process_num_samples = (FFT_INPUT_SIZE - io_buffers_next_indices) + .min(main_input.len() - current_sample_idx); + + // Since we can't do this in-place (without unnecessarily duplicating a ton of data), + // copying data from and to the ring buffers can be done with simple memcpys + self.input_buffers[channel_idx] + [io_buffers_next_indices..io_buffers_next_indices + process_num_samples] + .copy_from_slice( + &main_input[current_sample_idx..current_sample_idx + process_num_samples], + ); + for (band_output, band_output_buffers) in band_outputs .iter_mut() - .zip(band_outputs) + .zip(self.band_output_buffers.iter()) .take(num_bands) { - let filtered_samples = filter.process(samples); - - unsafe { output.from_simd_unchecked(filtered_samples) }; + band_output[current_sample_idx..current_sample_idx + process_num_samples] + .copy_from_slice( + &band_output_buffers[channel_idx][io_buffers_next_indices + ..io_buffers_next_indices + process_num_samples], + ); } + + // This is tracked per-channel because both channels are processed individually + self.io_buffers_next_indices[channel_idx] += process_num_samples; + current_sample_idx += process_num_samples; + } + + // At this point we either reached the end of the buffer (`current_sample_idx == + // main_input.len()`), or we filled up the `io_buffer` and we can process the next block + if self.io_buffers_next_indices[channel_idx] == FFT_INPUT_SIZE { + // Zero pad the input for the FFT + self.real_scratch_buffer[..FFT_INPUT_SIZE] + .copy_from_slice(&self.input_buffers[channel_idx]); + self.real_scratch_buffer[FFT_INPUT_SIZE..].fill(0.0); + + self.r2c_plan + .process_with_scratch( + &mut self.real_scratch_buffer, + &mut self.complex_scratch_buffer, + &mut [], + ) + .unwrap(); + + // The input can then be used to produce each band's output. Since realfft expects + // to be able to modify the input, we need to make a copy of this first: + let input_fft = self.complex_scratch_buffer; + + for (band_output_buffers, band_filter) in self + .band_output_buffers + .iter_mut() + .zip(self.band_filters.iter_mut()) + .take(num_bands) + { + band_filter.process( + &input_fft, + &mut band_output_buffers[channel_idx], + channel_idx, + &*self.c2r_plan, + &mut self.real_scratch_buffer, + &mut self.complex_scratch_buffer, + ) + } + + self.io_buffers_next_indices[channel_idx] = 0; } } } @@ -150,11 +237,16 @@ impl FirCrossover { FirCoefficients::design_fourth_order_linear_phase_low_pass_from_biquad( iir_coefs, ); - self.band_filters[0].coefficients = lp_fir_coefs; + self.band_filters[0].recompute_coefficients( + lp_fir_coefs.clone(), + &*self.r2c_plan, + &mut self.real_scratch_buffer, + &mut self.complex_scratch_buffer, + ); // For the band-pass filters and the final high-pass filter, we need to keep track // of the accumulated impulse response - let mut accumulated_ir = self.band_filters[0].coefficients.clone(); + let mut accumulated_ir = lp_fir_coefs; for (split_frequency, band_filter) in frequencies .iter() .zip(self.band_filters.iter_mut()) @@ -191,7 +283,12 @@ impl FirCrossover { *accumulated_coef += *bp_coef; } - band_filter.coefficients = fir_bp_coefs; + band_filter.recompute_coefficients( + fir_bp_coefs, + &*self.r2c_plan, + &mut self.real_scratch_buffer, + &mut self.complex_scratch_buffer, + ); } // And finally we can do a spectral inversion of the accumulated IR to the the last @@ -202,7 +299,12 @@ impl FirCrossover { } fir_hp_coefs.0[FILTER_SIZE / 2] += 1.0; - self.band_filters[num_bands - 1].coefficients = fir_hp_coefs; + self.band_filters[num_bands - 1].recompute_coefficients( + fir_hp_coefs, + &*self.r2c_plan, + &mut self.real_scratch_buffer, + &mut self.complex_scratch_buffer, + ); } } } @@ -212,5 +314,16 @@ impl FirCrossover { for filter in &mut self.band_filters { filter.reset(); } + + // The inputs don't need to be reset as they'll be overwritten immediately + for band_buffers in &mut self.band_output_buffers { + for buffer in band_buffers { + buffer.fill(0.0); + } + } + + // This being 0 means that the very first period will simply output the silence form above + // and gather input for the next FFT + self.io_buffers_next_indices.fill(0); } } diff --git a/plugins/crossover/src/crossover/fir/filter.rs b/plugins/crossover/src/crossover/fir/filter.rs index ae99c04d..d1e0626f 100644 --- a/plugins/crossover/src/crossover/fir/filter.rs +++ b/plugins/crossover/src/crossover/fir/filter.rs @@ -14,27 +14,50 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . +use realfft::num_complex::Complex32; +use realfft::{ComplexToReal, RealToComplex}; use std::f32; -use std::simd::{f32x2, StdFloat}; -use super::{FILTER_SIZE, RING_BUFFER_SIZE}; use crate::crossover::iir::biquad::{Biquad, BiquadCoefficients}; +use crate::NUM_CHANNELS; + +/// We're doing FFT convolution here since otherwise there's no way to get decent low-frequency +/// accuracy while still having acceptable performance. The input going into the STFT will be +/// smaller since it will be padding with zeroes to compensate for the otherwise overlapping tail +/// caused by the convolution. +pub const FFT_SIZE: usize = 4096; +/// The input chunk size the FFT convolution is processing. This is also the latency. By having this +/// be exactly half of FFT_SIZE, we can make the overlap-add part of the FFT convolution a lot +/// simpler for ourselves. (check the `StftHelper` struct in NIH-plug itself for an examples that +/// can handle arbitrary padding) +pub const FFT_INPUT_SIZE: usize = FFT_SIZE / 2; +/// The size of the FIR filter window, or the number of taps. Convoling `FFT_INPUT_SIZE` samples +/// with this filter should fit exactly in `FFT_SIZE`, and it should be an odd number. +pub const FILTER_SIZE: usize = FFT_SIZE - FFT_INPUT_SIZE + 1; /// A single FIR filter that may be configured in any way. In this plugin this will be a -/// linear-phase low-pass, band-pass, or high-pass filter. +/// linear-phase low-pass, band-pass, or high-pass filter. Implemented using FFT convolution. `git +/// blame` this for a version that uses direct convolution. +/// +/// `N_INPUT` is the size of the input that will be processed. The size of the FFT window becomes +/// `N_INPUT * 2`. That makes handling the overlap easy, as each IDFT after multiplying the padded +/// input and the padded impulse response FFTs will result one `N_INPUT` period of output that can +/// be taken as is, followed by one `N_INPUT` period of samples that need to be added to the next +/// period's outputs as part of the overlap-add process. #[derive(Debug, Clone)] -pub struct FirFilter { - /// The coefficients for this filter. The filters for both channels should be equivalent, this - /// just avoids broadcasts in the filter process. - pub coefficients: FirCoefficients, +pub struct FftFirFilter { + /// An `N_INPUT + 1` sized IIR. Padded, ran through the DFT, and then normalized by dividing by + /// `FFT_SIZE`. + padded_ir_fft: [Complex32; FFT_SIZE / 2 + 1], - /// A ring buffer storing the last `FILTER_SIZE - 1` samples. The capacity is `FILTER_SIZE` - /// rounded up to the next power of two. - delay_buffer: [f32x2; RING_BUFFER_SIZE], - /// The index in `delay_buffer` to write the next sample to. Wrapping negative indices back to - /// the end, the previous sample can be found at `delay_buffer[delay_buffer_next_idx - 1]`, the - /// one before that at `delay_buffer[delay_buffer_next_idx - 2]`, and so on. - delay_buffer_next_idx: usize, + /// The padding from the previous IDFT operation that needs to be added to the next output + /// buffer. After the IDFT process there will be an `FFT_SIZE` real scratch buffer containing + /// the output. At that point the first `FFT_INPUT_SIZE` samples of those will be copied to + /// `output_buffers` in the FIR crossover, `unapplied_padding_buffer` will be added to that + /// output buffer, and then finally the last `FFT_INPUT_SIZE` samples of the scratch buffer are + /// copied to `unapplied_padding_buffer`. This thus makes sure the tail gets delayed by another + /// period so that everything matches up. + unapplied_padding_buffers: [[f32; FFT_INPUT_SIZE]; NUM_CHANNELS as usize], } /// Coefficients for a (linear-phase) FIR filter. This struct includes ways to design the filter. @@ -43,12 +66,14 @@ pub struct FirFilter { #[derive(Debug, Clone)] pub struct FirCoefficients(pub [f32; N]); -impl Default for FirFilter { +impl Default for FftFirFilter { fn default() -> Self { Self { - coefficients: FirCoefficients::default(), - delay_buffer: [f32x2::default(); RING_BUFFER_SIZE], - delay_buffer_next_idx: 0, + // Would be nicer to initialize this to an impulse response that actually had the + // correct position wrt the usual linear-phase latency, but this is fine since it should + // never be used anyways + padded_ir_fft: [Complex32::new(1.0 / FFT_SIZE as f32, 0.0); FFT_SIZE / 2 + 1], + unapplied_padding_buffers: [[0.0; FFT_INPUT_SIZE]; NUM_CHANNELS as usize], } } } @@ -64,53 +89,87 @@ impl Default for FirCoefficients { } } -impl FirFilter { - /// Process left and right audio samples through the filter. - pub fn process(&mut self, samples: f32x2) -> f32x2 { - // TODO: Replace direct convolution with FFT convolution, would make the implementation much - // more complex though because of the multi output part - let coefficients = &self.coefficients.0; - let mut result = f32x2::splat(coefficients[0]) * samples; +impl FftFirFilter { + /// Filter `FFT_INPUT_SIZE` samples padded to `FFT_SIZE` through this filter, and write the + /// outputs to `output_samples` (belonging to channel `channel_idx`), at an `FFT_INPUT_SIZE` + /// delay. This is a bit weird and probably difficult to follow because as an optimization the + /// DFT is taken only once, and then the IDFT is taken once for every filtered band. This + /// function is thus called inside of the overlap-add loop to avoid duplicate work. + pub fn process( + &mut self, + input_fft: &[Complex32; FFT_SIZE / 2 + 1], + output_samples: &mut [f32; FFT_INPUT_SIZE], + output_channel_idx: usize, + c2r_plan: &dyn ComplexToReal, + real_scratch_buffer: &mut [f32; FFT_SIZE], + complex_scratch_buffer: &mut [Complex32; FFT_SIZE / 2 + 1], + ) { + // The padded input FFT has already been taken, so we only need to copy it to the scratch + // buffer (the input cannot change as the next band might need it as well). + complex_scratch_buffer.copy_from_slice(input_fft); - // Now multiply `self.coefficients[1..]` with the delay buffer starting at - // `self.delay_buffer_next_idx - 1`, wrapping around to the end when that is reached - // The end index is exclusive, and we already did the multiply+add for the first coefficient. - let before_wraparound_start_idx = self - .delay_buffer_next_idx - .saturating_sub(coefficients.len() - 1); - let before_wraparound_end_idx = self.delay_buffer_next_idx; - let num_before_wraparound = before_wraparound_end_idx - before_wraparound_start_idx; - for (coefficient, delayed_sample) in coefficients[1..1 + num_before_wraparound].iter().zip( - self.delay_buffer[before_wraparound_start_idx..before_wraparound_end_idx] - .iter() - .rev(), - ) { - // `result += coefficient * sample`, but with explicit FMA - result = f32x2::splat(*coefficient).mul_add(*delayed_sample, result); + // The FFT of the impulse response has already been normalized, so we just need to + // multiply the two buffers + for (output_bin, ir_bin) in complex_scratch_buffer + .iter_mut() + .zip(self.padded_ir_fft.iter()) + { + *output_bin *= ir_bin; } + c2r_plan + .process_with_scratch(complex_scratch_buffer, real_scratch_buffer, &mut []) + .unwrap(); - let after_wraparound_begin_idx = - self.delay_buffer.len() - (coefficients.len() - num_before_wraparound); - let after_wraparound_end_idx = self.delay_buffer.len(); - for (coefficient, delayed_sample) in coefficients[1 + num_before_wraparound..].iter().zip( - self.delay_buffer[after_wraparound_begin_idx..after_wraparound_end_idx] - .iter() - .rev(), - ) { - result = f32x2::splat(*coefficient).mul_add(*delayed_sample, result); + // At this point the first `FFT_INPUT_SIZE` elements in `real_scratch_buffer` + // contain the output for the next period, while the last `FFT_INPUT_SIZE` elements + // contain output that needs to be added to the period after that. Since previous + // period also produced similar delayed output, we'll need to copy that to the + // results as well. + output_samples.copy_from_slice(&real_scratch_buffer[..FFT_INPUT_SIZE]); + for (output_sample, padding_sample) in output_samples + .iter_mut() + .zip(self.unapplied_padding_buffers[output_channel_idx].iter()) + { + *output_sample += *padding_sample; } + self.unapplied_padding_buffers[output_channel_idx] + .copy_from_slice(&real_scratch_buffer[FFT_INPUT_SIZE..]); + } - // And finally write the samples to the delay buffer for the enxt sample - self.delay_buffer[self.delay_buffer_next_idx] = samples; - self.delay_buffer_next_idx = (self.delay_buffer_next_idx + 1) % self.delay_buffer.len(); + /// Set the filter's coefficients based on raw FIR filter coefficients. These will be padded, + /// ran through the DFT, and normalized. + pub fn recompute_coefficients( + &mut self, + coefficients: FirCoefficients, + r2c_plan: &dyn RealToComplex, + real_scratch_buffer: &mut [f32; FFT_SIZE], + complex_scratch_buffer: &mut [Complex32; FFT_SIZE / 2 + 1], + ) { + // This needs to be padded with zeroes + real_scratch_buffer[..FILTER_SIZE].copy_from_slice(&coefficients.0); + real_scratch_buffer[FILTER_SIZE..].fill(0.0); - result + r2c_plan + .process_with_scratch(real_scratch_buffer, complex_scratch_buffer, &mut []) + .unwrap(); + + // The resulting buffer needs to be normalized and written to `self.padded_ir_fft`. That way + // we don't need to do anything but multiplying and writing the results back when + // processing. + let normalization_factor = 1.0 / FFT_SIZE as f32; + for (filter_bin, target_bin) in complex_scratch_buffer + .iter() + .zip(self.padded_ir_fft.iter_mut()) + { + *target_bin = *filter_bin * normalization_factor; + } } /// Reset the internal filter state. pub fn reset(&mut self) { - self.delay_buffer.fill(f32x2::default()); - self.delay_buffer_next_idx = 0; + for buffer in &mut self.unapplied_padding_buffers { + buffer.fill(0.0); + } } } diff --git a/plugins/crossover/src/lib.rs b/plugins/crossover/src/lib.rs index d18a22fb..9174a683 100644 --- a/plugins/crossover/src/lib.rs +++ b/plugins/crossover/src/lib.rs @@ -27,6 +27,9 @@ use std::sync::Arc; mod crossover; +/// The number of channels this plugin supports. Hard capped at 2 for SIMD reasons. +pub const NUM_CHANNELS: u32 = 2; + /// The number of bands. Not used directly here, but this avoids hardcoding some constants in the /// crossover implementations. pub const NUM_BANDS: usize = 5; @@ -163,13 +166,13 @@ impl Plugin for Crossover { const VERSION: &'static str = "0.1.0"; - const DEFAULT_NUM_INPUTS: u32 = 2; - const DEFAULT_NUM_OUTPUTS: u32 = 2; + const DEFAULT_NUM_INPUTS: u32 = NUM_CHANNELS; + const DEFAULT_NUM_OUTPUTS: u32 = NUM_CHANNELS; const DEFAULT_AUX_OUTPUTS: Option = Some(AuxiliaryIOConfig { // Two to five of these busses will be used at a time num_busses: 5, - num_channels: 2, + num_channels: NUM_CHANNELS, }); const PORT_NAMES: PortNames = PortNames { @@ -186,9 +189,9 @@ impl Plugin for Crossover { fn accepts_bus_config(&self, config: &BusConfig) -> bool { // Only do stereo - config.num_input_channels == 2 - && config.num_output_channels == 2 - && config.aux_output_busses.num_channels == 2 + config.num_input_channels == NUM_CHANNELS + && config.num_output_channels == NUM_CHANNELS + && config.aux_output_busses.num_channels == NUM_CHANNELS } fn initialize( @@ -232,18 +235,7 @@ impl Plugin for Crossover { CrossoverType::LinkwitzRiley24LinearPhase => { context.set_latency_samples(self.fir_crossover.latency()); - todo!(); - // Self::do_process(buffer, aux, |main_channel_samples, bands| { - // if self.should_update_filters() { - // self.update_filters(buffer.len() as u32); - // } - - // self.fir_crossover.process( - // self.params.num_bands.value as usize, - // main_channel_samples, - // bands, - // ); - // }) + self.process_fir(buffer, aux); } } @@ -253,7 +245,7 @@ impl Plugin for Crossover { impl Crossover { /// Takes care of all of the boilerplate in zipping the outputs together to get a nice iterator - /// friendly and SIMD-able interface for the processing function. Prevents havign to branch per + /// friendly and SIMD-able interface for the processing function. Prevents having to branch per /// sample. The closure receives an input sample and it should write the output samples for each /// band to the array. fn process_iir(&mut self, buffer: &mut Buffer, aux: &mut AuxiliaryBuffers) { @@ -310,6 +302,45 @@ impl Crossover { } } + /// `process_iir()`, but for the linear-phase FIR crossovers. This processes an entire channel + /// at once instead of processing per-sample since we use FFT convolution. + fn process_fir(&mut self, buffer: &mut Buffer, aux: &mut AuxiliaryBuffers) { + // In theory we could do smoothing in between processed blocks, but this hsould be fine + if self.should_update_filters() { + self.update_filters(buffer.len() as u32); + } + + let aux_outputs = &mut aux.outputs; + let (band_1_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap(); + let (band_2_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap(); + let (band_3_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap(); + let (band_4_buffer, aux_outputs) = aux_outputs.split_first_mut().unwrap(); + let (band_5_buffer, _) = aux_outputs.split_first_mut().unwrap(); + + // We can avoid a lot of hardcoding and conditionals by restoring the original array structure + for channel_idx in 0..buffer.channels() { + let main_io = &mut buffer.as_slice()[channel_idx]; + let band_outputs = [ + &mut band_1_buffer.as_slice()[channel_idx], + &mut band_2_buffer.as_slice()[channel_idx], + &mut band_3_buffer.as_slice()[channel_idx], + &mut band_4_buffer.as_slice()[channel_idx], + &mut band_5_buffer.as_slice()[channel_idx], + ]; + + self.fir_crossover.process( + self.params.num_bands.value as usize, + main_io, + band_outputs, + channel_idx, + ); + + // The main output should be silent as the signal is already evenly split over the other + // bands + main_io.fill(0.0); + } + } + /// Returns whether the filters should be updated. There are different updating functions for /// the IIR and FIR crossovers. fn should_update_filters(&mut self) -> bool {