mirror of
https://github.com/italicsjenga/agb.git
synced 2024-12-23 08:11:33 +11:00
Even faster mixer (#447)
If you do an ldmia for loading lots of samples at once, the mixer uses significantly less CPU (10% compared to previous, 19800 cycles per frame -> 17701 cycles per frame for 32768Hz). I've also added a really simple unit test for the `collapse` function to at least gain some confidence in it. - [x] Changelog updated / no changelog update needed
This commit is contained in:
commit
c3268e3de2
|
@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
- Changed the default template game.
|
- Changed the default template game.
|
||||||
- `DynamicSprite` has a new API which changes the constructor and adds a `set_pixel` method.
|
- `DynamicSprite` has a new API which changes the constructor and adds a `set_pixel` method.
|
||||||
- You no longer need to install arm-none-eabi-binutils. In order to write games using `agb`, you now only need to install rust nightly.
|
- You no longer need to install arm-none-eabi-binutils. In order to write games using `agb`, you now only need to install rust nightly.
|
||||||
|
- 10% performance improvement with the software mixer.
|
||||||
|
|
||||||
## [0.15.0] - 2023/04/25
|
## [0.15.0] - 2023/04/25
|
||||||
|
|
||||||
|
|
|
@ -98,18 +98,15 @@ agb_arm_func agb_rs__mixer_add_stereo
|
||||||
@ r2 - volume to play the sound at
|
@ r2 - volume to play the sound at
|
||||||
@
|
@
|
||||||
@ The sound buffer must be SOUND_BUFFER_SIZE * 2 in size = 176 * 2
|
@ The sound buffer must be SOUND_BUFFER_SIZE * 2 in size = 176 * 2
|
||||||
push {{r4-r9}}
|
push {{r4-r11}}
|
||||||
|
|
||||||
mov r9, r2
|
|
||||||
ldr r5, =0x00000FFF
|
ldr r5, =0x00000FFF
|
||||||
|
|
||||||
ldr r8, =agb_rs__buffer_size
|
ldr r8, =agb_rs__buffer_size
|
||||||
ldr r8, [r8]
|
ldr r8, [r8]
|
||||||
1:
|
|
||||||
.rept 4
|
|
||||||
ldrsh r6, [r0], #2 @ load the current sound sample to r6
|
|
||||||
|
|
||||||
ldr r4, [r1] @ read the current value
|
.macro add_stereo_sample sample_reg:req
|
||||||
|
ldrsh r6, [r0], #2 @ load the current sound sample to r6
|
||||||
|
|
||||||
@ This is slightly convoluted, but is mainly done for performance reasons. It is better
|
@ This is slightly convoluted, but is mainly done for performance reasons. It is better
|
||||||
@ to hit ROM just once and then do 3 really simple instructions then do 2 ldrsbs however annoying
|
@ to hit ROM just once and then do 3 really simple instructions then do 2 ldrsbs however annoying
|
||||||
|
@ -130,15 +127,23 @@ agb_arm_func agb_rs__mixer_add_stereo
|
||||||
lsl r6, r6, #24 @ r6 = | R | 0 | 0 | 0 | drop everything except the right sample
|
lsl r6, r6, #24 @ r6 = | R | 0 | 0 | 0 | drop everything except the right sample
|
||||||
orr r6, r7, r6, asr #8 @ r6 = | 1 | R | 1 | L | now we have it perfectly set up
|
orr r6, r7, r6, asr #8 @ r6 = | 1 | R | 1 | L | now we have it perfectly set up
|
||||||
|
|
||||||
mla r4, r6, r9, r4 @ r4 += r6 * r9 (calculating both the left and right samples together)
|
mla \sample_reg, r6, r2, \sample_reg @ r4 += r6 * r2 (calculating both the left and right samples together)
|
||||||
|
.endm
|
||||||
|
|
||||||
str r4, [r1], #4 @ store the new value, and increment the pointer
|
1:
|
||||||
.endr
|
ldmia r1, {{r9-r12}} @ read the current values
|
||||||
|
|
||||||
|
add_stereo_sample r9
|
||||||
|
add_stereo_sample r10
|
||||||
|
add_stereo_sample r11
|
||||||
|
add_stereo_sample r12
|
||||||
|
|
||||||
|
stmia r1!, {{r9-r12}} @ store the new value, and increment the pointer
|
||||||
|
|
||||||
subs r8, r8, #4 @ loop counter
|
subs r8, r8, #4 @ loop counter
|
||||||
bne 1b @ jump back if we're done with the loop
|
bne 1b @ jump back if we're done with the loop
|
||||||
|
|
||||||
pop {{r4-r9}}
|
pop {{r4-r11}}
|
||||||
bx lr
|
bx lr
|
||||||
|
|
||||||
agb_arm_end agb_rs__mixer_add_stereo
|
agb_arm_end agb_rs__mixer_add_stereo
|
||||||
|
@ -147,22 +152,19 @@ agb_arm_func agb_rs__mixer_collapse
|
||||||
@ Arguments:
|
@ Arguments:
|
||||||
@ r0 = target buffer (i8)
|
@ r0 = target buffer (i8)
|
||||||
@ r1 = input buffer (i16) of fixnums with 4 bits of precision (read in sets of i16 in an i32)
|
@ r1 = input buffer (i16) of fixnums with 4 bits of precision (read in sets of i16 in an i32)
|
||||||
|
@ r2 = loop counter
|
||||||
|
|
||||||
push {{r4-r11}}
|
push {{r4-r11,lr}}
|
||||||
|
|
||||||
CONST_0 .req r7
|
CONST_0 .req r7
|
||||||
CONST_FF .req r8
|
CONST_128 .req r8
|
||||||
CONST_127 .req r9
|
|
||||||
TEMP .req r10
|
TEMP .req r10
|
||||||
SWAP_SIGN .req r11
|
SWAP_SIGN .req r11
|
||||||
|
|
||||||
ldr CONST_0, =0
|
ldr CONST_0, =0
|
||||||
ldr CONST_FF, =0xff
|
ldr CONST_128, =128
|
||||||
ldr CONST_127, =127
|
|
||||||
ldr SWAP_SIGN, =0x80808080
|
ldr SWAP_SIGN, =0x80808080
|
||||||
|
|
||||||
ldr r2, =agb_rs__buffer_size @ loop counter
|
|
||||||
ldr r2, [r2]
|
|
||||||
mov r4, r2
|
mov r4, r2
|
||||||
|
|
||||||
@ The idea for this solution came from pimpmobile:
|
@ The idea for this solution came from pimpmobile:
|
||||||
|
@ -171,7 +173,7 @@ SWAP_SIGN .req r11
|
||||||
@ The register should be 127 bigger then what you actually want, and we'll correct for that later. Hence the
|
@ The register should be 127 bigger then what you actually want, and we'll correct for that later. Hence the
|
||||||
@ add instructions in `load_sample`.
|
@ add instructions in `load_sample`.
|
||||||
@
|
@
|
||||||
@ The idea behind this is in the bit patters of -128 and 127 which are 10000000 and 01111111 respectively,
|
@ The idea behind this is in the bit patters of -128 and 127 which are 10000000 and 01111111 respectively, -x = !x + 1 => !x = -x-1
|
||||||
@ and we want to clamp the value between them.
|
@ and we want to clamp the value between them.
|
||||||
@
|
@
|
||||||
@ The first instruction calculates `-((sample + 128) >> 8)`. If sample is between -128 and 127, then
|
@ The first instruction calculates `-((sample + 128) >> 8)`. If sample is between -128 and 127, then
|
||||||
|
@ -185,16 +187,13 @@ SWAP_SIGN .req r11
|
||||||
@ So (-1 logical >> 24) gives 11111111 and (1 logical >> 24) gives 00000000 so register is clamped between these two values.
|
@ So (-1 logical >> 24) gives 11111111 and (1 logical >> 24) gives 00000000 so register is clamped between these two values.
|
||||||
.macro clamp_s8 reg:req
|
.macro clamp_s8 reg:req
|
||||||
subs TEMP, CONST_0, \reg, asr #8
|
subs TEMP, CONST_0, \reg, asr #8
|
||||||
andne \reg, CONST_FF, TEMP, lsr #24
|
movne \reg, TEMP, lsr #24
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro load_sample left_reg:req right_reg:req
|
.macro load_sample left_reg:req right_reg:req
|
||||||
@ left_reg = *r1; r1++
|
|
||||||
ldr \left_reg, [r1], #4
|
|
||||||
|
|
||||||
mov \right_reg, \left_reg, lsl #16 @ push the sample 16 bits first
|
mov \right_reg, \left_reg, lsl #16 @ push the sample 16 bits first
|
||||||
add \right_reg, CONST_127, \right_reg, asr #20 @ move right sample back to being the correct value
|
add \right_reg, CONST_128, \right_reg, asr #20 @ move right sample back to being the correct value
|
||||||
add \left_reg, CONST_127, \left_reg, asr #20 @ now we only have the left sample
|
add \left_reg, CONST_128, \left_reg, asr #20 @ now we only have the left sample
|
||||||
|
|
||||||
clamp_s8 \left_reg @ clamp the audio to 8 bit values
|
clamp_s8 \left_reg @ clamp the audio to 8 bit values
|
||||||
clamp_s8 \right_reg
|
clamp_s8 \right_reg
|
||||||
|
@ -202,19 +201,21 @@ SWAP_SIGN .req r11
|
||||||
|
|
||||||
1:
|
1:
|
||||||
.rept 4
|
.rept 4
|
||||||
|
ldmia r1!, {{r3,r5,r6,r9}}
|
||||||
|
|
||||||
load_sample r3, r12
|
load_sample r3, r12
|
||||||
|
|
||||||
load_sample r5, r6
|
load_sample r5, lr
|
||||||
orr r3, r3, r5, lsl #8
|
orr r3, r3, r5, lsl #8
|
||||||
orr r12, r12, r6, lsl #8
|
orr r12, r12, lr, lsl #8
|
||||||
|
|
||||||
load_sample r5, r6
|
load_sample r6, lr
|
||||||
orr r3, r3, r5, lsl #16
|
orr r3, r3, r6, lsl #16
|
||||||
orr r12, r12, r6, lsl #16
|
orr r12, r12, lr, lsl #16
|
||||||
|
|
||||||
load_sample r5, r6
|
load_sample r9, lr
|
||||||
orr r3, r3, r5, lsl #24
|
orr r3, r3, r9, lsl #24
|
||||||
orr r12, r12, r6, lsl #24
|
orr r12, r12, lr, lsl #24
|
||||||
|
|
||||||
eor r3, r3, SWAP_SIGN
|
eor r3, r3, SWAP_SIGN
|
||||||
eor r12, r12, SWAP_SIGN
|
eor r12, r12, SWAP_SIGN
|
||||||
|
@ -226,6 +227,6 @@ SWAP_SIGN .req r11
|
||||||
subs r2, r2, #16 @ r2 -= 16
|
subs r2, r2, #16 @ r2 -= 16
|
||||||
bne 1b @ loop if not 0
|
bne 1b @ loop if not 0
|
||||||
|
|
||||||
pop {{r4-r11}}
|
pop {{r4-r11,lr}}
|
||||||
bx lr
|
bx lr
|
||||||
agb_arm_end agb_rs__mixer_collapse
|
agb_arm_end agb_rs__mixer_collapse
|
||||||
|
|
|
@ -35,7 +35,11 @@ extern "C" {
|
||||||
volume: Num<i16, 4>,
|
volume: Num<i16, 4>,
|
||||||
);
|
);
|
||||||
|
|
||||||
fn agb_rs__mixer_collapse(sound_buffer: *mut i8, input_buffer: *const Num<i16, 4>);
|
fn agb_rs__mixer_collapse(
|
||||||
|
sound_buffer: *mut i8,
|
||||||
|
input_buffer: *const Num<i16, 4>,
|
||||||
|
num_samples: usize,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The main software mixer struct.
|
/// The main software mixer struct.
|
||||||
|
@ -458,7 +462,85 @@ impl MixerBuffer {
|
||||||
let write_buffer = free(|cs| self.state.borrow(cs).borrow_mut().active_advanced());
|
let write_buffer = free(|cs| self.state.borrow(cs).borrow_mut().active_advanced());
|
||||||
|
|
||||||
unsafe {
|
unsafe {
|
||||||
agb_rs__mixer_collapse(write_buffer, working_buffer.as_ptr());
|
agb_rs__mixer_collapse(
|
||||||
|
write_buffer,
|
||||||
|
working_buffer.as_ptr(),
|
||||||
|
self.frequency.buffer_size(),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use crate::fixnum::num;
|
||||||
|
use alloc::vec;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test_case]
|
||||||
|
fn collapse_should_correctly_reduce_size_of_input(_: &mut crate::Gba) {
|
||||||
|
#[repr(align(4))]
|
||||||
|
struct AlignedNumbers<const N: usize>([Num<i16, 4>; N]);
|
||||||
|
|
||||||
|
let input = &AlignedNumbers([
|
||||||
|
num!(10.0),
|
||||||
|
num!(10.0),
|
||||||
|
num!(5.0),
|
||||||
|
num!(5.0),
|
||||||
|
num!(-10.0),
|
||||||
|
num!(-10.5),
|
||||||
|
num!(-5.9),
|
||||||
|
num!(-5.2),
|
||||||
|
num!(0.0),
|
||||||
|
num!(1.1),
|
||||||
|
num!(2.2),
|
||||||
|
num!(3.3),
|
||||||
|
num!(155.4),
|
||||||
|
num!(-230.5),
|
||||||
|
num!(400.6),
|
||||||
|
num!(-700.7),
|
||||||
|
num!(10.0),
|
||||||
|
num!(10.0),
|
||||||
|
num!(5.0),
|
||||||
|
num!(5.0),
|
||||||
|
num!(-10.0),
|
||||||
|
num!(-10.5),
|
||||||
|
num!(-5.9),
|
||||||
|
num!(-5.2),
|
||||||
|
num!(0.0),
|
||||||
|
num!(1.1),
|
||||||
|
num!(2.2),
|
||||||
|
num!(3.3),
|
||||||
|
num!(155.4),
|
||||||
|
num!(-230.5),
|
||||||
|
num!(400.6),
|
||||||
|
num!(-700.7),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let input = &input.0;
|
||||||
|
|
||||||
|
let mut output_buffer = vec![0i32; input.len() / 4];
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
agb_rs__mixer_collapse(
|
||||||
|
output_buffer.as_mut_ptr().cast(),
|
||||||
|
input.as_ptr(),
|
||||||
|
input.len() / 2,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// output will be unzipped, so input is LRLRLRLRLRLRLR... and output is LLLLLLRRRRRR
|
||||||
|
assert_eq!(
|
||||||
|
output_buffer
|
||||||
|
.iter()
|
||||||
|
.flat_map(|x| x.to_le_bytes())
|
||||||
|
.map(|x| x as i8)
|
||||||
|
.collect::<alloc::vec::Vec<_>>(),
|
||||||
|
&[
|
||||||
|
10, 5, -10, -6, 0, 2, 127, 127, 10, 5, -10, -6, 0, 2, 127, 127, 10, 5, -11, -6, 1,
|
||||||
|
3, -128, -128, 10, 5, -11, -6, 1, 3, -128, -128
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue