Use hand written fast fill to save a few more cycles

2024-12-24 00:31:34 +11:00 · 2022-06-12 16:53:46 +01:00 · 2022-06-12 16:53:46 +01:00 · 4969672c73
parent e051a17836
commit 4969672c73
3 changed files with 26 additions and 28 deletions
--- a/agb/src/sound/mixer/mixer.s
+++ b/agb/src/sound/mixer/mixer.s
@ -136,12 +136,34 @@ agb_arm_func agb_rs__mixer_add_stereo

 agb_arm_end agb_rs__mixer_add_stereo

+.section .iwram
+    .balign 4
+constant_zero:
+.rept 4
+    .word 0
+.endr
+
 agb_arm_func agb_rs__mixer_collapse
    @ Arguments:
    @ r0 = target buffer (i8)
    @ r1 = input buffer (i16) of fixnums with 4 bits of precision (read in sets of i16 in an i32)
    push {r4-r11}

+    @ zero registers r4-r7 (4 of them)
+    mov r8, #constant_zero
+    ldm r8, {r4-r7}
+
+    @ get the size of the buffer
+    ldr r9, agb_rs__buffer_size
+    @ make a copy of the output buffer pointer
+    mov r10, r0
+1:
+    @ zero 4 words worth of the output buffer
+    stmia r10, {r4-r7}
+    subs r9, r9, #16
+    @ loop if we haven't zeroed everything
+    bne 1b
+
 CONST_0   .req r7
 CONST_FF  .req r8
 CONST_127 .req r9
--- a/agb/src/sound/mixer/sw_mixer.rs
+++ b/agb/src/sound/mixer/sw_mixer.rs
@ -5,7 +5,7 @@ use bare_metal::{CriticalSection, Mutex};
 use super::hw;
 use super::hw::LeftOrRight;
 use super::{SoundChannel, SoundPriority};
-use crate::syscall::cpu_fast_fill_i8;
+
 use crate::{
    fixnum::Num,
    interrupt::free,
@ -245,9 +245,6 @@ impl MixerBuffer {
                channel.playback_speed
            };

-            let right_amount = ((channel.panning + 1) / 2) * channel.volume;
-            let left_amount = ((-channel.panning + 1) / 2) * channel.volume;
-
            if (channel.pos + playback_speed * constants::SOUND_BUFFER_SIZE).floor()
                >= channel.data.len()
            {
@ -268,6 +265,9 @@ impl MixerBuffer {
                    );
                }
            } else {
+                let right_amount = ((channel.panning + 1) / 2) * channel.volume;
+                let left_amount = ((-channel.panning + 1) / 2) * channel.volume;
+
                unsafe {
                    agb_rs__mixer_add(
                        channel.data.as_ptr().add(channel.pos.floor()),
@ -285,7 +285,6 @@ impl MixerBuffer {
        let write_buffer_index = free(|cs| self.state.borrow(cs).borrow_mut().active_advanced());

        let write_buffer = &mut self.buffers[write_buffer_index].0;
-        cpu_fast_fill_i8(write_buffer, 0);

        unsafe {
            agb_rs__mixer_collapse(write_buffer.as_mut_ptr(), buffer.as_ptr());
--- a/agb/src/syscall.rs
+++ b/agb/src/syscall.rs
@ -120,29 +120,6 @@ pub fn arc_tan2(x: i16, y: i32) -> i16 {
    result
 }

-pub(crate) fn cpu_fast_fill_i8(input: &mut [i8], new_content: i32) {
-    assert_eq!(
-        input.len() % (4 * 8),
-        0,
-        "Input length must be divisible by 32"
-    );
-
-    let input_ptr = [new_content].as_ptr();
-    let output_ptr = input.as_mut_ptr();
-    let length_mode = (1 << 24) | // copy
-        (input.len() / 4);
-
-    unsafe {
-        asm!(
-            "swi 0x0c",
-            in("r0") input_ptr,
-            in("r1") output_ptr,
-            in("r2") length_mode,
-            lateout("r3") _,
-        );
-    }
-}
-
 // pub fn affine_matrix(
 //     x_scale: Num<i16, 8>,
 //     y_scale: Num<i16, 8>,