//! This module holds the assembly runtime that supports your Rust program. //! //! Most importantly, you can set the [`RUST_IRQ_HANDLER`] variable to assign //! which function should be run during a hardware interrupt. //! * When a hardware interrupt occurs, control first goes to the BIOS, which //! will then call the assembly runtime's handler. //! * The assembly runtime handler will properly acknowledge the interrupt //! within the system on its own without you having to do anything. //! * If a function is set in the `RUST_IRQ_HANDLER` variable then that function //! will be called and passed the bits for which interrupt(s) occurred. use core::ffi::c_void; use crate::{ dma::DmaControl, gba_cell::GbaCell, interrupts::IrqFn, mgba::MGBA_LOGGING_ENABLE_REQUEST, mmio::{DMA3_SRC, IME, MGBA_LOG_ENABLE}, }; /// Builds an assembly string that puts the contained code in the section /// specified. /// /// ```txt /// put_code_in_section!( ".example.section", { /// "lines" /// "go" /// "here" /// }); /// ``` macro_rules! put_code_in_section { ($section_name:expr, { $($asm_line:expr),+ $(,)? }) => { concat!( concat!(".section ", $section_name, "\n"), $( concat!($asm_line, "\n") ),+ , concat!(".previous\n"), ) } } /// Builds an assembly string wrapped in `.code 32` and `.code 16` as necessary /// /// ```txt /// emit_a32_code!{ /// "lines" /// "go" /// "here" /// }; /// ``` #[cfg(target_feature = "thumb-mode")] macro_rules! emit_a32_code { ($($asm_line:expr),+ $(,)?) => { concat!( concat!(".code 32\n"), $( concat!($asm_line, "\n") ),+ , concat!(".code 16\n"), ) } } /// Builds an assembly string wrapped in `.code 32` and `.code 16` as necessary /// /// ```txt /// emit_a32_code!{ /// "lines" /// "go" /// "here" /// }; /// ``` #[cfg(not(target_feature = "thumb-mode"))] macro_rules! emit_a32_code { ($($asm_line:expr),+ $(,)?) => { concat!( $( concat!($asm_line, "\n") ),+ , ) } } /// Builds an assembly string that pushes some regs, does the body, then pops /// the regs. /// /// The `reglist` expression should include the appropriate level of braces for /// the enclosing assembly block (two for normal asm, or one for raw asm). /// /// ```txt /// with_pushed_registers!( "reglist", { /// "lines" /// "go" /// "here" /// }); /// ``` macro_rules! with_pushed_registers { ($reglist:expr, { $($asm_line:expr),* $(,)? }) => { concat!( concat!("push ", $reglist, "\n"), $( concat!($asm_line, "\n") ),* , concat!("pop ", $reglist, "\n"), ) } } /// Reads SPSR into the register named, does the block, and writes the same /// register back to SPSR. macro_rules! with_spsr_held_in { ($reg:literal { $($asm_line:expr),* $(,)? }) => { concat!( concat!("mrs ", $reg, ", SPSR\n"), $( concat!($asm_line, "\n") ),* , concat!("msr SPSR, ", $reg, "\n"), ) } } /// Sets `lr` to just after the `bx`, then uses `bx` with the given register. /// /// This generates a label, so pick a `label_id` that won't interfere with any /// nearby code. macro_rules! adr_lr_then_bx_to { (reg=$reg_name:expr, label_id=$label:expr) => { concat!( concat!("adr lr, ", $label, "f\n"), concat!("bx ", $reg_name, "\n"), concat!($label, ":\n"), ) }; } /// Expands to the asm line to set the control bits of CPSR. /// /// * Can only be used in `a32` /// * Only sets the control bits, all other bits (eg: flag bits) are unchanged. /// /// Currently, not all possible patterns are covered by this macro, just the /// patterns needed by this runtime when it was written. In general, any of the /// five CPU modes can be combined with irq and fiq masking each being either /// off or on. If a desired combination is missing just add it. macro_rules! set_cpu_control { // CPSR low bits are: `I F T MMMMM`, and T must always be left as 0. // * 0b10011: Supervisor (SVC) // * 0b11111: System (SYS) (System, irq_masked: false, fiq_masked: false) => { "msr CPSR_c, #0b00011111\n" }; (Supervisor, irq_masked: true, fiq_masked: false) => { "msr CPSR_c, #0b10010010\n" }; } /// Performs the appropriate test, then either runs the block or jumps past it, /// depending on the test result. /// /// Currently supports: /// * `$reg == $op2` /// * `$reg != $op2` macro_rules! when { ($reg:literal == $op2:literal [label_id=$label:literal] { $($asm_line:expr),* $(,)? }) => { concat!( concat!("cmp ", $reg, ", ", $op2, "\n"), concat!("bne ", $label, "f\n"), $( concat!($asm_line, "\n") ),* , concat!($label, ":\n"), ) }; ($reg:literal != $op2:literal [label_id=$label:literal] { $($asm_line:expr),* $(,)? }) => { concat!( concat!("cmp ", $reg, ", ", $op2, "\n"), concat!("beq ", $label, "f\n"), $( concat!($asm_line, "\n") ),* , concat!($label, ":\n"), ) }; } /// The function pointer that the assembly runtime calls when an interrupt /// occurs. pub static RUST_IRQ_HANDLER: GbaCell> = GbaCell::new(None); const DMA_32_BIT_MEMCPY: DmaControl = DmaControl::new().with_transfer_32bit(true).with_enabled(true); const DMA3_OFFSET: usize = DMA3_SRC.as_usize() - 0x0400_0000; const IME_OFFSET: usize = IME.as_usize() - 0x0400_0000; #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".text.gba_rom_header"] unsafe extern "C" fn __start() -> ! { core::arch::asm!( "b 1f", ".space 0xE0", "1:", /* post header */ "mov r12, #{mmio_base}", "add r0, r12, #{waitcnt_offset}", "ldr r1, ={waitcnt_setting}", "strh r1, [r0]", /* iwram copy */ "ldr r4, =__iwram_word_copy_count", when!("r4" != "#0" [label_id=1] { "add r3, r12, #{dma3_offset}", "mov r5, #{dma3_setting}", "ldr r0, =__iwram_start", "ldr r2, =__iwram_position_in_rom", "str r2, [r3]", /* source */ "str r0, [r3, #4]", /* destination */ "strh r4, [r3, #8]", /* word count */ "strh r5, [r3, #10]", /* set control bits */ }), /* ewram copy */ "ldr r4, =__ewram_word_copy_count", when!("r4" != "#0" [label_id=1] { "add r3, r12, #{dma3_offset}", "mov r5, #{dma3_setting}", "ldr r0, =__ewram_start", "ldr r2, =__ewram_position_in_rom", "str r2, [r3]", /* source */ "str r0, [r3, #4]", /* destination */ "strh r4, [r3, #8]", /* word count */ "strh r5, [r3, #10]", /* set control bits */ }), /* bss zero */ "ldr r4, =__bss_word_clear_count", when!("r4" != "#0" [label_id=1] { "ldr r0, =__bss_start", "mov r2, #0", "2:", "str r2, [r0], #4", "subs r4, r4, #1", "bne 2b", }), /* assign the runtime irq handler */ "ldr r1, ={runtime_irq_handler}", "str r1, [r12, #-4]", /* ask for mGBA logging to be enabled. This should be harmless if we're not using mgba. */ "ldr r0, ={mgba_log_enable}", "ldr r1, ={mgba_logging_enable_request}", "str r1, [r0]", /* call to rust main */ "ldr r0, =main", "bx r0", // main shouldn't return, but if it does just SoftReset "swi #0", mmio_base = const 0x0400_0000, waitcnt_offset = const 0x204, waitcnt_setting = const 0x4317 /*sram8,r0:3.1,r1:4.2,r2:8.2,no_phi,prefetch*/, dma3_offset = const DMA3_OFFSET, dma3_setting = const DMA_32_BIT_MEMCPY.to_u16(), runtime_irq_handler = sym runtime_irq_handler, mgba_log_enable = const MGBA_LOG_ENABLE.as_usize(), mgba_logging_enable_request = const MGBA_LOGGING_ENABLE_REQUEST, options(noreturn) ) } #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.runtime.irq.handler"] unsafe extern "C" fn runtime_irq_handler() { // On Entry: r0 = 0x0400_0000 (mmio_base) core::arch::asm!( /* swap IME off, user can turn it back on if they want */ "add r12, r0, #{ime_offset}", "mov r3, #0", "swp r3, r3, [r12]", /* Read/Update IE and IF */ "ldr r0, [r12, #-8]", "and r0, r0, r0, LSR #16", "strh r0, [r12, #-6]", /* Read/Update BIOS_IF */ "sub r2, r12, #(0x208+8)", "ldrh r1, [r2]", "orr r1, r1, r0", "strh r1, [r2]", /* Call the Rust fn pointer (if set), using System mode */ "ldr r1, ={RUST_IRQ_HANDLER}", "ldr r1, [r1]", when!("r1" != "#0" [label_id=9] { with_spsr_held_in!("r2" { set_cpu_control!(System, irq_masked: false, fiq_masked: false), // Note(Lokathor): We are *SKIPPING* the part where we ensure that the // System stack pointer is aligned to 8 during the call to the rust // function. This is *technically* against the AAPCS ABI, but the GBA's // ARMv4T CPU does not even support any instructions that require an // alignment of 8. By not bothering to align the stack, we save about 5 // cycles total. Which is neat, but if this were on the DS (which has an // ARMv5TE CPU) you'd want to ensure the aligned stack. with_pushed_registers!("{{r2, r3, r12, lr}}", { adr_lr_then_bx_to!(reg="r1", label_id=1) }), set_cpu_control!(Supervisor, irq_masked: true, fiq_masked: false), }), }), /* Restore initial IME setting and return */ "swp r3, r3, [r12]", "bx lr", ime_offset = const IME_OFFSET, RUST_IRQ_HANDLER = sym RUST_IRQ_HANDLER, options(noreturn) ) } /// Returns 0 in `r0`, while placing the `numerator` into `r1`. /// /// This is written in that slightly strange way so that `div` function and /// `divmod` functions can share the same code path. /// /// See: [__aeabi_idiv0][aeabi-division-by-zero] /// /// [aeabi-division-by-zero]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#division-by-zero #[naked] #[no_mangle] #[instruction_set(arm::a32)] // this should literally never get called for real, so we leave it in ROM extern "C" fn __aeabi_idiv0(numerator: i32) -> i32 { unsafe { core::arch::asm!( // this comment stops rustfmt from making this a one-liner "mov r1, r0", "mov r0, #0", "bx lr", options(noreturn) ) } } /// Returns `u32 / u32` /// /// This implementation is *not* the fastest possible division, but it is /// extremely compact. /// /// See: [__aeabi_uidiv][aeabi-integer-32-32-division] /// /// [aeabi-integer-32-32-division]: /// https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#integer-32-32-32-division-functions #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.uidiv"] extern "C" fn __aeabi_uidiv(numerator: u32, denominator: u32) -> u32 { // Note(Lokathor): Other code in this module relies on being able to call this // function without affecting r12, so any future implementations of this code // **must not** destroy r12. unsafe { core::arch::asm!( // Check for divide by 0 "cmp r1, #0", "beq __aeabi_idiv0", // r3(shifted_denom) = denom "mov r3, r1", // while shifted_denom < (num>>1): shifted_denom =<< 1; "cmp r3, r0, lsr #1", "2:", "lslls r3, r3, #1", "cmp r3, r0, lsr #1", "bls 2b", // r0=quot(init 0), r1=denom, r2=num, r3=shifted_denom "mov r2, r0", "mov r0, #0", // subtraction loop "3:", "cmp r2, r3", "subcs r2, r2, r3", "adc r0, r0, r0", "mov r3, r3, lsr #1", "cmp r3, r1", "bcs 3b", "bx lr", options(noreturn) ) } } /// Returns `i32 / i32` /// /// See: [__aeabi_idiv][aeabi-integer-32-32-division] /// /// [aeabi-integer-32-32-division]: /// https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#integer-32-32-32-division-functions #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.idiv"] extern "C" fn __aeabi_idiv(numerator: i32, denominator: i32) -> u32 { unsafe { core::arch::asm!( // determine if `numerator` and `denominator` are the same sign "eor r12, r1, r0", // convert both values to their unsigned absolute value. "cmp r0, #0", "rsblt r0, r0, #0", "cmp r1, #0", "rsclt r1, r1, #0", with_pushed_registers!("{{lr}}", { // divide them using `u32` division (this will check for divide by 0) "bl __aeabi_uidiv", }), // if they started as different signs, flip the output's sign. "cmp r12, #0", "rsblt r0, r0, #0", "bx lr", options(noreturn) ) } } /// Returns `(u32 / u32, u32 % u32)` in `(r0, r1)`. /// /// The `u64` return value is a mild lie that gets Rust to grab up both the `r0` /// and `r1` values when the function returns. If you transmute the return value /// into `[u32; 2]` then you can separate the two parts of the return value, and /// it will have no runtime cost. /// /// See: [__aeabi_uidivmod][aeabi-integer-32-32-division] /// /// [aeabi-integer-32-32-division]: /// https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#integer-32-32-32-division-functions #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.uidivmod"] extern "C" fn __aeabi_uidivmod(numerator: u32, denominator: u32) -> u64 { unsafe { core::arch::asm!( // We need to save *both* input args until after the uidiv call. One of // them can be saved in `r12` because we know our uidiv doesn't actually // touch `r12`, while the other will be pushed onto the stack along with // `lr`. Since the function's output will be in `r0`, we push/pop `r1`. "mov r12, r0", with_pushed_registers!("{{r1, lr}}", { "bl __aeabi_uidiv", }), // Now r0 holds the `quot`, and we use it along with the input args to // calculate the `rem`. "mul r2, r0, r1", "sub r1, r12, r2", "bx lr", options(noreturn) ) } } /// Returns `(i32 / i32, i32 % i32)` in `(r0, r1)`. /// /// The `u64` return value is a mild lie that gets Rust to grab up both the `r0` /// and `r1` values when the function returns. If you transmute the return value /// into `[i32; 2]` then you can separate the two parts of the return value, and /// it will have no runtime cost. /// /// See: [__aeabi_idivmod][aeabi-integer-32-32-division] /// /// [aeabi-integer-32-32-division]: /// https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#integer-32-32-32-division-functions #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.idivmod"] extern "C" fn __aeabi_idivmod(numerator: i32, denominator: i32) -> u64 { unsafe { core::arch::asm!( with_pushed_registers!("{{r4, r5, lr}}", { // store old numerator then make it the unsigned absolute "movs r4, r0", "rsblt r0, r0, #0", // store old denominator then make it the unsigned absolute "movs r5, r1", "rsblt r1, r1, #0", // divmod using unsigned. "bl __aeabi_uidivmod", // if signs started opposite, quot becomes negative "eors r12, r4, r5", "rsblt r0, r0, #0", // if numerator started negative, rem is negative "cmp r4, #0", "rsblt r1, r1, #0", }), "bx lr", options(noreturn) ) } } /// Reads 4 bytes, starting at the address given. /// /// See [__aeabi_uread4] /// /// [__aeabi_uread4]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.uread4"] unsafe extern "C" fn __aeabi_uread4(address: *const c_void) -> u32 { core::arch::asm!( "ldrb r2, [r0]", "ldrb r3, [r0, #1]", "orr r2, r2, r3, lsl #8", "ldrb r3, [r0, #2]", "orr r2, r2, r3, lsl #16", "ldrb r3, [r0, #3]", "orr r2, r2, r3, lsl #24", "mov r0, r2", "bx lr", options(noreturn), ) } /// Writes 4 bytes, starting at the address given. /// /// See [__aeabi_uwrite4] /// /// [__aeabi_uwrite4]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.uwrite4"] unsafe extern "C" fn __aeabi_uwrite4(value: u32, address: *mut c_void) { core::arch::asm!( "strb r0, [r1]", "lsr r2, r0, #8", "strb r2, [r1, #1]", "lsr r2, r2, #8", "strb r2, [r1, #2]", "lsr r2, r2, #8", "strb r2, [r1, #3]", "bx lr", options(noreturn), ) } /// Reads 8 bytes, starting at the address given. /// /// See [__aeabi_uread8] /// /// [__aeabi_uread8]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.uread8"] unsafe extern "C" fn __aeabi_uread8(address: *const c_void) -> u64 { core::arch::asm!( "ldrb r1, [r0, #4]", "ldrb r2, [r0, #5]", "orr r1, r1, r2, lsl #8", "ldrb r2, [r0, #6]", "orr r1, r1, r2, lsl #16", "ldrb r2, [r0, #7]", "orr r1, r1, r2, lsl #24", "b __aeabi_uread4", options(noreturn), ) } /// Writes 8 bytes, starting at the address given. /// /// See [__aeabi_uwrite8] /// /// [__aeabi_uwrite8]: https://github.com/ARM-software/abi-aa/blob/main/rtabi32/rtabi32.rst#unaligned-memory-access #[naked] #[no_mangle] #[instruction_set(arm::a32)] #[link_section = ".iwram.aeabi.uwrite8"] unsafe extern "C" fn __aeabi_uwrite8(value: u64, address: *mut c_void) { core::arch::asm!( "strb r0, [r2]", "lsr r3, r0, #8", "strb r3, [r2, #1]", "lsr r3, r3, #8", "strb r3, [r2, #2]", "lsr r3, r3, #8", "strb r3, [r2, #3]", "strb r1, [r2, #4]", "lsr r3, r1, #8", "strb r3, [r2, #5]", "lsr r3, r3, #8", "strb r3, [r2, #6]", "lsr r3, r3, #8", "strb r3, [r2, #7]", "bx lr", options(noreturn), ) } /// Provides a `libc` styled memory copy (transfer between exclusive regions). /// /// This has mild overhead compared to calling [`__aeabi_memcpy`], prefer that /// function when possible. /// /// * **Returns:** the original `dest` pointer. #[inline] #[no_mangle] pub unsafe extern "C" fn memcpy( dest: *mut u8, src: *const u8, byte_count: usize, ) -> *mut u8 { __aeabi_memcpy(dest, src, byte_count); dest } /// Provides a `libc` styled memory move (transfer between non-exclusive /// regions). /// /// This has mild overhead compared to calling [`__aeabi_memmove`], prefer that /// function when possible. /// /// * **Returns:** the original `dest` pointer. #[inline] #[no_mangle] pub unsafe extern "C" fn memmove( dest: *mut u8, src: *const u8, byte_count: usize, ) -> *mut u8 { __aeabi_memmove(dest, src, byte_count); dest } /// Provides a `libc` styled memory set (assign `u8` in `byte` to the entire /// region). /// /// This has mild overhead compared to calling [`__aeabi_memset`], prefer that /// function when possible. Note that this function and that function have /// slightly different argument ordering, though the compiler won't let you mess /// it up like might happen in C. /// /// * **Returns:** the original `dest` pointer. #[inline] #[no_mangle] pub unsafe extern "C" fn memset( dest: *mut u8, byte: i32, byte_count: usize, ) -> *mut u8 { __aeabi_memset(dest, byte_count, byte); dest } extern "C" { /// Memory transfer between *exclusive* regions. /// /// There are no alignment requirements for the pointers. This will /// automatically detect when pointers are sufficiently aligned to use `u16` /// or `u32` transfers, instead of always using `u8` transfers. /// /// This follows the AEABI convention of not returning the original `dest` /// pointer at the end of the function. This actually allows a minor /// optimization, so if you're going to call a memory copy function at all, /// prefer this over [`memcpy`]. pub fn __aeabi_memcpy(dest: *mut u8, src: *const u8, byte_count: usize); /// As [`__aeabi_memcpy`], but both pointers are assumed to be aligned to 4. pub fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, byte_count: usize); /// As [`__aeabi_memcpy`], but both pointers are assumed to be aligned to 8. pub fn __aeabi_memcpy8(dest: *mut u8, src: *const u8, byte_count: usize); /// As [`__aeabi_memcpy`], but *only* performs `u8` transfers. /// /// Importantly, this means that this function can be used to get data to/from /// the SRAM region. pub fn gba_sram_memcpy(dest: *mut u8, src: *const u8, byte_count: usize); /// Memory transfer between *non-exclusive* regions. /// /// As [`__aeabi_memcpy`], but the regions don't need to be exclusive. pub fn __aeabi_memmove(dest: *mut u8, src: *const u8, byte_count: usize); /// As [`__aeabi_memmove`], but both pointers are assumed to be aligned to 4. pub fn __aeabi_memmove4(dest: *mut u8, src: *const u8, byte_count: usize); /// As [`__aeabi_memmove`], but both pointers are assumed to be aligned to 8. pub fn __aeabi_memmove8(dest: *mut u8, src: *const u8, byte_count: usize); /// Sets all bytes in the region to the value given. /// /// For historical reasons, the "byte" passed in is passed as an `i32`. Still, /// only the low 8 bits of the value are kept and written to the region. /// /// There are no alignment requirements for the pointer. This will /// automatically detect when pointer is sufficiently aligned to use `u16` or /// `u32` writes, instead of always using `u8` writes. /// /// This follows the AEABI convention of not returning the original `dest` /// pointer at the end of the function. This actually allows a minor /// optimization, so if you're going to call a memory copy function at all, /// prefer this over [`memcpy`]. pub fn __aeabi_memset(dest: *mut u8, byte_count: usize, byte: i32); /// As [`__aeabi_memset`], but both pointers are assumed to be aligned to 4. pub fn __aeabi_memset4(dest: *mut u8, byte_count: usize, byte: i32); /// As [`__aeabi_memset`], but both pointers are assumed to be aligned to 8. pub fn __aeabi_memset8(dest: *mut u8, byte_count: usize, byte: i32); /// Sets all bytes in the region to 0. /// /// There are no alignment requirements for the pointer. This will /// automatically detect when the pointer is sufficiently aligned to use `u16` /// or `u32` writes, instead of always using `u8` writes. pub fn __aeabi_memclr(dest: *mut u8, byte_count: usize); /// As [`__aeabi_memclr`], but the pointer is assumed to be aligned to 4. pub fn __aeabi_memclr4(dest: *mut u8, byte_count: usize); /// As [`__aeabi_memclr`], but the pointer is assumed to be aligned to 8. pub fn __aeabi_memclr8(dest: *mut u8, byte_count: usize); } core::arch::global_asm! { emit_a32_code!{ put_code_in_section!(".iwram.aeabi.memory.copy.and.move", { "__aeabi_memmove8:", "__aeabi_memmove4:", "__aeabi_memmove:", "cmp r0, r1", // if d > s, reverse copy "bgt .L_r_copy_gain_align", // else fallthrough "__aeabi_memcpy:", ".L_f_copy_gain_align:", "eor r3, r0, r1", "lsls r3, r3, #31", "bmi .L_f_copy_max_coalign1", "bcs .L_f_copy_max_coalign2", // else fallthrough ".L_f_copy_max_coalign4:", "tst r0, #3", "bne .L_f_copy_fixup4", // else fallthrough "__aeabi_memcpy8:", "__aeabi_memcpy4:", ".L_f_copy_coalign4_assured:", "cmp r2, #32", "bge .L_f_copy_block", ".L_f_copy_post_block:", // copy 4 words, two at a time "tst r2, #0b10000", "ldmne r1!, {r3, r12}", "stmne r0!, {r3, r12}", "ldmne r1!, {r3, r12}", "stmne r0!, {r3, r12}", "bics r2, r2, #0b10000", "bxeq lr", // copy 2 and/or 1 words "lsls r3, r2, #29", "ldmcs r1!, {r3, r12}", "stmcs r0!, {r3, r12}", "ldrmi r3, [r1], #4", "strmi r3, [r0], #4", "bics r2, r2, #0b1100", "bxeq lr", // copy halfword and/or byte "lsls r3, r2, #31", "ldrhcs r3, [r1], #2", "strhcs r3, [r0], #2", "ldrbmi r3, [r1], #1", "strbmi r3, [r0], #1", "bx lr", ".L_f_copy_block:", with_pushed_registers!("{r4-r9}", { "1:", "subs r2, r2, #32", "ldmge r1!, {r3-r9, r12}", "stmge r0!, {r3-r9, r12}", "bgt 1b", }), "bxeq lr", "b .L_f_copy_post_block", ".L_f_copy_fixup4:", "cmp r2, #7", // if count <= (fix+word): just byte copy "ble .L_f_copy_max_coalign1", "lsls r3, r0, #31", "submi r2, r2, #1", "ldrbmi r3, [r1], #1", "strbmi r3, [r0], #1", "subcs r2, r2, #2", "ldrhcs r3, [r1], #2", "strhcs r3, [r0], #2", "b .L_f_copy_coalign4_assured", ".L_f_copy_max_coalign2:", "tst r0, #1", "bne .L_f_copy_fixup2", ".L_f_copy_coalign2_assured:", "1:", "subs r2, r2, #2", "ldrhge r3, [r1], #2", "strhge r3, [r0], #2", "bgt 1b", "bxeq lr", "tst r2, #1", "ldrbne r3, [r1], #1", "strbne r3, [r0], #1", "bx lr", ".L_f_copy_fixup2:", "cmp r2, #3", // if count <= (fix+halfword): just byte copy "ble .L_f_copy_max_coalign1", "sub r2, r2, #1", "ldrb r3, [r1], #1", "strb r3, [r0], #1", "b .L_f_copy_coalign2_assured", "gba_sram_memcpy:", ".L_f_copy_max_coalign1:", "1:", "subs r2, r2, #1", "ldrbge r3, [r1], #1", "strbge r3, [r0], #1", "bgt 1b", "bx lr", ".L_r_copy_gain_align:", "add r0, r0, r2", "add r1, r1, r2", "eor r3, r0, r1", "lsls r3, r3, #31", "bmi .L_r_copy_max_coalign1", "bcs .L_r_copy_max_coalign2", // else fallthrough ".L_r_copy_max_coalign4:", "tst r0, #3", "bne .L_r_copy_fixup4", ".L_r_copy_coalign4_assured:", "cmp r2, #32", "bge .L_r_copy_block", ".L_r_copy_post_block:", // copy 4 words, two at a time "tst r2, #0b10000", "ldmdbne r1!, {r3, r12}", "stmdbne r0!, {r3, r12}", "ldmdbne r1!, {r3, r12}", "stmdbne r0!, {r3, r12}", "bics r2, r2, #0b10000", "bxeq lr", // copy 2 and/or 1 words "lsls r3, r2, #29", "ldmdbcs r1!, {r3, r12}", "stmdbcs r0!, {r3, r12}", "ldrmi r3, [r1, #-4]!", "strmi r3, [r0, #-4]!", "bxeq lr", "lsls r2, r2, #31", "ldrhcs r3, [r1, #-2]!", "strhcs r3, [r0, #-2]!", "ldrbmi r3, [r1, #-1]!", "strbmi r3, [r0, #-1]!", "bx lr", ".L_r_copy_block:", with_pushed_registers!("{r4-r9}", { "1:", "subs r2, r2, #32", "ldmdbcs r1!, {r3-r9, r12}", "stmdbcs r0!, {r3-r9, r12}", "bgt 1b", }), "bxeq lr", "b .L_r_copy_post_block", ".L_r_copy_fixup4:", "cmp r2, #7", // if count <= (fix+word): just byte copy "ble .L_r_copy_max_coalign1", "lsls r3, r0, #31", "submi r2, r2, #1", "ldrbmi r3, [r1, #-1]!", "strbmi r3, [r0, #-1]!", "subcs r2, r2, #2", "ldrhcs r3, [r1, #-2]!", "strhcs r3, [r0, #-2]!", "b .L_r_copy_coalign4_assured", ".L_r_copy_max_coalign2:", "tst r0, #1", "bne .L_r_copy_fixup2", ".L_r_copy_coalign2_assured:", "1:", "subs r2, r2, #2", "ldrhge r3, [r1, #-2]!", "strhge r3, [r0, #-2]!", "bgt 1b", "bxeq lr", "tst r2, #1", "ldrbne r3, [r1, #-1]!", "strbne r3, [r0, #-1]!", "bx lr", ".L_r_copy_fixup2:", "cmp r2, #3", // if count <= (fix+halfword): just byte copy "ble .L_r_copy_max_coalign1", "sub r2, r2, #1", "ldrb r3, [r1, #-1]!", "strb r3, [r0, #-1]!", "b .L_r_copy_coalign2_assured", ".L_r_copy_max_coalign1:", "1:", "subs r2, r2, #1", "ldrbge r3, [r1, #-1]!", "strbge r3, [r0, #-1]!", "bgt 1b", "bx lr", }), }, options(raw) } core::arch::global_asm! { emit_a32_code!{ put_code_in_section!(".iwram.aeabi.memory.clear.and.set", { "__aeabi_memclr8:", "__aeabi_memclr4:", "mov r2, #0", "mov r3, #0", "b .L_memset_check_for_block_work", "__aeabi_memclr:", "mov r2, #0", "__aeabi_memset8:", "__aeabi_memset4:", "__aeabi_memset:", // r0(dest), r1(count), r2(byte) // duplicate the byte across all of r2 and r3 "and r2, r2, #0xFF", "orr r2, r2, r2, lsl #8", "orr r2, r2, r2, lsl #16", "mov r3, r2", // for 'sets' too small to fixup we just byte loop "cmp r1, #3", "ble .L_memset_byte_loop", // carry/sign test on the address, then do fixup "lsls r12, r0, #31", "submi r1, r1, #1", "strbmi r2, [r0], #1", "subcs r1, r1, #2", "strhcs r2, [r0], #2", ".L_memset_check_for_block_work:", "cmp r1, #32", "bge .L_memset_block_work", ".L_memset_post_block_work:", // set 4 words "tst r1, #0b10000", "stmne r0!, {r2, r3}", "stmne r0!, {r2, r3}", // set 2 and/or 1 words "lsls r12, r1, #29", "stmcs r0!, {r2, r3}", "strmi r2, [r0], #4", // set halfword and/or byte "lsls r12, r1, #31", "strhcs r2, [r0], #2", "strbmi r2, [r0], #1", "bx lr", ".L_memset_block_work:", with_pushed_registers!("{r4-r9}", { "mov r4, r2", "mov r5, r2", "mov r6, r2", "mov r7, r2", "mov r8, r2", "mov r9, r2", "1:", "subs r1, r1, #32", "stmge r0!, {r2-r9}", "bgt 1b", }), "bxeq lr", "b .L_memset_post_block_work", ".L_memset_byte_loop:", "1:", "subs r1, r1, #1", "strbcs r2, [r0], #1", "bgt 1b", "bx lr", }), }, options(raw), }