From a9da4a65f986cbb354d50b1cc2fdf6273d44c1c4 Mon Sep 17 00:00:00 2001 From: Gwilym Kuiper Date: Thu, 14 Jul 2022 22:43:41 +0100 Subject: [PATCH] Update agbabi's memcpy --- agb/build.rs | 1 + agb/src/agbabi/macros.inc | 43 ++++++++++++++ agb/src/agbabi/memcpy.s | 117 +++++++++++++++++++------------------- 3 files changed, 103 insertions(+), 58 deletions(-) create mode 100644 agb/src/agbabi/macros.inc diff --git a/agb/build.rs b/agb/build.rs index 9c8a3a66..e7b43045 100644 --- a/agb/build.rs +++ b/agb/build.rs @@ -12,6 +12,7 @@ fn main() { println!("cargo:rerun-if-changed=gba.ld"); println!("cargo:rerun-if-changed=gba_mb.ld"); println!("cargo:rerun-if-changed=src/asm_include.s"); + println!("cargo:rerun-if-changed=src/agbabi/macros.inc"); println!("cargo:rerun-if-changed=gfx/test_logo.png"); println!("cargo:rerun-if-changed=build.rs"); diff --git a/agb/src/agbabi/macros.inc b/agb/src/agbabi/macros.inc new file mode 100644 index 00000000..8663ab1c --- /dev/null +++ b/agb/src/agbabi/macros.inc @@ -0,0 +1,43 @@ +/* +=============================================================================== + + ARM assembly support macros + + Copyright (C) 2021-2022 agbabi contributors + For conditions of distribution and use, see copyright notice in LICENSE.md + +=============================================================================== +*/ + +// Shift and test upper two bits, clobbering \reg +// Use mi for first bit, cs for second bit +.macro joaobapt_test_lsl reg shift = #0 + movs \reg, \reg, lsl \shift +.endm + +// Test lowest two bits, clobbering \reg +// Use mi for low bit, cs for high bit +.macro joaobapt_test reg + joaobapt_test_lsl \reg, #31 +.endm + +// Test lowest two bits of \src, result stored in \dst +// Use mi for low bit, cs for high bit +.macro joaobapt_test_into dst, src + movs \dst, \src, lsl #31 +.endm + +// Branches depending on lowest two bits, clobbering \reg +// b_mi = low bit case, b_cs = high bit case +.macro joaobapt_switch reg, b_mi, b_cs + joaobapt_test \reg + bmi \b_mi + bcs \b_cs +.endm + +// Branches depending on alignment of \a and \b, clobbering \scratch +// b_byte = off-by-byte case, b_half = off-by-half case +.macro align_switch a, b, scratch, b_byte, b_half + eor \scratch, \a, \b + joaobapt_switch \scratch, \b_byte, \b_half +.endm diff --git a/agb/src/agbabi/memcpy.s b/agb/src/agbabi/memcpy.s index b043c325..08baab52 100644 --- a/agb/src/agbabi/memcpy.s +++ b/agb/src/agbabi/memcpy.s @@ -1,19 +1,18 @@ /* =============================================================================== - ABI: __aeabi_memcpy, __aeabi_memcpy4, __aeabi_memcpy8 Standard: memcpy Support: - __agbabi_memcpy2 - + __agbabi_memcpy2, __agbabi_memcpy1 Copyright (C) 2021-2022 agbabi contributors For conditions of distribution and use, see copyright notice in LICENSE.md - =============================================================================== */ +.include "src/agbabi/macros.inc" + .arm .align 2 @@ -22,95 +21,97 @@ __agbabi_memcpy: .global __aeabi_memcpy __aeabi_memcpy: - // Check pointer alignment - eor r3, r1, r0 - // JoaoBapt carry & sign bit test - movs r3, r3, lsl #31 - bmi .Lcopy1 - bcs .Lcopy2 + // >6-bytes is roughly the threshold when byte-by-byte copy is slower + cmp r2, #6 + ble __agbabi_memcpy1 -.Lcopy4: - // Handle <= 2 byte copies byte-by-byte - cmp r2, #2 - ble .Lcopy1 + align_switch r0, r1, r3, __agbabi_memcpy1, .Lcopy_halves - // Copy half and byte head - rsb r3, r0, #4 - // JoaoBapt carry & sign bit test - movs r3, r3, lsl #31 + // Check if r0 (or r1) needs word aligning + rsbs r3, r0, #4 + joaobapt_test r3 + + // Copy byte head to align ldrmib r3, [r1], #1 strmib r3, [r0], #1 submi r2, r2, #1 + // r0, r1 are now half aligned + + // Copy half head to align ldrcsh r3, [r1], #2 strcsh r3, [r0], #2 subcs r2, r2, #2 - // Fallthrough + // r0, r1 are now word aligned .global __aeabi_memcpy8 __aeabi_memcpy8: .global __aeabi_memcpy4 __aeabi_memcpy4: - // Copy 8 words - movs r12, r2, lsr #5 - beq .Lskip32 - lsl r3, r12, #5 - sub r2, r2, r3 + cmp r2, #32 + blt .Lcopy_words + + // Word aligned, 32-byte copy push {r4-r10} -.LcopyWords8: - ldmia r1!, {r3-r10} - stmia r0!, {r3-r10} - subs r12, r12, #1 - bne .LcopyWords8 +.Lloop_32: + subs r2, r2, #32 + ldmgeia r1!, {r3-r10} + stmgeia r0!, {r3-r10} + bgt .Lloop_32 pop {r4-r10} -.Lskip32: + bxeq lr - // Copy words - movs r12, r2, lsr #2 -.LcopyWords: - subs r12, r12, #1 - ldrhs r3, [r1], #4 - strhs r3, [r0], #4 - bhs .LcopyWords + // < 32 bytes remaining to be copied + add r2, r2, #32 - // Copy half and byte tail - // JoaoBapt carry & sign bit test - movs r3, r2, lsl #31 +.Lcopy_words: + cmp r2, #4 + blt .Lcopy_halves +.Lloop_4: + subs r2, r2, #4 + ldrge r3, [r1], #4 + strge r3, [r0], #4 + bgt .Lloop_4 + bxeq lr + + // Copy byte & half tail + // This test still works when r2 is negative + joaobapt_test r2 + // Copy half ldrcsh r3, [r1], #2 strcsh r3, [r0], #2 + // Copy byte ldrmib r3, [r1] strmib r3, [r0] bx lr -.Lcopy2: - // Copy byte head +.Lcopy_halves: + // Copy byte head to align tst r0, #1 - cmpne r2, #0 ldrneb r3, [r1], #1 strneb r3, [r0], #1 subne r2, r2, #1 - // Fallthrough + // r0, r1 are now half aligned .global __agbabi_memcpy2 __agbabi_memcpy2: - // Copy halves - movs r12, r2, lsr #1 -.LcopyHalves: - subs r12, r12, #1 - ldrhsh r3, [r1], #2 - strhsh r3, [r0], #2 - bhs .LcopyHalves + subs r2, r2, #2 + ldrgeh r3, [r1], #2 + strgeh r3, [r0], #2 + bgt __agbabi_memcpy2 + bxeq lr // Copy byte tail - tst r2, #1 - ldrneb r3, [r1] - strneb r3, [r0] + adds r2, r2, #1 + ldreqb r3, [r1] + streqb r3, [r0] bx lr -.Lcopy1: + .global __agbabi_memcpy1 +__agbabi_memcpy1: subs r2, r2, #1 - ldrhsb r3, [r1], #1 - strhsb r3, [r0], #1 - bhs .Lcopy1 + ldrgeb r3, [r1], #1 + strgeb r3, [r0], #1 + bgt __agbabi_memcpy1 bx lr .section .iwram.memcpy, "ax", %progbits