Update agbabi's memcpy

This commit is contained in:
Gwilym Kuiper 2022-07-14 22:43:41 +01:00
parent 6a8aeeb3e8
commit a9da4a65f9
3 changed files with 103 additions and 58 deletions

View file

@ -12,6 +12,7 @@ fn main() {
println!("cargo:rerun-if-changed=gba.ld"); println!("cargo:rerun-if-changed=gba.ld");
println!("cargo:rerun-if-changed=gba_mb.ld"); println!("cargo:rerun-if-changed=gba_mb.ld");
println!("cargo:rerun-if-changed=src/asm_include.s"); println!("cargo:rerun-if-changed=src/asm_include.s");
println!("cargo:rerun-if-changed=src/agbabi/macros.inc");
println!("cargo:rerun-if-changed=gfx/test_logo.png"); println!("cargo:rerun-if-changed=gfx/test_logo.png");
println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rerun-if-changed=build.rs");

43
agb/src/agbabi/macros.inc Normal file
View file

@ -0,0 +1,43 @@
/*
===============================================================================
ARM assembly support macros
Copyright (C) 2021-2022 agbabi contributors
For conditions of distribution and use, see copyright notice in LICENSE.md
===============================================================================
*/
// Shift and test upper two bits, clobbering \reg
// Use mi for first bit, cs for second bit
.macro joaobapt_test_lsl reg shift = #0
movs \reg, \reg, lsl \shift
.endm
// Test lowest two bits, clobbering \reg
// Use mi for low bit, cs for high bit
.macro joaobapt_test reg
joaobapt_test_lsl \reg, #31
.endm
// Test lowest two bits of \src, result stored in \dst
// Use mi for low bit, cs for high bit
.macro joaobapt_test_into dst, src
movs \dst, \src, lsl #31
.endm
// Branches depending on lowest two bits, clobbering \reg
// b_mi = low bit case, b_cs = high bit case
.macro joaobapt_switch reg, b_mi, b_cs
joaobapt_test \reg
bmi \b_mi
bcs \b_cs
.endm
// Branches depending on alignment of \a and \b, clobbering \scratch
// b_byte = off-by-byte case, b_half = off-by-half case
.macro align_switch a, b, scratch, b_byte, b_half
eor \scratch, \a, \b
joaobapt_switch \scratch, \b_byte, \b_half
.endm

View file

@ -1,19 +1,18 @@
/* /*
=============================================================================== ===============================================================================
ABI: ABI:
__aeabi_memcpy, __aeabi_memcpy4, __aeabi_memcpy8 __aeabi_memcpy, __aeabi_memcpy4, __aeabi_memcpy8
Standard: Standard:
memcpy memcpy
Support: Support:
__agbabi_memcpy2 __agbabi_memcpy2, __agbabi_memcpy1
Copyright (C) 2021-2022 agbabi contributors Copyright (C) 2021-2022 agbabi contributors
For conditions of distribution and use, see copyright notice in LICENSE.md For conditions of distribution and use, see copyright notice in LICENSE.md
=============================================================================== ===============================================================================
*/ */
.include "src/agbabi/macros.inc"
.arm .arm
.align 2 .align 2
@ -22,95 +21,97 @@
__agbabi_memcpy: __agbabi_memcpy:
.global __aeabi_memcpy .global __aeabi_memcpy
__aeabi_memcpy: __aeabi_memcpy:
// Check pointer alignment // >6-bytes is roughly the threshold when byte-by-byte copy is slower
eor r3, r1, r0 cmp r2, #6
// JoaoBapt carry & sign bit test ble __agbabi_memcpy1
movs r3, r3, lsl #31
bmi .Lcopy1
bcs .Lcopy2
.Lcopy4: align_switch r0, r1, r3, __agbabi_memcpy1, .Lcopy_halves
// Handle <= 2 byte copies byte-by-byte
cmp r2, #2
ble .Lcopy1
// Copy half and byte head // Check if r0 (or r1) needs word aligning
rsb r3, r0, #4 rsbs r3, r0, #4
// JoaoBapt carry & sign bit test joaobapt_test r3
movs r3, r3, lsl #31
// Copy byte head to align
ldrmib r3, [r1], #1 ldrmib r3, [r1], #1
strmib r3, [r0], #1 strmib r3, [r0], #1
submi r2, r2, #1 submi r2, r2, #1
// r0, r1 are now half aligned
// Copy half head to align
ldrcsh r3, [r1], #2 ldrcsh r3, [r1], #2
strcsh r3, [r0], #2 strcsh r3, [r0], #2
subcs r2, r2, #2 subcs r2, r2, #2
// Fallthrough // r0, r1 are now word aligned
.global __aeabi_memcpy8 .global __aeabi_memcpy8
__aeabi_memcpy8: __aeabi_memcpy8:
.global __aeabi_memcpy4 .global __aeabi_memcpy4
__aeabi_memcpy4: __aeabi_memcpy4:
// Copy 8 words cmp r2, #32
movs r12, r2, lsr #5 blt .Lcopy_words
beq .Lskip32
lsl r3, r12, #5 // Word aligned, 32-byte copy
sub r2, r2, r3
push {r4-r10} push {r4-r10}
.LcopyWords8: .Lloop_32:
ldmia r1!, {r3-r10} subs r2, r2, #32
stmia r0!, {r3-r10} ldmgeia r1!, {r3-r10}
subs r12, r12, #1 stmgeia r0!, {r3-r10}
bne .LcopyWords8 bgt .Lloop_32
pop {r4-r10} pop {r4-r10}
.Lskip32: bxeq lr
// Copy words // < 32 bytes remaining to be copied
movs r12, r2, lsr #2 add r2, r2, #32
.LcopyWords:
subs r12, r12, #1
ldrhs r3, [r1], #4
strhs r3, [r0], #4
bhs .LcopyWords
// Copy half and byte tail .Lcopy_words:
// JoaoBapt carry & sign bit test cmp r2, #4
movs r3, r2, lsl #31 blt .Lcopy_halves
.Lloop_4:
subs r2, r2, #4
ldrge r3, [r1], #4
strge r3, [r0], #4
bgt .Lloop_4
bxeq lr
// Copy byte & half tail
// This test still works when r2 is negative
joaobapt_test r2
// Copy half
ldrcsh r3, [r1], #2 ldrcsh r3, [r1], #2
strcsh r3, [r0], #2 strcsh r3, [r0], #2
// Copy byte
ldrmib r3, [r1] ldrmib r3, [r1]
strmib r3, [r0] strmib r3, [r0]
bx lr bx lr
.Lcopy2: .Lcopy_halves:
// Copy byte head // Copy byte head to align
tst r0, #1 tst r0, #1
cmpne r2, #0
ldrneb r3, [r1], #1 ldrneb r3, [r1], #1
strneb r3, [r0], #1 strneb r3, [r0], #1
subne r2, r2, #1 subne r2, r2, #1
// Fallthrough // r0, r1 are now half aligned
.global __agbabi_memcpy2 .global __agbabi_memcpy2
__agbabi_memcpy2: __agbabi_memcpy2:
// Copy halves subs r2, r2, #2
movs r12, r2, lsr #1 ldrgeh r3, [r1], #2
.LcopyHalves: strgeh r3, [r0], #2
subs r12, r12, #1 bgt __agbabi_memcpy2
ldrhsh r3, [r1], #2 bxeq lr
strhsh r3, [r0], #2
bhs .LcopyHalves
// Copy byte tail // Copy byte tail
tst r2, #1 adds r2, r2, #1
ldrneb r3, [r1] ldreqb r3, [r1]
strneb r3, [r0] streqb r3, [r0]
bx lr bx lr
.Lcopy1: .global __agbabi_memcpy1
__agbabi_memcpy1:
subs r2, r2, #1 subs r2, r2, #1
ldrhsb r3, [r1], #1 ldrgeb r3, [r1], #1
strhsb r3, [r0], #1 strgeb r3, [r0], #1
bhs .Lcopy1 bgt __agbabi_memcpy1
bx lr bx lr
.section .iwram.memcpy, "ax", %progbits .section .iwram.memcpy, "ax", %progbits