Merge pull request #256 from gwilymk/update-agbabi

Update agbabi's memcpy
This commit is contained in:
Gwilym Kuiper 2022-07-14 22:54:52 +01:00 committed by GitHub
commit bb0a88973f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 103 additions and 58 deletions

View file

@ -12,6 +12,7 @@ fn main() {
println!("cargo:rerun-if-changed=gba.ld");
println!("cargo:rerun-if-changed=gba_mb.ld");
println!("cargo:rerun-if-changed=src/asm_include.s");
println!("cargo:rerun-if-changed=src/agbabi/macros.inc");
println!("cargo:rerun-if-changed=gfx/test_logo.png");
println!("cargo:rerun-if-changed=build.rs");

43
agb/src/agbabi/macros.inc Normal file
View file

@ -0,0 +1,43 @@
/*
===============================================================================
ARM assembly support macros
Copyright (C) 2021-2022 agbabi contributors
For conditions of distribution and use, see copyright notice in LICENSE.md
===============================================================================
*/
// Shift and test upper two bits, clobbering \reg
// Use mi for first bit, cs for second bit
.macro joaobapt_test_lsl reg shift = #0
movs \reg, \reg, lsl \shift
.endm
// Test lowest two bits, clobbering \reg
// Use mi for low bit, cs for high bit
.macro joaobapt_test reg
joaobapt_test_lsl \reg, #31
.endm
// Test lowest two bits of \src, result stored in \dst
// Use mi for low bit, cs for high bit
.macro joaobapt_test_into dst, src
movs \dst, \src, lsl #31
.endm
// Branches depending on lowest two bits, clobbering \reg
// b_mi = low bit case, b_cs = high bit case
.macro joaobapt_switch reg, b_mi, b_cs
joaobapt_test \reg
bmi \b_mi
bcs \b_cs
.endm
// Branches depending on alignment of \a and \b, clobbering \scratch
// b_byte = off-by-byte case, b_half = off-by-half case
.macro align_switch a, b, scratch, b_byte, b_half
eor \scratch, \a, \b
joaobapt_switch \scratch, \b_byte, \b_half
.endm

View file

@ -1,19 +1,18 @@
/*
===============================================================================
ABI:
__aeabi_memcpy, __aeabi_memcpy4, __aeabi_memcpy8
Standard:
memcpy
Support:
__agbabi_memcpy2
__agbabi_memcpy2, __agbabi_memcpy1
Copyright (C) 2021-2022 agbabi contributors
For conditions of distribution and use, see copyright notice in LICENSE.md
===============================================================================
*/
.include "src/agbabi/macros.inc"
.arm
.align 2
@ -22,95 +21,97 @@
__agbabi_memcpy:
.global __aeabi_memcpy
__aeabi_memcpy:
// Check pointer alignment
eor r3, r1, r0
// JoaoBapt carry & sign bit test
movs r3, r3, lsl #31
bmi .Lcopy1
bcs .Lcopy2
// >6-bytes is roughly the threshold when byte-by-byte copy is slower
cmp r2, #6
ble __agbabi_memcpy1
.Lcopy4:
// Handle <= 2 byte copies byte-by-byte
cmp r2, #2
ble .Lcopy1
align_switch r0, r1, r3, __agbabi_memcpy1, .Lcopy_halves
// Copy half and byte head
rsb r3, r0, #4
// JoaoBapt carry & sign bit test
movs r3, r3, lsl #31
// Check if r0 (or r1) needs word aligning
rsbs r3, r0, #4
joaobapt_test r3
// Copy byte head to align
ldrmib r3, [r1], #1
strmib r3, [r0], #1
submi r2, r2, #1
// r0, r1 are now half aligned
// Copy half head to align
ldrcsh r3, [r1], #2
strcsh r3, [r0], #2
subcs r2, r2, #2
// Fallthrough
// r0, r1 are now word aligned
.global __aeabi_memcpy8
__aeabi_memcpy8:
.global __aeabi_memcpy4
__aeabi_memcpy4:
// Copy 8 words
movs r12, r2, lsr #5
beq .Lskip32
lsl r3, r12, #5
sub r2, r2, r3
cmp r2, #32
blt .Lcopy_words
// Word aligned, 32-byte copy
push {r4-r10}
.LcopyWords8:
ldmia r1!, {r3-r10}
stmia r0!, {r3-r10}
subs r12, r12, #1
bne .LcopyWords8
.Lloop_32:
subs r2, r2, #32
ldmgeia r1!, {r3-r10}
stmgeia r0!, {r3-r10}
bgt .Lloop_32
pop {r4-r10}
.Lskip32:
bxeq lr
// Copy words
movs r12, r2, lsr #2
.LcopyWords:
subs r12, r12, #1
ldrhs r3, [r1], #4
strhs r3, [r0], #4
bhs .LcopyWords
// < 32 bytes remaining to be copied
add r2, r2, #32
// Copy half and byte tail
// JoaoBapt carry & sign bit test
movs r3, r2, lsl #31
.Lcopy_words:
cmp r2, #4
blt .Lcopy_halves
.Lloop_4:
subs r2, r2, #4
ldrge r3, [r1], #4
strge r3, [r0], #4
bgt .Lloop_4
bxeq lr
// Copy byte & half tail
// This test still works when r2 is negative
joaobapt_test r2
// Copy half
ldrcsh r3, [r1], #2
strcsh r3, [r0], #2
// Copy byte
ldrmib r3, [r1]
strmib r3, [r0]
bx lr
.Lcopy2:
// Copy byte head
.Lcopy_halves:
// Copy byte head to align
tst r0, #1
cmpne r2, #0
ldrneb r3, [r1], #1
strneb r3, [r0], #1
subne r2, r2, #1
// Fallthrough
// r0, r1 are now half aligned
.global __agbabi_memcpy2
__agbabi_memcpy2:
// Copy halves
movs r12, r2, lsr #1
.LcopyHalves:
subs r12, r12, #1
ldrhsh r3, [r1], #2
strhsh r3, [r0], #2
bhs .LcopyHalves
subs r2, r2, #2
ldrgeh r3, [r1], #2
strgeh r3, [r0], #2
bgt __agbabi_memcpy2
bxeq lr
// Copy byte tail
tst r2, #1
ldrneb r3, [r1]
strneb r3, [r0]
adds r2, r2, #1
ldreqb r3, [r1]
streqb r3, [r0]
bx lr
.Lcopy1:
.global __agbabi_memcpy1
__agbabi_memcpy1:
subs r2, r2, #1
ldrhsb r3, [r1], #1
strhsb r3, [r0], #1
bhs .Lcopy1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
bgt __agbabi_memcpy1
bx lr
.section .iwram.memcpy, "ax", %progbits