mirror of
https://github.com/italicsjenga/rp-hal-boards.git
synced 2025-01-10 04:21:32 +11:00
Merge pull request #299 from Sizurka/divider-global-asm
Use optimized assembly for hardware division
This commit is contained in:
commit
ed0cda5269
|
@ -1,6 +1,71 @@
|
||||||
use super::Float;
|
use super::Float;
|
||||||
use crate::rom_data;
|
use crate::rom_data;
|
||||||
use crate::sio::save_divider;
|
|
||||||
|
// Make sure this stays as a separate call, because when it's inlined the
|
||||||
|
// compiler will move the save of the registers used to contain the divider
|
||||||
|
// state into the function prologue. That save and restore (push/pop) takes
|
||||||
|
// longer than the actual division, so doing it in the common case where
|
||||||
|
// they are not required wastes a lot of time.
|
||||||
|
#[inline(never)]
|
||||||
|
#[cold]
|
||||||
|
fn save_divider_and_call<F, R>(f: F) -> R
|
||||||
|
where
|
||||||
|
F: FnOnce() -> R,
|
||||||
|
{
|
||||||
|
let sio = unsafe { &(*pac::SIO::ptr()) };
|
||||||
|
|
||||||
|
// Since we can't save the signed-ness of the calculation, we have to make
|
||||||
|
// sure that there's at least an 8 cycle delay before we read the result.
|
||||||
|
// The Pico SDK ensures this by using a 6 cycle push and two 1 cycle reads.
|
||||||
|
// Since we can't be sure the Rust implementation will optimize to the same,
|
||||||
|
// just use an explicit wait.
|
||||||
|
while !sio.div_csr.read().ready().bit() {}
|
||||||
|
|
||||||
|
// Read the quotient last, since that's what clears the dirty flag
|
||||||
|
let dividend = sio.div_udividend.read().bits();
|
||||||
|
let divisor = sio.div_udivisor.read().bits();
|
||||||
|
let remainder = sio.div_remainder.read().bits();
|
||||||
|
let quotient = sio.div_quotient.read().bits();
|
||||||
|
|
||||||
|
// If we get interrupted here (before a write sets the DIRTY flag) its fine, since
|
||||||
|
// we have the full state, so the interruptor doesn't have to restore it. Once the
|
||||||
|
// write happens and the DIRTY flag is set, the interruptor becomes responsible for
|
||||||
|
// restoring our state.
|
||||||
|
let result = f();
|
||||||
|
|
||||||
|
// If we are interrupted here, then the interruptor will start an incorrect calculation
|
||||||
|
// using a wrong divisor, but we'll restore the divisor and result ourselves correctly.
|
||||||
|
// This sets DIRTY, so any interruptor will save the state.
|
||||||
|
sio.div_udividend.write(|w| unsafe { w.bits(dividend) });
|
||||||
|
// If we are interrupted here, the the interruptor may start the calculation using
|
||||||
|
// incorrectly signed inputs, but we'll restore the result ourselves.
|
||||||
|
// This sets DIRTY, so any interruptor will save the state.
|
||||||
|
sio.div_udivisor.write(|w| unsafe { w.bits(divisor) });
|
||||||
|
// If we are interrupted here, the interruptor will have restored everything but the
|
||||||
|
// quotient may be wrongly signed. If the calculation started by the above writes is
|
||||||
|
// still ongoing it is stopped, so it won't replace the result we're restoring.
|
||||||
|
// DIRTY and READY set, but only DIRTY matters to make the interruptor save the state.
|
||||||
|
sio.div_remainder.write(|w| unsafe { w.bits(remainder) });
|
||||||
|
// State fully restored after the quotient write. This sets both DIRTY and READY, so
|
||||||
|
// whatever we may have interrupted can read the result.
|
||||||
|
sio.div_quotient.write(|w| unsafe { w.bits(quotient) });
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn save_divider<F, R>(f: F) -> R
|
||||||
|
where
|
||||||
|
F: FnOnce() -> R,
|
||||||
|
{
|
||||||
|
let sio = unsafe { &(*pac::SIO::ptr()) };
|
||||||
|
if !sio.div_csr.read().dirty().bit() {
|
||||||
|
// Not dirty, so nothing is waiting for the calculation. So we can just
|
||||||
|
// issue it directly without a save/restore.
|
||||||
|
f()
|
||||||
|
} else {
|
||||||
|
save_divider_and_call(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
trait ROMDiv {
|
trait ROMDiv {
|
||||||
fn rom_div(self, b: Self) -> Self;
|
fn rom_div(self, b: Self) -> Self;
|
||||||
|
@ -9,14 +74,14 @@ trait ROMDiv {
|
||||||
impl ROMDiv for f32 {
|
impl ROMDiv for f32 {
|
||||||
fn rom_div(self, b: Self) -> Self {
|
fn rom_div(self, b: Self) -> Self {
|
||||||
// ROM implementation uses the hardware divider, so we have to save it
|
// ROM implementation uses the hardware divider, so we have to save it
|
||||||
save_divider(|_sio| rom_data::float_funcs::fdiv(self, b))
|
save_divider(|| rom_data::float_funcs::fdiv(self, b))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ROMDiv for f64 {
|
impl ROMDiv for f64 {
|
||||||
fn rom_div(self, b: Self) -> Self {
|
fn rom_div(self, b: Self) -> Self {
|
||||||
// ROM implementation uses the hardware divider, so we have to save it
|
// ROM implementation uses the hardware divider, so we have to save it
|
||||||
save_divider(|_sio| rom_data::double_funcs::ddiv(self, b))
|
save_divider(|| rom_data::double_funcs::ddiv(self, b))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,9 +13,10 @@ macro_rules! intrinsics_aliases {
|
||||||
$($rest:ident)*
|
$($rest:ident)*
|
||||||
) => {
|
) => {
|
||||||
#[cfg(all(target_arch = "arm", not(feature = "disable-intrinsics")))]
|
#[cfg(all(target_arch = "arm", not(feature = "disable-intrinsics")))]
|
||||||
intrinsics! {
|
mod $alias {
|
||||||
extern $abi fn $alias( $($argname: $ty),* ) -> $ret {
|
#[no_mangle]
|
||||||
$name($($argname),*)
|
pub extern $abi fn $alias( $($argname: $ty),* ) -> $ret {
|
||||||
|
super::$name($($argname),*)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,9 +32,10 @@ macro_rules! intrinsics_aliases {
|
||||||
$($rest:ident)*
|
$($rest:ident)*
|
||||||
) => {
|
) => {
|
||||||
#[cfg(all(target_arch = "arm", not(feature = "disable-intrinsics")))]
|
#[cfg(all(target_arch = "arm", not(feature = "disable-intrinsics")))]
|
||||||
intrinsics! {
|
mod $alias {
|
||||||
|
#[no_mangle]
|
||||||
unsafe extern $abi fn $alias( $($argname: $ty),* ) -> $ret {
|
unsafe extern $abi fn $alias( $($argname: $ty),* ) -> $ret {
|
||||||
$name($($argname),*)
|
super::$name($($argname),*)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -43,10 +43,10 @@ pub struct HwDivider {
|
||||||
|
|
||||||
/// Result of divide/modulo operation
|
/// Result of divide/modulo operation
|
||||||
pub struct DivResult<T> {
|
pub struct DivResult<T> {
|
||||||
/// The remainder of divide/modulo operation
|
|
||||||
pub remainder: T,
|
|
||||||
/// The quotient of divide/modulo operation
|
/// The quotient of divide/modulo operation
|
||||||
pub quotient: T,
|
pub quotient: T,
|
||||||
|
/// The remainder of divide/modulo operation
|
||||||
|
pub remainder: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Struct containing ownership markers for managing ownership of the SIO registers.
|
/// Struct containing ownership markers for managing ownership of the SIO registers.
|
||||||
|
@ -171,107 +171,159 @@ impl SioFifo {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn save_divider<F, R>(f: F) -> R
|
// This takes advantage of how AAPCS defines a 64-bit return on 32-bit registers
|
||||||
where
|
// by packing it into r0[0:31] and r1[32:63]. So all we need to do is put
|
||||||
F: FnOnce(&pac::sio::RegisterBlock) -> R,
|
// the remainder in the high order 32 bits of a 64 bit result. We can also
|
||||||
{
|
// alias the division operators to these for a similar reason r0 is the
|
||||||
let sio = unsafe { &(*pac::SIO::ptr()) };
|
// result either way and r1 a scratch register, so the caller can't assume it
|
||||||
if !sio.div_csr.read().dirty().bit() {
|
// retains the argument value.
|
||||||
// Not dirty, so nothing is waiting for the calculation. So we can just
|
#[cfg(target_arch = "arm")]
|
||||||
// issue it directly without a save/restore.
|
core::arch::global_asm!(
|
||||||
f(sio)
|
".macro hwdivider_head",
|
||||||
} else {
|
"ldr r2, =(0xd0000000)", // SIO_BASE
|
||||||
// Since we can't save the signed-ness of the calculation, we have to make
|
// Check the DIRTY state of the divider by shifting it into the C
|
||||||
// sure that there's at least an 8 cycle delay before we read the result.
|
// status bit.
|
||||||
// The Pico SDK ensures this by using a 6 cycle push and two 1 cycle reads.
|
"ldr r3, [r2, #0x078]", // DIV_CSR
|
||||||
// Since we can't be sure the Rust implementation will optimize to the same,
|
"lsrs r3, #2", // DIRTY = 1, so shift 2 down
|
||||||
// just use an explicit wait.
|
// We only need to save the state when DIRTY, otherwise we can just do the
|
||||||
while !sio.div_csr.read().ready().bit() {}
|
// division directly.
|
||||||
|
"bcs 2f",
|
||||||
|
"1:",
|
||||||
|
// Do the actual division now, we're either not DIRTY, or we've saved the
|
||||||
|
// state and branched back here so it's safe now.
|
||||||
|
".endm",
|
||||||
|
".macro hwdivider_tail",
|
||||||
|
// 8 cycle delay to wait for the result. Each branch takes two cycles
|
||||||
|
// and fits into a 2-byte Thumb instruction, so this is smaller than
|
||||||
|
// 8 NOPs.
|
||||||
|
"b 3f",
|
||||||
|
"3: b 3f",
|
||||||
|
"3: b 3f",
|
||||||
|
"3: b 3f",
|
||||||
|
"3:",
|
||||||
|
// Read the quotient last, since that's what clears the dirty flag.
|
||||||
|
"ldr r1, [r2, #0x074]", // DIV_REMAINDER
|
||||||
|
"ldr r0, [r2, #0x070]", // DIV_QUOTIENT
|
||||||
|
// Either return to the caller or back to the state restore.
|
||||||
|
"bx lr",
|
||||||
|
"2:",
|
||||||
|
// Since we can't save the signed-ness of the calculation, we have to make
|
||||||
|
// sure that there's at least an 8 cycle delay before we read the result.
|
||||||
|
// The push takes 5 cycles, and we've already spent at least 7 checking
|
||||||
|
// the DIRTY state to get here.
|
||||||
|
"push {{r4-r6, lr}}",
|
||||||
|
// Read the quotient last, since that's what clears the dirty flag.
|
||||||
|
"ldr r3, [r2, #0x060]", // DIV_UDIVIDEND
|
||||||
|
"ldr r4, [r2, #0x064]", // DIV_UDIVISOR
|
||||||
|
"ldr r5, [r2, #0x074]", // DIV_REMAINDER
|
||||||
|
"ldr r6, [r2, #0x070]", // DIV_QUOTIENT
|
||||||
|
// If we get interrupted here (before a write sets the DIRTY flag) it's
|
||||||
|
// fine, since we have the full state, so the interruptor doesn't have to
|
||||||
|
// restore it. Once the write happens and the DIRTY flag is set, the
|
||||||
|
// interruptor becomes responsible for restoring our state.
|
||||||
|
"bl 1b",
|
||||||
|
// If we are interrupted here, then the interruptor will start an incorrect
|
||||||
|
// calculation using a wrong divisor, but we'll restore the divisor and
|
||||||
|
// result ourselves correctly. This sets DIRTY, so any interruptor will
|
||||||
|
// save the state.
|
||||||
|
"str r3, [r2, #0x060]", // DIV_UDIVIDEND
|
||||||
|
// If we are interrupted here, the the interruptor may start the
|
||||||
|
// calculation using incorrectly signed inputs, but we'll restore the
|
||||||
|
// result ourselves. This sets DIRTY, so any interruptor will save the
|
||||||
|
// state.
|
||||||
|
"str r4, [r2, #0x064]", // DIV_UDIVISOR
|
||||||
|
// If we are interrupted here, the interruptor will have restored
|
||||||
|
// everything but the quotient may be wrongly signed. If the calculation
|
||||||
|
// started by the above writes is still ongoing it is stopped, so it won't
|
||||||
|
// replace the result we're restoring. DIRTY and READY set, but only
|
||||||
|
// DIRTY matters to make the interruptor save the state.
|
||||||
|
"str r5, [r2, #0x074]", // DIV_REMAINDER
|
||||||
|
// State fully restored after the quotient write. This sets both DIRTY
|
||||||
|
// and READY, so whatever we may have interrupted can read the result.
|
||||||
|
"str r6, [r2, #0x070]", // DIV_QUOTIENT
|
||||||
|
"pop {{r4-r6, pc}}",
|
||||||
|
".endm",
|
||||||
|
);
|
||||||
|
|
||||||
// Read the quotient last, since that's what clears the dirty flag
|
macro_rules! division_function {
|
||||||
let dividend = sio.div_udividend.read().bits();
|
(
|
||||||
let divisor = sio.div_udivisor.read().bits();
|
$name:ident $($intrinsic:ident)* ( $argty:ty ) {
|
||||||
let remainder = sio.div_remainder.read().bits();
|
$($begin:literal),+
|
||||||
let quotient = sio.div_quotient.read().bits();
|
}
|
||||||
|
) => {
|
||||||
|
#[cfg(all(target_arch = "arm", not(feature = "disable-intrinsics")))]
|
||||||
|
core::arch::global_asm!(
|
||||||
|
// Mangle the name slightly, since this is a global symbol.
|
||||||
|
concat!(".global _rphal_", stringify!($name)),
|
||||||
|
concat!(".type _rphal_", stringify!($name), ", %function"),
|
||||||
|
".align 2",
|
||||||
|
concat!("_rphal_", stringify!($name), ":"),
|
||||||
|
$(
|
||||||
|
concat!(".global ", stringify!($intrinsic)),
|
||||||
|
concat!(".type ", stringify!($intrinsic), ", %function"),
|
||||||
|
concat!(stringify!($intrinsic), ":"),
|
||||||
|
)*
|
||||||
|
|
||||||
// If we get interrupted here (before a write sets the DIRTY flag) its fine, since
|
"hwdivider_head",
|
||||||
// we have the full state, so the interruptor doesn't have to restore it. Once the
|
$($begin),+ ,
|
||||||
// write happens and the DIRTY flag is set, the interruptor becomes responsible for
|
"hwdivider_tail",
|
||||||
// restoring our state.
|
);
|
||||||
let result = f(sio);
|
|
||||||
|
|
||||||
// If we are interrupted here, then the interruptor will start an incorrect calculation
|
#[cfg(all(target_arch = "arm", feature = "disable-intrinsics"))]
|
||||||
// using a wrong divisor, but we'll restore the divisor and result ourselves correctly.
|
core::arch::global_asm!(
|
||||||
// This sets DIRTY, so any interruptor will save the state.
|
// Mangle the name slightly, since this is a global symbol.
|
||||||
sio.div_udividend.write(|w| unsafe { w.bits(dividend) });
|
concat!(".global _rphal_", stringify!($name)),
|
||||||
// If we are interrupted here, the the interruptor may start the calculation using
|
concat!(".type _rphal_", stringify!($name), ", %function"),
|
||||||
// incorrectly signed inputs, but we'll restore the result ourselves.
|
".align 2",
|
||||||
// This sets DIRTY, so any interruptor will save the state.
|
concat!("_rphal_", stringify!($name), ":"),
|
||||||
sio.div_udivisor.write(|w| unsafe { w.bits(divisor) });
|
|
||||||
// If we are interrupted here, the interruptor will have restored everything but the
|
|
||||||
// quotient may be wrongly signed. If the calculation started by the above writes is
|
|
||||||
// still ongoing it is stopped, so it won't replace the result we're restoring.
|
|
||||||
// DIRTY and READY set, but only DIRTY matters to make the interruptor save the state.
|
|
||||||
sio.div_remainder.write(|w| unsafe { w.bits(remainder) });
|
|
||||||
// State fully restored after the quotient write. This sets both DIRTY and READY, so
|
|
||||||
// whatever we may have interrupted can read the result.
|
|
||||||
sio.div_quotient.write(|w| unsafe { w.bits(quotient) });
|
|
||||||
|
|
||||||
result
|
"hwdivider_head",
|
||||||
|
$($begin),+ ,
|
||||||
|
"hwdivider_tail",
|
||||||
|
);
|
||||||
|
|
||||||
|
#[cfg(target_arch = "arm")]
|
||||||
|
extern "aapcs" {
|
||||||
|
// Connect a local name to global symbol above through FFI.
|
||||||
|
#[link_name = concat!("_rphal_", stringify!($name)) ]
|
||||||
|
fn $name(n: $argty, d: $argty) -> u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_arch = "arm"))]
|
||||||
|
#[allow(unused_variables)]
|
||||||
|
unsafe fn $name(n: $argty, d: $argty) -> u64 { 0 }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
division_function! {
|
||||||
|
unsigned_divmod __aeabi_uidivmod __aeabi_uidiv ( u32 ) {
|
||||||
|
"str r0, [r2, #0x060]", // DIV_UDIVIDEND
|
||||||
|
"str r1, [r2, #0x064]" // DIV_UDIVISOR
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Don't use cortex_m::asm::delay(8) because that ends up delaying 15 cycles
|
division_function! {
|
||||||
// on Cortex-M0. Each iteration of the inner loop is 3 cycles and it adds
|
signed_divmod __aeabi_idivmod __aeabi_idiv ( i32 ) {
|
||||||
// one extra iteration.
|
"str r0, [r2, #0x068]", // DIV_SDIVIDEND
|
||||||
#[inline(always)]
|
"str r1, [r2, #0x06c]" // DIV_SDIVISOR
|
||||||
fn divider_delay() {
|
}
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
cortex_m::asm::nop();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn divider_unsigned(dividend: u32, divisor: u32) -> DivResult<u32> {
|
fn divider_unsigned(n: u32, d: u32) -> DivResult<u32> {
|
||||||
save_divider(|sio| {
|
let packed = unsafe { unsigned_divmod(n, d) };
|
||||||
sio.div_udividend.write(|w| unsafe { w.bits(dividend) });
|
DivResult {
|
||||||
sio.div_udivisor.write(|w| unsafe { w.bits(divisor) });
|
quotient: packed as u32,
|
||||||
|
remainder: (packed >> 32) as u32,
|
||||||
divider_delay();
|
}
|
||||||
|
|
||||||
// Note: quotient must be read last
|
|
||||||
let remainder = sio.div_remainder.read().bits();
|
|
||||||
let quotient = sio.div_quotient.read().bits();
|
|
||||||
|
|
||||||
DivResult {
|
|
||||||
remainder,
|
|
||||||
quotient,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn divider_signed(dividend: i32, divisor: i32) -> DivResult<i32> {
|
fn divider_signed(n: i32, d: i32) -> DivResult<i32> {
|
||||||
save_divider(|sio| {
|
let packed = unsafe { signed_divmod(n, d) };
|
||||||
sio.div_sdividend
|
// Double casts to avoid sign extension
|
||||||
.write(|w| unsafe { w.bits(dividend as u32) });
|
DivResult {
|
||||||
sio.div_sdivisor
|
quotient: packed as u32 as i32,
|
||||||
.write(|w| unsafe { w.bits(divisor as u32) });
|
remainder: (packed >> 32) as u32 as i32,
|
||||||
|
}
|
||||||
divider_delay();
|
|
||||||
|
|
||||||
// Note: quotient must be read last
|
|
||||||
let remainder = sio.div_remainder.read().bits() as i32;
|
|
||||||
let quotient = sio.div_quotient.read().bits() as i32;
|
|
||||||
|
|
||||||
DivResult {
|
|
||||||
remainder,
|
|
||||||
quotient,
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HwDivider {
|
impl HwDivider {
|
||||||
|
@ -287,7 +339,6 @@ impl HwDivider {
|
||||||
}
|
}
|
||||||
|
|
||||||
intrinsics! {
|
intrinsics! {
|
||||||
#[aeabi = __aeabi_uidiv]
|
|
||||||
extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
|
extern "C" fn __udivsi3(n: u32, d: u32) -> u32 {
|
||||||
divider_unsigned(n, d).quotient
|
divider_unsigned(n, d).quotient
|
||||||
}
|
}
|
||||||
|
@ -304,7 +355,6 @@ intrinsics! {
|
||||||
quo_rem.quotient
|
quo_rem.quotient
|
||||||
}
|
}
|
||||||
|
|
||||||
#[aeabi = __aeabi_idiv]
|
|
||||||
extern "C" fn __divsi3(n: i32, d: i32) -> i32 {
|
extern "C" fn __divsi3(n: i32, d: i32) -> i32 {
|
||||||
divider_signed(n, d).quotient
|
divider_signed(n, d).quotient
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue