summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiuseppe Cavallaro <peppe.cavallaro@st.com>2009-12-14 16:45:49 +0100
committerCarmelo Amoroso <carmelo.amoroso@st.com>2009-12-14 17:03:34 +0100
commit8cd420c223f1a39c01835189669928ea8df9d221 (patch)
tree9698aabfd30235d568e823ecab91811688f1df94
parent6732cb1ae137d7af17eb911004ba904badba1b85 (diff)
sh: fix endianess and optimise the SH4 memcpy
This patch fixes the big-endian code and adds a new optimization only for little endian mode. This optimization is based on prefetching and 64bit data transfer via FPU. Tests shows that ---------------------------------------- Memory bandwidth | Gain | sh4-300 | sh4-200 ---------------------------------------- 512 bytes to 16KiB | ~20% | ~25% from 32KiB to 16MiB | ~190% | ~5% ---------------------------------------- Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com> Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
-rw-r--r--libc/string/sh/sh4/memcpy.S109
1 files changed, 107 insertions, 2 deletions
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S
index 0954bce85..c03c18c73 100644
--- a/libc/string/sh/sh4/memcpy.S
+++ b/libc/string/sh/sh4/memcpy.S
@@ -6,6 +6,9 @@
* Modified from memcpy.S and micro-optimised for SH4
* Stuart Menefy (stuart.menefy@st.com)
*
+ * Copyright (c) 2009 STMicroelectronics Ltd
+ * Optimised using prefetching and 64bit data transfer via FPU
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
*/
/*
@@ -17,6 +20,22 @@
#include <endian.h>
+#ifdef __LITTLE_ENDIAN__
+#define MEMCPY_USES_FPU
+/* Use paired single precision load or store mode for 64-bit tranfering.
+ * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
+ * Currenlty it has been only implemented and tested for little endian mode. */
+.macro FPU_SET_PAIRED_PREC
+ sts fpscr, r7
+ mov #0x10, r6 ! PR=0 SZ=1
+ shll16 r6
+ lds r6, fpscr
+.endm
+.macro RESTORE_FPSCR
+ lds r7, fpscr
+.endm
+#endif
+
!
! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
!
@@ -189,9 +208,7 @@ memcpy:
mov r4, r0 ! 5 MT (0 cycle latency)
add r6, r0 ! 49 EX
- mov #16, r1 ! 6 EX
bt/s .Lcase00 ! 111 BR (aligned)
-
sub r4, r5 ! 75 EX
! Arguments are not nicely long word aligned or zero len.
@@ -207,6 +224,7 @@ memcpy:
! However the penalty for getting it 'wrong' is much higher for long word
! aligned data (and this is more common), so use a value of 16.
+ mov #16, r1 ! 6 EX
cmp/gt r6,r1 ! 56 MT
add #-1,r5 ! 50 EX
@@ -447,6 +465,92 @@ memcpy:
mov.l r7, @-r0 ! 30 LS
+#ifdef MEMCPY_USES_FPU
+ ! Copy the cache line aligned blocks by using the FPU registers.
+ ! If src and dst are well aligned adopt 64-bit data transfer.
+ ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+ ! r5: src (was r0+r5)
+ ! r1: dest (was r0)
+1:
+ add r0, r5
+ mov r0, r1
+
+ add #-0x1c, r5
+ mov r5, r0
+
+ tst #7, r0 ! src is 8byte aligned
+ mov r5, r3
+
+ add #-64, r3 ! To pefetch head
+ bt/s 3f
+
+ pref @r3
+
+2: fmov.s @r5+, fr0
+ mov r1, r6
+ fmov.s @r5+, fr1
+ add #-32, r6
+ fmov.s @r5+, fr2
+ fmov.s @r5+, fr3
+ fmov.s @r5+, fr4
+ fmov.s @r5+, fr5
+ fmov.s @r5+, fr6
+ fmov.s @r5+, fr7
+ add #-0x40, r5
+
+ movca.l r0, @r6 ! Cache allocate + store on dst-32.
+
+ fmov.s fr7, @-r1
+ fmov.s fr6, @-r1
+ fmov.s fr5, @-r1
+ fmov.s fr4, @-r1
+ fmov.s fr3, @-r1
+ fmov.s fr2, @-r1
+ fmov.s fr1, @-r1
+ fmov.s fr0, @-r1
+
+ add #-32, r3
+ cmp/eq r2,r1
+
+ bf/s 2b
+ pref @r3 ! Prefetch the next cache line.
+
+ bra 5f
+
+3: FPU_SET_PAIRED_PREC
+
+4: fmov @r5+, dr0
+ mov r1, r6
+ fmov @r5+, dr2
+ add #-32, r6
+ fmov @r5+, dr4
+ fmov @r5+, dr6
+ add #-0x40, r5
+
+ movca.l r0, @r6
+
+ fmov dr6, @-r1
+ fmov dr4, @-r1
+ fmov dr2, @-r1
+ fmov dr0, @-r1
+ add #-32, r3
+ cmp/eq r2,r1
+
+ bf/s 4b
+ pref @r3
+
+ RESTORE_FPSCR
+
+5: mov r1, r0
+
+ cmp/eq r4, r0 ! 54 MT
+ bf/s 1f ! 109 BR
+ sub r1, r5 ! 75 EX
+
+ rts
+ nop
+1:
+#else
! Copy the cache line aligned blocks
!
! In use: r0, r2, r4, r5
@@ -512,6 +616,7 @@ memcpy:
rts
1: mov.l @r15+, r8 ! 15 LS
+#endif
sub r4, r1 ! 75 EX (len remaining)
! number of trailing bytes is non-zero