summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorSalvatore Cro <salvatore.cro@st.com>2010-09-09 16:10:21 +0200
committerCarmelo Amoroso <carmelo.amoroso@st.com>2010-09-15 12:51:04 +0200
commita27dd6924e7964d92b49f4d5ebe2e68cfb2742dd (patch)
tree627befce36a43bffdb02bacca575a2c01256fd7a /libc
parent599c74a4d7e9bbe68b946d65aef2725821ea3fe9 (diff)
sh: update the memcpy adding a new loop with aggressive prefetching
After exploring different prefetch distance-degree combinations in this new update of the memcpy function, a new loop has been added for moving many cache lines with an aggressive prefetching schema. Prefetch has been removed when move few cache line aligned blocks. As final result, this memcpy gives us the same performances for small sizes (we already had!) and better numbers for big copies. In case of SH4-300 CPU Series, benchmarks show a gain of ~20% for sizes from 4KiB to 256KiB. In case of the SH4-200, there is a gain of ~40% for sizes bigger than 32KiB. Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com> Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
Diffstat (limited to 'libc')
-rw-r--r--libc/string/sh/sh4/memcpy.S128
1 files changed, 107 insertions, 21 deletions
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S
index 252ef36eb..5be770a59 100644
--- a/libc/string/sh/sh4/memcpy.S
+++ b/libc/string/sh/sh4/memcpy.S
@@ -28,13 +28,20 @@
* Currenlty it has been only implemented and tested for little endian mode. */
.macro FPU_SET_PAIRED_PREC
sts fpscr, r7
- mov #0x10, r6 ! PR=0 SZ=1
- shll16 r6
- lds r6, fpscr
+ mov #0x10, r0 ! PR=0 SZ=1
+ shll16 r0
+ lds r0, fpscr
.endm
.macro RESTORE_FPSCR
lds r7, fpscr
.endm
+.macro DALLOC
+ ! Cache allocate + store on dst-32.
+ add #-32, r1
+ movca.l r0, @r1
+ add #32, r1
+.endm
+
#endif
!
@@ -471,30 +478,111 @@ ENTRY(memcpy)
add r0, r5
mov r0, r1
- add #-0x1c, r5
- mov r5, r0
+ mov r1, r3 ! MT
+ sub r2, r3 ! EX (r3 - r2 -> r3)
+ mov #-5, r0
+ shld r0, r3 ! number of the cache lines
+ mov #8, r0
+ cmp/ge r0, r3 ! Check if there are many cache lines to copy.
+ bf 45f ! Copy cache line aligned blocks without pref.
+ mov r5, r0
+ add #-0x7c, r0
tst #7, r0 ! src is 8byte aligned
- mov r5, r3
+ bf 45f
+
+ ! Many cache lines have to be copied and the buffers are well aligned.
+ ! Aggressive prefetching and FPU in single paired precision.
+ mov r0, r5
+ mov r5, r6
+ add #-0x80, r6 ! prefetch head
- add #-64, r3 ! To pefetch head
- bt/s 3f
+ FPU_SET_PAIRED_PREC
- pref @r3
+ mov #4, r0
+67:
+ add #-0x20, r6
+ pref @r6
+ add #-0x20, r6
+ pref @r6
+
+ fmov @r5+, dr0
+ fmov @r5+, dr2
+ fmov @r5+, dr4
+ fmov @r5+, dr6
+ fmov @r5+, dr8
+ fmov @r5+, dr10
+ fmov @r5+, dr12
+ fmov @r5+, dr14
+ fmov @r5+, xd0
+ fmov @r5+, xd2
+ fmov @r5+, xd4
+ fmov @r5+, xd6
+ fmov @r5+, xd8
+ fmov @r5+, xd10
+ fmov @r5+, xd12
+ fmov @r5+, xd14
+
+ DALLOC
+ fmov xd14, @-r1
+ fmov xd12, @-r1
+ fmov xd10, @-r1
+ fmov xd8, @-r1
+ DALLOC
+ fmov xd6, @-r1
+ fmov xd4, @-r1
+ fmov xd2, @-r1
+ fmov xd0, @-r1
+ DALLOC
+ fmov dr14, @-r1
+ fmov dr12, @-r1
+ fmov dr10, @-r1
+ fmov dr8, @-r1
+ DALLOC
+ fmov dr6, @-r1
+ add #-0x80, r5
+ fmov dr4, @-r1
+ add #-0x80, r5
+ fmov dr2, @-r1
+ add #-0x20, r6
+ fmov dr0, @-r1
+ add #-4, r3
+ pref @r6
+ add #-0x20, r6
+ cmp/ge r0, r3
+ bt/s 67b
+ pref @r6
+
+ ! Other cache lines could be copied: so use the FPU in single paired
+ ! precision without prefetching. No check for alignment is necessary.
+
+ mov #1, r0
+ cmp/ge r0, r3
+ bt/s 4f
+ add #0x60, r5
+
+ RESTORE_FPSCR
+
+ bra 5f
+ nop
+
+ ! No prefetch and FPU in single precision.
+45:
+ add #-0x1c, r5
+ mov r5, r0
+ tst #7, r0
+ bt 3f
2: fmov.s @r5+, fr0
- mov r1, r6
fmov.s @r5+, fr1
- add #-32, r6
fmov.s @r5+, fr2
fmov.s @r5+, fr3
fmov.s @r5+, fr4
fmov.s @r5+, fr5
fmov.s @r5+, fr6
fmov.s @r5+, fr7
- add #-0x40, r5
- movca.l r0, @r6 ! Cache allocate + store on dst-32.
+ DALLOC
fmov.s fr7, @-r1
fmov.s fr6, @-r1
@@ -505,35 +593,33 @@ ENTRY(memcpy)
fmov.s fr1, @-r1
fmov.s fr0, @-r1
- add #-32, r3
cmp/eq r2,r1
bf/s 2b
- pref @r3 ! Prefetch the next cache line.
+ add #-0x40, r5
bra 5f
+ nop
+
+ ! No prefetch and FPU in single paired precision.
3: FPU_SET_PAIRED_PREC
4: fmov @r5+, dr0
- mov r1, r6
fmov @r5+, dr2
- add #-32, r6
fmov @r5+, dr4
fmov @r5+, dr6
- add #-0x40, r5
- movca.l r0, @r6
+ DALLOC
fmov dr6, @-r1
fmov dr4, @-r1
fmov dr2, @-r1
fmov dr0, @-r1
- add #-32, r3
cmp/eq r2,r1
bf/s 4b
- pref @r3
+ add #-0x40, r5
RESTORE_FPSCR