diff options
author | Salvatore Cro <salvatore.cro@st.com> | 2010-09-09 16:08:54 +0200 |
---|---|---|
committer | Carmelo Amoroso <carmelo.amoroso@st.com> | 2010-09-15 12:42:09 +0200 |
commit | 599c74a4d7e9bbe68b946d65aef2725821ea3fe9 (patch) | |
tree | 1eed7e5868ace26b9a08910fb36deaf95b513f99 | |
parent | 4b88e6e858b55def2ef0392278ddf81835f2ac45 (diff) |
sh: move data without fetching cache block within the memset
With this patch the movca.l instruction is used within the memset.
The current memset implementation only uses the FPU and there is
an real gain for all the sizes.
Adding the movca.l instruction numbers always are better than the generic code.
There is a big gain for size greater than 64 KiB but number are worst for 4-32KiB
sizes compared with the implementation without movca.l.
Time Memory Bandwidth (Mbytes)
-------------------------------------------------
Generic SH4 SH4
(FPU) (FPU+movca.l)
-------------------------------------------------
512 1143 1998 1596
1 KiB 1273 2567 1915
2 KiB 1350 2993 2128
4-32KiB 1391 3262 2252
64KiB-16MiB 170 186 *830*
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
-rw-r--r-- | libc/string/sh/sh4/memset.S | 62 |
1 files changed, 34 insertions, 28 deletions
diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S index 83f874612..eb83355ce 100644 --- a/libc/string/sh/sh4/memset.S +++ b/libc/string/sh/sh4/memset.S @@ -5,7 +5,7 @@ * Copyright (C) 1999 Niibe Yutaka * * Copyright (c) 2009 STMicroelectronics Ltd - * Optimised using 64bit data transfer via FPU + * Optimised using 64bit data transfer (via FPU) and the movca.l inst. * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> * * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. @@ -24,9 +24,9 @@ * Currenlty it has been only implemented and tested for little endian mode. */ .macro FPU_SET_PAIRED_PREC sts fpscr, r3 - mov #0x10, r0 ! PR=0 SZ=1 - shll16 r0 - lds r0, fpscr + mov #0x10, r1 ! PR=0 SZ=1 + shll16 r1 + lds r1, fpscr .endm .macro RESTORE_FPSCR lds r3, fpscr @@ -34,12 +34,10 @@ #endif ENTRY(memset) - tst r6,r6 - bt/s 5f ! if n=0, do nothing - add r6,r4 mov #12,r0 + add r6,r4 cmp/gt r6,r0 - bt/s 4f ! if it's too small, set a byte at once + bt/s 40f ! if it's too small, set a byte at once mov r4,r0 and #3,r0 cmp/eq #0,r0 @@ -56,7 +54,7 @@ ENTRY(memset) swap.w r5,r0 ! VV00 or r0,r5 ! VVVV - ! Enough bytes need to be copied + ! Check if enough bytes need to be copied to be worth the big loop mov #0x40, r0 ! (MT) cmp/gt r6,r0 ! (MT) 64 > len => slow loop @@ -84,6 +82,9 @@ ENTRY(memset) mov #-5,r0 shld r0,r2 ! number of loops + add #-32, r4 + mov r5, r0 + #ifdef MEMSET_USES_FPU lds r5, fpul ! (CO) fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV' @@ -91,36 +92,40 @@ ENTRY(memset) FPU_SET_PAIRED_PREC 12: - add #-0x20, r6 !(MT) + movca.l r0, @r4 + mov.l r5, @(4, r4) + add #32, r4 fmov dr0, @-r4 fmov dr0, @-r4 + add #-0x20, r6 fmov dr0, @-r4 dt r2 - bf/s 12b !(BR) - fmov dr0, @-r4 + bf/s 12b + add #-40, r4 RESTORE_FPSCR #else 12: - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 - mov.l r5,@-r4 + movca.l r0,@r4 + mov.l r5,@(4, r4) + mov.l r5,@(8, r4) + mov.l r5,@(12,r4) + mov.l r5,@(16,r4) + mov.l r5,@(20,r4) add #-0x20, r6 - mov.l r5,@-r4 + mov.l r5,@(24,r4) dt r2 + mov.l r5,@(28,r4) bf/s 12b - mov.l r5,@-r4 -#endif - tst r6,r6 - bt/s 5f - mov #8, r0 + add #-32, r4 +#endif + add #32, r4 + mov #8, r0 cmp/ge r0, r6 - bf/s 4f - mov r6,r0 + bf 40f + + mov r6,r0 22: shlr2 r0 shlr r0 ! r0 = r6 >> 3 @@ -132,9 +137,10 @@ ENTRY(memset) ! mov #7,r0 and r0,r6 - tst r6,r6 + + ! fill bytes (length may be zero) +40: tst r6,r6 bt 5f - ! fill bytes 4: dt r6 bf/s 4b |