diff options
author | Austin Foxley <austinf@cetoncorp.com> | 2009-11-22 12:17:38 -0800 |
---|---|---|
committer | Austin Foxley <austinf@cetoncorp.com> | 2009-11-22 12:17:38 -0800 |
commit | 5c9ef58ec4bcb2def9e30f0b156f9cfcb1d0d163 (patch) | |
tree | f8f889678b653d5275c285a037b9b43f27a91192 /libc/string/sh/sh4 | |
parent | f757db2d319ccc5f7034165046fb2bb58901afb1 (diff) |
sh: Add new optimisation to the SH4 memcpy
This optimization is based on prefetching and 64bit data transfer via FPU
(only for the little endianess)
Tests shows that:
----------------------------------------
Memory bandwidth | Gain
| sh4-300 | sh4-200
----------------------------------------
512 bytes to 16KiB | ~20% | ~25%
from 32KiB to 16MiB | ~190% | ~5%
----------------------------------------
Signed-off-by: Austin Foxley <austinf@cetoncorp.com>
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
Diffstat (limited to 'libc/string/sh/sh4')
-rw-r--r-- | libc/string/sh/sh4/memcpy.S | 120 | ||||
-rw-r--r-- | libc/string/sh/sh4/memmove.c | 117 | ||||
-rw-r--r-- | libc/string/sh/sh4/memset.S | 146 | ||||
-rw-r--r-- | libc/string/sh/sh4/strcpy.S | 28 | ||||
-rw-r--r-- | libc/string/sh/sh4/strncpy.S | 43 |
5 files changed, 444 insertions, 10 deletions
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S index 0954bce85..efdaf8bba 100644 --- a/libc/string/sh/sh4/memcpy.S +++ b/libc/string/sh/sh4/memcpy.S @@ -6,6 +6,9 @@ * Modified from memcpy.S and micro-optimised for SH4 * Stuart Menefy (stuart.menefy@st.com) * + * Copyright (c) 2009 STMicroelectronics Ltd + * Optimised using prefetching and 64bit data transfer via FPU + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> */ /* @@ -15,8 +18,25 @@ * If there is an overlap, then the results are undefined. */ +#include <sysdep.h> #include <endian.h> +#ifdef __LITTLE_ENDIAN__ +#define MEMCPY_USES_FPU +/* Use paired single precision load or store mode for 64-bit tranfering. + * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300. + * Currenlty it has been only implemented and tested for little endian mode. */ +.macro FPU_SET_PAIRED_PREC + sts fpscr, r7 + mov #0x10, r6 ! PR=0 SZ=1 + shll16 r6 + lds r6, fpscr +.endm +.macro RESTORE_FPSCR + lds r7, fpscr +.endm +#endif + ! ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. ! @@ -157,12 +177,7 @@ 9: rts nop -/* void * memcpy(void *dst, const void *src, size_t len) */ -.text -.align 4 -.type memcpy,@function -.globl memcpy; -memcpy: +ENTRY(memcpy) ! Calculate the invariants which will be used in the remainder ! of the code: @@ -189,9 +204,7 @@ memcpy: mov r4, r0 ! 5 MT (0 cycle latency) add r6, r0 ! 49 EX - mov #16, r1 ! 6 EX bt/s .Lcase00 ! 111 BR (aligned) - sub r4, r5 ! 75 EX ! Arguments are not nicely long word aligned or zero len. @@ -207,6 +220,7 @@ memcpy: ! However the penalty for getting it 'wrong' is much higher for long word ! aligned data (and this is more common), so use a value of 16. + mov #16, r1 ! 6 EX cmp/gt r6,r1 ! 56 MT add #-1,r5 ! 50 EX @@ -447,6 +461,92 @@ memcpy: mov.l r7, @-r0 ! 30 LS +#ifdef MEMCPY_USES_FPU + ! Copy the cache line aligned blocks by using the FPU registers. + ! If src and dst are well aligned adopt 64-bit data transfer. + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: + ! r5: src (was r0+r5) + ! r1: dest (was r0) +1: + add r0, r5 + mov r0, r1 + + add #-0x1c, r5 + mov r5, r0 + + tst #7, r0 ! src is 8byte aligned + mov r5, r3 + + add #-64, r3 ! To pefetch head + bt/s 3f + + pref @r3 + +2: fmov.s @r5+, fr0 + mov r1, r6 + fmov.s @r5+, fr1 + add #-32, r6 + fmov.s @r5+, fr2 + fmov.s @r5+, fr3 + fmov.s @r5+, fr4 + fmov.s @r5+, fr5 + fmov.s @r5+, fr6 + fmov.s @r5+, fr7 + add #-0x40, r5 + + movca.l r0, @r6 ! Cache allocate + store on dst-32. + + fmov.s fr7, @-r1 + fmov.s fr6, @-r1 + fmov.s fr5, @-r1 + fmov.s fr4, @-r1 + fmov.s fr3, @-r1 + fmov.s fr2, @-r1 + fmov.s fr1, @-r1 + fmov.s fr0, @-r1 + + add #-32, r3 + cmp/eq r2,r1 + + bf/s 2b + pref @r3 ! Prefetch the next cache line. + + bra 5f + +3: FPU_SET_PAIRED_PREC + +4: fmov @r5+, dr0 + mov r1, r6 + fmov @r5+, dr2 + add #-32, r6 + fmov @r5+, dr4 + fmov @r5+, dr6 + add #-0x40, r5 + + movca.l r0, @r6 + + fmov dr6, @-r1 + fmov dr4, @-r1 + fmov dr2, @-r1 + fmov dr0, @-r1 + add #-32, r3 + cmp/eq r2,r1 + + bf/s 4b + pref @r3 + + RESTORE_FPSCR + +5: mov r1, r0 + + cmp/eq r4, r0 ! 54 MT + bf/s 1f ! 109 BR + sub r1, r5 ! 75 EX + + rts + nop +1: +#else ! Copy the cache line aligned blocks ! ! In use: r0, r2, r4, r5 @@ -512,6 +612,7 @@ memcpy: rts 1: mov.l @r15+, r8 ! 15 LS +#endif sub r4, r1 ! 75 EX (len remaining) ! number of trailing bytes is non-zero @@ -803,6 +904,5 @@ memcpy: rts mov.b r1,@-r0 -.size memcpy,.-memcpy; - +END(memcpy) libc_hidden_def (memcpy) diff --git a/libc/string/sh/sh4/memmove.c b/libc/string/sh/sh4/memmove.c new file mode 100644 index 000000000..4d52db2ca --- /dev/null +++ b/libc/string/sh/sh4/memmove.c @@ -0,0 +1,117 @@ +/* memmove implementation for SH4 + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +#include <string.h> + + +#define FPSCR_SR (1 << 20) +#define STORE_FPSCR(x) __asm__ volatile("sts fpscr, %0" : "=r"(x)) +#define LOAD_FPSCR(x) __asm__ volatile("lds %0, fpscr" : : "r"(x)) + +static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len) +{ + char *d = (char *)dest; + char *s = (char *)src; + + if (len >= 64) { + unsigned long fpscr; + int *s1; + int *d1; + + /* Align the dest to 4 byte boundary. */ + while ((unsigned)d & 0x7) { + *d++ = *s++; + len--; + } + + s1 = (int *)s; + d1 = (int *)d; + + /* check if s is well aligned to use FPU */ + if (!((unsigned)s1 & 0x7)) { + + /* Align the dest to cache-line boundary */ + while ((unsigned)d1 & 0x1c) { + *d1++ = *s1++; + len -= 4; + } + + /* Use paired single precision load or store mode for + * 64-bit tranfering.*/ + STORE_FPSCR(fpscr); + LOAD_FPSCR(FPSCR_SR); + + while (len >= 32) { + __asm__ volatile ("fmov @%0+,dr0":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr2":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr4":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr6":"+r" (s1)); + __asm__ + volatile ("fmov dr0,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr2,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr4,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr6,@%0"::"r" + (d1):"memory"); + d1 += 2; + len -= 32; + } + LOAD_FPSCR(fpscr); + } + s = (char *)s1; + d = (char *)d1; + /*TODO: other subcases could be covered here?!?*/ + } + /* Go to per-byte copy */ + while (len > 0) { + *d++ = *s++; + len--; + } + return; +} + +void *memmove(void *dest, const void *src, size_t len) +{ + unsigned long int d = (long int)dest; + unsigned long int s = (long int)src; + unsigned long int res; + + if (d >= s) + res = d - s; + else + res = s - d; + /* + * 1) dest and src are not overlap ==> memcpy (BWD/FDW) + * 2) dest and src are 100% overlap ==> memcpy (BWD/FDW) + * 3) left-to-right overlap ==> Copy from the beginning to the end + * 4) right-to-left overlap ==> Copy from the end to the beginning + */ + + if (res == 0) /* 100% overlap */ + memcpy(dest, src, len); /* No overlap */ + else if (res >= len) + memcpy(dest, src, len); + else { + if (d > s) /* right-to-left overlap */ + memcpy(dest, src, len); /* memcpy is BWD */ + else /* cannot use SH4 memcpy for this case */ + fpu_optimised_copy_fwd(dest, src, len); + } + return (dest); +} + +libc_hidden_def(memmove) diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S new file mode 100644 index 000000000..1a57cb969 --- /dev/null +++ b/libc/string/sh/sh4/memset.S @@ -0,0 +1,146 @@ +/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $ + * + * "memset" implementation of SuperH + * + * Copyright (C) 1999 Niibe Yutaka + * + * Copyright (c) 2009 STMicroelectronics Ltd + * Optimised using 64bit data transfer via FPU + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* + * void *memset(void *s, int c, size_t n); + */ + +#include <sysdep.h> + +#ifdef __LITTLE_ENDIAN__ +#define MEMSET_USES_FPU +/* Use paired single precision load or store mode for 64-bit tranfering. + * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300. + * Currenlty it has been only implemented and tested for little endian mode. */ +.macro FPU_SET_PAIRED_PREC + sts fpscr, r3 + mov #0x10, r0 ! PR=0 SZ=1 + shll16 r0 + lds r0, fpscr +.endm +.macro RESTORE_FPSCR + lds r3, fpscr +.endm +#endif + +ENTRY(memset) + tst r6,r6 + bt/s 5f ! if n=0, do nothing + add r6,r4 + mov #12,r0 + cmp/gt r6,r0 + bt/s 4f ! if it's too small, set a byte at once + mov r4,r0 + and #3,r0 + cmp/eq #0,r0 + bt/s 2f ! It's aligned + sub r0,r6 +1: + dt r0 + bf/s 1b + mov.b r5,@-r4 +2: ! make VVVV + extu.b r5,r5 + swap.b r5,r0 ! V0 + or r0,r5 ! VV + swap.w r5,r0 ! VV00 + or r0,r5 ! VVVV + + ! Enough bytes need to be copied + mov #0x40, r0 ! (MT) + cmp/gt r6,r0 ! (MT) 64 > len => slow loop + + bt/s 22f + mov r6,r0 + + ! align the dst to the cache block size if necessary + mov r4, r3 + mov #~(0x1f), r1 + + and r3, r1 + cmp/eq r3, r1 + + bt/s 11f ! dst is already aligned + sub r1, r3 ! r3-r1 -> r3 + shlr2 r3 ! number of loops + +10: mov.l r5,@-r4 + dt r3 + bf/s 10b + add #-4, r6 + +11: ! dst is 32byte aligned + mov r6,r2 + mov #-5,r0 + shld r0,r2 ! number of loops + +#ifdef MEMSET_USES_FPU + lds r5, fpul ! (CO) + fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV' + fsts fpul, fr1 + + FPU_SET_PAIRED_PREC +12: + add #-0x20, r6 !(MT) + fmov dr0, @-r4 + fmov dr0, @-r4 + fmov dr0, @-r4 + dt r2 + bf/s 12b !(BR) + fmov dr0, @-r4 + + RESTORE_FPSCR +#else +12: + mov.l r5,@-r4 + mov.l r5,@-r4 + mov.l r5,@-r4 + mov.l r5,@-r4 + mov.l r5,@-r4 + mov.l r5,@-r4 + add #-0x20, r6 + mov.l r5,@-r4 + dt r2 + bf/s 12b + mov.l r5,@-r4 +#endif + tst r6,r6 + bt/s 5f + mov #8, r0 + + cmp/ge r0, r6 + bf/s 4f + mov r6,r0 +22: + shlr2 r0 + shlr r0 ! r0 = r6 >> 3 +3: + dt r0 + mov.l r5,@-r4 ! set 8-byte at once + bf/s 3b + mov.l r5,@-r4 + ! + mov #7,r0 + and r0,r6 + tst r6,r6 + bt 5f + ! fill bytes +4: + dt r6 + bf/s 4b + mov.b r5,@-r4 +5: + rts + mov r4,r0 +END(memset) +libc_hidden_def (memset) diff --git a/libc/string/sh/sh4/strcpy.S b/libc/string/sh/sh4/strcpy.S new file mode 100644 index 000000000..0f8278017 --- /dev/null +++ b/libc/string/sh/sh4/strcpy.S @@ -0,0 +1,28 @@ +/* strcpy implementation for SUPERH + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* + char *strcpy(char *dest, const char *src); + */ + +#include <sysdep.h> + +ENTRY(strcpy) + mov r4,r2 +1: + mov.b @r5+,r1 + tst r1,r1 + mov.b r1,@r2 + bf/s 1b + add #1,r2 + + rts + mov r4,r0 +END(strcpy) +libc_hidden_def (strcpy) diff --git a/libc/string/sh/sh4/strncpy.S b/libc/string/sh/sh4/strncpy.S new file mode 100644 index 000000000..8a16f39d4 --- /dev/null +++ b/libc/string/sh/sh4/strncpy.S @@ -0,0 +1,43 @@ +/* strncpy implementation for SUPERH + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com> + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +/* + char *strncpy(char *dest, const char *src, size_t n); + */ + +#include <sysdep.h> + +ENTRY(strncpy) + mov #0,r0 + bra 2f + mov r4,r2 +1: + mov.b r1,@(r0,r2) + add #1,r0 +2: + cmp/hs r6,r0 + bt 5f + mov.b @(r0,r5),r1 + tst r1,r1 + bf/s 1b + cmp/hs r6,r0 + bra 4f + nop +3: + mov.b r1,@(r0,r2) + add #1,r0 + cmp/hs r6,r0 +4: + bf/s 3b + mov #0,r1 +5: + rts + mov r2,r0 +END(strncpy) +libc_hidden_def(strncpy) |