From 5c9ef58ec4bcb2def9e30f0b156f9cfcb1d0d163 Mon Sep 17 00:00:00 2001 From: Austin Foxley Date: Sun, 22 Nov 2009 12:17:38 -0800 Subject: sh: Add new optimisation to the SH4 memcpy This optimization is based on prefetching and 64bit data transfer via FPU (only for the little endianess) Tests shows that: ---------------------------------------- Memory bandwidth | Gain | sh4-300 | sh4-200 ---------------------------------------- 512 bytes to 16KiB | ~20% | ~25% from 32KiB to 16MiB | ~190% | ~5% ---------------------------------------- Signed-off-by: Austin Foxley Signed-off-by: Giuseppe Cavallaro Signed-off-by: Carmelo Amoroso --- libc/string/sh/sh4/memmove.c | 117 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 libc/string/sh/sh4/memmove.c (limited to 'libc/string/sh/sh4/memmove.c') diff --git a/libc/string/sh/sh4/memmove.c b/libc/string/sh/sh4/memmove.c new file mode 100644 index 000000000..4d52db2ca --- /dev/null +++ b/libc/string/sh/sh4/memmove.c @@ -0,0 +1,117 @@ +/* memmove implementation for SH4 + * + * Copyright (C) 2009 STMicroelectronics Ltd. + * + * Author: Giuseppe Cavallaro + * + * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. + */ + +#include + + +#define FPSCR_SR (1 << 20) +#define STORE_FPSCR(x) __asm__ volatile("sts fpscr, %0" : "=r"(x)) +#define LOAD_FPSCR(x) __asm__ volatile("lds %0, fpscr" : : "r"(x)) + +static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len) +{ + char *d = (char *)dest; + char *s = (char *)src; + + if (len >= 64) { + unsigned long fpscr; + int *s1; + int *d1; + + /* Align the dest to 4 byte boundary. */ + while ((unsigned)d & 0x7) { + *d++ = *s++; + len--; + } + + s1 = (int *)s; + d1 = (int *)d; + + /* check if s is well aligned to use FPU */ + if (!((unsigned)s1 & 0x7)) { + + /* Align the dest to cache-line boundary */ + while ((unsigned)d1 & 0x1c) { + *d1++ = *s1++; + len -= 4; + } + + /* Use paired single precision load or store mode for + * 64-bit tranfering.*/ + STORE_FPSCR(fpscr); + LOAD_FPSCR(FPSCR_SR); + + while (len >= 32) { + __asm__ volatile ("fmov @%0+,dr0":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr2":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr4":"+r" (s1)); + __asm__ volatile ("fmov @%0+,dr6":"+r" (s1)); + __asm__ + volatile ("fmov dr0,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr2,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr4,@%0"::"r" + (d1):"memory"); + d1 += 2; + __asm__ + volatile ("fmov dr6,@%0"::"r" + (d1):"memory"); + d1 += 2; + len -= 32; + } + LOAD_FPSCR(fpscr); + } + s = (char *)s1; + d = (char *)d1; + /*TODO: other subcases could be covered here?!?*/ + } + /* Go to per-byte copy */ + while (len > 0) { + *d++ = *s++; + len--; + } + return; +} + +void *memmove(void *dest, const void *src, size_t len) +{ + unsigned long int d = (long int)dest; + unsigned long int s = (long int)src; + unsigned long int res; + + if (d >= s) + res = d - s; + else + res = s - d; + /* + * 1) dest and src are not overlap ==> memcpy (BWD/FDW) + * 2) dest and src are 100% overlap ==> memcpy (BWD/FDW) + * 3) left-to-right overlap ==> Copy from the beginning to the end + * 4) right-to-left overlap ==> Copy from the end to the beginning + */ + + if (res == 0) /* 100% overlap */ + memcpy(dest, src, len); /* No overlap */ + else if (res >= len) + memcpy(dest, src, len); + else { + if (d > s) /* right-to-left overlap */ + memcpy(dest, src, len); /* memcpy is BWD */ + else /* cannot use SH4 memcpy for this case */ + fpu_optimised_copy_fwd(dest, src, len); + } + return (dest); +} + +libc_hidden_def(memmove) -- cgit v1.2.3