From 8cd420c223f1a39c01835189669928ea8df9d221 Mon Sep 17 00:00:00 2001 From: Giuseppe Cavallaro Date: Mon, 14 Dec 2009 16:45:49 +0100 Subject: sh: fix endianess and optimise the SH4 memcpy This patch fixes the big-endian code and adds a new optimization only for little endian mode. This optimization is based on prefetching and 64bit data transfer via FPU. Tests shows that ---------------------------------------- Memory bandwidth | Gain | sh4-300 | sh4-200 ---------------------------------------- 512 bytes to 16KiB | ~20% | ~25% from 32KiB to 16MiB | ~190% | ~5% ---------------------------------------- Signed-off-by: Giuseppe Cavallaro Signed-off-by: Carmelo Amoroso --- libc/string/sh/sh4/memcpy.S | 109 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 2 deletions(-) (limited to 'libc') diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S index 0954bce85..c03c18c73 100644 --- a/libc/string/sh/sh4/memcpy.S +++ b/libc/string/sh/sh4/memcpy.S @@ -6,6 +6,9 @@ * Modified from memcpy.S and micro-optimised for SH4 * Stuart Menefy (stuart.menefy@st.com) * + * Copyright (c) 2009 STMicroelectronics Ltd + * Optimised using prefetching and 64bit data transfer via FPU + * Author: Giuseppe Cavallaro */ /* @@ -17,6 +20,22 @@ #include +#ifdef __LITTLE_ENDIAN__ +#define MEMCPY_USES_FPU +/* Use paired single precision load or store mode for 64-bit tranfering. + * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300. + * Currenlty it has been only implemented and tested for little endian mode. */ +.macro FPU_SET_PAIRED_PREC + sts fpscr, r7 + mov #0x10, r6 ! PR=0 SZ=1 + shll16 r6 + lds r6, fpscr +.endm +.macro RESTORE_FPSCR + lds r7, fpscr +.endm +#endif + ! ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. ! @@ -189,9 +208,7 @@ memcpy: mov r4, r0 ! 5 MT (0 cycle latency) add r6, r0 ! 49 EX - mov #16, r1 ! 6 EX bt/s .Lcase00 ! 111 BR (aligned) - sub r4, r5 ! 75 EX ! Arguments are not nicely long word aligned or zero len. @@ -207,6 +224,7 @@ memcpy: ! However the penalty for getting it 'wrong' is much higher for long word ! aligned data (and this is more common), so use a value of 16. + mov #16, r1 ! 6 EX cmp/gt r6,r1 ! 56 MT add #-1,r5 ! 50 EX @@ -447,6 +465,92 @@ memcpy: mov.l r7, @-r0 ! 30 LS +#ifdef MEMCPY_USES_FPU + ! Copy the cache line aligned blocks by using the FPU registers. + ! If src and dst are well aligned adopt 64-bit data transfer. + ! We also need r0 as a temporary (for movca), so 'undo' the invariant: + ! r5: src (was r0+r5) + ! r1: dest (was r0) +1: + add r0, r5 + mov r0, r1 + + add #-0x1c, r5 + mov r5, r0 + + tst #7, r0 ! src is 8byte aligned + mov r5, r3 + + add #-64, r3 ! To pefetch head + bt/s 3f + + pref @r3 + +2: fmov.s @r5+, fr0 + mov r1, r6 + fmov.s @r5+, fr1 + add #-32, r6 + fmov.s @r5+, fr2 + fmov.s @r5+, fr3 + fmov.s @r5+, fr4 + fmov.s @r5+, fr5 + fmov.s @r5+, fr6 + fmov.s @r5+, fr7 + add #-0x40, r5 + + movca.l r0, @r6 ! Cache allocate + store on dst-32. + + fmov.s fr7, @-r1 + fmov.s fr6, @-r1 + fmov.s fr5, @-r1 + fmov.s fr4, @-r1 + fmov.s fr3, @-r1 + fmov.s fr2, @-r1 + fmov.s fr1, @-r1 + fmov.s fr0, @-r1 + + add #-32, r3 + cmp/eq r2,r1 + + bf/s 2b + pref @r3 ! Prefetch the next cache line. + + bra 5f + +3: FPU_SET_PAIRED_PREC + +4: fmov @r5+, dr0 + mov r1, r6 + fmov @r5+, dr2 + add #-32, r6 + fmov @r5+, dr4 + fmov @r5+, dr6 + add #-0x40, r5 + + movca.l r0, @r6 + + fmov dr6, @-r1 + fmov dr4, @-r1 + fmov dr2, @-r1 + fmov dr0, @-r1 + add #-32, r3 + cmp/eq r2,r1 + + bf/s 4b + pref @r3 + + RESTORE_FPSCR + +5: mov r1, r0 + + cmp/eq r4, r0 ! 54 MT + bf/s 1f ! 109 BR + sub r1, r5 ! 75 EX + + rts + nop +1: +#else ! Copy the cache line aligned blocks ! ! In use: r0, r2, r4, r5 @@ -512,6 +616,7 @@ memcpy: rts 1: mov.l @r15+, r8 ! 15 LS +#endif sub r4, r1 ! 75 EX (len remaining) ! number of trailing bytes is non-zero -- cgit v1.2.3