From a27dd6924e7964d92b49f4d5ebe2e68cfb2742dd Mon Sep 17 00:00:00 2001 From: Salvatore Cro Date: Thu, 9 Sep 2010 16:10:21 +0200 Subject: sh: update the memcpy adding a new loop with aggressive prefetching After exploring different prefetch distance-degree combinations in this new update of the memcpy function, a new loop has been added for moving many cache lines with an aggressive prefetching schema. Prefetch has been removed when move few cache line aligned blocks. As final result, this memcpy gives us the same performances for small sizes (we already had!) and better numbers for big copies. In case of SH4-300 CPU Series, benchmarks show a gain of ~20% for sizes from 4KiB to 256KiB. In case of the SH4-200, there is a gain of ~40% for sizes bigger than 32KiB. Signed-off-by: Giuseppe Cavallaro Signed-off-by: Carmelo Amoroso --- libc/string/sh/sh4/memcpy.S | 128 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 107 insertions(+), 21 deletions(-) (limited to 'libc/string') diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S index 252ef36eb..5be770a59 100644 --- a/libc/string/sh/sh4/memcpy.S +++ b/libc/string/sh/sh4/memcpy.S @@ -28,13 +28,20 @@ * Currenlty it has been only implemented and tested for little endian mode. */ .macro FPU_SET_PAIRED_PREC sts fpscr, r7 - mov #0x10, r6 ! PR=0 SZ=1 - shll16 r6 - lds r6, fpscr + mov #0x10, r0 ! PR=0 SZ=1 + shll16 r0 + lds r0, fpscr .endm .macro RESTORE_FPSCR lds r7, fpscr .endm +.macro DALLOC + ! Cache allocate + store on dst-32. + add #-32, r1 + movca.l r0, @r1 + add #32, r1 +.endm + #endif ! @@ -471,30 +478,111 @@ ENTRY(memcpy) add r0, r5 mov r0, r1 - add #-0x1c, r5 - mov r5, r0 + mov r1, r3 ! MT + sub r2, r3 ! EX (r3 - r2 -> r3) + mov #-5, r0 + shld r0, r3 ! number of the cache lines + mov #8, r0 + cmp/ge r0, r3 ! Check if there are many cache lines to copy. + bf 45f ! Copy cache line aligned blocks without pref. + mov r5, r0 + add #-0x7c, r0 tst #7, r0 ! src is 8byte aligned - mov r5, r3 + bf 45f + + ! Many cache lines have to be copied and the buffers are well aligned. + ! Aggressive prefetching and FPU in single paired precision. + mov r0, r5 + mov r5, r6 + add #-0x80, r6 ! prefetch head - add #-64, r3 ! To pefetch head - bt/s 3f + FPU_SET_PAIRED_PREC - pref @r3 + mov #4, r0 +67: + add #-0x20, r6 + pref @r6 + add #-0x20, r6 + pref @r6 + + fmov @r5+, dr0 + fmov @r5+, dr2 + fmov @r5+, dr4 + fmov @r5+, dr6 + fmov @r5+, dr8 + fmov @r5+, dr10 + fmov @r5+, dr12 + fmov @r5+, dr14 + fmov @r5+, xd0 + fmov @r5+, xd2 + fmov @r5+, xd4 + fmov @r5+, xd6 + fmov @r5+, xd8 + fmov @r5+, xd10 + fmov @r5+, xd12 + fmov @r5+, xd14 + + DALLOC + fmov xd14, @-r1 + fmov xd12, @-r1 + fmov xd10, @-r1 + fmov xd8, @-r1 + DALLOC + fmov xd6, @-r1 + fmov xd4, @-r1 + fmov xd2, @-r1 + fmov xd0, @-r1 + DALLOC + fmov dr14, @-r1 + fmov dr12, @-r1 + fmov dr10, @-r1 + fmov dr8, @-r1 + DALLOC + fmov dr6, @-r1 + add #-0x80, r5 + fmov dr4, @-r1 + add #-0x80, r5 + fmov dr2, @-r1 + add #-0x20, r6 + fmov dr0, @-r1 + add #-4, r3 + pref @r6 + add #-0x20, r6 + cmp/ge r0, r3 + bt/s 67b + pref @r6 + + ! Other cache lines could be copied: so use the FPU in single paired + ! precision without prefetching. No check for alignment is necessary. + + mov #1, r0 + cmp/ge r0, r3 + bt/s 4f + add #0x60, r5 + + RESTORE_FPSCR + + bra 5f + nop + + ! No prefetch and FPU in single precision. +45: + add #-0x1c, r5 + mov r5, r0 + tst #7, r0 + bt 3f 2: fmov.s @r5+, fr0 - mov r1, r6 fmov.s @r5+, fr1 - add #-32, r6 fmov.s @r5+, fr2 fmov.s @r5+, fr3 fmov.s @r5+, fr4 fmov.s @r5+, fr5 fmov.s @r5+, fr6 fmov.s @r5+, fr7 - add #-0x40, r5 - movca.l r0, @r6 ! Cache allocate + store on dst-32. + DALLOC fmov.s fr7, @-r1 fmov.s fr6, @-r1 @@ -505,35 +593,33 @@ ENTRY(memcpy) fmov.s fr1, @-r1 fmov.s fr0, @-r1 - add #-32, r3 cmp/eq r2,r1 bf/s 2b - pref @r3 ! Prefetch the next cache line. + add #-0x40, r5 bra 5f + nop + + ! No prefetch and FPU in single paired precision. 3: FPU_SET_PAIRED_PREC 4: fmov @r5+, dr0 - mov r1, r6 fmov @r5+, dr2 - add #-32, r6 fmov @r5+, dr4 fmov @r5+, dr6 - add #-0x40, r5 - movca.l r0, @r6 + DALLOC fmov dr6, @-r1 fmov dr4, @-r1 fmov dr2, @-r1 fmov dr0, @-r1 - add #-32, r3 cmp/eq r2,r1 bf/s 4b - pref @r3 + add #-0x40, r5 RESTORE_FPSCR -- cgit v1.2.3