diff options
Diffstat (limited to 'libc/string/sh/sh4/memcpy.S')
-rw-r--r-- | libc/string/sh/sh4/memcpy.S | 128 |
1 files changed, 107 insertions, 21 deletions
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S index 252ef36eb..5be770a59 100644 --- a/libc/string/sh/sh4/memcpy.S +++ b/libc/string/sh/sh4/memcpy.S @@ -28,13 +28,20 @@ * Currenlty it has been only implemented and tested for little endian mode. */ .macro FPU_SET_PAIRED_PREC sts fpscr, r7 - mov #0x10, r6 ! PR=0 SZ=1 - shll16 r6 - lds r6, fpscr + mov #0x10, r0 ! PR=0 SZ=1 + shll16 r0 + lds r0, fpscr .endm .macro RESTORE_FPSCR lds r7, fpscr .endm +.macro DALLOC + ! Cache allocate + store on dst-32. + add #-32, r1 + movca.l r0, @r1 + add #32, r1 +.endm + #endif ! @@ -471,30 +478,111 @@ ENTRY(memcpy) add r0, r5 mov r0, r1 - add #-0x1c, r5 - mov r5, r0 + mov r1, r3 ! MT + sub r2, r3 ! EX (r3 - r2 -> r3) + mov #-5, r0 + shld r0, r3 ! number of the cache lines + mov #8, r0 + cmp/ge r0, r3 ! Check if there are many cache lines to copy. + bf 45f ! Copy cache line aligned blocks without pref. + mov r5, r0 + add #-0x7c, r0 tst #7, r0 ! src is 8byte aligned - mov r5, r3 + bf 45f + + ! Many cache lines have to be copied and the buffers are well aligned. + ! Aggressive prefetching and FPU in single paired precision. + mov r0, r5 + mov r5, r6 + add #-0x80, r6 ! prefetch head - add #-64, r3 ! To pefetch head - bt/s 3f + FPU_SET_PAIRED_PREC - pref @r3 + mov #4, r0 +67: + add #-0x20, r6 + pref @r6 + add #-0x20, r6 + pref @r6 + + fmov @r5+, dr0 + fmov @r5+, dr2 + fmov @r5+, dr4 + fmov @r5+, dr6 + fmov @r5+, dr8 + fmov @r5+, dr10 + fmov @r5+, dr12 + fmov @r5+, dr14 + fmov @r5+, xd0 + fmov @r5+, xd2 + fmov @r5+, xd4 + fmov @r5+, xd6 + fmov @r5+, xd8 + fmov @r5+, xd10 + fmov @r5+, xd12 + fmov @r5+, xd14 + + DALLOC + fmov xd14, @-r1 + fmov xd12, @-r1 + fmov xd10, @-r1 + fmov xd8, @-r1 + DALLOC + fmov xd6, @-r1 + fmov xd4, @-r1 + fmov xd2, @-r1 + fmov xd0, @-r1 + DALLOC + fmov dr14, @-r1 + fmov dr12, @-r1 + fmov dr10, @-r1 + fmov dr8, @-r1 + DALLOC + fmov dr6, @-r1 + add #-0x80, r5 + fmov dr4, @-r1 + add #-0x80, r5 + fmov dr2, @-r1 + add #-0x20, r6 + fmov dr0, @-r1 + add #-4, r3 + pref @r6 + add #-0x20, r6 + cmp/ge r0, r3 + bt/s 67b + pref @r6 + + ! Other cache lines could be copied: so use the FPU in single paired + ! precision without prefetching. No check for alignment is necessary. + + mov #1, r0 + cmp/ge r0, r3 + bt/s 4f + add #0x60, r5 + + RESTORE_FPSCR + + bra 5f + nop + + ! No prefetch and FPU in single precision. +45: + add #-0x1c, r5 + mov r5, r0 + tst #7, r0 + bt 3f 2: fmov.s @r5+, fr0 - mov r1, r6 fmov.s @r5+, fr1 - add #-32, r6 fmov.s @r5+, fr2 fmov.s @r5+, fr3 fmov.s @r5+, fr4 fmov.s @r5+, fr5 fmov.s @r5+, fr6 fmov.s @r5+, fr7 - add #-0x40, r5 - movca.l r0, @r6 ! Cache allocate + store on dst-32. + DALLOC fmov.s fr7, @-r1 fmov.s fr6, @-r1 @@ -505,35 +593,33 @@ ENTRY(memcpy) fmov.s fr1, @-r1 fmov.s fr0, @-r1 - add #-32, r3 cmp/eq r2,r1 bf/s 2b - pref @r3 ! Prefetch the next cache line. + add #-0x40, r5 bra 5f + nop + + ! No prefetch and FPU in single paired precision. 3: FPU_SET_PAIRED_PREC 4: fmov @r5+, dr0 - mov r1, r6 fmov @r5+, dr2 - add #-32, r6 fmov @r5+, dr4 fmov @r5+, dr6 - add #-0x40, r5 - movca.l r0, @r6 + DALLOC fmov dr6, @-r1 fmov dr4, @-r1 fmov dr2, @-r1 fmov dr0, @-r1 - add #-32, r3 cmp/eq r2,r1 bf/s 4b - pref @r3 + add #-0x40, r5 RESTORE_FPSCR |