diff options
| author | Salvatore Cro <salvatore.cro@st.com> | 2010-09-09 16:10:21 +0200 | 
|---|---|---|
| committer | Carmelo Amoroso <carmelo.amoroso@st.com> | 2010-09-15 12:51:04 +0200 | 
| commit | a27dd6924e7964d92b49f4d5ebe2e68cfb2742dd (patch) | |
| tree | 627befce36a43bffdb02bacca575a2c01256fd7a /libc/string | |
| parent | 599c74a4d7e9bbe68b946d65aef2725821ea3fe9 (diff) | |
sh: update the memcpy adding a new loop with aggressive prefetching
After exploring different prefetch distance-degree combinations
in this new update of the memcpy function, a new loop has been added
for moving many cache lines with an aggressive prefetching schema.
Prefetch has been removed when move few cache line aligned blocks.
As final result, this memcpy gives us the same performances for small
sizes (we already had!) and better numbers for big copies.
In case of SH4-300 CPU Series, benchmarks show a gain of ~20% for sizes
from 4KiB to 256KiB.
In case of the SH4-200, there is a gain of ~40% for sizes bigger than
32KiB.
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
Diffstat (limited to 'libc/string')
| -rw-r--r-- | libc/string/sh/sh4/memcpy.S | 128 | 
1 files changed, 107 insertions, 21 deletions
| diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S index 252ef36eb..5be770a59 100644 --- a/libc/string/sh/sh4/memcpy.S +++ b/libc/string/sh/sh4/memcpy.S @@ -28,13 +28,20 @@   * Currenlty it has been only implemented and tested for little endian mode. */  .macro FPU_SET_PAIRED_PREC  	sts	fpscr, r7 -	mov	#0x10, r6	! PR=0 SZ=1 -	shll16	r6 -	lds	r6, fpscr +	mov	#0x10, r0	! PR=0 SZ=1 +	shll16	r0 +	lds	r0, fpscr  .endm  .macro RESTORE_FPSCR  	lds	r7, fpscr  .endm +.macro DALLOC +	! Cache allocate + store on dst-32. +	add	#-32, r1 +	movca.l	r0, @r1 +	add	#32, r1 +.endm +  #endif  	! @@ -471,30 +478,111 @@ ENTRY(memcpy)  	add	r0, r5  	mov	r0, r1 -	add	#-0x1c, r5 -	mov	r5, r0 +	mov	r1, r3		! MT +	sub	r2, r3		! EX (r3 - r2 -> r3) +	mov	#-5, r0 +	shld	r0, r3		! number of the cache lines +	mov	#8, r0 +	cmp/ge	r0, r3		! Check if there are many cache lines to copy. +	bf	45f		! Copy cache line aligned blocks without pref. +	mov	r5, r0 +	add	#-0x7c, r0  	tst	#7, r0		! src is 8byte aligned -	mov	r5, r3 +	bf	45f + +	! Many cache lines have to be copied and the buffers are well aligned. +	! Aggressive prefetching and FPU in single paired precision. +	mov	r0, r5 +	mov	r5, r6 +	add	#-0x80, r6	! prefetch head -	add	#-64, r3	! To pefetch head -	bt/s	3f +	FPU_SET_PAIRED_PREC -	 pref	@r3 +	mov	#4, r0 +67: +	add	#-0x20, r6 +	pref	@r6 +	add	#-0x20, r6 +	pref	@r6 + +	fmov	@r5+, dr0 +	fmov	@r5+, dr2 +	fmov	@r5+, dr4 +	fmov	@r5+, dr6 +	fmov	@r5+, dr8 +	fmov	@r5+, dr10 +	fmov	@r5+, dr12 +	fmov	@r5+, dr14 +	fmov	@r5+, xd0 +	fmov	@r5+, xd2 +	fmov	@r5+, xd4 +	fmov	@r5+, xd6 +	fmov	@r5+, xd8 +	fmov	@r5+, xd10 +	fmov	@r5+, xd12 +	fmov	@r5+, xd14 + +	DALLOC +	fmov	xd14, @-r1 +	fmov	xd12, @-r1 +	fmov	xd10, @-r1 +	fmov	xd8, @-r1 +	DALLOC +	fmov	xd6, @-r1 +	fmov	xd4, @-r1 +	fmov	xd2, @-r1 +	fmov	xd0, @-r1 +	DALLOC +	fmov	dr14, @-r1 +	fmov	dr12, @-r1 +	fmov	dr10, @-r1 +	fmov	dr8, @-r1 +	DALLOC +	fmov	dr6, @-r1 +	add	#-0x80, r5 +	fmov	dr4, @-r1 +	add	#-0x80, r5 +	fmov	dr2, @-r1 +	add	#-0x20, r6 +	fmov	dr0, @-r1 +	add	#-4, r3 +	pref	@r6 +	add	#-0x20, r6 +	cmp/ge	r0, r3 +	bt/s	67b +	 pref	@r6 + +	! Other cache lines could be copied: so use the FPU in single paired +	! precision without prefetching. No check for alignment is necessary. + +	mov	#1, r0 +	cmp/ge	r0, r3 +	bt/s	4f +	 add	#0x60, r5 + +	RESTORE_FPSCR + +	bra	5f +	 nop + +	! No prefetch and FPU in single precision. +45: +	add	#-0x1c, r5 +	mov	r5, r0 +	tst	#7, r0 +	bt	3f  2:	fmov.s	@r5+, fr0 -	mov	r1, r6  	fmov.s	@r5+, fr1 -	add	#-32, r6  	fmov.s	@r5+, fr2  	fmov.s	@r5+, fr3  	fmov.s	@r5+, fr4  	fmov.s	@r5+, fr5  	fmov.s	@r5+, fr6  	fmov.s	@r5+, fr7 -	add	#-0x40, r5 -	movca.l	r0, @r6		! Cache allocate + store on dst-32. +	DALLOC  	fmov.s	fr7, @-r1  	fmov.s	fr6, @-r1 @@ -505,35 +593,33 @@ ENTRY(memcpy)  	fmov.s	fr1, @-r1  	fmov.s	fr0, @-r1 -	add	#-32, r3  	cmp/eq	r2,r1  	bf/s	2b -	 pref	@r3		! Prefetch the next cache line. +	 add	#-0x40, r5  	bra	5f +	 nop + +	! No prefetch and FPU in single paired precision.  3:	FPU_SET_PAIRED_PREC  4:	fmov	@r5+, dr0 -	mov	r1, r6  	fmov	@r5+, dr2 -	add	#-32, r6  	fmov	@r5+, dr4  	fmov	@r5+, dr6 -	add	#-0x40, r5 -	movca.l	r0, @r6 +	DALLOC  	fmov	dr6, @-r1  	fmov	dr4, @-r1  	fmov	dr2, @-r1  	fmov	dr0, @-r1 -	add	#-32, r3  	cmp/eq	r2,r1  	bf/s	4b -	 pref	@r3 +	 add	#-0x40, r5  	RESTORE_FPSCR | 
