sh: update the memcpy adding a new loop with aggressive prefetching

After exploring different prefetch distance-degree combinations in this new update of the memcpy function, a new loop has been added for moving many cache lines with an aggressive prefetching schema. Prefetch has been removed when move few cache line aligned blocks. As final result, this memcpy gives us the same performances for small sizes (we already had!) and better numbers for big copies. In case of SH4-300 CPU Series, benchmarks show a gain of ~20% for sizes from 4KiB to 256KiB. In case of the SH4-200, there is a gain of ~40% for sizes bigger than 32KiB. Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com> Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>
author: Salvatore Cro <salvatore.cro@st.com> 2010-09-09 16:10:21 +0200
committer: Carmelo Amoroso <carmelo.amoroso@st.com> 2010-09-15 12:51:04 +0200
commit: a27dd6924e7964d92b49f4d5ebe2e68cfb2742dd (patch)
tree: 627befce36a43bffdb02bacca575a2c01256fd7a /libc/string/sh
parent: 599c74a4d7e9bbe68b946d65aef2725821ea3fe9 (diff)
1 files changed, 107 insertions, 21 deletions
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S
index 252ef36eb..5be770a59 100644
--- a/libc/string/sh/sh4/memcpy.S
+++ b/libc/string/sh/sh4/memcpy.S
@@ -28,13 +28,20 @@
  * Currenlty it has been only implemented and tested for little endian mode. */
 .macro FPU_SET_PAIRED_PREC
 	sts	fpscr, r7
-	mov	#0x10, r6	! PR=0 SZ=1
-	shll16	r6
-	lds	r6, fpscr
+	mov	#0x10, r0	! PR=0 SZ=1
+	shll16	r0
+	lds	r0, fpscr
 .endm
 .macro RESTORE_FPSCR
 	lds	r7, fpscr
 .endm
+.macro DALLOC
+	! Cache allocate + store on dst-32.
+	add	#-32, r1
+	movca.l	r0, @r1
+	add	#32, r1
+.endm
+
 #endif
 
 	!
@@ -471,30 +478,111 @@ ENTRY(memcpy)
 	add	r0, r5
 	mov	r0, r1
 
-	add	#-0x1c, r5
-	mov	r5, r0
+	mov	r1, r3		! MT
+	sub	r2, r3		! EX (r3 - r2 -> r3)
+	mov	#-5, r0
+	shld	r0, r3		! number of the cache lines
 
+	mov	#8, r0
+	cmp/ge	r0, r3		! Check if there are many cache lines to copy.
+	bf	45f		! Copy cache line aligned blocks without pref.
+	mov	r5, r0
+	add	#-0x7c, r0
 	tst	#7, r0		! src is 8byte aligned
-	mov	r5, r3
+	bf	45f
+
+	! Many cache lines have to be copied and the buffers are well aligned.
+	! Aggressive prefetching and FPU in single paired precision.
+	mov	r0, r5
+	mov	r5, r6
+	add	#-0x80, r6	! prefetch head
 
-	add	#-64, r3	! To pefetch head
-	bt/s	3f
+	FPU_SET_PAIRED_PREC
 
-	 pref	@r3
+	mov	#4, r0
+67:
+	add	#-0x20, r6
+	pref	@r6
+	add	#-0x20, r6
+	pref	@r6
+
+	fmov	@r5+, dr0
+	fmov	@r5+, dr2
+	fmov	@r5+, dr4
+	fmov	@r5+, dr6
+	fmov	@r5+, dr8
+	fmov	@r5+, dr10
+	fmov	@r5+, dr12
+	fmov	@r5+, dr14
+	fmov	@r5+, xd0
+	fmov	@r5+, xd2
+	fmov	@r5+, xd4
+	fmov	@r5+, xd6
+	fmov	@r5+, xd8
+	fmov	@r5+, xd10
+	fmov	@r5+, xd12
+	fmov	@r5+, xd14
+
+	DALLOC
+	fmov	xd14, @-r1
+	fmov	xd12, @-r1
+	fmov	xd10, @-r1
+	fmov	xd8, @-r1
+	DALLOC
+	fmov	xd6, @-r1
+	fmov	xd4, @-r1
+	fmov	xd2, @-r1
+	fmov	xd0, @-r1
+	DALLOC
+	fmov	dr14, @-r1
+	fmov	dr12, @-r1
+	fmov	dr10, @-r1
+	fmov	dr8, @-r1
+	DALLOC
+	fmov	dr6, @-r1
+	add	#-0x80, r5
+	fmov	dr4, @-r1
+	add	#-0x80, r5
+	fmov	dr2, @-r1
+	add	#-0x20, r6
+	fmov	dr0, @-r1
+	add	#-4, r3
+	pref	@r6
+	add	#-0x20, r6
+	cmp/ge	r0, r3
+	bt/s	67b
+	 pref	@r6
+
+	! Other cache lines could be copied: so use the FPU in single paired
+	! precision without prefetching. No check for alignment is necessary.
+
+	mov	#1, r0
+	cmp/ge	r0, r3
+	bt/s	4f
+	 add	#0x60, r5
+
+	RESTORE_FPSCR
+
+	bra	5f
+	 nop
+
+	! No prefetch and FPU in single precision.
+45:
+	add	#-0x1c, r5
+	mov	r5, r0
+	tst	#7, r0
+	bt	3f
 
 2:	fmov.s	@r5+, fr0
-	mov	r1, r6
 	fmov.s	@r5+, fr1
-	add	#-32, r6
 	fmov.s	@r5+, fr2
 	fmov.s	@r5+, fr3
 	fmov.s	@r5+, fr4
 	fmov.s	@r5+, fr5
 	fmov.s	@r5+, fr6
 	fmov.s	@r5+, fr7
-	add	#-0x40, r5
 
-	movca.l	r0, @r6		! Cache allocate + store on dst-32.
+	DALLOC
 
 	fmov.s	fr7, @-r1
 	fmov.s	fr6, @-r1
@@ -505,35 +593,33 @@ ENTRY(memcpy)
 	fmov.s	fr1, @-r1
 	fmov.s	fr0, @-r1
 
-	add	#-32, r3
 	cmp/eq	r2,r1
 
 	bf/s	2b
-	 pref	@r3		! Prefetch the next cache line.
+	 add	#-0x40, r5
 
 	bra	5f
+	 nop
+
+	! No prefetch and FPU in single paired precision.
 
 3:	FPU_SET_PAIRED_PREC
 
 4:	fmov	@r5+, dr0
-	mov	r1, r6
 	fmov	@r5+, dr2
-	add	#-32, r6
 	fmov	@r5+, dr4
 	fmov	@r5+, dr6
-	add	#-0x40, r5
 
-	movca.l	r0, @r6
+	DALLOC
 
 	fmov	dr6, @-r1
 	fmov	dr4, @-r1
 	fmov	dr2, @-r1
 	fmov	dr0, @-r1
-	add	#-32, r3
 	cmp/eq	r2,r1
 
 	bf/s	4b
-	 pref	@r3
+	 add	#-0x40, r5
 
 	RESTORE_FPSCR
author	Salvatore Cro <salvatore.cro@st.com>	2010-09-09 16:10:21 +0200
committer	Carmelo Amoroso <carmelo.amoroso@st.com>	2010-09-15 12:51:04 +0200
commit	a27dd6924e7964d92b49f4d5ebe2e68cfb2742dd (patch)
tree	627befce36a43bffdb02bacca575a2c01256fd7a /libc/string/sh
parent	599c74a4d7e9bbe68b946d65aef2725821ea3fe9 (diff)