diff options
| author | Eric Andersen <andersen@codepoet.org> | 2003-12-17 07:57:32 +0000 | 
|---|---|---|
| committer | Eric Andersen <andersen@codepoet.org> | 2003-12-17 07:57:32 +0000 | 
| commit | e2ec80efb27158e6a367238227ddca876054ae9e (patch) | |
| tree | 5a44feeb563f8660e44526d18dbc8b62c37fa4f1 /libc | |
| parent | f3651e4e206598a4d39dbdab76d4c066ab646188 (diff) | |
Patch from Paul Mundt <lethal@linux-sh.org>:
The subject says it all.. optimized memset/memcpy/strcpy, lifted from SuperH's
glibc tree.
Diffstat (limited to 'libc')
| -rw-r--r-- | libc/string/Makefile | 2 | ||||
| -rw-r--r-- | libc/string/sh64/Makefile | 38 | ||||
| -rw-r--r-- | libc/string/sh64/memcpy.S | 202 | ||||
| -rw-r--r-- | libc/string/sh64/memset.S | 92 | ||||
| -rw-r--r-- | libc/string/sh64/strcpy.S | 98 | 
5 files changed, 431 insertions, 1 deletions
diff --git a/libc/string/Makefile b/libc/string/Makefile index 464a07e74..2ec8a43be 100644 --- a/libc/string/Makefile +++ b/libc/string/Makefile @@ -23,7 +23,7 @@ DIRS=  ifeq ($(TARGET_ARCH),$(wildcard $(TARGET_ARCH)))  DIRS = $(TARGET_ARCH)  endif -ALL_SUBDIRS = i386 arm +ALL_SUBDIRS = i386 arm sh64  MSRC= wstring.c  MOBJ=  basename.o bcopy.o bzero.o dirname.o ffs.o memccpy.o memchr.o memcmp.o \ diff --git a/libc/string/sh64/Makefile b/libc/string/sh64/Makefile new file mode 100644 index 000000000..da2c28416 --- /dev/null +++ b/libc/string/sh64/Makefile @@ -0,0 +1,38 @@ +# Makefile for uClibc's sh64 optimized string routines +# +# Copyright (C) 2003 Paul Mundt <lethal@linux-sh.org> +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU Library General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more +# details. +# +# You should have received a copy of the GNU Library General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +TOPDIR=../../../ +include $(TOPDIR)Rules.mak + +SSRC= memcpy.S memset.S strcpy.S +SOBJS=$(patsubst %.S,%.o, $(SSRC)) + +all: $(SOBJS) $(LIBC) + +$(LIBC): ar-target + +ar-target: $(SOBJS) +	$(AR) $(ARFLAGS) $(LIBC) $(SOBJS) + +$(SOBJS): %.o : %.S +	$(CC) $(CFLAGS) -c $< -o $@ +	$(STRIPTOOL) -x -R .note -R .comment $*.o + +clean: +	$(RM) *.[oa] *~ core + diff --git a/libc/string/sh64/memcpy.S b/libc/string/sh64/memcpy.S new file mode 100644 index 000000000..361e050db --- /dev/null +++ b/libc/string/sh64/memcpy.S @@ -0,0 +1,202 @@ +/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ +/* Modified by SuperH, Inc. September 2003 */ +! +! Fast SH memcpy +! +! by Toshiyasu Morita (tm@netcom.com) +! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) +! SH5 code Copyright 2002 SuperH Ltd. +! +! Entry: ARG0: destination pointer +!        ARG1: source pointer +!        ARG2: byte count +! +! Exit:  RESULT: destination pointer +!        any other registers in the range r0-r7: trashed +! +! Notes: Usually one wants to do small reads and write a longword, but +!        unfortunately it is difficult in some cases to concatanate bytes +!        into a longword on the SH, so this does a longword read and small +!        writes. +! +! This implementation makes two assumptions about how it is called: +! +! 1.: If the byte count is nonzero, the address of the last byte to be +!     copied is unsigned greater than the address of the first byte to +!     be copied.  This could be easily swapped for a signed comparison, +!     but the algorithm used needs some comparison. +! +! 2.: When there are two or three bytes in the last word of an 11-or-more +!     bytes memory chunk to b copied, the rest of the word can be read +!     without side effects. +!     This could be easily changed by increasing the minumum size of +!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, +!     however, this would cost a few extra cyles on average. +!     For SHmedia, the assumption is that any quadword can be read in its +!     enirety if at least one byte is included in the copy. +! + +	.section .text..SHmedia32,"ax" +	.globl	memcpy +	.type	memcpy, @function + +	.align	5 +memcpy: + +#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 +#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 +#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 +#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 + +	ld.b r3,0,r63 +	pta/l Large,tr0 +	movi 25,r0 +	bgeu/u r4,r0,tr0 +	nsb r4,r0 +	shlli r0,5,r0 +	movi (L1-L0+63*32 + 1) & 0xffff,r1 +	sub r1, r0, r0 +L0:	ptrel r0,tr0 +	add r2,r4,r5 +	ptabs r18,tr1 +	add r3,r4,r6 +	blink tr0,r63 +	 +/* Rearranged to make cut2 safe */ +	.balign 8 +L4_7:	/* 4..7 byte memcpy cntd. */ +	stlo.l r2, 0, r0 +	or r6, r7, r6 +	sthi.l r5, -1, r6 +	stlo.l r5, -4, r6 +	blink tr1,r63 + +	.balign 8 +L1:	/* 0 byte memcpy */ +	nop +	blink tr1,r63 +	nop +	nop +	nop +	nop + +L2_3:	/* 2 or 3 byte memcpy cntd. */ +	st.b r5,-1,r6 +	blink tr1,r63 + +	/* 1 byte memcpy */ +	ld.b r3,0,r0 +	st.b r2,0,r0 +	blink tr1,r63 + +L8_15:	/* 8..15 byte memcpy cntd. */ +	stlo.q r2, 0, r0 +	or r6, r7, r6 +	sthi.q r5, -1, r6 +	stlo.q r5, -8, r6 +	blink tr1,r63 +	 +	/* 2 or 3 byte memcpy */ +	ld.b r3,0,r0 +	ld.b r2,0,r63 +	ld.b r3,1,r1 +	st.b r2,0,r0 +	pta/l L2_3,tr0 +	ld.b r6,-1,r6 +	st.b r2,1,r1 +	blink tr0, r63 + +	/* 4 .. 7 byte memcpy */ +	LDUAL (r3, 0, r0, r1) +	pta L4_7, tr0 +	ldlo.l r6, -4, r7 +	or r0, r1, r0 +	sthi.l r2, 3, r0 +	ldhi.l r6, -1, r6 +	blink tr0, r63 + +	/* 8 .. 15 byte memcpy */ +	LDUAQ (r3, 0, r0, r1) +	pta L8_15, tr0 +	ldlo.q r6, -8, r7 +	or r0, r1, r0 +	sthi.q r2, 7, r0 +	ldhi.q r6, -1, r6 +	blink tr0, r63 + +	/* 16 .. 24 byte memcpy */ +	LDUAQ (r3, 0, r0, r1) +	LDUAQ (r3, 8, r8, r9) +	or r0, r1, r0 +	sthi.q r2, 7, r0 +	or r8, r9, r8 +	sthi.q r2, 15, r8 +	ldlo.q r6, -8, r7 +	ldhi.q r6, -1, r6 +	stlo.q r2, 8, r8 +	stlo.q r2, 0, r0 +	or r6, r7, r6 +	sthi.q r5, -1, r6 +	stlo.q r5, -8, r6 +	blink tr1,r63 + +Large: +	ld.b r2, 0, r63 +	pta/l  Loop_ua, tr1 +	ori r3, -8, r7 +	sub r2, r7, r22 +	sub r3, r2, r6 +	add r2, r4, r5 +	ldlo.q r3, 0, r0 +	addi r5, -16, r5 +	movi 64+8, r27 // could subtract r7 from that. +	stlo.q r2, 0, r0 +	sthi.q r2, 7, r0 +	ldx.q r22, r6, r0 +	bgtu/l r27, r4, tr1 + +	addi r5, -48, r27 +	pta/l Loop_line, tr0 +	addi r6, 64, r36 +	addi r6, -24, r19 +	addi r6, -16, r20 +	addi r6, -8, r21 + +Loop_line: +	ldx.q r22, r36, r63 +	alloco r22, 32 +	addi r22, 32, r22 +	ldx.q r22, r19, r23 +	sthi.q r22, -25, r0 +	ldx.q r22, r20, r24 +	ldx.q r22, r21, r25 +	stlo.q r22, -32, r0 +	ldx.q r22, r6,  r0 +	sthi.q r22, -17, r23 +	sthi.q r22,  -9, r24 +	sthi.q r22,  -1, r25 +	stlo.q r22, -24, r23 +	stlo.q r22, -16, r24 +	stlo.q r22,  -8, r25 +	bgeu r27, r22, tr0 + +Loop_ua: +	addi r22, 8, r22 +	sthi.q r22, -1, r0 +	stlo.q r22, -8, r0 +	ldx.q r22, r6, r0 +	bgtu/l r5, r22, tr1 + +	add r3, r4, r7 +	ldlo.q r7, -8, r1 +	sthi.q r22, 7, r0 +	ldhi.q r7, -1, r7 +	ptabs r18,tr1 +	stlo.q r22, 0, r0 +	or r1, r7, r1 +	sthi.q r5, 15, r1 +	stlo.q r5, 8, r1 +	blink tr1, r63 + +	.size memcpy, . - memcpy + diff --git a/libc/string/sh64/memset.S b/libc/string/sh64/memset.S new file mode 100644 index 000000000..714fa3fb3 --- /dev/null +++ b/libc/string/sh64/memset.S @@ -0,0 +1,92 @@ +/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ +/* Modified by SuperH, Inc. September 2003 */ +! +! Fast SH memset +! +! by Toshiyasu Morita (tm@netcom.com) +! +! SH5 code by J"orn Rennecke (joern.rennecke@superh.com) +! Copyright 2002 SuperH Ltd. +! + +#ifdef __LITTLE_ENDIAN__ +#define SHHI shlld +#define SHLO shlrd +#else +#define SHHI shlrd +#define SHLO shlld +#endif + +	.section .text..SHmedia32,"ax" +	.globl	memset +	.type	memset, @function + +	.align 5 + +memset: +	pta/l multiquad, tr0 +	andi r2, 7, r22 +	ptabs r18, tr2 +	mshflo.b r3,r3,r3 +	add r4, r22, r23 +	mperm.w r3, r63, r3	// Fill pattern now in every byte of r3 + +	movi 8, r9 +	bgtu/u r23, r9, tr0 // multiquad + +	beqi/u r4, 0, tr2       // Return with size 0 - ensures no mem accesses +	ldlo.q r2, 0, r7 +	shlli r4, 2, r4 +	movi -1, r8 +	SHHI r8, r4, r8 +	SHHI r8, r4, r8 +	mcmv r7, r8, r3 +	stlo.q r2, 0, r3 +	blink tr2, r63 + +multiquad: +	pta/l lastquad, tr0 +	stlo.q r2, 0, r3 +	shlri r23, 3, r24 +	add r2, r4, r5 +	beqi/u r24, 1, tr0 // lastquad +	pta/l loop, tr1 +	sub r2, r22, r25 +	andi r5, -8, r20   // calculate end address and +	addi r20, -7*8, r8 // loop end address; This might overflow, so we need +	                   // to use a different test before we start the loop +	bge/u r24, r9, tr1 // loop +	st.q r25, 8, r3 +	st.q r20, -8, r3 +	shlri r24, 1, r24 +	beqi/u r24, 1, tr0 // lastquad +	st.q r25, 16, r3 +	st.q r20, -16, r3 +	beqi/u r24, 2, tr0 // lastquad +	st.q r25, 24, r3 +	st.q r20, -24, r3 +lastquad: +	sthi.q r5, -1, r3 +	blink tr2,r63 + +loop: +!!!	alloco r25, 32	// QQQ comment out for short-term fix to SHUK #3895. +			// QQQ commenting out is locically correct, but sub-optimal +			// QQQ Sean McGoogan - 4th April 2003. +	st.q r25, 8, r3 +	st.q r25, 16, r3 +	st.q r25, 24, r3 +	st.q r25, 32, r3 +	addi r25, 32, r25 +	bgeu/l r8, r25, tr1 // loop + +	st.q r20, -40, r3 +	st.q r20, -32, r3 +	st.q r20, -24, r3 +	st.q r20, -16, r3 +	st.q r20, -8, r3 +	sthi.q r5, -1, r3 +	blink tr2,r63 + +	.size	memset, . - memset + diff --git a/libc/string/sh64/strcpy.S b/libc/string/sh64/strcpy.S new file mode 100644 index 000000000..273e9147c --- /dev/null +++ b/libc/string/sh64/strcpy.S @@ -0,0 +1,98 @@ +/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ +/* Modified by SuperH, Inc. September 2003 */ +! Entry: arg0: destination +!        arg1: source +! Exit:  result: destination +! +! SH5 code Copyright 2002 SuperH Ltd. + +#ifdef __LITTLE_ENDIAN__ +#define SHHI shlld +#define SHLO shlrd +#else +#define SHHI shlrd +#define SHLO shlld +#endif + +	.section .text..SHmedia32,"ax" +	.globl	strcpy +	.type	strcpy, @function + +	.align 5 +strcpy: + +	pta/l shortstring,tr1 +	ldlo.q r3,0,r4 +	ptabs r18,tr4 +	shlli r3,3,r7 +	addi r2, 8, r0 +	mcmpeq.b r4,r63,r6 +	SHHI r6,r7,r6 +	bnei/u r6,0,tr1 // shortstring +	pta/l no_lddst, tr2 +	ori r3,-8,r23 +	sub r2, r23, r0 +	sub r3, r2, r21 +	addi r21, 8, r20 +	ldx.q r0, r21, r5 +	pta/l loop, tr0 +	ori r2,-8,r22 +	mcmpeq.b r5, r63, r6 +	bgt/u r22, r23, tr2 // no_lddst + +	// r22 < r23 :  Need to do a load from the destination. +	// r22 == r23 : Doesn't actually need to load from destination, +	//              but still can be handled here. +	ldlo.q r2, 0, r9 +	movi -1, r8 +	SHLO r8, r7, r8 +	mcmv r4, r8, r9 +	stlo.q r2, 0, r9 +	beqi/l r6, 0, tr0 // loop + +	add r5, r63, r4 +	addi r0, 8, r0 +	blink tr1, r63 // shortstring +no_lddst: +	// r22 > r23: note that for r22 == r23 the sthi.q would clobber +	//            bytes before the destination region. +	stlo.q r2, 0, r4 +	SHHI r4, r7, r4 +	sthi.q r0, -1, r4 +	beqi/l r6, 0, tr0 // loop + +	add r5, r63, r4 +	addi r0, 8, r0 +shortstring: +#ifndef __LITTLE_ENDIAN__ +	pta/l shortstring2,tr1 +	byterev r4,r4 +#endif +shortstring2: +	st.b r0,-8,r4 +	andi r4,0xff,r5 +	shlri r4,8,r4 +	addi r0,1,r0 +	bnei/l r5,0,tr1 +	blink tr4,r63 // return +	 +	.balign 8 +loop: +	stlo.q r0, 0, r5 +	ldx.q r0, r20, r4 +	addi r0, 16, r0 +	sthi.q r0, -9, r5 +	mcmpeq.b r4, r63, r6 +	bnei/u r6, 0, tr1 // shortstring +	ldx.q r0, r21, r5 +	stlo.q r0, -8, r4 +	sthi.q r0, -1, r4 +	mcmpeq.b r5, r63, r6 +	beqi/l r6, 0, tr0 // loop + +	add r5, r63, r4 +	addi r0, 8, r0 +	blink tr1, r63 // shortstring + +	.size	strcpy, . - strcpy +  | 
