diff options
| -rwxr-xr-x | libc/string/arc/Makefile | 13 | ||||
| -rw-r--r-- | libc/string/arc/memcmp.S | 128 | ||||
| -rw-r--r-- | libc/string/arc/memcpy.S | 71 | ||||
| -rw-r--r-- | libc/string/arc/memset.S | 51 | ||||
| -rw-r--r-- | libc/string/arc/strchr.S | 138 | ||||
| -rw-r--r-- | libc/string/arc/strcmp.S | 102 | ||||
| -rw-r--r-- | libc/string/arc/strcpy.S | 71 | ||||
| -rw-r--r-- | libc/string/arc/strlen.S | 84 | 
8 files changed, 658 insertions, 0 deletions
| diff --git a/libc/string/arc/Makefile b/libc/string/arc/Makefile new file mode 100755 index 000000000..523cf6842 --- /dev/null +++ b/libc/string/arc/Makefile @@ -0,0 +1,13 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> +# +# Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. +# + +top_srcdir:=../../../ +top_builddir:=../../../ +all: objs +include $(top_builddir)Rules.mak +include ../Makefile.in +include $(top_srcdir)Makerules diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S new file mode 100644 index 000000000..4c0e39143 --- /dev/null +++ b/libc/string/arc/memcmp.S @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + +#include <sysdep.h> +#include <features.h> + +#ifdef __LITTLE_ENDIAN__ +#define WORD2 r2 +#define SHIFT r3 +#else /* BIG ENDIAN */ +#define WORD2 r3 +#define SHIFT r2 +#endif + +ENTRY(memcmp) +	or	r12,r0,r1 +	asl_s	r12,r12,30 +	sub	r3,r2,1 +	brls	r2,r12,.Lbytewise +	ld	r4,[r0,0] +	ld	r5,[r1,0] +	lsr.f	lp_count,r3,3 +	lpne	.Loop_end +	ld_s	WORD2,[r0,4] +	ld_s	r12,[r1,4] +	brne	r4,r5,.Leven +	ld.a	r4,[r0,8] +	ld.a	r5,[r1,8] +	brne	WORD2,r12,.Lodd +.Loop_end: +	asl_s	SHIFT,SHIFT,3 +	bhs_s	.Last_cmp +	brne	r4,r5,.Leven +	ld	r4,[r0,4] +	ld	r5,[r1,4] +#ifdef __LITTLE_ENDIAN__ +	nop_s +	; one more load latency cycle +.Last_cmp: +	xor	r0,r4,r5 +	bset	r0,r0,SHIFT +	sub_s	r1,r0,1 +	bic_s	r1,r1,r0 +	norm	r1,r1 +	b.d	.Leven_cmp +	and	r1,r1,24 +.Leven: +	xor	r0,r4,r5 +	sub_s	r1,r0,1 +	bic_s	r1,r1,r0 +	norm	r1,r1 +	; slow track insn +	and	r1,r1,24 +.Leven_cmp: +	asl	r2,r4,r1 +	asl	r12,r5,r1 +	lsr_s	r2,r2,1 +	lsr_s	r12,r12,1 +	j_s.d	[blink] +	sub	r0,r2,r12 +	.balign	4 +.Lodd: +	xor	r0,WORD2,r12 +	sub_s	r1,r0,1 +	bic_s	r1,r1,r0 +	norm	r1,r1 +	; slow track insn +	and	r1,r1,24 +	asl_s	r2,r2,r1 +	asl_s	r12,r12,r1 +	lsr_s	r2,r2,1 +	lsr_s	r12,r12,1 +	j_s.d	[blink] +	sub	r0,r2,r12 +#else /* BIG ENDIAN */ +.Last_cmp: +	neg_s	SHIFT,SHIFT +	lsr	r4,r4,SHIFT +	lsr	r5,r5,SHIFT +	; slow track insn +.Leven: +	sub.f	r0,r4,r5 +	mov.ne	r0,1 +	j_s.d	[blink] +	bset.cs	r0,r0,31 +.Lodd: +	cmp_s	WORD2,r12 +	mov_s	r0,1 +	j_s.d	[blink] +	bset.cs	r0,r0,31 +#endif /* ENDIAN */ +	.balign	4 +.Lbytewise: +	breq	r2,0,.Lnil +	ldb	r4,[r0,0] +	ldb	r5,[r1,0] +	lsr.f	lp_count,r3 +	lpne	.Lbyte_end +	ldb_s	r3,[r0,1] +	ldb	r12,[r1,1] +	brne	r4,r5,.Lbyte_even +	ldb.a	r4,[r0,2] +	ldb.a	r5,[r1,2] +	brne	r3,r12,.Lbyte_odd +.Lbyte_end: +	bcc	.Lbyte_even +	brne	r4,r5,.Lbyte_even +	ldb_s	r3,[r0,1] +	ldb_s	r12,[r1,1] +.Lbyte_odd: +	j_s.d	[blink] +	sub	r0,r3,r12 +.Lbyte_even: +	j_s.d	[blink] +	sub	r0,r4,r5 +.Lnil: +	j_s.d	[blink] +	mov	r0,0 +END(memcmp) +libc_hidden_def(memcmp) + +#ifdef __UCLIBC_SUSV3_LEGACY__ +strong_alias(memcmp,bcmp) +#endif diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S new file mode 100644 index 000000000..1c11951e4 --- /dev/null +++ b/libc/string/arc/memcpy.S @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + +#include <sysdep.h> + +/* This memcpy implementation does not support objects of 1GB or larger - +   the check for alignment does not work then.  */ +/* We assume that most sources and destinations are aligned, and +   that also lengths are mostly a multiple of four, although to a lesser +   extent.  */ +ENTRY(memcpy) +	or	r3,r0,r1 +	asl_s	r3,r3,30 +	mov_s	r5,r0 +	brls.d	r2,r3,.Lcopy_bytewise +	sub.f	r3,r2,1 +	ld_s	r12,[r1,0] +	asr.f	lp_count,r3,3 +	bbit0.d	r3,2,.Lnox4 +	bmsk_s	r2,r2,1 +	st.ab	r12,[r5,4] +	ld.a	r12,[r1,4] +.Lnox4: +	lppnz	.Lendloop +	ld_s	r3,[r1,4] +	st.ab	r12,[r5,4] +	ld.a	r12,[r1,8] +	st.ab	r3,[r5,4] +.Lendloop: +	breq	r2,0,.Last_store +	ld	r3,[r5,0] +#ifdef __LITTLE_ENDIAN__ +	add3	r2,-1,r2 +	; uses long immediate +	xor_s	r12,r12,r3 +	bmsk	r12,r12,r2 +        xor_s	r12,r12,r3 +#else /* BIG ENDIAN */ +	sub3	r2,31,r2 +	; uses long immediate +        xor_s	r3,r3,r12 +        bmsk	r3,r3,r2 +        xor_s	r12,r12,r3 +#endif /* ENDIAN */ +.Last_store: +	j_s.d	[blink] +	st	r12,[r5,0] + +	.balign	4 +.Lcopy_bytewise: +	jcs	[blink] +	ldb_s	r12,[r1,0] +	lsr.f	lp_count,r3 +	bhs_s	.Lnox1 +	stb.ab	r12,[r5,1] +	ldb.a	r12,[r1,1] +.Lnox1: +	lppnz	.Lendbloop +	ldb_s	r3,[r1,1] +	stb.ab	r12,[r5,1] +	ldb.a	r12,[r1,2] +	stb.ab	r3,[r5,1] +.Lendbloop: +	j_s.d	[blink] +	stb	r12,[r5,0] +END(memcpy) +libc_hidden_def(memcpy) diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S new file mode 100644 index 000000000..f4048455a --- /dev/null +++ b/libc/string/arc/memset.S @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + +#include <sysdep.h> + +#define SMALL	7 /* Must be at least 6 to deal with alignment/loop issues.  */ + +ENTRY(memset) + +	mov_s	r4,r0 +	or	r12,r0,r2 +	bmsk.f	r12,r12,1 +	extb_s	r1,r1 +	asl	r3,r1,8 +	beq.d	.Laligned +	or_s	r1,r1,r3 +	brls	r2,SMALL,.Ltiny +	add	r3,r2,r0 +	stb	r1,[r3,-1] +	bclr_s	r3,r3,0 +	stw	r1,[r3,-2] +	bmsk.f	r12,r0,1 +	add_s	r2,r2,r12 +	sub.ne	r2,r2,4 +	stb.ab	r1,[r4,1] +	and	r4,r4,-2 +	stw.ab	r1,[r4,2] +	and	r4,r4,-4 +.Laligned:	; This code address should be aligned for speed. +	asl	r3,r1,16 +	lsr.f	lp_count,r2,2 +	or_s	r1,r1,r3 +	lpne	.Loop_end +	st.ab	r1,[r4,4] +.Loop_end: +	j_s	[blink] + + +	.balign	4 +.Ltiny: +	mov.f	lp_count,r2 +	lpne	.Ltiny_end +	stb.ab	r1,[r4,1] +.Ltiny_end: +	j_s	[blink] +END(memset) +libc_hidden_def(memset) diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S new file mode 100644 index 000000000..443993589 --- /dev/null +++ b/libc/string/arc/strchr.S @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + +#include <sysdep.h> +#include <features.h> + +/* ARC700 has a relatively long pipeline and branch prediction, so we want +   to avoid branches that are hard to predict.  On the other hand, the +   presence of the norm instruction makes it easier to operate on whole +   words branch-free.  */ + +ENTRY(strchr) +	extb_s	r1,r1 +	asl	r5,r1,8 +	bmsk	r2,r0,1 +	or	r5,r5,r1 +	mov_s	r3,0x01010101 +	breq.d	r2,r0,.Laligned +	asl	r4,r5,16 +	sub_s	r0,r0,r2 +	asl	r7,r2,3 +	ld_s	r2,[r0] +#ifdef __LITTLE_ENDIAN__ +	asl	r7,r3,r7 +#else +	lsr	r7,r3,r7 +#endif +	or	r5,r5,r4 +	ror	r4,r3 +	sub	r12,r2,r7 +	bic_s	r12,r12,r2 +	and	r12,r12,r4 +	brne.d	r12,0,.Lfound0_ua +	xor	r6,r2,r5 +	ld.a	r2,[r0,4] +	sub	r12,r6,r7 +	bic	r12,r12,r6 +#ifdef __LITTLE_ENDIAN__ +	and	r7,r12,r4 +	breq	r7,0,.Loop ; For speed, we want this branch to be unaligned. +	b	.Lfound_char ; Likewise this one. +#else +	and	r12,r12,r4 +	breq	r12,0,.Loop ; For speed, we want this branch to be unaligned. +	lsr_s	r12,r12,7 +	bic 	r2,r7,r6 +	b.d	.Lfound_char_b +	and_s	r2,r2,r12 +#endif +; /* We require this code address to be unaligned for speed...  */ +.Laligned: +	ld_s	r2,[r0] +	or	r5,r5,r4 +	ror	r4,r3 +; /* ... so that this code address is aligned, for itself and ...  */ +.Loop: +	sub	r12,r2,r3 +	bic_s	r12,r12,r2 +	and	r12,r12,r4 +	brne.d	r12,0,.Lfound0 +	xor	r6,r2,r5 +	ld.a	r2,[r0,4] +	sub	r12,r6,r3 +	bic	r12,r12,r6 +	and	r7,r12,r4 +	breq	r7,0,.Loop /* ... so that this branch is unaligned.  */ +	; Found searched-for character.  r0 has already advanced to next word. +#ifdef __LITTLE_ENDIAN__ +/* We only need the information about the first matching byte +   (i.e. the least significant matching byte) to be exact, +   hence there is no problem with carry effects.  */ +.Lfound_char: +	sub	r3,r7,1 +	bic	r3,r3,r7 +	norm	r2,r3 +	sub_s	r0,r0,1 +	asr_s	r2,r2,3 +	j.d	[blink] +	sub_s	r0,r0,r2 + +	.balign	4 +.Lfound0_ua: +	mov	r3,r7 +.Lfound0: +	sub	r3,r6,r3 +	bic	r3,r3,r6 +	and	r2,r3,r4 +	or_s	r12,r12,r2 +	sub_s	r3,r12,1 +	bic_s	r3,r3,r12 +	norm	r3,r3 +	add_s	r0,r0,3 +	asr_s	r12,r3,3 +	asl.f	0,r2,r3 +	sub_s	r0,r0,r12 +	j_s.d	[blink] +	mov.pl	r0,0 +#else /* BIG ENDIAN */ +.Lfound_char: +	lsr	r7,r7,7 + +	bic	r2,r7,r6 +.Lfound_char_b: +	norm	r2,r2 +	sub_s	r0,r0,4 +	asr_s	r2,r2,3 +	j.d	[blink] +	add_s	r0,r0,r2 + +.Lfound0_ua: +	mov_s	r3,r7 +.Lfound0: +	asl_s	r2,r2,7 +	or	r7,r6,r4 +	bic_s	r12,r12,r2 +	sub	r2,r7,r3 +	or	r2,r2,r6 +	bic	r12,r2,r12 +	bic.f	r3,r4,r12 +	norm	r3,r3 + +	add.pl	r3,r3,1 +	asr_s	r12,r3,3 +	asl.f	0,r2,r3 +	add_s	r0,r0,r12 +	j_s.d	[blink] +	mov.mi	r0,0 +#endif /* ENDIAN */ +END(strchr) +libc_hidden_def(strchr) + +#ifdef __UCLIBC_SUSV3_LEGACY__ +strong_alias(strchr,index) +#endif diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S new file mode 100644 index 000000000..5a0e56045 --- /dev/null +++ b/libc/string/arc/strcmp.S @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + +#include <features.h> +#include <sysdep.h> + +/* This is optimized primarily for the ARC700. +   It would be possible to speed up the loops by one cycle / word +   respective one cycle / byte by forcing double source 1 alignment, unrolling +   by a factor of two, and speculatively loading the second word / byte of +   source 1; however, that would increase the overhead for loop setup / finish, +   and strcmp might often terminate early.  */ + +ENTRY(strcmp) +	or	r2,r0,r1 +	bmsk_s	r2,r2,1 +	brne	r2,0,.Lcharloop +	mov_s	r12,0x01010101 +	ror	r5,r12 +.Lwordloop: +	ld.ab	r2,[r0,4] +	ld.ab	r3,[r1,4] +	nop_s +	sub	r4,r2,r12 +	bic	r4,r4,r2 +	and	r4,r4,r5 +	brne	r4,0,.Lfound0 +	breq	r2,r3,.Lwordloop +#ifdef	__LITTLE_ENDIAN__ +	xor	r0,r2,r3	; mask for difference +	sub_s	r1,r0,1 +	bic_s	r0,r0,r1	; mask for least significant difference bit +	sub	r1,r5,r0 +	xor	r0,r5,r1	; mask for least significant difference byte +	and_s	r2,r2,r0 +	and_s	r3,r3,r0 +#endif /* LITTLE ENDIAN */ +	cmp_s	r2,r3 +	mov_s	r0,1 +	j_s.d	[blink] +	bset.lo	r0,r0,31 + +	.balign	4 +#ifdef __LITTLE_ENDIAN__ +.Lfound0: +	xor	r0,r2,r3	; mask for difference +	or	r0,r0,r4	; or in zero indicator +	sub_s	r1,r0,1 +	bic_s	r0,r0,r1	; mask for least significant difference bit +	sub	r1,r5,r0 +	xor	r0,r5,r1	; mask for least significant difference byte +	and_s	r2,r2,r0 +	and_s	r3,r3,r0 +	sub.f	r0,r2,r3 +	mov.hi	r0,1 +	j_s.d	[blink] +	bset.lo	r0,r0,31 +#else /* BIG ENDIAN */ +	/* The zero-detection above can mis-detect 0x01 bytes as zeroes +	   because of carry-propagateion from a lower significant zero byte. +	   We can compensate for this by checking that bit0 is zero. +	   This compensation is not necessary in the step where we +	   get a low estimate for r2, because in any affected bytes +	   we already have 0x00 or 0x01, which will remain unchanged +	   when bit 7 is cleared.  */ +	.balign	4 +.Lfound0: +	lsr	r0,r4,8 +	lsr_s	r1,r2 +	bic_s	r2,r2,r0	; get low estimate for r2 and get ... +	bic_s	r0,r0,r1	; <this is the adjusted mask for zeros> +	or_s	r3,r3,r0	; ... high estimate r3 so that r2 > r3 will ... +	cmp_s	r3,r2		; ... be independent of trailing garbage +	or_s	r2,r2,r0	; likewise for r3 > r2 +	bic_s	r3,r3,r0 +	rlc	r0,0		; r0 := r2 > r3 ? 1 : 0 +	cmp_s	r2,r3 +	j_s.d	[blink] +	bset.lo	r0,r0,31 +#endif /* ENDIAN */ + +	.balign	4 +.Lcharloop: +	ldb.ab	r2,[r0,1] +	ldb.ab	r3,[r1,1] +	nop_s +	breq	r2,0,.Lcmpend +	breq	r2,r3,.Lcharloop +.Lcmpend: +	j_s.d	[blink] +	sub	r0,r2,r3 +END(strcmp) +libc_hidden_def(strcmp) + +#ifndef __UCLIBC_HAS_LOCALE__ +strong_alias(strcmp,strcoll) +libc_hidden_def(strcoll) +#endif diff --git a/libc/string/arc/strcpy.S b/libc/string/arc/strcpy.S new file mode 100644 index 000000000..241bf3ee6 --- /dev/null +++ b/libc/string/arc/strcpy.S @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + + +#include <sysdep.h> + +/* If dst and src are 4 byte aligned, copy 8 bytes at a time. +   If the src is 4, but not 8 byte aligned, we first read 4 bytes to get +   it 8 byte aligned.  Thus, we can do a little read-ahead, without +   dereferencing a cache line that we should not touch. +   Note that short and long instructions have been scheduled to avoid +   branch stalls. +   The beq_s to r3z could be made unaligned & long to avoid a stall +   there, but the it is not likely to be taken often, and it +   would also be likey to cost an unaligned mispredict at the next call.  */ + +ENTRY(strcpy) +	or	r2,r0,r1 +	bmsk_s	r2,r2,1 +	brne.d	r2,0,charloop +	mov_s	r10,r0 +	ld_s	r3,[r1,0] +	mov	r8,0x01010101 +	bbit0.d	r1,2,loop_start +	ror	r12,r8 +	sub	r2,r3,r8 +	bic_s	r2,r2,r3 +	tst_s	r2,r12 +	bne	r3z +	mov_s	r4,r3 +	.balign 4 +loop: +	ld.a	r3,[r1,4] +	st.ab	r4,[r10,4] +loop_start: +	ld.a	r4,[r1,4] +	sub	r2,r3,r8 +	bic_s	r2,r2,r3 +	tst_s	r2,r12 +	bne_s	r3z +	st.ab	r3,[r10,4] +	sub	r2,r4,r8 +	bic	r2,r2,r4 +	tst	r2,r12 +	beq	loop +	mov_s	r3,r4 +#ifdef __LITTLE_ENDIAN__ +r3z:	bmsk.f	r1,r3,7 +	lsr_s	r3,r3,8 +#else +r3z:	lsr.f	r1,r3,24 +	asl_s	r3,r3,8 +#endif +	bne.d	r3z +	stb.ab	r1,[r10,1] +	j_s	[blink] + +	.balign	4 +charloop: +	ldb.ab	r3,[r1,1] + + +	brne.d	r3,0,charloop +	stb.ab	r3,[r10,1] +	j	[blink] +END(strcpy) +libc_hidden_def(strcpy) diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S new file mode 100644 index 000000000..0b9b93815 --- /dev/null +++ b/libc/string/arc/strlen.S @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2007 ARC International (UK) LTD + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. + */ + + +#include <sysdep.h> + +ENTRY(strlen) +	or	r3,r0,7 +	ld	r2,[r3,-7] +	ld.a	r6,[r3,-3] +	mov	r4,0x01010101 +	; uses long immediate +#ifdef __LITTLE_ENDIAN__ +	asl_s	r1,r0,3 +	btst_s	r0,2 +	asl	r7,r4,r1 +	ror	r5,r4 +	sub	r1,r2,r7 +	bic_s	r1,r1,r2 +	mov.eq	r7,r4 +	sub	r12,r6,r7 +	bic	r12,r12,r6 +	or.eq	r12,r12,r1 +	and	r12,r12,r5 +	brne	r12,0,.Learly_end +#else /* BIG ENDIAN */ +	ror	r5,r4 +	btst_s	r0,2 +	mov_s	r1,31 +	sub3	r7,r1,r0 +	sub	r1,r2,r4 +	bic_s	r1,r1,r2 +	bmsk	r1,r1,r7 +	sub	r12,r6,r4 +	bic	r12,r12,r6 +	bmsk.ne	r12,r12,r7 +	or.eq	r12,r12,r1 +	and	r12,r12,r5 +	brne	r12,0,.Learly_end +#endif /* ENDIAN */ + +.Loop: +	ld_s	r2,[r3,4] +	ld.a	r6,[r3,8] +	; stall for load result +	sub	r1,r2,r4 +	bic_s	r1,r1,r2 +	sub	r12,r6,r4 +	bic	r12,r12,r6 +	or	r12,r12,r1 +	and	r12,r12,r5 +	breq r12,0,.Loop +.Lend: +	and.f	r1,r1,r5 +	sub.ne	r3,r3,4 +	mov.eq	r1,r12 +#ifdef __LITTLE_ENDIAN__ +	sub_s	r2,r1,1 +	bic_s	r2,r2,r1 +	norm	r1,r2 +	sub_s	r0,r0,3 +	lsr_s	r1,r1,3 +	sub	r0,r3,r0 +	j_s.d	[blink] +	sub	r0,r0,r1 +#else /* BIG ENDIAN */ +	lsr_s	r1,r1,7 +	mov.eq	r2,r6 +	bic_s	r1,r1,r2 +	norm	r1,r1 +	sub	r0,r3,r0 +	lsr_s	r1,r1,3 +	j_s.d	[blink] +	add	r0,r0,r1 +#endif /* ENDIAN */ +.Learly_end: +	b.d	.Lend +	sub_s.ne r1,r1,r1 +END(strlen) +libc_hidden_def(strlen) | 
