diff options
| -rw-r--r-- | libc/string/arc/memcmp.S | 94 | ||||
| -rw-r--r-- | libc/string/arc/memcpy.S | 65 | ||||
| -rw-r--r-- | libc/string/arc/memset.S | 61 | ||||
| -rw-r--r-- | libc/string/arc/strchr.S | 25 | ||||
| -rw-r--r-- | libc/string/arc/strcmp.S | 21 | ||||
| -rw-r--r-- | libc/string/arc/strlen.S | 7 | ||||
| -rw-r--r-- | libc/sysdeps/linux/arc/asm.h | 39 | 
7 files changed, 267 insertions, 45 deletions
diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S index a60757e7a..20122a296 100644 --- a/libc/string/arc/memcmp.S +++ b/libc/string/arc/memcmp.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -17,6 +17,8 @@  #endif  ENTRY(memcmp) + +#if defined(__ARC700__) || defined(__ARCHS__)  	or	r12,r0,r1  	asl_s	r12,r12,30  	sub	r3,r2,1 @@ -149,6 +151,96 @@ ENTRY(memcmp)  .Lnil:  	j_s.d	[blink]  	mov	r0,0 + +#elif (__ARC64_ARCH32__) +	;; Based on Synopsys code from newlib's arc64/memcmp.S +	cmp		r2, 32 +	bls.d	@.L_compare_1_bytes +	mov		r3, r0	; "r0" will be used as return value + +	lsr		r12, r2, 4	; counter for 16-byte chunks +	xor		r13, r13, r13	; the mask showing inequal registers + +.L_compare_16_bytes: +	ld.ab	r4, [r3, +4] +	ld.ab	r5, [r1, +4] +	ld.ab	r6, [r3, +4] +	ld.ab	r7, [r1, +4] +	ld.ab	r8, [r3, +4] +	ld.ab	r9, [r1, +4] +	ld.ab	r10, [r3, +4] +	ld.ab	r11, [r1, +4] +	xor.f	0, r4, r5 +	xor.ne	r13, r13, 0b0001 +	xor.f	0, r6, r7 +	xor.ne	r13, r13, 0b0010 +	xor.f	0, r8, r9 +	xor.ne	r13, r13, 0b0100 +	xor.f	0, r10, r11 +	xor.ne	r13, r13, 0b1000 +	brne	r13, 0, @.L_unequal_find +	dbnz	r12, @.L_compare_16_bytes + +	;; Adjusting the pointers because of the extra loads in the end +	sub		r1, r1, 4 +	sub		r3, r3, 4 +	bmsk_s	  r2, r2, 3	; any remaining bytes to compare + +.L_compare_1_bytes: +	cmp		r2, 0 +	jeq.d	[blink] +	xor_s	r0, r0, r0 + +2: +	ldb.ab	r4, [r3, +1] +	ldb.ab	r5, [r1, +1] +	sub.f	r0, r4, r5 +	jne		[blink] +	dbnz	r2, @2b +	j_s		[blink] + +	;; At this point, we want to find the _first_ comparison that marked the +	;; inequality of "lhs" and "rhs" +.L_unequal_find: +	ffs		r13, r13 +	asl		r13, r13, 2 +	bi		[r13] +.L_unequal_r4r5: +	mov		r1, r4 +	b.d		@.L_diff_byte_in_regs +	mov		r2, r5 +	nop +.L_unequal_r6r7: +	mov		r1, r6 +	b.d		@.L_diff_byte_in_regs +	mov		r2, r7 +	nop +.L_unequal_r8r9: +	mov		r1, r8 +	b.d		@.L_diff_byte_in_regs +	mov		r2, r9 +	nop +.L_unequal_r10r11: +	mov		r1, r10 +	mov		r2, r11 + +	;; fall-through +	;; If we're here, that means the two operands are not equal. +.L_diff_byte_in_regs: +	xor		r0, r1, r2 +	ffs		r0, r0 +	and		r0, r0, 0x18 +	lsr		r1, r1, r0 +	lsr		r2, r2, r0 +	bmsk_s	r1, r1, 7 +	bmsk_s	r2, r2, 7 +	j_s.d	[blink] +	sub		r0, r1, r2 + +#else +#error "Unsupported ARC CPU type" +#endif +  END(memcmp)  libc_hidden_def(memcmp) diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S index 69d7220b8..153083765 100644 --- a/libc/string/arc/memcpy.S +++ b/libc/string/arc/memcpy.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@  #include <sysdep.h> -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif -  ENTRY(memcpy) -#ifdef __ARC700__ +#if defined(__ARC700__)  /* This memcpy implementation does not support objects of 1GB or larger -     the check for alignment does not work then.  */  /* We assume that most sources and destinations are aligned, and @@ -73,9 +69,9 @@ ENTRY(memcpy)  .Lendbloop:  	j_s.d	[blink]  	stb	r12,[r5,0] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) +  #ifdef __LITTLE_ENDIAN__  # define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<  # define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >> @@ -299,7 +295,58 @@ ENTRY(memcpy)  	stb.ab	r6, [r3,1]  .Lcopybytewise_3:  	j	[blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) +	;; Based on Synopsys code from newlib's arc64/memcpy.S +	lsr.f	r11, r2, 4		; counter for 16-byte chunks +	beq.d	@.L_write_15_bytes +	mov	r3, r0			; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) +	ldd.ab	r4, [r1, 8] +	ldd.ab	r6, [r1, 8] +	std.ab	r4, [r3, 8] +	std.ab	r6, [r3, 8] +	dbnz	r11, @.L_write_16_bytes +#else +	ld.ab	r4, [r1, 4] +	ld.ab	r5, [r1, 4] +	ld.ab	r6, [r1, 4] +	ld.ab	r7, [r1, 4] +	st.ab	r4, [r3, 4] +	st.ab	r5, [r3, 4] +	st.ab	r6, [r3, 4] +	dbnz.d	r11, @.L_write_16_bytes +	st.ab	r7, [r3, 4] +#endif +	bmsk_s	r2, r2, 3 + +.L_write_15_bytes: +	bbit0.d	r2, 1, @1f +	lsr	r11, r2, 2 +	ldh.ab	r4, [r1, 2] +	sth.ab	r4, [r3, 2] +1: +	bbit0.d	r2, 0, @1f +	xor	r11, r11, 3 +	ldb.ab	r4, [r1, 1] +	stb.ab	r4, [r3, 1] +1: +	asl	r11, r11, 1 +	bi	[r11] +	ld.ab	r4,[r1, 4] +	st.ab	r4,[r3, 4] +	ld.ab	r4,[r1, 4] +	st.ab	r4,[r3, 4] +	ld	r4,[r1] +	st	r4,[r3] + +	j_s	[blink] + +#else +#error "Unsupported ARC CPU type" +#endif  END(memcpy)  libc_hidden_def(memcpy) diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S index 0b74ddc7f..5aa5d6c65 100644 --- a/libc/string/arc/memset.S +++ b/libc/string/arc/memset.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@  #include <sysdep.h> -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif -  ENTRY(memset) -#ifdef __ARC700__ +#if defined(__ARC700__)  #define SMALL	7 /* Must be at least 6 to deal with alignment/loop issues.  */  	mov_s	r4,r0 @@ -52,9 +48,8 @@ ENTRY(memset)  	stb.ab	r1,[r4,1]  .Ltiny_end:  	j_s	[blink] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__)  #ifdef DONT_USE_PREALLOC  #define PREWRITE(A,B)	prefetchw [(A),(B)]  #else @@ -156,7 +151,55 @@ ENTRY(memset)  .Lcopy3bytes:  	j	[blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) +	;; Based on Synopsys code from newlib's arc64/memset.S + +	;; Assemble the bytes to 32bit words +	bmsk_s	r1, r1, 7		; treat it like unsigned char +	lsl8	r3, r1 +	or_s	r1, r1, r3 +	lsl16	r3, r1 +	or	r6, r1, r3 +	mov r7,r6 + +	lsr.f	r5, r2, 4		; counter for 16-byte chunks +	beq.d	@.L_write_15_bytes +	mov	r4, r0			; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) +	std.ab	r6, [r4, 8] +	std.ab	r6, [r4, 8] +	dbnz	r5, @.L_write_16_bytes +#else +	st.ab	r6, [r4, 4] +	st.ab	r6, [r4, 4] +	st.ab	r6, [r4, 4] +	dbnz.d	r5, @.L_write_16_bytes +	st.ab	r6, [r4, 4] +#endif +	bmsk_s	r2, r2, 3 + +.L_write_15_bytes: +	bbit0.d	r2, 1, @1f +	lsr	r3, r2, 2 +	sth.ab	r6, [r4, 2] +1: +	bbit0.d	r2, 0, @1f +	xor	r3, r3, 3 +	stb.ab	r6, [r4, 1] +1: +	bi	[r3] +	st.ab	r6,[r4, 4] +	st.ab	r6,[r4, 4] +	st.ab	r6,[r4, 4] + +	j_s	[blink] + +#else +#error "Unsupported ARC CPU type" +#endif  END(memset)  libc_hidden_def(memset) diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S index 443993589..df25eb3be 100644 --- a/libc/string/arc/strchr.S +++ b/libc/string/arc/strchr.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@  #include <sysdep.h>  #include <features.h> +#include <asm.h>  /* ARC700 has a relatively long pipeline and branch prediction, so we want     to avoid branches that are hard to predict.  On the other hand, the @@ -21,7 +22,7 @@ ENTRY(strchr)  	mov_s	r3,0x01010101  	breq.d	r2,r0,.Laligned  	asl	r4,r5,16 -	sub_s	r0,r0,r2 +	SUBR_S	r0,r0,r2  	asl	r7,r2,3  	ld_s	r2,[r0]  #ifdef __LITTLE_ENDIAN__ @@ -77,10 +78,10 @@ ENTRY(strchr)  	sub	r3,r7,1  	bic	r3,r3,r7  	norm	r2,r3 -	sub_s	r0,r0,1 -	asr_s	r2,r2,3 +	SUBR_S	r0,r0,1 +	ASRR_S	r2,r2,3  	j.d	[blink] -	sub_s	r0,r0,r2 +	SUBR_S	r0,r0,r2  	.balign	4  .Lfound0_ua: @@ -90,13 +91,13 @@ ENTRY(strchr)  	bic	r3,r3,r6  	and	r2,r3,r4  	or_s	r12,r12,r2 -	sub_s	r3,r12,1 +	SUBR_S	r3,r12,1  	bic_s	r3,r3,r12  	norm	r3,r3 -	add_s	r0,r0,3 -	asr_s	r12,r3,3 +	ADDR_S	r0,r0,3 +	ASRR_S	r12,r3,3  	asl.f	0,r2,r3 -	sub_s	r0,r0,r12 +	SUBR_S	r0,r0,r12  	j_s.d	[blink]  	mov.pl	r0,0  #else /* BIG ENDIAN */ @@ -106,10 +107,10 @@ ENTRY(strchr)  	bic	r2,r7,r6  .Lfound_char_b:  	norm	r2,r2 -	sub_s	r0,r0,4 +	SUBR_S	r0,r0,4  	asr_s	r2,r2,3  	j.d	[blink] -	add_s	r0,r0,r2 +	ADDR_S	r0,r0,r2  .Lfound0_ua:  	mov_s	r3,r7 @@ -126,7 +127,7 @@ ENTRY(strchr)  	add.pl	r3,r3,1  	asr_s	r12,r3,3  	asl.f	0,r2,r3 -	add_s	r0,r0,r12 +	ADDR_S	r0,r0,r12  	j_s.d	[blink]  	mov.mi	r0,0  #endif /* ENDIAN */ diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S index ad38d9e00..3f64ac421 100644 --- a/libc/string/arc/strcmp.S +++ b/libc/string/arc/strcmp.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,14 +7,11 @@  #include <features.h>  #include <sysdep.h> - -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif +#include <asm.h>  ENTRY(strcmp) -#ifdef __ARC700__ +#if defined(__ARC700__) || defined(__ARC64_ARCH32__)  /* This is optimized primarily for the ARC700.     It would be possible to speed up the loops by one cycle / word     respective one cycle / byte by forcing double source 1 alignment, unrolling @@ -38,7 +35,7 @@ ENTRY(strcmp)  	breq	r2,r3,.Lwordloop  #ifdef	__LITTLE_ENDIAN__  	xor	r0,r2,r3	; mask for difference -	sub_s	r1,r0,1 +	SUBR_S	r1,r0,1  	bic_s	r0,r0,r1	; mask for least significant difference bit  	sub	r1,r5,r0  	xor	r0,r5,r1	; mask for least significant difference byte @@ -55,7 +52,7 @@ ENTRY(strcmp)  .Lfound0:  	xor	r0,r2,r3	; mask for difference  	or	r0,r0,r4	; or in zero indicator -	sub_s	r1,r0,1 +	SUBR_S	r1,r0,1  	bic_s	r0,r0,r1	; mask for least significant difference bit  	sub	r1,r5,r0  	xor	r0,r5,r1	; mask for least significant difference byte @@ -99,9 +96,8 @@ ENTRY(strcmp)  .Lcmpend:  	j_s.d	[blink]  	sub	r0,r2,r3 -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__)  	or	r2, r0, r1  	bmsk_s	r2, r2, 1  	brne	r2, 0, @.Lcharloop @@ -168,7 +164,10 @@ ENTRY(strcmp)  .Lcmpend:  	j_s.d	[blink]  	sub	r0, r2, r3 -#endif /* __ARCHS__ */ + +#else +#error "Unsupported ARC CPU type" +#endif  END(strcmp)  libc_hidden_def(strcmp) diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S index 0b9b93815..0d1d3aa4e 100644 --- a/libc/string/arc/strlen.S +++ b/libc/string/arc/strlen.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@  #include <sysdep.h> +#include <asm.h>  ENTRY(strlen)  	or	r3,r0,7 @@ -15,7 +16,7 @@ ENTRY(strlen)  	mov	r4,0x01010101  	; uses long immediate  #ifdef __LITTLE_ENDIAN__ -	asl_s	r1,r0,3 +	ASLR_S	r1,r0,3  	btst_s	r0,2  	asl	r7,r4,r1  	ror	r5,r4 @@ -59,7 +60,7 @@ ENTRY(strlen)  	sub.ne	r3,r3,4  	mov.eq	r1,r12  #ifdef __LITTLE_ENDIAN__ -	sub_s	r2,r1,1 +	SUBR_S	r2,r1,1  	bic_s	r2,r2,r1  	norm	r1,r2  	sub_s	r0,r0,3 diff --git a/libc/sysdeps/linux/arc/asm.h b/libc/sysdeps/linux/arc/asm.h index f15dff841..f83075ea1 100644 --- a/libc/sysdeps/linux/arc/asm.h +++ b/libc/sysdeps/linux/arc/asm.h @@ -7,6 +7,13 @@  #ifndef _ARC_ASM_H  #define _ARC_ASM_H +/* + * Some 16-bit instructions were excluded from the ARCv3 ISA + * the following macros are introduced to handle these changes in one place. + * This will allow not to change existing ARCv2 code and use 16-bit versions + * of instructions for ARCv2 and replace them with 32-bit vesrions for ARCv3 + */ +  #if defined (__ARC64_ARCH32__)  .macro PUSHR reg @@ -25,6 +32,22 @@  	pop	\reg  .endm +.macro SUBR_S dst,src1,src2 +	sub	\dst, \src1, \src2 +.endm + +.macro ADDR_S dst,src1,src2 +	add	\dst, \src1, \src2 +.endm + +.macro ASRR_S dst,src1,src2 +	asr	\dst, \src1, \src2 +.endm + +.macro ASLR_S dst,src1,src2 +	asl	\dst, \src1, \src2 +.endm +  #elif defined (__ARC64_ARCH64__)  # error ARCv3 64-bit is not supported by uClibc-ng @@ -47,6 +70,22 @@  	pop_s	\reg  .endm +.macro SUBR_S dst,src1,src2 +	sub_s	\dst, \src1, \src2 +.endm + +.macro ADDR_S dst,src1,src2 +	add_s	\dst, \src1, \src2 +.endm + +.macro ASRR_S dst,src1,src2 +	asr_s	\dst, \src1, \src2 +.endm + +.macro ASLR_S dst,src1,src2 +	asl_s	\dst, \src1, \src2 +.endm +  #endif  #endif /* _ARC_ASM_H  */  | 
