diff options
Diffstat (limited to 'libc/string')
| -rw-r--r-- | libc/string/arc/memcmp.S | 94 | ||||
| -rw-r--r-- | libc/string/arc/memcpy.S | 65 | ||||
| -rw-r--r-- | libc/string/arc/memset.S | 61 | ||||
| -rw-r--r-- | libc/string/arc/strchr.S | 25 | ||||
| -rw-r--r-- | libc/string/arc/strcmp.S | 29 | ||||
| -rw-r--r-- | libc/string/arc/strlen.S | 7 | ||||
| -rw-r--r-- | libc/string/arm/memset.S | 2 | ||||
| -rw-r--r-- | libc/string/explicit_bzero.c | 30 | ||||
| -rw-r--r-- | libc/string/generic/strchr.c | 23 | ||||
| -rw-r--r-- | libc/string/generic/strchrnul.c | 23 | ||||
| -rw-r--r-- | libc/string/generic/strlen.c | 21 | ||||
| -rw-r--r-- | libc/string/generic/strnlen.c | 6 | ||||
| -rw-r--r-- | libc/string/kvx/Makefile | 13 | ||||
| -rw-r--r-- | libc/string/kvx/memcpy.S | 221 | ||||
| -rw-r--r-- | libc/string/kvx/memset.S | 146 | ||||
| -rw-r--r-- | libc/string/strcasestr.c | 2 | ||||
| -rw-r--r-- | libc/string/strstr.c | 2 | ||||
| -rw-r--r-- | libc/string/x86_64/strcat.S | 2 | ||||
| -rw-r--r-- | libc/string/x86_64/strcspn.S | 2 | ||||
| -rw-r--r-- | libc/string/x86_64/strlen.S | 2 | ||||
| -rw-r--r-- | libc/string/x86_64/strspn.S | 2 | 
21 files changed, 675 insertions, 103 deletions
| diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S index a60757e7a..20122a296 100644 --- a/libc/string/arc/memcmp.S +++ b/libc/string/arc/memcmp.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -17,6 +17,8 @@  #endif  ENTRY(memcmp) + +#if defined(__ARC700__) || defined(__ARCHS__)  	or	r12,r0,r1  	asl_s	r12,r12,30  	sub	r3,r2,1 @@ -149,6 +151,96 @@ ENTRY(memcmp)  .Lnil:  	j_s.d	[blink]  	mov	r0,0 + +#elif (__ARC64_ARCH32__) +	;; Based on Synopsys code from newlib's arc64/memcmp.S +	cmp		r2, 32 +	bls.d	@.L_compare_1_bytes +	mov		r3, r0	; "r0" will be used as return value + +	lsr		r12, r2, 4	; counter for 16-byte chunks +	xor		r13, r13, r13	; the mask showing inequal registers + +.L_compare_16_bytes: +	ld.ab	r4, [r3, +4] +	ld.ab	r5, [r1, +4] +	ld.ab	r6, [r3, +4] +	ld.ab	r7, [r1, +4] +	ld.ab	r8, [r3, +4] +	ld.ab	r9, [r1, +4] +	ld.ab	r10, [r3, +4] +	ld.ab	r11, [r1, +4] +	xor.f	0, r4, r5 +	xor.ne	r13, r13, 0b0001 +	xor.f	0, r6, r7 +	xor.ne	r13, r13, 0b0010 +	xor.f	0, r8, r9 +	xor.ne	r13, r13, 0b0100 +	xor.f	0, r10, r11 +	xor.ne	r13, r13, 0b1000 +	brne	r13, 0, @.L_unequal_find +	dbnz	r12, @.L_compare_16_bytes + +	;; Adjusting the pointers because of the extra loads in the end +	sub		r1, r1, 4 +	sub		r3, r3, 4 +	bmsk_s	  r2, r2, 3	; any remaining bytes to compare + +.L_compare_1_bytes: +	cmp		r2, 0 +	jeq.d	[blink] +	xor_s	r0, r0, r0 + +2: +	ldb.ab	r4, [r3, +1] +	ldb.ab	r5, [r1, +1] +	sub.f	r0, r4, r5 +	jne		[blink] +	dbnz	r2, @2b +	j_s		[blink] + +	;; At this point, we want to find the _first_ comparison that marked the +	;; inequality of "lhs" and "rhs" +.L_unequal_find: +	ffs		r13, r13 +	asl		r13, r13, 2 +	bi		[r13] +.L_unequal_r4r5: +	mov		r1, r4 +	b.d		@.L_diff_byte_in_regs +	mov		r2, r5 +	nop +.L_unequal_r6r7: +	mov		r1, r6 +	b.d		@.L_diff_byte_in_regs +	mov		r2, r7 +	nop +.L_unequal_r8r9: +	mov		r1, r8 +	b.d		@.L_diff_byte_in_regs +	mov		r2, r9 +	nop +.L_unequal_r10r11: +	mov		r1, r10 +	mov		r2, r11 + +	;; fall-through +	;; If we're here, that means the two operands are not equal. +.L_diff_byte_in_regs: +	xor		r0, r1, r2 +	ffs		r0, r0 +	and		r0, r0, 0x18 +	lsr		r1, r1, r0 +	lsr		r2, r2, r0 +	bmsk_s	r1, r1, 7 +	bmsk_s	r2, r2, 7 +	j_s.d	[blink] +	sub		r0, r1, r2 + +#else +#error "Unsupported ARC CPU type" +#endif +  END(memcmp)  libc_hidden_def(memcmp) diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S index 69d7220b8..153083765 100644 --- a/libc/string/arc/memcpy.S +++ b/libc/string/arc/memcpy.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@  #include <sysdep.h> -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif -  ENTRY(memcpy) -#ifdef __ARC700__ +#if defined(__ARC700__)  /* This memcpy implementation does not support objects of 1GB or larger -     the check for alignment does not work then.  */  /* We assume that most sources and destinations are aligned, and @@ -73,9 +69,9 @@ ENTRY(memcpy)  .Lendbloop:  	j_s.d	[blink]  	stb	r12,[r5,0] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) +  #ifdef __LITTLE_ENDIAN__  # define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<  # define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >> @@ -299,7 +295,58 @@ ENTRY(memcpy)  	stb.ab	r6, [r3,1]  .Lcopybytewise_3:  	j	[blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) +	;; Based on Synopsys code from newlib's arc64/memcpy.S +	lsr.f	r11, r2, 4		; counter for 16-byte chunks +	beq.d	@.L_write_15_bytes +	mov	r3, r0			; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) +	ldd.ab	r4, [r1, 8] +	ldd.ab	r6, [r1, 8] +	std.ab	r4, [r3, 8] +	std.ab	r6, [r3, 8] +	dbnz	r11, @.L_write_16_bytes +#else +	ld.ab	r4, [r1, 4] +	ld.ab	r5, [r1, 4] +	ld.ab	r6, [r1, 4] +	ld.ab	r7, [r1, 4] +	st.ab	r4, [r3, 4] +	st.ab	r5, [r3, 4] +	st.ab	r6, [r3, 4] +	dbnz.d	r11, @.L_write_16_bytes +	st.ab	r7, [r3, 4] +#endif +	bmsk_s	r2, r2, 3 + +.L_write_15_bytes: +	bbit0.d	r2, 1, @1f +	lsr	r11, r2, 2 +	ldh.ab	r4, [r1, 2] +	sth.ab	r4, [r3, 2] +1: +	bbit0.d	r2, 0, @1f +	xor	r11, r11, 3 +	ldb.ab	r4, [r1, 1] +	stb.ab	r4, [r3, 1] +1: +	asl	r11, r11, 1 +	bi	[r11] +	ld.ab	r4,[r1, 4] +	st.ab	r4,[r3, 4] +	ld.ab	r4,[r1, 4] +	st.ab	r4,[r3, 4] +	ld	r4,[r1] +	st	r4,[r3] + +	j_s	[blink] + +#else +#error "Unsupported ARC CPU type" +#endif  END(memcpy)  libc_hidden_def(memcpy) diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S index 0b74ddc7f..5aa5d6c65 100644 --- a/libc/string/arc/memset.S +++ b/libc/string/arc/memset.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@  #include <sysdep.h> -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif -  ENTRY(memset) -#ifdef __ARC700__ +#if defined(__ARC700__)  #define SMALL	7 /* Must be at least 6 to deal with alignment/loop issues.  */  	mov_s	r4,r0 @@ -52,9 +48,8 @@ ENTRY(memset)  	stb.ab	r1,[r4,1]  .Ltiny_end:  	j_s	[blink] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__)  #ifdef DONT_USE_PREALLOC  #define PREWRITE(A,B)	prefetchw [(A),(B)]  #else @@ -156,7 +151,55 @@ ENTRY(memset)  .Lcopy3bytes:  	j	[blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) +	;; Based on Synopsys code from newlib's arc64/memset.S + +	;; Assemble the bytes to 32bit words +	bmsk_s	r1, r1, 7		; treat it like unsigned char +	lsl8	r3, r1 +	or_s	r1, r1, r3 +	lsl16	r3, r1 +	or	r6, r1, r3 +	mov r7,r6 + +	lsr.f	r5, r2, 4		; counter for 16-byte chunks +	beq.d	@.L_write_15_bytes +	mov	r4, r0			; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) +	std.ab	r6, [r4, 8] +	std.ab	r6, [r4, 8] +	dbnz	r5, @.L_write_16_bytes +#else +	st.ab	r6, [r4, 4] +	st.ab	r6, [r4, 4] +	st.ab	r6, [r4, 4] +	dbnz.d	r5, @.L_write_16_bytes +	st.ab	r6, [r4, 4] +#endif +	bmsk_s	r2, r2, 3 + +.L_write_15_bytes: +	bbit0.d	r2, 1, @1f +	lsr	r3, r2, 2 +	sth.ab	r6, [r4, 2] +1: +	bbit0.d	r2, 0, @1f +	xor	r3, r3, 3 +	stb.ab	r6, [r4, 1] +1: +	bi	[r3] +	st.ab	r6,[r4, 4] +	st.ab	r6,[r4, 4] +	st.ab	r6,[r4, 4] + +	j_s	[blink] + +#else +#error "Unsupported ARC CPU type" +#endif  END(memset)  libc_hidden_def(memset) diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S index 443993589..df25eb3be 100644 --- a/libc/string/arc/strchr.S +++ b/libc/string/arc/strchr.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@  #include <sysdep.h>  #include <features.h> +#include <asm.h>  /* ARC700 has a relatively long pipeline and branch prediction, so we want     to avoid branches that are hard to predict.  On the other hand, the @@ -21,7 +22,7 @@ ENTRY(strchr)  	mov_s	r3,0x01010101  	breq.d	r2,r0,.Laligned  	asl	r4,r5,16 -	sub_s	r0,r0,r2 +	SUBR_S	r0,r0,r2  	asl	r7,r2,3  	ld_s	r2,[r0]  #ifdef __LITTLE_ENDIAN__ @@ -77,10 +78,10 @@ ENTRY(strchr)  	sub	r3,r7,1  	bic	r3,r3,r7  	norm	r2,r3 -	sub_s	r0,r0,1 -	asr_s	r2,r2,3 +	SUBR_S	r0,r0,1 +	ASRR_S	r2,r2,3  	j.d	[blink] -	sub_s	r0,r0,r2 +	SUBR_S	r0,r0,r2  	.balign	4  .Lfound0_ua: @@ -90,13 +91,13 @@ ENTRY(strchr)  	bic	r3,r3,r6  	and	r2,r3,r4  	or_s	r12,r12,r2 -	sub_s	r3,r12,1 +	SUBR_S	r3,r12,1  	bic_s	r3,r3,r12  	norm	r3,r3 -	add_s	r0,r0,3 -	asr_s	r12,r3,3 +	ADDR_S	r0,r0,3 +	ASRR_S	r12,r3,3  	asl.f	0,r2,r3 -	sub_s	r0,r0,r12 +	SUBR_S	r0,r0,r12  	j_s.d	[blink]  	mov.pl	r0,0  #else /* BIG ENDIAN */ @@ -106,10 +107,10 @@ ENTRY(strchr)  	bic	r2,r7,r6  .Lfound_char_b:  	norm	r2,r2 -	sub_s	r0,r0,4 +	SUBR_S	r0,r0,4  	asr_s	r2,r2,3  	j.d	[blink] -	add_s	r0,r0,r2 +	ADDR_S	r0,r0,r2  .Lfound0_ua:  	mov_s	r3,r7 @@ -126,7 +127,7 @@ ENTRY(strchr)  	add.pl	r3,r3,1  	asr_s	r12,r3,3  	asl.f	0,r2,r3 -	add_s	r0,r0,r12 +	ADDR_S	r0,r0,r12  	j_s.d	[blink]  	mov.mi	r0,0  #endif /* ENDIAN */ diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S index ad38d9e00..48d2d7ec1 100644 --- a/libc/string/arc/strcmp.S +++ b/libc/string/arc/strcmp.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,14 +7,11 @@  #include <features.h>  #include <sysdep.h> - -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif +#include <asm.h>  ENTRY(strcmp) -#ifdef __ARC700__ +#if defined(__ARC700__) || defined(__ARC64_ARCH32__)  /* This is optimized primarily for the ARC700.     It would be possible to speed up the loops by one cycle / word     respective one cycle / byte by forcing double source 1 alignment, unrolling @@ -38,7 +35,7 @@ ENTRY(strcmp)  	breq	r2,r3,.Lwordloop  #ifdef	__LITTLE_ENDIAN__  	xor	r0,r2,r3	; mask for difference -	sub_s	r1,r0,1 +	SUBR_S	r1,r0,1  	bic_s	r0,r0,r1	; mask for least significant difference bit  	sub	r1,r5,r0  	xor	r0,r5,r1	; mask for least significant difference byte @@ -55,7 +52,7 @@ ENTRY(strcmp)  .Lfound0:  	xor	r0,r2,r3	; mask for difference  	or	r0,r0,r4	; or in zero indicator -	sub_s	r1,r0,1 +	SUBR_S	r1,r0,1  	bic_s	r0,r0,r1	; mask for least significant difference bit  	sub	r1,r5,r0  	xor	r0,r5,r1	; mask for least significant difference byte @@ -99,31 +96,28 @@ ENTRY(strcmp)  .Lcmpend:  	j_s.d	[blink]  	sub	r0,r2,r3 -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__)  	or	r2, r0, r1  	bmsk_s	r2, r2, 1  	brne	r2, 0, @.Lcharloop  ;;; s1 and s2 are word aligned -	ld.ab	r2, [r0, 4]  	mov_s	r12, 0x01010101  	ror	r11, r12  	.align  4  .LwordLoop: +	ld.ab	r2, [r0, 4] +	sub	r4, r2, r12  	ld.ab	r3, [r1, 4]  	;; Detect NULL char in str1 -	sub	r4, r2, r12 -	ld.ab	r5, [r0, 4]  	bic	r4, r4, r2  	and	r4, r4, r11  	brne.d.nt	r4, 0, .LfoundNULL  	;; Check if the read locations are the same  	cmp	r2, r3 -	beq.d	.LwordLoop -	mov.eq	r2, r5 +	beq	.LwordLoop  	;; A match is found, spot it out  #ifdef __LITTLE_ENDIAN__ @@ -168,7 +162,10 @@ ENTRY(strcmp)  .Lcmpend:  	j_s.d	[blink]  	sub	r0, r2, r3 -#endif /* __ARCHS__ */ + +#else +#error "Unsupported ARC CPU type" +#endif  END(strcmp)  libc_hidden_def(strcmp) diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S index 0b9b93815..0d1d3aa4e 100644 --- a/libc/string/arc/strlen.S +++ b/libc/string/arc/strlen.S @@ -1,5 +1,5 @@  /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)   * Copyright (C) 2007 ARC International (UK) LTD   *   * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@  #include <sysdep.h> +#include <asm.h>  ENTRY(strlen)  	or	r3,r0,7 @@ -15,7 +16,7 @@ ENTRY(strlen)  	mov	r4,0x01010101  	; uses long immediate  #ifdef __LITTLE_ENDIAN__ -	asl_s	r1,r0,3 +	ASLR_S	r1,r0,3  	btst_s	r0,2  	asl	r7,r4,r1  	ror	r5,r4 @@ -59,7 +60,7 @@ ENTRY(strlen)  	sub.ne	r3,r3,4  	mov.eq	r1,r12  #ifdef __LITTLE_ENDIAN__ -	sub_s	r2,r1,1 +	SUBR_S	r2,r1,1  	bic_s	r2,r2,r1  	norm	r1,r2  	sub_s	r0,r0,3 diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S index 412270f50..29c583f16 100644 --- a/libc/string/arm/memset.S +++ b/libc/string/arm/memset.S @@ -32,6 +32,7 @@ memset:  	cmp	r2, #8		@ at least 8 bytes to do?  	bcc	2f +	and	r1, r1, #0xFF  	lsl	r3, r1, #8  	orr	r1, r3  	lsl	r3, r1, #16 @@ -68,6 +69,7 @@ memset:  	mov	a4, a1  	cmp	a3, $8		@ at least 8 bytes to do?  	blo	2f +	and	a2, a2, #0xFF  	orr	a2, a2, a2, lsl $8  	orr	a2, a2, a2, lsl $16  1: diff --git a/libc/string/explicit_bzero.c b/libc/string/explicit_bzero.c new file mode 100644 index 000000000..b09e4c1f4 --- /dev/null +++ b/libc/string/explicit_bzero.c @@ -0,0 +1,30 @@ +/* +Copyright © 2005-2020 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ +#define _BSD_SOURCE +#include <string.h> + +void explicit_bzero(void *d, size_t n) +{ +	d = memset(d, 0, n); +	__asm__ __volatile__ ("" : : "r"(d) : "memory"); +} diff --git a/libc/string/generic/strchr.c b/libc/string/generic/strchr.c index 321d2b8c3..b34884d67 100644 --- a/libc/string/generic/strchr.c +++ b/libc/string/generic/strchr.c @@ -60,22 +60,19 @@ char *strchr (const char *s, int c_in)       The 1-bits make sure that carries propagate to the next 0-bit.       The 0-bits provide holes for carries to fall into.  */ -  switch (sizeof (longword)) -    { -    case 4: magic_bits = 0x7efefeffL; break; -    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break; -    default: -      abort (); -    } -    /* Set up a longword, each of whose bytes is C.  */ +#if __WORDSIZE == 32 +  magic_bits = 0x7efefeffL;    charmask = c | (c << 8);    charmask |= charmask << 16; -  if (sizeof (longword) > 4) -    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */ -    charmask |= (charmask << 16) << 16; -  if (sizeof (longword) > 8) -    abort (); +#elif __WORDSIZE == 64 +  magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; +  charmask = c | (c << 8); +  charmask |= charmask << 16; +  charmask |= (charmask << 16) << 16; +#else +  #error unexpected integer size strchr() +#endif    /* Instead of the traditional loop which tests each character,       we will test a longword at a time.  The tricky part is testing diff --git a/libc/string/generic/strchrnul.c b/libc/string/generic/strchrnul.c index d11d9e00d..d9fadc776 100644 --- a/libc/string/generic/strchrnul.c +++ b/libc/string/generic/strchrnul.c @@ -59,22 +59,19 @@ char *strchrnul (const char *s, int c_in)       The 1-bits make sure that carries propagate to the next 0-bit.       The 0-bits provide holes for carries to fall into.  */ -  switch (sizeof (longword)) -    { -    case 4: magic_bits = 0x7efefeffL; break; -    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break; -    default: -      abort (); -    } -  /* Set up a longword, each of whose bytes is C.  */ +#if __WORDSIZE == 32 +  magic_bits = 0x7efefeffL;    charmask = c | (c << 8);    charmask |= charmask << 16; -  if (sizeof (longword) > 4) -    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */ -    charmask |= (charmask << 16) << 16; -  if (sizeof (longword) > 8) -    abort (); +#elif __WORDSIZE == 64 +  magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; +  charmask = c | (c << 8); +  charmask |= charmask << 16; +  charmask |= (charmask << 16) << 16; +#else +  #error unexpected integer size strchr() +#endif    /* Instead of the traditional loop which tests each character,       we will test a longword at a time.  The tricky part is testing diff --git a/libc/string/generic/strlen.c b/libc/string/generic/strlen.c index dc383398b..dcc032ddc 100644 --- a/libc/string/generic/strlen.c +++ b/libc/string/generic/strlen.c @@ -28,7 +28,7 @@ size_t strlen (const char *str)  {    const char *char_ptr;    const unsigned long int *longword_ptr; -  unsigned long int longword, magic_bits, himagic, lomagic; +  unsigned long int longword, himagic, lomagic;    /* Handle the first few characters by reading one character at a time.       Do this until CHAR_PTR is aligned on a longword boundary.  */ @@ -52,14 +52,12 @@ size_t strlen (const char *str)       The 1-bits make sure that carries propagate to the next 0-bit.       The 0-bits provide holes for carries to fall into.  */ -  magic_bits = 0x7efefeffL;    himagic = 0x80808080L;    lomagic = 0x01010101L;    if (sizeof (longword) > 4)      {        /* 64-bit version of the magic.  */        /* Do the shift in two steps to avoid a warning if long has 32 bits.  */ -      magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL;        himagic = ((himagic << 16) << 16) | himagic;        lomagic = ((lomagic << 16) << 16) | lomagic;      } @@ -102,22 +100,7 @@ size_t strlen (const char *str)        longword = *longword_ptr++; -      if ( -#if 0 -	  /* Add MAGIC_BITS to LONGWORD.  */ -	  (((longword + magic_bits) - -	    /* Set those bits that were unchanged by the addition.  */ -	    ^ ~longword) - -	   /* Look at only the hole bits.  If any of the hole bits -	      are unchanged, most likely one of the bytes was a -	      zero.  */ -	   & ~magic_bits) -#else -	  ((longword - lomagic) & himagic) -#endif -	  != 0) +      if (((longword - lomagic) & himagic) != 0)  	{  	  /* Which of the bytes was the zero?  If none of them were, it was  	     a misfire; continue the search.  */ diff --git a/libc/string/generic/strnlen.c b/libc/string/generic/strnlen.c index 4d4cde84f..82d4122ec 100644 --- a/libc/string/generic/strnlen.c +++ b/libc/string/generic/strnlen.c @@ -29,15 +29,17 @@     '\0' terminator is found in that many characters, return MAXLEN.  */  size_t strnlen (const char *str, size_t maxlen)  { -  const char *char_ptr, *end_ptr = str + maxlen; +  const char *char_ptr, *end_ptr;    const unsigned long int *longword_ptr;    unsigned long int longword, himagic, lomagic;    if (maxlen == 0)      return 0; -  if (__builtin_expect (end_ptr < str, 0)) +  if (__builtin_expect ((uintptr_t)str + maxlen < (uintptr_t)str, 0))      end_ptr = (const char *) ~0UL; +  else +    end_ptr = str + maxlen;    /* Handle the first few characters by reading one character at a time.       Do this until CHAR_PTR is aligned on a longword boundary.  */ diff --git a/libc/string/kvx/Makefile b/libc/string/kvx/Makefile new file mode 100644 index 000000000..0a95346fd --- /dev/null +++ b/libc/string/kvx/Makefile @@ -0,0 +1,13 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> +# +# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. +# + +top_srcdir:=../../../ +top_builddir:=../../../ +all: objs +include $(top_builddir)Rules.mak +include ../Makefile.in +include $(top_srcdir)Makerules diff --git a/libc/string/kvx/memcpy.S b/libc/string/kvx/memcpy.S new file mode 100644 index 000000000..70e8db910 --- /dev/null +++ b/libc/string/kvx/memcpy.S @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2020 Kalray Inc. + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB + * in this tarball. + */ + +#include <sysdep.h> + +.align 16 +ENTRY(memcpy) +	cb.deqz $r2? .Lreturn +	compd.geu $r3 = $r2, 256 +	copyd $r6 = $r0 +	;; +	cb.deqz $r3? .Lremaining_256 +	;; +	lq.u $r32r33 = 0[$r1] +	addd $r2 = $r2, -256 +	;; +	lq.u $r34r35 = 16[$r1] +	;; +	lq.u $r36r37 = 32[$r1] +	srld $r7 = $r2, 8 +	;; +	lq.u $r38r39 = 48[$r1] +	;; +	lq.u $r40r41 = 64[$r1] +	;; +	lq.u $r42r43 = 80[$r1] +	;; +	lq.u $r44r45 = 96[$r1] +	;; +	lq.u $r46r47 = 112[$r1] +	;; +	lq.u $r48r49 = 128[$r1] +	;; +	lq.u $r50r51 = 144[$r1] +	;; +	lq.u $r52r53 = 160[$r1] +	;; +	lq.u $r54r55 = 176[$r1] +	;; +	lq.u $r56r57 = 192[$r1] +	;; +	lq.u $r58r59 = 208[$r1] +	compd.geu $r3 = $r2, 256 +	;; +	lq.u $r60r61 = 224[$r1] +	;; +	lq.u $r62r63 = 240[$r1] +	addd $r1 = $r1, 256 +	;; +	cb.deqz $r7? .Lstreaming_loop_end +	;; +	loopdo $r7, .Lstreaming_loop_end +		;; +		sq 0[$r0] = $r32r33 +		addd $r2 = $r2, -256 +		;; +		lq.u $r32r33 = 0[$r1] +		;; +		sq 16[$r0] = $r34r35 +		;; +		lq.u $r34r35 = 16[$r1] +		;; +		sq 32[$r0] = $r36r37 +		;; +		lq.u $r36r37 = 32[$r1] +		;; +		sq 48[$r0] = $r38r39 +		;; +		lq.u $r38r39 = 48[$r1] +		;; +		sq 64[$r0] = $r40r41 +		;; +		lq.u $r40r41 = 64[$r1] +		;; +		sq 80[$r0] = $r42r43 +		;; +		lq.u $r42r43 = 80[$r1] +		;; +		sq 96[$r0] = $r44r45 +		;; +		lq.u $r44r45 = 96[$r1] +		;; +		sq 112[$r0] = $r46r47 +		;; +		lq.u $r46r47 = 112[$r1] +		;; +		sq 128[$r0] = $r48r49 +		;; +		lq.u $r48r49 = 128[$r1] +		;; +		sq 144[$r0] = $r50r51 +		;; +		lq.u $r50r51 = 144[$r1] +		;; +		sq 160[$r0] = $r52r53 +		;; +		lq.u $r52r53 = 160[$r1] +		;; +		sq 176[$r0] = $r54r55 +		;; +		lq.u $r54r55 = 176[$r1] +		;; +		sq 192[$r0] = $r56r57 +		;; +		lq.u $r56r57 = 192[$r1] +		;; +		sq 208[$r0] = $r58r59 +		;; +		lq.u $r58r59 = 208[$r1] +		;; +		sq 224[$r0] = $r60r61 +		;; +		lq.u $r60r61 = 224[$r1] +		;; +		sq 240[$r0] = $r62r63 +		addd $r0 = $r0, 256 +		;; +		lq.u $r62r63 = 240[$r1] +		addd $r1 = $r1, 256 +		;; +	.Lstreaming_loop_end: +	sq 0[$r0] = $r32r33 +	;; +	sq 16[$r0] = $r34r35 +	;; +	sq 32[$r0] = $r36r37 +	;; +	sq 48[$r0] = $r38r39 +	;; +	sq 64[$r0] = $r40r41 +	;; +	sq 80[$r0] = $r42r43 +	;; +	sq 96[$r0] = $r44r45 +	;; +	sq 112[$r0] = $r46r47 +	;; +	sq 128[$r0] = $r48r49 +	;; +	sq 144[$r0] = $r50r51 +	;; +	sq 160[$r0] = $r52r53 +	;; +	sq 176[$r0] = $r54r55 +	;; +	sq 192[$r0] = $r56r57 +	;; +	sq 208[$r0] = $r58r59 +	;; +	sq 224[$r0] = $r60r61 +	;; +	sq 240[$r0] = $r62r63 +	addd $r0 = $r0, 256 +	;; +.Lremaining_256: +	andd $r11 = $r2, 16 +	srld $r7 = $r2, 5 +	;; +	cb.deqz $r7? .Lloop_32_end +	;; +	loopdo $r7, .Lloop_32_end +		;; +		lo $r32r33r34r35 = 0[$r1] +		addd $r1 = $r1, 32 +		addd $r2 = $r2, -32 +		;; +		so 0[$r0] = $r32r33r34r35 +		addd $r0 = $r0, 32 +		;; +	.Lloop_32_end: +	andd $r10 = $r2, 8 +	andd $r9 = $r2, 4 +	cb.deqz $r11? .Lloop_remaining_16 +	lq.u.dnez $r11? $r32r33 = 0[$r1] +	;; +	sq 0[$r0] = $r32r33 +	addd $r1 = $r1, 16 +	addd $r0 = $r0, 16 +	;; +.Lloop_remaining_16: +	andd $r8 = $r2, 2 +	andd $r7 = $r2, 1 +	cb.deqz $r10? .Lloop_remaining_8 +	ld.dnez $r10? $r32 = 0[$r1] +	;; +	sd 0[$r0] = $r32 +	addd $r1 = $r1, 8 +	addd $r0 = $r0, 8 +	;; +.Lloop_remaining_8: +	cb.deqz $r9? .Lloop_remaining_4 +	lwz.dnez $r9? $r32 = 0[$r1] +	;; +	sw 0[$r0] = $r32 +	addd $r1 = $r1, 4 +	addd $r0 = $r0, 4 +	;; +.Lloop_remaining_4: +	cb.deqz $r8? .Lloop_remaining_2 +	lhz.dnez $r8? $r32 = 0[$r1] +	;; +	sh 0[$r0] = $r32 +	addd $r1 = $r1, 2 +	addd $r0 = $r0, 2 +	;; +.Lloop_remaining_2: +	lbz.dnez $r7? $r32 = 0[$r1] +	;; +	sb.dnez $r7? 0[$r0] = $r32 +	;; +.Lreturn: +	copyd $r0 = $r6 +	ret +	;; +END(memcpy) + +libc_hidden_def(memcpy) diff --git a/libc/string/kvx/memset.S b/libc/string/kvx/memset.S new file mode 100644 index 000000000..45023a68f --- /dev/null +++ b/libc/string/kvx/memset.S @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2019 Kalray Inc. + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB + * in this tarball. + */ + +#define REPLICATE_BYTE_MASK	0x0101010101010101 +#define MIN_SIZE_FOR_ALIGN	128 + +/* + * Optimized memset for kvx architecture + * + * In order to optimize memset on kvx, we can use various things: + * - conditionnal store which avoid branch penalty + * - store half/word/double/quad/octuple to store up to 16 bytes at a time + * - hardware loop for steady cases. + * + * First,  we start by checking if the size is below a minimum size. If so, we + * skip the alignment part. Indeed, the kvx supports misalignment and the + * penalty for letting it do unaligned accesses is lower than trying to + * realigning us. So for small sizes, we don't even bother to realign. + * In order to create the 64 bits pattern, we use sbmm to replicate the pattern + * on all bits on a register in one call. + * Once alignment has been reached, we can do the hardware loop using store + * octuple in order to optimize throughput. Care must be taken to align hardware + * loops on at least 8 bytes for performances. + * Once the main loop has been done, we finish the copy by checking length to do + * the necessary calls to store remaining bytes. + */ + +#include <sysdep.h> + +.align 16 +ENTRY(memset) +	/* Preserve return value */ +	copyd $r3 = $r0 +	/* Replicate the first pattern byte on all bytes */ +	sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK +	/* Check if length < MIN_SIZE_FOR_ALIGN */ +	compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN +	/* Invert address to compute what we need to copy to be aligned on 32 bytes */ +	negd $r5 = $r0 +	;; +	/* Check if we are aligned on 32 bytes */ +	andw $r9 = $r0, 0x1F +	/* Compute the length that will be copied to align on 32 bytes boundary */ +	andw $r6 = $r5, 0x1F +	/* +	 * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done +	 * unaligned but that is still better that what we can do with sb +	 */ +	cb.deqz $r7? .Laligned_32 +	;; +	/* Remove unaligned part from length */ +	sbfd $r2 = $r6, $r2 +	/* If we are already aligned on 32 bytes, jump to main "so" loop */ +	cb.deqz $r9? .Laligned_32 +	/* Check if we need to copy 1 byte */ +	andw $r4 = $r5, (1 << 0) +	;; +	/* If we are not aligned, store byte */ +	sb.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 2 bytes */ +	andw $r4 = $r5, (1 << 1) +	/* Add potentially copied part for next store offset */ +	addd $r0 = $r0, $r4 +	;; +	sh.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 4 bytes */ +	andw $r4 = $r5, (1 << 2) +	addd $r0 = $r0, $r4 +	;; +	sw.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 8 bytes */ +	andw $r4 = $r5, (1 << 3) +	addd $r0 = $r0, $r4 +	/* Copy second part of pattern for sq */ +	copyd $r33 = $r32 +	;; +	sd.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 16 bytes */ +	andw $r4 = $r5, (1 << 4) +	addd $r0 = $r0, $r4 +	;; +	sq.dnez $r4? [$r0] = $r32r33 +	addd $r0 = $r0, $r4 +	;; +.Laligned_32: +	/* Copy second part of pattern for sq */ +	copyd $r33 = $r32 +	/* Prepare amount of data for 32 bytes store */ +	srld $r10 = $r2, 5 +	nop +	nop +	;; +	copyq $r34r35 = $r32, $r33 +	/* Remaining bytes for 16 bytes store */ +	andw $r8 = $r2, (1 << 4) +	make $r11 = 32 +	/* Check if there are enough data for 32 bytes store */ +	cb.deqz $r10? .Laligned_32_done +	;; +	loopdo $r10, .Laligned_32_done +		;; +		so 0[$r0] = $r32r33r34r35 +		addd $r0 = $r0, $r11 +		;; +	.Laligned_32_done: +	/* +	 * Now that we have handled every aligned bytes using 'so', we can +	 * handled the remainder of length using store by decrementing size +	 * We also exploit the fact we are aligned to simply check remaining +	 * size */ +	sq.dnez $r8? [$r0] = $r32r33 +	addd $r0 = $r0, $r8 +	/* Remaining bytes for 8 bytes store */ +	andw $r8 = $r2, (1 << 3) +	cb.deqz $r2? .Lmemset_done +	;; +	sd.dnez $r8? [$r0] = $r32 +	addd $r0 = $r0, $r8 +	/* Remaining bytes for 4 bytes store */ +	andw $r8 = $r2, (1 << 2) +	;; +	sw.dnez $r8? [$r0] = $r32 +	addd $r0 = $r0, $r8 +	/* Remaining bytes for 2 bytes store */ +	andw $r8 = $r2, (1 << 1) +	;; +	sh.dnez $r8? [$r0] = $r32 +	addd $r0 = $r0, $r8 +	;; +	sb.odd $r2? [$r0] = $r32 +	/* Restore original value */ +	copyd $r0 = $r3 +	ret +	;; +.Lmemset_done: +	/* Restore original value */ +	copyd $r0 = $r3 +	ret +	;; +END(memset) + +libc_hidden_def(memset) diff --git a/libc/string/strcasestr.c b/libc/string/strcasestr.c index 3334086bf..8f57cc0a3 100644 --- a/libc/string/strcasestr.c +++ b/libc/string/strcasestr.c @@ -16,7 +16,7 @@ char *strcasestr(const char *s1, const char *s2)  #if 1  	do {  		if (!*p) { -			return (char *) s1;; +			return (char *) s1;  		}  		if ((*p == *s)  			|| (tolower(*((unsigned char *)p)) == tolower(*((unsigned char *)s))) diff --git a/libc/string/strstr.c b/libc/string/strstr.c index 7e2a64e7d..bf56b9c12 100644 --- a/libc/string/strstr.c +++ b/libc/string/strstr.c @@ -22,7 +22,7 @@ Wchar *Wstrstr(const Wchar *s1, const Wchar *s2)  	do {  		if (!*p) { -			return (Wchar *) s1;; +			return (Wchar *) s1;  		}  		if (*p == *s) {  			++p; diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S index 55e09e5f1..209e19062 100644 --- a/libc/string/x86_64/strcat.S +++ b/libc/string/x86_64/strcat.S @@ -106,7 +106,7 @@ ENTRY (BP_SYM (strcat))  	/* Align, it is a jump target.  */  	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */ -	.p2align 3,,8 +	.p2align 3,,7  3:  	subq $8,%rax		/* correct pointer increment.  */ diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S index 7a06c8867..5ef565db7 100644 --- a/libc/string/x86_64/strcspn.S +++ b/libc/string/x86_64/strcspn.S @@ -94,7 +94,7 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */  	/* but it will also align entire function to 16 bytes, */  	/* potentially creating largish padding at link time. */  	/* We are aligning to 8 bytes instead: */ -	.p2align 3,,8 +	.p2align 3,,7  L(3):	addq $4, %rax		/* adjust pointer for full loop round */ diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S index 9e84326c2..2fe2f58b2 100644 --- a/libc/string/x86_64/strlen.S +++ b/libc/string/x86_64/strlen.S @@ -102,7 +102,7 @@ ENTRY (strlen)  	/* Align, it is a jump target.  */  	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */ -	.p2align 3,,8 +	.p2align 3,,7  3:  	subq $8,%rax		/* correct pointer increment.  */ diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S index 366377649..8dc42656b 100644 --- a/libc/string/x86_64/strspn.S +++ b/libc/string/x86_64/strspn.S @@ -89,7 +89,7 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */  	/* but it will also align entire function to 16 bytes, */  	/* potentially creating largish padding at link time. */  	/* We are aligning to 8 bytes instead: */ -	.p2align 3,,8 +	.p2align 3,,7  L(3):  	addq $4, %rax		/* adjust pointer for full loop round */ | 
