22 files changed, 682 insertions, 106 deletions
diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S
index a60757e7a..20122a296 100644
--- a/libc/string/arc/memcmp.S
+++ b/libc/string/arc/memcmp.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -17,6 +17,8 @@
 #endif
 
 ENTRY(memcmp)
+
+#if defined(__ARC700__) || defined(__ARCHS__)
 	or	r12,r0,r1
 	asl_s	r12,r12,30
 	sub	r3,r2,1
@@ -149,6 +151,96 @@ ENTRY(memcmp)
 .Lnil:
 	j_s.d	[blink]
 	mov	r0,0
+
+#elif (__ARC64_ARCH32__)
+	;; Based on Synopsys code from newlib's arc64/memcmp.S
+	cmp		r2, 32
+	bls.d	@.L_compare_1_bytes
+	mov		r3, r0	; "r0" will be used as return value
+
+	lsr		r12, r2, 4	; counter for 16-byte chunks
+	xor		r13, r13, r13	; the mask showing inequal registers
+
+.L_compare_16_bytes:
+	ld.ab	r4, [r3, +4]
+	ld.ab	r5, [r1, +4]
+	ld.ab	r6, [r3, +4]
+	ld.ab	r7, [r1, +4]
+	ld.ab	r8, [r3, +4]
+	ld.ab	r9, [r1, +4]
+	ld.ab	r10, [r3, +4]
+	ld.ab	r11, [r1, +4]
+	xor.f	0, r4, r5
+	xor.ne	r13, r13, 0b0001
+	xor.f	0, r6, r7
+	xor.ne	r13, r13, 0b0010
+	xor.f	0, r8, r9
+	xor.ne	r13, r13, 0b0100
+	xor.f	0, r10, r11
+	xor.ne	r13, r13, 0b1000
+	brne	r13, 0, @.L_unequal_find
+	dbnz	r12, @.L_compare_16_bytes
+
+	;; Adjusting the pointers because of the extra loads in the end
+	sub		r1, r1, 4
+	sub		r3, r3, 4
+	bmsk_s	  r2, r2, 3	; any remaining bytes to compare
+
+.L_compare_1_bytes:
+	cmp		r2, 0
+	jeq.d	[blink]
+	xor_s	r0, r0, r0
+
+2:
+	ldb.ab	r4, [r3, +1]
+	ldb.ab	r5, [r1, +1]
+	sub.f	r0, r4, r5
+	jne		[blink]
+	dbnz	r2, @2b
+	j_s		[blink]
+
+	;; At this point, we want to find the _first_ comparison that marked the
+	;; inequality of "lhs" and "rhs"
+.L_unequal_find:
+	ffs		r13, r13
+	asl		r13, r13, 2
+	bi		[r13]
+.L_unequal_r4r5:
+	mov		r1, r4
+	b.d		@.L_diff_byte_in_regs
+	mov		r2, r5
+	nop
+.L_unequal_r6r7:
+	mov		r1, r6
+	b.d		@.L_diff_byte_in_regs
+	mov		r2, r7
+	nop
+.L_unequal_r8r9:
+	mov		r1, r8
+	b.d		@.L_diff_byte_in_regs
+	mov		r2, r9
+	nop
+.L_unequal_r10r11:
+	mov		r1, r10
+	mov		r2, r11
+
+	;; fall-through
+	;; If we're here, that means the two operands are not equal.
+.L_diff_byte_in_regs:
+	xor		r0, r1, r2
+	ffs		r0, r0
+	and		r0, r0, 0x18
+	lsr		r1, r1, r0
+	lsr		r2, r2, r0
+	bmsk_s	r1, r1, 7
+	bmsk_s	r2, r2, 7
+	j_s.d	[blink]
+	sub		r0, r1, r2
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
+
 END(memcmp)
 libc_hidden_def(memcmp)
 
diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S
index 69d7220b8..153083765 100644
--- a/libc/string/arc/memcpy.S
+++ b/libc/string/arc/memcpy.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
 
 #include <sysdep.h>
 
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
-
 ENTRY(memcpy)
 
-#ifdef __ARC700__
+#if defined(__ARC700__)
 /* This memcpy implementation does not support objects of 1GB or larger -
    the check for alignment does not work then.  */
 /* We assume that most sources and destinations are aligned, and
@@ -73,9 +69,9 @@ ENTRY(memcpy)
 .Lendbloop:
 	j_s.d	[blink]
 	stb	r12,[r5,0]
-#endif /* __ARC700__ */
 
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
+
 #ifdef __LITTLE_ENDIAN__
 # define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
 # define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
@@ -299,7 +295,58 @@ ENTRY(memcpy)
 	stb.ab	r6, [r3,1]
 .Lcopybytewise_3:
 	j	[blink]
-#endif /* __ARCHS__ */
+
+#elif defined(__ARC64_ARCH32__)
+	;; Based on Synopsys code from newlib's arc64/memcpy.S
+	lsr.f	r11, r2, 4		; counter for 16-byte chunks
+	beq.d	@.L_write_15_bytes
+	mov	r3, r0			; work on a copy of "r0"
+
+.L_write_16_bytes:
+#if defined(__ARC64_LL64__)
+	ldd.ab	r4, [r1, 8]
+	ldd.ab	r6, [r1, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r6, [r3, 8]
+	dbnz	r11, @.L_write_16_bytes
+#else
+	ld.ab	r4, [r1, 4]
+	ld.ab	r5, [r1, 4]
+	ld.ab	r6, [r1, 4]
+	ld.ab	r7, [r1, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r5, [r3, 4]
+	st.ab	r6, [r3, 4]
+	dbnz.d	r11, @.L_write_16_bytes
+	st.ab	r7, [r3, 4]
+#endif
+	bmsk_s	r2, r2, 3
+
+.L_write_15_bytes:
+	bbit0.d	r2, 1, @1f
+	lsr	r11, r2, 2
+	ldh.ab	r4, [r1, 2]
+	sth.ab	r4, [r3, 2]
+1:
+	bbit0.d	r2, 0, @1f
+	xor	r11, r11, 3
+	ldb.ab	r4, [r1, 1]
+	stb.ab	r4, [r3, 1]
+1:
+	asl	r11, r11, 1
+	bi	[r11]
+	ld.ab	r4,[r1, 4]
+	st.ab	r4,[r3, 4]
+	ld.ab	r4,[r1, 4]
+	st.ab	r4,[r3, 4]
+	ld	r4,[r1]
+	st	r4,[r3]
+
+	j_s	[blink]
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
 
 END(memcpy)
 libc_hidden_def(memcpy)
diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S
index 0b74ddc7f..5aa5d6c65 100644
--- a/libc/string/arc/memset.S
+++ b/libc/string/arc/memset.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
 
 #include <sysdep.h>
 
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
-
 ENTRY(memset)
 
-#ifdef __ARC700__
+#if defined(__ARC700__)
 #define SMALL	7 /* Must be at least 6 to deal with alignment/loop issues.  */
 
 	mov_s	r4,r0
@@ -52,9 +48,8 @@ ENTRY(memset)
 	stb.ab	r1,[r4,1]
 .Ltiny_end:
 	j_s	[blink]
-#endif /* __ARC700__ */
 
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
 #ifdef DONT_USE_PREALLOC
 #define PREWRITE(A,B)	prefetchw [(A),(B)]
 #else
@@ -156,7 +151,55 @@ ENTRY(memset)
 .Lcopy3bytes:
 
 	j	[blink]
-#endif /* __ARCHS__ */
+
+#elif defined(__ARC64_ARCH32__)
+	;; Based on Synopsys code from newlib's arc64/memset.S
+
+	;; Assemble the bytes to 32bit words
+	bmsk_s	r1, r1, 7		; treat it like unsigned char
+	lsl8	r3, r1
+	or_s	r1, r1, r3
+	lsl16	r3, r1
+	or	r6, r1, r3
+	mov r7,r6
+
+	lsr.f	r5, r2, 4		; counter for 16-byte chunks
+	beq.d	@.L_write_15_bytes
+	mov	r4, r0			; work on a copy of "r0"
+
+.L_write_16_bytes:
+#if defined(__ARC64_LL64__)
+	std.ab	r6, [r4, 8]
+	std.ab	r6, [r4, 8]
+	dbnz	r5, @.L_write_16_bytes
+#else
+	st.ab	r6, [r4, 4]
+	st.ab	r6, [r4, 4]
+	st.ab	r6, [r4, 4]
+	dbnz.d	r5, @.L_write_16_bytes
+	st.ab	r6, [r4, 4]
+#endif
+	bmsk_s	r2, r2, 3
+
+.L_write_15_bytes:
+	bbit0.d	r2, 1, @1f
+	lsr	r3, r2, 2
+	sth.ab	r6, [r4, 2]
+1:
+	bbit0.d	r2, 0, @1f
+	xor	r3, r3, 3
+	stb.ab	r6, [r4, 1]
+1:
+	bi	[r3]
+	st.ab	r6,[r4, 4]
+	st.ab	r6,[r4, 4]
+	st.ab	r6,[r4, 4]
+
+	j_s	[blink]
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
 
 END(memset)
 libc_hidden_def(memset)
diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S
index 443993589..df25eb3be 100644
--- a/libc/string/arc/strchr.S
+++ b/libc/string/arc/strchr.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
 
 #include <sysdep.h>
 #include <features.h>
+#include <asm.h>
 
 /* ARC700 has a relatively long pipeline and branch prediction, so we want
    to avoid branches that are hard to predict.  On the other hand, the
@@ -21,7 +22,7 @@ ENTRY(strchr)
 	mov_s	r3,0x01010101
 	breq.d	r2,r0,.Laligned
 	asl	r4,r5,16
-	sub_s	r0,r0,r2
+	SUBR_S	r0,r0,r2
 	asl	r7,r2,3
 	ld_s	r2,[r0]
 #ifdef __LITTLE_ENDIAN__
@@ -77,10 +78,10 @@ ENTRY(strchr)
 	sub	r3,r7,1
 	bic	r3,r3,r7
 	norm	r2,r3
-	sub_s	r0,r0,1
-	asr_s	r2,r2,3
+	SUBR_S	r0,r0,1
+	ASRR_S	r2,r2,3
 	j.d	[blink]
-	sub_s	r0,r0,r2
+	SUBR_S	r0,r0,r2
 
 	.balign	4
 .Lfound0_ua:
@@ -90,13 +91,13 @@ ENTRY(strchr)
 	bic	r3,r3,r6
 	and	r2,r3,r4
 	or_s	r12,r12,r2
-	sub_s	r3,r12,1
+	SUBR_S	r3,r12,1
 	bic_s	r3,r3,r12
 	norm	r3,r3
-	add_s	r0,r0,3
-	asr_s	r12,r3,3
+	ADDR_S	r0,r0,3
+	ASRR_S	r12,r3,3
 	asl.f	0,r2,r3
-	sub_s	r0,r0,r12
+	SUBR_S	r0,r0,r12
 	j_s.d	[blink]
 	mov.pl	r0,0
 #else /* BIG ENDIAN */
@@ -106,10 +107,10 @@ ENTRY(strchr)
 	bic	r2,r7,r6
 .Lfound_char_b:
 	norm	r2,r2
-	sub_s	r0,r0,4
+	SUBR_S	r0,r0,4
 	asr_s	r2,r2,3
 	j.d	[blink]
-	add_s	r0,r0,r2
+	ADDR_S	r0,r0,r2
 
 .Lfound0_ua:
 	mov_s	r3,r7
@@ -126,7 +127,7 @@ ENTRY(strchr)
 	add.pl	r3,r3,1
 	asr_s	r12,r3,3
 	asl.f	0,r2,r3
-	add_s	r0,r0,r12
+	ADDR_S	r0,r0,r12
 	j_s.d	[blink]
 	mov.mi	r0,0
 #endif /* ENDIAN */
diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S
index ad38d9e00..48d2d7ec1 100644
--- a/libc/string/arc/strcmp.S
+++ b/libc/string/arc/strcmp.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,14 +7,11 @@
 
 #include <features.h>
 #include <sysdep.h>
-
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
+#include <asm.h>
 
 ENTRY(strcmp)
 
-#ifdef __ARC700__
+#if defined(__ARC700__) || defined(__ARC64_ARCH32__)
 /* This is optimized primarily for the ARC700.
    It would be possible to speed up the loops by one cycle / word
    respective one cycle / byte by forcing double source 1 alignment, unrolling
@@ -38,7 +35,7 @@ ENTRY(strcmp)
 	breq	r2,r3,.Lwordloop
 #ifdef	__LITTLE_ENDIAN__
 	xor	r0,r2,r3	; mask for difference
-	sub_s	r1,r0,1
+	SUBR_S	r1,r0,1
 	bic_s	r0,r0,r1	; mask for least significant difference bit
 	sub	r1,r5,r0
 	xor	r0,r5,r1	; mask for least significant difference byte
@@ -55,7 +52,7 @@ ENTRY(strcmp)
 .Lfound0:
 	xor	r0,r2,r3	; mask for difference
 	or	r0,r0,r4	; or in zero indicator
-	sub_s	r1,r0,1
+	SUBR_S	r1,r0,1
 	bic_s	r0,r0,r1	; mask for least significant difference bit
 	sub	r1,r5,r0
 	xor	r0,r5,r1	; mask for least significant difference byte
@@ -99,31 +96,28 @@ ENTRY(strcmp)
 .Lcmpend:
 	j_s.d	[blink]
 	sub	r0,r2,r3
-#endif /* __ARC700__ */
 
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
 	or	r2, r0, r1
 	bmsk_s	r2, r2, 1
 	brne	r2, 0, @.Lcharloop
 
 ;;; s1 and s2 are word aligned
-	ld.ab	r2, [r0, 4]
 
 	mov_s	r12, 0x01010101
 	ror	r11, r12
 	.align  4
 .LwordLoop:
+	ld.ab	r2, [r0, 4]
+	sub	r4, r2, r12
 	ld.ab	r3, [r1, 4]
 	;; Detect NULL char in str1
-	sub	r4, r2, r12
-	ld.ab	r5, [r0, 4]
 	bic	r4, r4, r2
 	and	r4, r4, r11
 	brne.d.nt	r4, 0, .LfoundNULL
 	;; Check if the read locations are the same
 	cmp	r2, r3
-	beq.d	.LwordLoop
-	mov.eq	r2, r5
+	beq	.LwordLoop
 
 	;; A match is found, spot it out
 #ifdef __LITTLE_ENDIAN__
@@ -168,7 +162,10 @@ ENTRY(strcmp)
 .Lcmpend:
 	j_s.d	[blink]
 	sub	r0, r2, r3
-#endif /* __ARCHS__ */
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
 
 END(strcmp)
 libc_hidden_def(strcmp)
diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S
index 0b9b93815..0d1d3aa4e 100644
--- a/libc/string/arc/strlen.S
+++ b/libc/string/arc/strlen.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
 
 
 #include <sysdep.h>
+#include <asm.h>
 
 ENTRY(strlen)
 	or	r3,r0,7
@@ -15,7 +16,7 @@ ENTRY(strlen)
 	mov	r4,0x01010101
 	; uses long immediate
 #ifdef __LITTLE_ENDIAN__
-	asl_s	r1,r0,3
+	ASLR_S	r1,r0,3
 	btst_s	r0,2
 	asl	r7,r4,r1
 	ror	r5,r4
@@ -59,7 +60,7 @@ ENTRY(strlen)
 	sub.ne	r3,r3,4
 	mov.eq	r1,r12
 #ifdef __LITTLE_ENDIAN__
-	sub_s	r2,r1,1
+	SUBR_S	r2,r1,1
 	bic_s	r2,r2,r1
 	norm	r1,r2
 	sub_s	r0,r0,3
diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S
index 412270f50..29c583f16 100644
--- a/libc/string/arm/memset.S
+++ b/libc/string/arm/memset.S
@@ -32,6 +32,7 @@ memset:
 	cmp	r2, #8		@ at least 8 bytes to do?
 	bcc	2f
 
+	and	r1, r1, #0xFF
 	lsl	r3, r1, #8
 	orr	r1, r3
 	lsl	r3, r1, #16
@@ -68,6 +69,7 @@ memset:
 	mov	a4, a1
 	cmp	a3, $8		@ at least 8 bytes to do?
 	blo	2f
+	and	a2, a2, #0xFF
 	orr	a2, a2, a2, lsl $8
 	orr	a2, a2, a2, lsl $16
 1:
diff --git a/libc/string/explicit_bzero.c b/libc/string/explicit_bzero.c
new file mode 100644
index 000000000..b09e4c1f4
--- /dev/null
+++ b/libc/string/explicit_bzero.c
@@ -0,0 +1,30 @@
+/*
+Copyright © 2005-2020 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#define _BSD_SOURCE
+#include <string.h>
+
+void explicit_bzero(void *d, size_t n)
+{
+	d = memset(d, 0, n);
+	__asm__ __volatile__ ("" : : "r"(d) : "memory");
+}
diff --git a/libc/string/generic/memmove.c b/libc/string/generic/memmove.c
index 1ac018013..5389cc029 100644
--- a/libc/string/generic/memmove.c
+++ b/libc/string/generic/memmove.c
@@ -23,8 +23,9 @@
 #include "memcopy.h"
 #include "pagecopy.h"
 
-#ifndef __ARCH_HAS_BWD_MEMCPY__
+#if defined(__ARCH_HAS_BWD_MEMCPY__) || defined(__mips__)
 /* generic-opt memmove assumes memcpy does forward copying! */
+/* also needed for MIPS as its memcpy does not support overlapping regions */
 #include "_memcpy_fwd.c"
 #endif
 
@@ -224,8 +225,11 @@ void *memmove (void *dest, const void *src, size_t len)
      Reduces the working set.  */
   if (dstp - srcp >= len)	/* *Unsigned* compare!  */
     {
-#ifdef __ARCH_HAS_BWD_MEMCPY__
-      /* Backward memcpy implementation can be used */
+      /*  Calling memcpy() from memmove() should be skipped in two cases:
+       *  a) if arch's memcpy uses a backward copying (SH4)
+       *  b) if arch's memcpy is not fully safe for overlapping regions (MIPS)
+       */
+#if !defined(__ARCH_HAS_BWD_MEMCPY_) && !defined(__mips__)
       memcpy(dest, src, len);
 #else
       /* Copy from the beginning to the end.  */
diff --git a/libc/string/generic/strchr.c b/libc/string/generic/strchr.c
index 321d2b8c3..b34884d67 100644
--- a/libc/string/generic/strchr.c
+++ b/libc/string/generic/strchr.c
@@ -60,22 +60,19 @@ char *strchr (const char *s, int c_in)
 
      The 1-bits make sure that carries propagate to the next 0-bit.
      The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
-    {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
-    }
-
   /* Set up a longword, each of whose bytes is C.  */
+#if __WORDSIZE == 32
+  magic_bits = 0x7efefeffL;
   charmask = c | (c << 8);
   charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
+#elif __WORDSIZE == 64
+  magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL;
+  charmask = c | (c << 8);
+  charmask |= charmask << 16;
+  charmask |= (charmask << 16) << 16;
+#else
+  #error unexpected integer size strchr()
+#endif
 
   /* Instead of the traditional loop which tests each character,
      we will test a longword at a time.  The tricky part is testing
diff --git a/libc/string/generic/strchrnul.c b/libc/string/generic/strchrnul.c
index d11d9e00d..d9fadc776 100644
--- a/libc/string/generic/strchrnul.c
+++ b/libc/string/generic/strchrnul.c
@@ -59,22 +59,19 @@ char *strchrnul (const char *s, int c_in)
 
      The 1-bits make sure that carries propagate to the next 0-bit.
      The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
-    {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
-    }
 
-  /* Set up a longword, each of whose bytes is C.  */
+#if __WORDSIZE == 32
+  magic_bits = 0x7efefeffL;
   charmask = c | (c << 8);
   charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
+#elif __WORDSIZE == 64
+  magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL;
+  charmask = c | (c << 8);
+  charmask |= charmask << 16;
+  charmask |= (charmask << 16) << 16;
+#else
+  #error unexpected integer size strchr()
+#endif
 
   /* Instead of the traditional loop which tests each character,
      we will test a longword at a time.  The tricky part is testing
diff --git a/libc/string/generic/strlen.c b/libc/string/generic/strlen.c
index dc383398b..dcc032ddc 100644
--- a/libc/string/generic/strlen.c
+++ b/libc/string/generic/strlen.c
@@ -28,7 +28,7 @@ size_t strlen (const char *str)
 {
   const char *char_ptr;
   const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, himagic, lomagic;
+  unsigned long int longword, himagic, lomagic;
 
   /* Handle the first few characters by reading one character at a time.
      Do this until CHAR_PTR is aligned on a longword boundary.  */
@@ -52,14 +52,12 @@ size_t strlen (const char *str)
 
      The 1-bits make sure that carries propagate to the next 0-bit.
      The 0-bits provide holes for carries to fall into.  */
-  magic_bits = 0x7efefeffL;
   himagic = 0x80808080L;
   lomagic = 0x01010101L;
   if (sizeof (longword) > 4)
     {
       /* 64-bit version of the magic.  */
       /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL;
       himagic = ((himagic << 16) << 16) | himagic;
       lomagic = ((lomagic << 16) << 16) | lomagic;
     }
@@ -102,22 +100,7 @@ size_t strlen (const char *str)
 
       longword = *longword_ptr++;
 
-      if (
-#if 0
-	  /* Add MAGIC_BITS to LONGWORD.  */
-	  (((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits)
-#else
-	  ((longword - lomagic) & himagic)
-#endif
-	  != 0)
+      if (((longword - lomagic) & himagic) != 0)
 	{
 	  /* Which of the bytes was the zero?  If none of them were, it was
 	     a misfire; continue the search.  */
diff --git a/libc/string/generic/strnlen.c b/libc/string/generic/strnlen.c
index 4d4cde84f..82d4122ec 100644
--- a/libc/string/generic/strnlen.c
+++ b/libc/string/generic/strnlen.c
@@ -29,15 +29,17 @@
    '\0' terminator is found in that many characters, return MAXLEN.  */
 size_t strnlen (const char *str, size_t maxlen)
 {
-  const char *char_ptr, *end_ptr = str + maxlen;
+  const char *char_ptr, *end_ptr;
   const unsigned long int *longword_ptr;
   unsigned long int longword, himagic, lomagic;
 
   if (maxlen == 0)
     return 0;
 
-  if (__builtin_expect (end_ptr < str, 0))
+  if (__builtin_expect ((uintptr_t)str + maxlen < (uintptr_t)str, 0))
     end_ptr = (const char *) ~0UL;
+  else
+    end_ptr = str + maxlen;
 
   /* Handle the first few characters by reading one character at a time.
      Do this until CHAR_PTR is aligned on a longword boundary.  */
diff --git a/libc/string/kvx/Makefile b/libc/string/kvx/Makefile
new file mode 100644
index 000000000..0a95346fd
--- /dev/null
+++ b/libc/string/kvx/Makefile
@@ -0,0 +1,13 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
+#
+# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+#
+
+top_srcdir:=../../../
+top_builddir:=../../../
+all: objs
+include $(top_builddir)Rules.mak
+include ../Makefile.in
+include $(top_srcdir)Makerules
diff --git a/libc/string/kvx/memcpy.S b/libc/string/kvx/memcpy.S
new file mode 100644
index 000000000..70e8db910
--- /dev/null
+++ b/libc/string/kvx/memcpy.S
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2020 Kalray Inc.
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
+ * in this tarball.
+ */
+
+#include <sysdep.h>
+
+.align 16
+ENTRY(memcpy)
+	cb.deqz $r2? .Lreturn
+	compd.geu $r3 = $r2, 256
+	copyd $r6 = $r0
+	;;
+	cb.deqz $r3? .Lremaining_256
+	;;
+	lq.u $r32r33 = 0[$r1]
+	addd $r2 = $r2, -256
+	;;
+	lq.u $r34r35 = 16[$r1]
+	;;
+	lq.u $r36r37 = 32[$r1]
+	srld $r7 = $r2, 8
+	;;
+	lq.u $r38r39 = 48[$r1]
+	;;
+	lq.u $r40r41 = 64[$r1]
+	;;
+	lq.u $r42r43 = 80[$r1]
+	;;
+	lq.u $r44r45 = 96[$r1]
+	;;
+	lq.u $r46r47 = 112[$r1]
+	;;
+	lq.u $r48r49 = 128[$r1]
+	;;
+	lq.u $r50r51 = 144[$r1]
+	;;
+	lq.u $r52r53 = 160[$r1]
+	;;
+	lq.u $r54r55 = 176[$r1]
+	;;
+	lq.u $r56r57 = 192[$r1]
+	;;
+	lq.u $r58r59 = 208[$r1]
+	compd.geu $r3 = $r2, 256
+	;;
+	lq.u $r60r61 = 224[$r1]
+	;;
+	lq.u $r62r63 = 240[$r1]
+	addd $r1 = $r1, 256
+	;;
+	cb.deqz $r7? .Lstreaming_loop_end
+	;;
+	loopdo $r7, .Lstreaming_loop_end
+		;;
+		sq 0[$r0] = $r32r33
+		addd $r2 = $r2, -256
+		;;
+		lq.u $r32r33 = 0[$r1]
+		;;
+		sq 16[$r0] = $r34r35
+		;;
+		lq.u $r34r35 = 16[$r1]
+		;;
+		sq 32[$r0] = $r36r37
+		;;
+		lq.u $r36r37 = 32[$r1]
+		;;
+		sq 48[$r0] = $r38r39
+		;;
+		lq.u $r38r39 = 48[$r1]
+		;;
+		sq 64[$r0] = $r40r41
+		;;
+		lq.u $r40r41 = 64[$r1]
+		;;
+		sq 80[$r0] = $r42r43
+		;;
+		lq.u $r42r43 = 80[$r1]
+		;;
+		sq 96[$r0] = $r44r45
+		;;
+		lq.u $r44r45 = 96[$r1]
+		;;
+		sq 112[$r0] = $r46r47
+		;;
+		lq.u $r46r47 = 112[$r1]
+		;;
+		sq 128[$r0] = $r48r49
+		;;
+		lq.u $r48r49 = 128[$r1]
+		;;
+		sq 144[$r0] = $r50r51
+		;;
+		lq.u $r50r51 = 144[$r1]
+		;;
+		sq 160[$r0] = $r52r53
+		;;
+		lq.u $r52r53 = 160[$r1]
+		;;
+		sq 176[$r0] = $r54r55
+		;;
+		lq.u $r54r55 = 176[$r1]
+		;;
+		sq 192[$r0] = $r56r57
+		;;
+		lq.u $r56r57 = 192[$r1]
+		;;
+		sq 208[$r0] = $r58r59
+		;;
+		lq.u $r58r59 = 208[$r1]
+		;;
+		sq 224[$r0] = $r60r61
+		;;
+		lq.u $r60r61 = 224[$r1]
+		;;
+		sq 240[$r0] = $r62r63
+		addd $r0 = $r0, 256
+		;;
+		lq.u $r62r63 = 240[$r1]
+		addd $r1 = $r1, 256
+		;;
+	.Lstreaming_loop_end:
+	sq 0[$r0] = $r32r33
+	;;
+	sq 16[$r0] = $r34r35
+	;;
+	sq 32[$r0] = $r36r37
+	;;
+	sq 48[$r0] = $r38r39
+	;;
+	sq 64[$r0] = $r40r41
+	;;
+	sq 80[$r0] = $r42r43
+	;;
+	sq 96[$r0] = $r44r45
+	;;
+	sq 112[$r0] = $r46r47
+	;;
+	sq 128[$r0] = $r48r49
+	;;
+	sq 144[$r0] = $r50r51
+	;;
+	sq 160[$r0] = $r52r53
+	;;
+	sq 176[$r0] = $r54r55
+	;;
+	sq 192[$r0] = $r56r57
+	;;
+	sq 208[$r0] = $r58r59
+	;;
+	sq 224[$r0] = $r60r61
+	;;
+	sq 240[$r0] = $r62r63
+	addd $r0 = $r0, 256
+	;;
+.Lremaining_256:
+	andd $r11 = $r2, 16
+	srld $r7 = $r2, 5
+	;;
+	cb.deqz $r7? .Lloop_32_end
+	;;
+	loopdo $r7, .Lloop_32_end
+		;;
+		lo $r32r33r34r35 = 0[$r1]
+		addd $r1 = $r1, 32
+		addd $r2 = $r2, -32
+		;;
+		so 0[$r0] = $r32r33r34r35
+		addd $r0 = $r0, 32
+		;;
+	.Lloop_32_end:
+	andd $r10 = $r2, 8
+	andd $r9 = $r2, 4
+	cb.deqz $r11? .Lloop_remaining_16
+	lq.u.dnez $r11? $r32r33 = 0[$r1]
+	;;
+	sq 0[$r0] = $r32r33
+	addd $r1 = $r1, 16
+	addd $r0 = $r0, 16
+	;;
+.Lloop_remaining_16:
+	andd $r8 = $r2, 2
+	andd $r7 = $r2, 1
+	cb.deqz $r10? .Lloop_remaining_8
+	ld.dnez $r10? $r32 = 0[$r1]
+	;;
+	sd 0[$r0] = $r32
+	addd $r1 = $r1, 8
+	addd $r0 = $r0, 8
+	;;
+.Lloop_remaining_8:
+	cb.deqz $r9? .Lloop_remaining_4
+	lwz.dnez $r9? $r32 = 0[$r1]
+	;;
+	sw 0[$r0] = $r32
+	addd $r1 = $r1, 4
+	addd $r0 = $r0, 4
+	;;
+.Lloop_remaining_4:
+	cb.deqz $r8? .Lloop_remaining_2
+	lhz.dnez $r8? $r32 = 0[$r1]
+	;;
+	sh 0[$r0] = $r32
+	addd $r1 = $r1, 2
+	addd $r0 = $r0, 2
+	;;
+.Lloop_remaining_2:
+	lbz.dnez $r7? $r32 = 0[$r1]
+	;;
+	sb.dnez $r7? 0[$r0] = $r32
+	;;
+.Lreturn:
+	copyd $r0 = $r6
+	ret
+	;;
+END(memcpy)
+
+libc_hidden_def(memcpy)
diff --git a/libc/string/kvx/memset.S b/libc/string/kvx/memset.S
new file mode 100644
index 000000000..45023a68f
--- /dev/null
+++ b/libc/string/kvx/memset.S
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2019 Kalray Inc.
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
+ * in this tarball.
+ */
+
+#define REPLICATE_BYTE_MASK	0x0101010101010101
+#define MIN_SIZE_FOR_ALIGN	128
+
+/*
+ * Optimized memset for kvx architecture
+ *
+ * In order to optimize memset on kvx, we can use various things:
+ * - conditionnal store which avoid branch penalty
+ * - store half/word/double/quad/octuple to store up to 16 bytes at a time
+ * - hardware loop for steady cases.
+ *
+ * First,  we start by checking if the size is below a minimum size. If so, we
+ * skip the alignment part. Indeed, the kvx supports misalignment and the
+ * penalty for letting it do unaligned accesses is lower than trying to
+ * realigning us. So for small sizes, we don't even bother to realign.
+ * In order to create the 64 bits pattern, we use sbmm to replicate the pattern
+ * on all bits on a register in one call.
+ * Once alignment has been reached, we can do the hardware loop using store
+ * octuple in order to optimize throughput. Care must be taken to align hardware
+ * loops on at least 8 bytes for performances.
+ * Once the main loop has been done, we finish the copy by checking length to do
+ * the necessary calls to store remaining bytes.
+ */
+
+#include <sysdep.h>
+
+.align 16
+ENTRY(memset)
+	/* Preserve return value */
+	copyd $r3 = $r0
+	/* Replicate the first pattern byte on all bytes */
+	sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK
+	/* Check if length < MIN_SIZE_FOR_ALIGN */
+	compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN
+	/* Invert address to compute what we need to copy to be aligned on 32 bytes */
+	negd $r5 = $r0
+	;;
+	/* Check if we are aligned on 32 bytes */
+	andw $r9 = $r0, 0x1F
+	/* Compute the length that will be copied to align on 32 bytes boundary */
+	andw $r6 = $r5, 0x1F
+	/*
+	 * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done
+	 * unaligned but that is still better that what we can do with sb
+	 */
+	cb.deqz $r7? .Laligned_32
+	;;
+	/* Remove unaligned part from length */
+	sbfd $r2 = $r6, $r2
+	/* If we are already aligned on 32 bytes, jump to main "so" loop */
+	cb.deqz $r9? .Laligned_32
+	/* Check if we need to copy 1 byte */
+	andw $r4 = $r5, (1 << 0)
+	;;
+	/* If we are not aligned, store byte */
+	sb.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 2 bytes */
+	andw $r4 = $r5, (1 << 1)
+	/* Add potentially copied part for next store offset */
+	addd $r0 = $r0, $r4
+	;;
+	sh.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 4 bytes */
+	andw $r4 = $r5, (1 << 2)
+	addd $r0 = $r0, $r4
+	;;
+	sw.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 8 bytes */
+	andw $r4 = $r5, (1 << 3)
+	addd $r0 = $r0, $r4
+	/* Copy second part of pattern for sq */
+	copyd $r33 = $r32
+	;;
+	sd.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 16 bytes */
+	andw $r4 = $r5, (1 << 4)
+	addd $r0 = $r0, $r4
+	;;
+	sq.dnez $r4? [$r0] = $r32r33
+	addd $r0 = $r0, $r4
+	;;
+.Laligned_32:
+	/* Copy second part of pattern for sq */
+	copyd $r33 = $r32
+	/* Prepare amount of data for 32 bytes store */
+	srld $r10 = $r2, 5
+	nop
+	nop
+	;;
+	copyq $r34r35 = $r32, $r33
+	/* Remaining bytes for 16 bytes store */
+	andw $r8 = $r2, (1 << 4)
+	make $r11 = 32
+	/* Check if there are enough data for 32 bytes store */
+	cb.deqz $r10? .Laligned_32_done
+	;;
+	loopdo $r10, .Laligned_32_done
+		;;
+		so 0[$r0] = $r32r33r34r35
+		addd $r0 = $r0, $r11
+		;;
+	.Laligned_32_done:
+	/*
+	 * Now that we have handled every aligned bytes using 'so', we can
+	 * handled the remainder of length using store by decrementing size
+	 * We also exploit the fact we are aligned to simply check remaining
+	 * size */
+	sq.dnez $r8? [$r0] = $r32r33
+	addd $r0 = $r0, $r8
+	/* Remaining bytes for 8 bytes store */
+	andw $r8 = $r2, (1 << 3)
+	cb.deqz $r2? .Lmemset_done
+	;;
+	sd.dnez $r8? [$r0] = $r32
+	addd $r0 = $r0, $r8
+	/* Remaining bytes for 4 bytes store */
+	andw $r8 = $r2, (1 << 2)
+	;;
+	sw.dnez $r8? [$r0] = $r32
+	addd $r0 = $r0, $r8
+	/* Remaining bytes for 2 bytes store */
+	andw $r8 = $r2, (1 << 1)
+	;;
+	sh.dnez $r8? [$r0] = $r32
+	addd $r0 = $r0, $r8
+	;;
+	sb.odd $r2? [$r0] = $r32
+	/* Restore original value */
+	copyd $r0 = $r3
+	ret
+	;;
+.Lmemset_done:
+	/* Restore original value */
+	copyd $r0 = $r3
+	ret
+	;;
+END(memset)
+
+libc_hidden_def(memset)
diff --git a/libc/string/strcasestr.c b/libc/string/strcasestr.c
index 3334086bf..8f57cc0a3 100644
--- a/libc/string/strcasestr.c
+++ b/libc/string/strcasestr.c
@@ -16,7 +16,7 @@ char *strcasestr(const char *s1, const char *s2)
 #if 1
 	do {
 		if (!*p) {
-			return (char *) s1;;
+			return (char *) s1;
 		}
 		if ((*p == *s)
 			|| (tolower(*((unsigned char *)p)) == tolower(*((unsigned char *)s)))
diff --git a/libc/string/strstr.c b/libc/string/strstr.c
index 7e2a64e7d..bf56b9c12 100644
--- a/libc/string/strstr.c
+++ b/libc/string/strstr.c
@@ -22,7 +22,7 @@ Wchar *Wstrstr(const Wchar *s1, const Wchar *s2)
 
 	do {
 		if (!*p) {
-			return (Wchar *) s1;;
+			return (Wchar *) s1;
 		}
 		if (*p == *s) {
 			++p;
diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S
index 55e09e5f1..209e19062 100644
--- a/libc/string/x86_64/strcat.S
+++ b/libc/string/x86_64/strcat.S
@@ -106,7 +106,7 @@ ENTRY (BP_SYM (strcat))
 
 	/* Align, it is a jump target.  */
 	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
-	.p2align 3,,8
+	.p2align 3,,7
 3:
 	subq $8,%rax		/* correct pointer increment.  */
 
diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S
index 7a06c8867..5ef565db7 100644
--- a/libc/string/x86_64/strcspn.S
+++ b/libc/string/x86_64/strcspn.S
@@ -94,7 +94,7 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	/* but it will also align entire function to 16 bytes, */
 	/* potentially creating largish padding at link time. */
 	/* We are aligning to 8 bytes instead: */
-	.p2align 3,,8
+	.p2align 3,,7
 
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 
diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S
index 9e84326c2..2fe2f58b2 100644
--- a/libc/string/x86_64/strlen.S
+++ b/libc/string/x86_64/strlen.S
@@ -102,7 +102,7 @@ ENTRY (strlen)
 
 	/* Align, it is a jump target.  */
 	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
-	.p2align 3,,8
+	.p2align 3,,7
 3:
 	subq $8,%rax		/* correct pointer increment.  */
 
diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S
index 366377649..8dc42656b 100644
--- a/libc/string/x86_64/strspn.S
+++ b/libc/string/x86_64/strspn.S
@@ -89,7 +89,7 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	/* but it will also align entire function to 16 bytes, */
 	/* potentially creating largish padding at link time. */
 	/* We are aligning to 8 bytes instead: */
-	.p2align 3,,8
+	.p2align 3,,7
 L(3):
 	addq $4, %rax		/* adjust pointer for full loop round */