204 files changed, 7290 insertions, 7373 deletions
diff --git a/libc/string/Makefile.in b/libc/string/Makefile.in
index 2f14cc0e6..e7f2ccde1 100644
--- a/libc/string/Makefile.in
+++ b/libc/string/Makefile.in
@@ -1,10 +1,12 @@
 # Makefile for uClibc
 #
-# Copyright (C) 2000-2006 Erik Andersen <andersen@uclibc.org>
+# Copyright (C) 2000-2008 Erik Andersen <andersen@uclibc.org>
 #
 # Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
 #
 
+subdirs += libc/string/$(TARGET_ARCH) libc/string/generic
+
 #
 # Arch specific fun
 #
@@ -16,7 +18,10 @@ STRING_SUBARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH
 STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S)
 STRING_SUBARCH_SOBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC))
 
-STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ)
+STRING_SUBARCH_CSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.c)
+STRING_SUBARCH_COBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.c,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_CSRC))
+
+STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ) $(STRING_SUBARCH_COBJ)
 endif
 
 # Collect the arch specific implementation (asm, c files)
@@ -133,7 +138,7 @@ libc-y += $(STRING_COBJ)
 libc-nomulti-$(UCLIBC_HAS_XLOCALE) += $(STRING_OUT)/wcsxfrm_l.o
 libc-nomulti-y += $(STRING_OUT)/__xpg_strerror_r.o
 
-objclean-y += string_objclean
+objclean-y += CLEAN_libc/string
 
-string_objclean:
-	$(RM) $(STRING_OUT)/{,*/}{,*/}*.{o,os,oS}
+CLEAN_libc/string:
+	$(do_rm) $(addprefix $(STRING_OUT)/,$(addprefix *., o os oS) $(addprefix */*., o os oS) $(addprefix */*/*., o os oS))
diff --git a/libc/string/__glibc_strerror_r.c b/libc/string/__glibc_strerror_r.c
index 0f9cd16a9..96b881700 100644
--- a/libc/string/__glibc_strerror_r.c
+++ b/libc/string/__glibc_strerror_r.c
@@ -5,11 +5,13 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
+/* get rid of REDIRECT */
+#define strerror_r __hide_strerror_r
+
 #include <features.h>
 #include <string.h>
 
-libc_hidden_proto(__glibc_strerror_r)
-libc_hidden_proto(__xpg_strerror_r)
+#undef strerror_r
 
 char *__glibc_strerror_r(int errnum, char *strerrbuf, size_t buflen)
 {
@@ -18,3 +20,6 @@ char *__glibc_strerror_r(int errnum, char *strerrbuf, size_t buflen)
     return strerrbuf;
 }
 libc_hidden_def(__glibc_strerror_r)
+#if !defined __USE_XOPEN2K || defined __USE_GNU
+strong_alias(__glibc_strerror_r,strerror_r)
+#endif
diff --git a/libc/string/__xpg_basename.c b/libc/string/__xpg_basename.c
index 2449d1d42..2e7ade913 100644
--- a/libc/string/__xpg_basename.c
+++ b/libc/string/__xpg_basename.c
@@ -5,7 +5,6 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-#include "_string.h"
 #include <libgen.h>
 
 char *__xpg_basename(register char *path)
@@ -34,3 +33,7 @@ char *__xpg_basename(register char *path)
 
 	return first;
 }
+#ifndef __USE_GNU
+# undef basename
+weak_alias(__xpg_basename,basename)
+#endif
diff --git a/libc/string/__xpg_strerror_r.c b/libc/string/__xpg_strerror_r.c
index ff41192e5..3e78da1be 100644
--- a/libc/string/__xpg_strerror_r.c
+++ b/libc/string/__xpg_strerror_r.c
@@ -5,8 +5,8 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-/* Make sure we get proper strerror_r() prototype */
-#define strerror_r _hidestrerror_r
+/* get rid of REDIRECT */
+#define strerror_r __hide_strerror_r
 
 #include <features.h>
 #include <errno.h>
@@ -15,10 +15,6 @@
 
 #undef strerror_r
 
-libc_hidden_proto(__xpg_strerror_r)
-/* Experimentally off - libc_hidden_proto(memcpy) */
-/* Experimentally off - libc_hidden_proto(strlen) */
-
 #ifdef __UCLIBC_HAS_ERRNO_MESSAGES__
 
 extern const char _string_syserrmsgs[] attribute_hidden;
@@ -276,4 +272,6 @@ int __xpg_strerror_r(int errnum, char *strerrbuf, size_t buflen)
 
 #endif /* __UCLIBC_HAS_ERRNO_MESSAGES__ */
 libc_hidden_def(__xpg_strerror_r)
-weak_alias(__xpg_strerror_r, strerror_r)
+#if defined __USE_XOPEN2K && !defined __USE_GNU
+strong_alias(__xpg_strerror_r,strerror_r)
+#endif
diff --git a/libc/string/_collate.c b/libc/string/_collate.c
index 64b5d9608..93501b85e 100644
--- a/libc/string/_collate.c
+++ b/libc/string/_collate.c
@@ -19,15 +19,6 @@
 #include <errno.h>
 #include <assert.h>
 
-/* Experimentally off - libc_hidden_proto(memset) */
-/* Experimentally off - libc_hidden_proto(memcpy) */
-/* Experimentally off - libc_hidden_proto(strlcpy) */
-/* Experimentally off - libc_hidden_proto(strcmp) */
-#ifdef WANT_WIDE
-libc_hidden_proto(wcsxfrm)
-libc_hidden_proto(wcscmp)
-#endif
-
 #ifdef __UCLIBC_HAS_LOCALE__
 #if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l)
 
@@ -59,29 +50,24 @@ libc_hidden_proto(wcscmp)
 
 #if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE)
 
-libc_hidden_proto(wcscoll_l)
 
-libc_hidden_proto(wcscoll)
 int wcscoll (const Wchar *s0, const Wchar *s1)
 {
 	return wcscoll_l(s0, s1, __UCLIBC_CURLOCALE );
 }
 libc_hidden_def(wcscoll)
 
-libc_hidden_proto(wcsxfrm_l)
 
-libc_hidden_proto(wcsxfrm)
 size_t wcsxfrm(Wchar *__restrict ws1, const Wchar *__restrict ws2, size_t n)
 {
 	return wcsxfrm_l(ws1, ws2, n, __UCLIBC_CURLOCALE );
 }
-libc_hidden_def(wcsxfrm)
 
 #else  /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
 
 
 #if 0
-#define CUR_COLLATE (&__UCLIBC_CURLOCALE_DATA.collate)
+#define CUR_COLLATE (&__UCLIBC_CURLOCALE->collate)
 #else
 #define CUR_COLLATE (& __LOCALE_PTR->collate)
 #endif
@@ -173,7 +159,7 @@ static void next_weight(col_state_t *cs, int pass   __LOCALE_PARAM )
 #define N (1)
 #else  /* WANT_WIDE */
 	wchar_t WC;
-	size_t n0, nx;
+	size_t n0, nx = 0;
 #define N n0
 
 #endif /* WANT_WIDE */
@@ -381,7 +367,7 @@ static void next_weight(col_state_t *cs, int pass   __LOCALE_PARAM )
 						if (cs->back_buf == cs->ibb) { /* was using internal buffer */
 							cs->bp = malloc(cs->bb_size + 128);
 							if (!cs->bp) {
-								__set_errno(ENOMEM);
+								/* __set_errno(ENOMEM); */
 #ifdef __UCLIBC_MJN3_ONLY__
 #warning what to do here?
 #endif
@@ -393,7 +379,7 @@ static void next_weight(col_state_t *cs, int pass   __LOCALE_PARAM )
 						} else {
 							cs->bp = realloc(cs->back_buf, cs->bb_size + 128);
 							if (!cs->bp) {
-								__set_errno(ENOMEM);
+								/* __set_errno(ENOMEM); */
 #ifdef __UCLIBC_MJN3_ONLY__
 #warning what to do here?
 #endif
@@ -513,7 +499,6 @@ static void next_weight(col_state_t *cs, int pass   __LOCALE_PARAM )
 	} while (1);
 }
 
-libc_hidden_proto(__XL_NPP(wcscoll))
 int __XL_NPP(wcscoll) (const Wchar *s0, const Wchar *s1   __LOCALE_PARAM )
 {
 	col_state_t ws[2];
@@ -522,9 +507,9 @@ int __XL_NPP(wcscoll) (const Wchar *s0, const Wchar *s1   __LOCALE_PARAM )
 	if (!CUR_COLLATE->num_weights) { /* C locale */
 #ifdef WANT_WIDE
 		return wcscmp(s0, s1);
-#else  /* WANT_WIDE */
+#else
 		return strcmp(s0, s1);
-#endif /* WANT_WIDE */
+#endif
 	}
 
 	pass = 0;
@@ -551,10 +536,6 @@ libc_hidden_def(__XL_NPP(wcscoll))
 
 #ifdef WANT_WIDE
 
-extern size_t __wcslcpy(wchar_t *__restrict dst,
-		const wchar_t *__restrict src, size_t n);
-
-libc_hidden_proto(__XL_NPP(wcsxfrm))
 size_t __XL_NPP(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
 					 size_t n   __LOCALE_PARAM )
 {
@@ -592,7 +573,9 @@ size_t __XL_NPP(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
 	}
 	return count-1;
 }
+#if defined L_strxfrm_l || defined L_wcsxfrm_l
 libc_hidden_def(__XL_NPP(wcsxfrm))
+#endif
 
 #else  /* WANT_WIDE */
 
@@ -636,7 +619,6 @@ static size_t store(unsigned char *s, size_t count, size_t n, __uwchar_t weight)
 	return r;
 }
 
-libc_hidden_proto(__XL_NPP(strxfrm))
 size_t __XL_NPP(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_t n
 					 __LOCALE_PARAM )
 {
@@ -674,7 +656,9 @@ size_t __XL_NPP(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_
 	}
 	return count-1;
 }
+#ifdef L_strxfrm_l
 libc_hidden_def(__XL_NPP(strxfrm))
+#endif
 
 #endif /* WANT_WIDE */
 
diff --git a/libc/string/arc/Makefile b/libc/string/arc/Makefile
new file mode 100755
index 000000000..523cf6842
--- /dev/null
+++ b/libc/string/arc/Makefile
@@ -0,0 +1,13 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
+#
+# Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+#
+
+top_srcdir:=../../../
+top_builddir:=../../../
+all: objs
+include $(top_builddir)Rules.mak
+include ../Makefile.in
+include $(top_srcdir)Makerules
diff --git a/libc/string/arc/arcv2/memcpy.S b/libc/string/arc/arcv2/memcpy.S
new file mode 100644
index 000000000..7573daf51
--- /dev/null
+++ b/libc/string/arc/arcv2/memcpy.S
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <features.h>
+#include <sysdep.h>
+
+#ifdef __LITTLE_ENDIAN__
+# define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
+# define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
+# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM
+# define MERGE_2(RX,RY,IMM)
+# define EXTRACT_1(RX,RY,IMM)	and	RX, RY, 0xFFFF
+# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, IMM
+#else
+# define SHIFT_1(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
+# define SHIFT_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
+# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
+# define MERGE_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
+# define EXTRACT_1(RX,RY,IMM)	lsr	RX, RY, IMM
+# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, 0x08
+#endif
+
+#ifdef __LL64__
+# define PREFETCH_READ(RX)	prefetch [RX, 56]
+# define PREFETCH_WRITE(RX)	prefetchw [RX, 64]
+# define LOADX(DST,RX)		ldd.ab	DST, [RX, 8]
+# define STOREX(SRC,RX)		std.ab	SRC, [RX, 8]
+# define ZOLSHFT		5
+# define ZOLAND			0x1F
+#else
+# define PREFETCH_READ(RX)	prefetch [RX, 28]
+# define PREFETCH_WRITE(RX)	prefetchw [RX, 32]
+# define LOADX(DST,RX)		ld.ab	DST, [RX, 4]
+# define STOREX(SRC,RX)		st.ab	SRC, [RX, 4]
+# define ZOLSHFT		4
+# define ZOLAND			0xF
+#endif
+
+ENTRY(memcpy)
+	prefetch  [r1]		; Prefetch the read location
+	prefetchw [r0]		; Prefetch the write location
+	mov.f	0, r2
+;;; if size is zero
+	jz.d	[blink]
+	mov	r3, r0		; don't clobber ret val
+
+;;; if size <= 8
+	cmp	r2, 8
+	bls.d	@.Lsmallchunk
+	mov.f	lp_count, r2
+
+	and.f	r4, r0, 0x03
+	rsub	lp_count, r4, 4
+	lpnz	@.Laligndestination
+	;; LOOP BEGIN
+	ldb.ab	r5, [r1,1]
+	sub	r2, r2, 1
+	stb.ab	r5, [r3,1]
+.Laligndestination:
+
+;;; Check the alignment of the source
+	and.f	r4, r1, 0x03
+	bnz.d	@.Lsourceunaligned
+
+;;; CASE 0: Both source and destination are 32bit aligned
+;;; Convert len to Dwords, unfold x4
+	lsr.f	lp_count, r2, ZOLSHFT
+	lpnz	@.Lcopy32_64bytes
+	;; LOOP START
+	LOADX (r6, r1)
+	PREFETCH_READ (r1)
+	PREFETCH_WRITE (r3)
+	LOADX (r8, r1)
+	LOADX (r10, r1)
+	LOADX (r4, r1)
+	STOREX (r6, r3)
+	STOREX (r8, r3)
+	STOREX (r10, r3)
+	STOREX (r4, r3)
+.Lcopy32_64bytes:
+
+	and.f	lp_count, r2, ZOLAND ;Last remaining 31 bytes
+.Lsmallchunk:
+	lpnz	@.Lcopyremainingbytes
+	;; LOOP START
+	ldb.ab	r5, [r1,1]
+	stb.ab	r5, [r3,1]
+.Lcopyremainingbytes:
+
+	j	[blink]
+;;; END CASE 0
+
+.Lsourceunaligned:
+	cmp	r4, 2
+	beq.d	@.LunalignedOffby2
+	sub	r2, r2, 1
+
+	bhi.d	@.LunalignedOffby3
+	ldb.ab	r5, [r1, 1]
+
+;;; CASE 1: The source is unaligned, off by 1
+	;; Hence I need to read 1 byte for a 16bit alignment
+	;; and 2bytes to reach 32bit alignment
+	ldh.ab	r6, [r1, 2]
+	sub	r2, r2, 2
+	;; Convert to words, unfold x2
+	lsr.f	lp_count, r2, 3
+	MERGE_1 (r6, r6, 8)
+	MERGE_2 (r5, r5, 24)
+	or	r5, r5, r6
+
+	;; Both src and dst are aligned
+	lpnz	@.Lcopy8bytes_1
+	;; LOOP START
+	ld.ab	r6, [r1, 4]
+	prefetch [r1, 28]	;Prefetch the next read location
+	ld.ab	r8, [r1,4]
+	prefetchw [r3, 32]	;Prefetch the next write location
+
+	SHIFT_1	(r7, r6, 24)
+	or	r7, r7, r5
+	SHIFT_2	(r5, r6, 8)
+
+	SHIFT_1	(r9, r8, 24)
+	or	r9, r9, r5
+	SHIFT_2	(r5, r8, 8)
+
+	st.ab	r7, [r3, 4]
+	st.ab	r9, [r3, 4]
+.Lcopy8bytes_1:
+
+	;; Write back the remaining 16bits
+	EXTRACT_1 (r6, r5, 16)
+	sth.ab	r6, [r3, 2]
+	;; Write back the remaining 8bits
+	EXTRACT_2 (r5, r5, 16)
+	stb.ab	r5, [r3, 1]
+
+	and.f	lp_count, r2, 0x07 ;Last 8bytes
+	lpnz	@.Lcopybytewise_1
+	;; LOOP START
+	ldb.ab	r6, [r1,1]
+	stb.ab	r6, [r3,1]
+.Lcopybytewise_1:
+	j	[blink]
+
+.LunalignedOffby2:
+;;; CASE 2: The source is unaligned, off by 2
+	ldh.ab	r5, [r1, 2]
+	sub	r2, r2, 1
+
+	;; Both src and dst are aligned
+	;; Convert to words, unfold x2
+	lsr.f	lp_count, r2, 3
+#ifdef __BIG_ENDIAN__
+	asl.nz	r5, r5, 16
+#endif
+	lpnz	@.Lcopy8bytes_2
+	;; LOOP START
+	ld.ab	r6, [r1, 4]
+	prefetch [r1, 28]	;Prefetch the next read location
+	ld.ab	r8, [r1,4]
+	prefetchw [r3, 32]	;Prefetch the next write location
+
+	SHIFT_1	(r7, r6, 16)
+	or	r7, r7, r5
+	SHIFT_2	(r5, r6, 16)
+
+	SHIFT_1	(r9, r8, 16)
+	or	r9, r9, r5
+	SHIFT_2	(r5, r8, 16)
+
+	st.ab	r7, [r3, 4]
+	st.ab	r9, [r3, 4]
+.Lcopy8bytes_2:
+
+#ifdef __BIG_ENDIAN__
+	lsr.nz	r5, r5, 16
+#endif
+	sth.ab	r5, [r3, 2]
+
+	and.f	lp_count, r2, 0x07 ;Last 8bytes
+	lpnz	@.Lcopybytewise_2
+	;; LOOP START
+	ldb.ab	r6, [r1,1]
+	stb.ab	r6, [r3,1]
+.Lcopybytewise_2:
+	j	[blink]
+
+.LunalignedOffby3:
+;;; CASE 3: The source is unaligned, off by 3
+;;; Hence, I need to read 1byte for achieve the 32bit alignment
+
+	;; Both src and dst are aligned
+	;; Convert to words, unfold x2
+	lsr.f	lp_count, r2, 3
+#ifdef __BIG_ENDIAN__
+	asl.ne	r5, r5, 24
+#endif
+	lpnz	@.Lcopy8bytes_3
+	;; LOOP START
+	ld.ab	r6, [r1, 4]
+	prefetch [r1, 28]	;Prefetch the next read location
+	ld.ab	r8, [r1,4]
+	prefetchw [r3, 32]	;Prefetch the next write location
+
+	SHIFT_1	(r7, r6, 8)
+	or	r7, r7, r5
+	SHIFT_2	(r5, r6, 24)
+
+	SHIFT_1	(r9, r8, 8)
+	or	r9, r9, r5
+	SHIFT_2	(r5, r8, 24)
+
+	st.ab	r7, [r3, 4]
+	st.ab	r9, [r3, 4]
+.Lcopy8bytes_3:
+
+#ifdef __BIG_ENDIAN__
+	lsr.nz	r5, r5, 24
+#endif
+	stb.ab	r5, [r3, 1]
+
+	and.f	lp_count, r2, 0x07 ;Last 8bytes
+	lpnz	@.Lcopybytewise_3
+	;; LOOP START
+	ldb.ab	r6, [r1,1]
+	stb.ab	r6, [r3,1]
+.Lcopybytewise_3:
+	j	[blink]
+
+END(memcpy)
+libc_hidden_def(memcpy)
diff --git a/libc/string/arc/arcv2/memset.S b/libc/string/arc/arcv2/memset.S
new file mode 100644
index 000000000..0918d3774
--- /dev/null
+++ b/libc/string/arc/arcv2/memset.S
@@ -0,0 +1,115 @@
+
+/*
+ * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <features.h>
+#include <sysdep.h>
+
+#ifdef DONT_USE_PREALLOC
+#define PREWRITE(A,B)	prefetchw [(A),(B)]
+#else
+#define PREWRITE(A,B)	prealloc [(A),(B)]
+#endif
+
+ENTRY(memset)
+	prefetchw [r0]		; Prefetch the write location
+	mov.f	0, r2
+;;; if size is zero
+	jz.d	[blink]
+	mov	r3, r0		; don't clobber ret val
+
+;;; if length < 8
+	brls.d.nt	r2, 8, .Lsmallchunk
+	mov.f	lp_count,r2
+
+	and.f	r4, r0, 0x03
+	rsub	lp_count, r4, 4
+	lpnz	@.Laligndestination
+	;; LOOP BEGIN
+	stb.ab	r1, [r3,1]
+	sub	r2, r2, 1
+.Laligndestination:
+
+;;; Destination is aligned
+	and	r1, r1, 0xFF
+	asl	r4, r1, 8
+	or	r4, r4, r1
+	asl	r5, r4, 16
+	or	r5, r5, r4
+	mov	r4, r5
+
+	sub3	lp_count, r2, 8
+	cmp     r2, 64
+	bmsk.hi	r2, r2, 5
+	mov.ls	lp_count, 0
+	add3.hi	r2, r2, 8
+
+;;; Convert len to Dwords, unfold x8
+	lsr.f	lp_count, lp_count, 6
+	lpnz	@.Lset64bytes
+	;; LOOP START
+	PREWRITE(r3, 64)	;Prefetch the next write location
+#ifdef __LL64__
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+#else
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+#endif
+.Lset64bytes:
+
+	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
+	lpnz	.Lset32bytes
+	;; LOOP START
+	prefetchw [r3, 32]	;Prefetch the next write location
+#ifdef __LL64__
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r4, [r3, 8]
+#else
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r4, [r3, 4]
+#endif
+.Lset32bytes:
+
+	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
+.Lsmallchunk:
+	lpnz	.Lcopy3bytes
+	;; LOOP START
+	stb.ab	r1, [r3, 1]
+.Lcopy3bytes:
+
+	j	[blink]
+
+END(memset)
+libc_hidden_def(memset)
diff --git a/libc/string/arc/arcv2/strcmp.S b/libc/string/arc/arcv2/strcmp.S
new file mode 100644
index 000000000..2e0e64a0c
--- /dev/null
+++ b/libc/string/arc/arcv2/strcmp.S
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <features.h>
+#include <sysdep.h>
+
+ENTRY(strcmp)
+	or	r2, r0, r1
+	bmsk_s	r2, r2, 1
+	brne	r2, 0, @.Lcharloop
+
+;;; s1 and s2 are word aligned
+	ld.ab	r2, [r0, 4]
+
+	mov_s	r12, 0x01010101
+	ror	r11, r12
+	.align  4
+.LwordLoop:
+	ld.ab	r3, [r1, 4]
+	;; Detect NULL char in str1
+	sub	r4, r2, r12
+	ld.ab	r5, [r0, 4]
+	bic	r4, r4, r2
+	and	r4, r4, r11
+	brne.d.nt	r4, 0, .LfoundNULL
+	;; Check if the read locations are the same
+	cmp	r2, r3
+	beq.d	.LwordLoop
+	mov.eq	r2, r5
+
+	;; A match is found, spot it out
+#ifdef __LITTLE_ENDIAN__
+	swape	r3, r3
+	mov_s	r0, 1
+	swape	r2, r2
+#else
+	mov_s	r0, 1
+#endif
+	cmp_s	r2, r3
+	j_s.d	[blink]
+	bset.lo	r0, r0, 31
+
+	.align 4
+.LfoundNULL:
+#ifdef __BIG_ENDIAN__
+	swape	r4, r4
+	swape	r2, r2
+	swape	r3, r3
+#endif
+	;; Find null byte
+	ffs	r0, r4
+	bmsk	r2, r2, r0
+	bmsk	r3, r3, r0
+	swape	r2, r2
+	swape	r3, r3
+	;; make the return value
+	sub.f	r0, r2, r3
+	mov.hi	r0, 1
+	j_s.d	[blink]
+	bset.lo	r0, r0, 31
+
+	.align 4
+.Lcharloop:
+	ldb.ab	r2, [r0, 1]
+	ldb.ab	r3, [r1, 1]
+	nop
+	breq	r2, 0, .Lcmpend
+	breq	r2, r3, .Lcharloop
+
+	.align 4
+.Lcmpend:
+	j_s.d	[blink]
+	sub	r0, r2, r3
+END(strcmp)
+libc_hidden_def(strcmp)
+
+#ifndef __UCLIBC_HAS_LOCALE__
+strong_alias(strcmp,strcoll)
+libc_hidden_def(strcoll)
+#endif
diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S
new file mode 100644
index 000000000..a60757e7a
--- /dev/null
+++ b/libc/string/arc/memcmp.S
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <sysdep.h>
+#include <features.h>
+
+#ifdef __LITTLE_ENDIAN__
+#define WORD2 r2
+#define SHIFT r3
+#else /* BIG ENDIAN */
+#define WORD2 r3
+#define SHIFT r2
+#endif
+
+ENTRY(memcmp)
+	or	r12,r0,r1
+	asl_s	r12,r12,30
+	sub	r3,r2,1
+	brls	r2,r12,.Lbytewise
+	ld	r4,[r0,0]
+	ld	r5,[r1,0]
+	lsr.f	lp_count,r3,3
+#ifdef __HS__
+	/* In ARCv2 a branch can't be the last instruction in a zero overhead
+	 * loop.
+	 * So we move the branch to the start of the loop, duplicate it
+	 * after the end, and set up r12 so that the branch isn't taken
+	 *  initially.
+	 */
+	mov_s	r12,WORD2
+	lpne	.Loop_end
+	brne	WORD2,r12,.Lodd
+	ld	WORD2,[r0,4]
+#else
+	lpne	.Loop_end
+	ld_s	WORD2,[r0,4]
+#endif
+	ld_s	r12,[r1,4]
+	brne	r4,r5,.Leven
+	ld.a	r4,[r0,8]
+	ld.a	r5,[r1,8]
+#ifdef __HS__
+.Loop_end:
+	brne	WORD2,r12,.Lodd
+#else
+	brne	WORD2,r12,.Lodd
+.Loop_end:
+#endif
+	asl_s	SHIFT,SHIFT,3
+	bhs_s	.Last_cmp
+	brne	r4,r5,.Leven
+	ld	r4,[r0,4]
+	ld	r5,[r1,4]
+#ifdef __LITTLE_ENDIAN__
+	nop_s
+	; one more load latency cycle
+.Last_cmp:
+	xor	r0,r4,r5
+	bset	r0,r0,SHIFT
+	sub_s	r1,r0,1
+	bic_s	r1,r1,r0
+	norm	r1,r1
+	b.d	.Leven_cmp
+	and	r1,r1,24
+.Leven:
+	xor	r0,r4,r5
+	sub_s	r1,r0,1
+	bic_s	r1,r1,r0
+	norm	r1,r1
+	; slow track insn
+	and	r1,r1,24
+.Leven_cmp:
+	asl	r2,r4,r1
+	asl	r12,r5,r1
+	lsr_s	r2,r2,1
+	lsr_s	r12,r12,1
+	j_s.d	[blink]
+	sub	r0,r2,r12
+	.balign	4
+.Lodd:
+	xor	r0,WORD2,r12
+	sub_s	r1,r0,1
+	bic_s	r1,r1,r0
+	norm	r1,r1
+	; slow track insn
+	and	r1,r1,24
+	asl_s	r2,r2,r1
+	asl_s	r12,r12,r1
+	lsr_s	r2,r2,1
+	lsr_s	r12,r12,1
+	j_s.d	[blink]
+	sub	r0,r2,r12
+#else /* BIG ENDIAN */
+.Last_cmp:
+	neg_s	SHIFT,SHIFT
+	lsr	r4,r4,SHIFT
+	lsr	r5,r5,SHIFT
+	; slow track insn
+.Leven:
+	sub.f	r0,r4,r5
+	mov.ne	r0,1
+	j_s.d	[blink]
+	bset.cs	r0,r0,31
+.Lodd:
+	cmp_s	WORD2,r12
+	mov_s	r0,1
+	j_s.d	[blink]
+	bset.cs	r0,r0,31
+#endif /* ENDIAN */
+	.balign	4
+.Lbytewise:
+	breq	r2,0,.Lnil
+	ldb	r4,[r0,0]
+	ldb	r5,[r1,0]
+	lsr.f	lp_count,r3
+#ifdef __HS__
+	mov	r12,r3
+	lpne	.Lbyte_end
+	brne	r3,r12,.Lbyte_odd
+#else
+	lpne	.Lbyte_end
+#endif
+	ldb_s	r3,[r0,1]
+	ldb	r12,[r1,1]
+	brne	r4,r5,.Lbyte_even
+	ldb.a	r4,[r0,2]
+	ldb.a	r5,[r1,2]
+#ifdef __HS__
+.Lbyte_end:
+	brne	r3,r12,.Lbyte_odd
+#else
+	brne	r3,r12,.Lbyte_odd
+.Lbyte_end:
+#endif
+	bcc	.Lbyte_even
+	brne	r4,r5,.Lbyte_even
+	ldb_s	r3,[r0,1]
+	ldb_s	r12,[r1,1]
+.Lbyte_odd:
+	j_s.d	[blink]
+	sub	r0,r3,r12
+.Lbyte_even:
+	j_s.d	[blink]
+	sub	r0,r4,r5
+.Lnil:
+	j_s.d	[blink]
+	mov	r0,0
+END(memcmp)
+libc_hidden_def(memcmp)
+
+#ifdef __UCLIBC_SUSV3_LEGACY__
+strong_alias(memcmp,bcmp)
+#endif
diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S
new file mode 100644
index 000000000..1c11951e4
--- /dev/null
+++ b/libc/string/arc/memcpy.S
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <sysdep.h>
+
+/* This memcpy implementation does not support objects of 1GB or larger -
+   the check for alignment does not work then.  */
+/* We assume that most sources and destinations are aligned, and
+   that also lengths are mostly a multiple of four, although to a lesser
+   extent.  */
+ENTRY(memcpy)
+	or	r3,r0,r1
+	asl_s	r3,r3,30
+	mov_s	r5,r0
+	brls.d	r2,r3,.Lcopy_bytewise
+	sub.f	r3,r2,1
+	ld_s	r12,[r1,0]
+	asr.f	lp_count,r3,3
+	bbit0.d	r3,2,.Lnox4
+	bmsk_s	r2,r2,1
+	st.ab	r12,[r5,4]
+	ld.a	r12,[r1,4]
+.Lnox4:
+	lppnz	.Lendloop
+	ld_s	r3,[r1,4]
+	st.ab	r12,[r5,4]
+	ld.a	r12,[r1,8]
+	st.ab	r3,[r5,4]
+.Lendloop:
+	breq	r2,0,.Last_store
+	ld	r3,[r5,0]
+#ifdef __LITTLE_ENDIAN__
+	add3	r2,-1,r2
+	; uses long immediate
+	xor_s	r12,r12,r3
+	bmsk	r12,r12,r2
+        xor_s	r12,r12,r3
+#else /* BIG ENDIAN */
+	sub3	r2,31,r2
+	; uses long immediate
+        xor_s	r3,r3,r12
+        bmsk	r3,r3,r2
+        xor_s	r12,r12,r3
+#endif /* ENDIAN */
+.Last_store:
+	j_s.d	[blink]
+	st	r12,[r5,0]
+
+	.balign	4
+.Lcopy_bytewise:
+	jcs	[blink]
+	ldb_s	r12,[r1,0]
+	lsr.f	lp_count,r3
+	bhs_s	.Lnox1
+	stb.ab	r12,[r5,1]
+	ldb.a	r12,[r1,1]
+.Lnox1:
+	lppnz	.Lendbloop
+	ldb_s	r3,[r1,1]
+	stb.ab	r12,[r5,1]
+	ldb.a	r12,[r1,2]
+	stb.ab	r3,[r5,1]
+.Lendbloop:
+	j_s.d	[blink]
+	stb	r12,[r5,0]
+END(memcpy)
+libc_hidden_def(memcpy)
diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S
new file mode 100644
index 000000000..f4048455a
--- /dev/null
+++ b/libc/string/arc/memset.S
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <sysdep.h>
+
+#define SMALL	7 /* Must be at least 6 to deal with alignment/loop issues.  */
+
+ENTRY(memset)
+
+	mov_s	r4,r0
+	or	r12,r0,r2
+	bmsk.f	r12,r12,1
+	extb_s	r1,r1
+	asl	r3,r1,8
+	beq.d	.Laligned
+	or_s	r1,r1,r3
+	brls	r2,SMALL,.Ltiny
+	add	r3,r2,r0
+	stb	r1,[r3,-1]
+	bclr_s	r3,r3,0
+	stw	r1,[r3,-2]
+	bmsk.f	r12,r0,1
+	add_s	r2,r2,r12
+	sub.ne	r2,r2,4
+	stb.ab	r1,[r4,1]
+	and	r4,r4,-2
+	stw.ab	r1,[r4,2]
+	and	r4,r4,-4
+.Laligned:	; This code address should be aligned for speed.
+	asl	r3,r1,16
+	lsr.f	lp_count,r2,2
+	or_s	r1,r1,r3
+	lpne	.Loop_end
+	st.ab	r1,[r4,4]
+.Loop_end:
+	j_s	[blink]
+
+
+	.balign	4
+.Ltiny:
+	mov.f	lp_count,r2
+	lpne	.Ltiny_end
+	stb.ab	r1,[r4,1]
+.Ltiny_end:
+	j_s	[blink]
+END(memset)
+libc_hidden_def(memset)
diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S
new file mode 100644
index 000000000..443993589
--- /dev/null
+++ b/libc/string/arc/strchr.S
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <sysdep.h>
+#include <features.h>
+
+/* ARC700 has a relatively long pipeline and branch prediction, so we want
+   to avoid branches that are hard to predict.  On the other hand, the
+   presence of the norm instruction makes it easier to operate on whole
+   words branch-free.  */
+
+ENTRY(strchr)
+	extb_s	r1,r1
+	asl	r5,r1,8
+	bmsk	r2,r0,1
+	or	r5,r5,r1
+	mov_s	r3,0x01010101
+	breq.d	r2,r0,.Laligned
+	asl	r4,r5,16
+	sub_s	r0,r0,r2
+	asl	r7,r2,3
+	ld_s	r2,[r0]
+#ifdef __LITTLE_ENDIAN__
+	asl	r7,r3,r7
+#else
+	lsr	r7,r3,r7
+#endif
+	or	r5,r5,r4
+	ror	r4,r3
+	sub	r12,r2,r7
+	bic_s	r12,r12,r2
+	and	r12,r12,r4
+	brne.d	r12,0,.Lfound0_ua
+	xor	r6,r2,r5
+	ld.a	r2,[r0,4]
+	sub	r12,r6,r7
+	bic	r12,r12,r6
+#ifdef __LITTLE_ENDIAN__
+	and	r7,r12,r4
+	breq	r7,0,.Loop ; For speed, we want this branch to be unaligned.
+	b	.Lfound_char ; Likewise this one.
+#else
+	and	r12,r12,r4
+	breq	r12,0,.Loop ; For speed, we want this branch to be unaligned.
+	lsr_s	r12,r12,7
+	bic 	r2,r7,r6
+	b.d	.Lfound_char_b
+	and_s	r2,r2,r12
+#endif
+; /* We require this code address to be unaligned for speed...  */
+.Laligned:
+	ld_s	r2,[r0]
+	or	r5,r5,r4
+	ror	r4,r3
+; /* ... so that this code address is aligned, for itself and ...  */
+.Loop:
+	sub	r12,r2,r3
+	bic_s	r12,r12,r2
+	and	r12,r12,r4
+	brne.d	r12,0,.Lfound0
+	xor	r6,r2,r5
+	ld.a	r2,[r0,4]
+	sub	r12,r6,r3
+	bic	r12,r12,r6
+	and	r7,r12,r4
+	breq	r7,0,.Loop /* ... so that this branch is unaligned.  */
+	; Found searched-for character.  r0 has already advanced to next word.
+#ifdef __LITTLE_ENDIAN__
+/* We only need the information about the first matching byte
+   (i.e. the least significant matching byte) to be exact,
+   hence there is no problem with carry effects.  */
+.Lfound_char:
+	sub	r3,r7,1
+	bic	r3,r3,r7
+	norm	r2,r3
+	sub_s	r0,r0,1
+	asr_s	r2,r2,3
+	j.d	[blink]
+	sub_s	r0,r0,r2
+
+	.balign	4
+.Lfound0_ua:
+	mov	r3,r7
+.Lfound0:
+	sub	r3,r6,r3
+	bic	r3,r3,r6
+	and	r2,r3,r4
+	or_s	r12,r12,r2
+	sub_s	r3,r12,1
+	bic_s	r3,r3,r12
+	norm	r3,r3
+	add_s	r0,r0,3
+	asr_s	r12,r3,3
+	asl.f	0,r2,r3
+	sub_s	r0,r0,r12
+	j_s.d	[blink]
+	mov.pl	r0,0
+#else /* BIG ENDIAN */
+.Lfound_char:
+	lsr	r7,r7,7
+
+	bic	r2,r7,r6
+.Lfound_char_b:
+	norm	r2,r2
+	sub_s	r0,r0,4
+	asr_s	r2,r2,3
+	j.d	[blink]
+	add_s	r0,r0,r2
+
+.Lfound0_ua:
+	mov_s	r3,r7
+.Lfound0:
+	asl_s	r2,r2,7
+	or	r7,r6,r4
+	bic_s	r12,r12,r2
+	sub	r2,r7,r3
+	or	r2,r2,r6
+	bic	r12,r2,r12
+	bic.f	r3,r4,r12
+	norm	r3,r3
+
+	add.pl	r3,r3,1
+	asr_s	r12,r3,3
+	asl.f	0,r2,r3
+	add_s	r0,r0,r12
+	j_s.d	[blink]
+	mov.mi	r0,0
+#endif /* ENDIAN */
+END(strchr)
+libc_hidden_def(strchr)
+
+#ifdef __UCLIBC_SUSV3_LEGACY__
+strong_alias(strchr,index)
+#endif
diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S
new file mode 100644
index 000000000..5a0e56045
--- /dev/null
+++ b/libc/string/arc/strcmp.S
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+#include <features.h>
+#include <sysdep.h>
+
+/* This is optimized primarily for the ARC700.
+   It would be possible to speed up the loops by one cycle / word
+   respective one cycle / byte by forcing double source 1 alignment, unrolling
+   by a factor of two, and speculatively loading the second word / byte of
+   source 1; however, that would increase the overhead for loop setup / finish,
+   and strcmp might often terminate early.  */
+
+ENTRY(strcmp)
+	or	r2,r0,r1
+	bmsk_s	r2,r2,1
+	brne	r2,0,.Lcharloop
+	mov_s	r12,0x01010101
+	ror	r5,r12
+.Lwordloop:
+	ld.ab	r2,[r0,4]
+	ld.ab	r3,[r1,4]
+	nop_s
+	sub	r4,r2,r12
+	bic	r4,r4,r2
+	and	r4,r4,r5
+	brne	r4,0,.Lfound0
+	breq	r2,r3,.Lwordloop
+#ifdef	__LITTLE_ENDIAN__
+	xor	r0,r2,r3	; mask for difference
+	sub_s	r1,r0,1
+	bic_s	r0,r0,r1	; mask for least significant difference bit
+	sub	r1,r5,r0
+	xor	r0,r5,r1	; mask for least significant difference byte
+	and_s	r2,r2,r0
+	and_s	r3,r3,r0
+#endif /* LITTLE ENDIAN */
+	cmp_s	r2,r3
+	mov_s	r0,1
+	j_s.d	[blink]
+	bset.lo	r0,r0,31
+
+	.balign	4
+#ifdef __LITTLE_ENDIAN__
+.Lfound0:
+	xor	r0,r2,r3	; mask for difference
+	or	r0,r0,r4	; or in zero indicator
+	sub_s	r1,r0,1
+	bic_s	r0,r0,r1	; mask for least significant difference bit
+	sub	r1,r5,r0
+	xor	r0,r5,r1	; mask for least significant difference byte
+	and_s	r2,r2,r0
+	and_s	r3,r3,r0
+	sub.f	r0,r2,r3
+	mov.hi	r0,1
+	j_s.d	[blink]
+	bset.lo	r0,r0,31
+#else /* BIG ENDIAN */
+	/* The zero-detection above can mis-detect 0x01 bytes as zeroes
+	   because of carry-propagateion from a lower significant zero byte.
+	   We can compensate for this by checking that bit0 is zero.
+	   This compensation is not necessary in the step where we
+	   get a low estimate for r2, because in any affected bytes
+	   we already have 0x00 or 0x01, which will remain unchanged
+	   when bit 7 is cleared.  */
+	.balign	4
+.Lfound0:
+	lsr	r0,r4,8
+	lsr_s	r1,r2
+	bic_s	r2,r2,r0	; get low estimate for r2 and get ...
+	bic_s	r0,r0,r1	; <this is the adjusted mask for zeros>
+	or_s	r3,r3,r0	; ... high estimate r3 so that r2 > r3 will ...
+	cmp_s	r3,r2		; ... be independent of trailing garbage
+	or_s	r2,r2,r0	; likewise for r3 > r2
+	bic_s	r3,r3,r0
+	rlc	r0,0		; r0 := r2 > r3 ? 1 : 0
+	cmp_s	r2,r3
+	j_s.d	[blink]
+	bset.lo	r0,r0,31
+#endif /* ENDIAN */
+
+	.balign	4
+.Lcharloop:
+	ldb.ab	r2,[r0,1]
+	ldb.ab	r3,[r1,1]
+	nop_s
+	breq	r2,0,.Lcmpend
+	breq	r2,r3,.Lcharloop
+.Lcmpend:
+	j_s.d	[blink]
+	sub	r0,r2,r3
+END(strcmp)
+libc_hidden_def(strcmp)
+
+#ifndef __UCLIBC_HAS_LOCALE__
+strong_alias(strcmp,strcoll)
+libc_hidden_def(strcoll)
+#endif
diff --git a/libc/string/arc/strcpy.S b/libc/string/arc/strcpy.S
new file mode 100644
index 000000000..241bf3ee6
--- /dev/null
+++ b/libc/string/arc/strcpy.S
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+
+#include <sysdep.h>
+
+/* If dst and src are 4 byte aligned, copy 8 bytes at a time.
+   If the src is 4, but not 8 byte aligned, we first read 4 bytes to get
+   it 8 byte aligned.  Thus, we can do a little read-ahead, without
+   dereferencing a cache line that we should not touch.
+   Note that short and long instructions have been scheduled to avoid
+   branch stalls.
+   The beq_s to r3z could be made unaligned & long to avoid a stall
+   there, but the it is not likely to be taken often, and it
+   would also be likey to cost an unaligned mispredict at the next call.  */
+
+ENTRY(strcpy)
+	or	r2,r0,r1
+	bmsk_s	r2,r2,1
+	brne.d	r2,0,charloop
+	mov_s	r10,r0
+	ld_s	r3,[r1,0]
+	mov	r8,0x01010101
+	bbit0.d	r1,2,loop_start
+	ror	r12,r8
+	sub	r2,r3,r8
+	bic_s	r2,r2,r3
+	tst_s	r2,r12
+	bne	r3z
+	mov_s	r4,r3
+	.balign 4
+loop:
+	ld.a	r3,[r1,4]
+	st.ab	r4,[r10,4]
+loop_start:
+	ld.a	r4,[r1,4]
+	sub	r2,r3,r8
+	bic_s	r2,r2,r3
+	tst_s	r2,r12
+	bne_s	r3z
+	st.ab	r3,[r10,4]
+	sub	r2,r4,r8
+	bic	r2,r2,r4
+	tst	r2,r12
+	beq	loop
+	mov_s	r3,r4
+#ifdef __LITTLE_ENDIAN__
+r3z:	bmsk.f	r1,r3,7
+	lsr_s	r3,r3,8
+#else
+r3z:	lsr.f	r1,r3,24
+	asl_s	r3,r3,8
+#endif
+	bne.d	r3z
+	stb.ab	r1,[r10,1]
+	j_s	[blink]
+
+	.balign	4
+charloop:
+	ldb.ab	r3,[r1,1]
+
+
+	brne.d	r3,0,charloop
+	stb.ab	r3,[r10,1]
+	j	[blink]
+END(strcpy)
+libc_hidden_def(strcpy)
diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S
new file mode 100644
index 000000000..0b9b93815
--- /dev/null
+++ b/libc/string/arc/strlen.S
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2007 ARC International (UK) LTD
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+ */
+
+
+#include <sysdep.h>
+
+ENTRY(strlen)
+	or	r3,r0,7
+	ld	r2,[r3,-7]
+	ld.a	r6,[r3,-3]
+	mov	r4,0x01010101
+	; uses long immediate
+#ifdef __LITTLE_ENDIAN__
+	asl_s	r1,r0,3
+	btst_s	r0,2
+	asl	r7,r4,r1
+	ror	r5,r4
+	sub	r1,r2,r7
+	bic_s	r1,r1,r2
+	mov.eq	r7,r4
+	sub	r12,r6,r7
+	bic	r12,r12,r6
+	or.eq	r12,r12,r1
+	and	r12,r12,r5
+	brne	r12,0,.Learly_end
+#else /* BIG ENDIAN */
+	ror	r5,r4
+	btst_s	r0,2
+	mov_s	r1,31
+	sub3	r7,r1,r0
+	sub	r1,r2,r4
+	bic_s	r1,r1,r2
+	bmsk	r1,r1,r7
+	sub	r12,r6,r4
+	bic	r12,r12,r6
+	bmsk.ne	r12,r12,r7
+	or.eq	r12,r12,r1
+	and	r12,r12,r5
+	brne	r12,0,.Learly_end
+#endif /* ENDIAN */
+
+.Loop:
+	ld_s	r2,[r3,4]
+	ld.a	r6,[r3,8]
+	; stall for load result
+	sub	r1,r2,r4
+	bic_s	r1,r1,r2
+	sub	r12,r6,r4
+	bic	r12,r12,r6
+	or	r12,r12,r1
+	and	r12,r12,r5
+	breq r12,0,.Loop
+.Lend:
+	and.f	r1,r1,r5
+	sub.ne	r3,r3,4
+	mov.eq	r1,r12
+#ifdef __LITTLE_ENDIAN__
+	sub_s	r2,r1,1
+	bic_s	r2,r2,r1
+	norm	r1,r2
+	sub_s	r0,r0,3
+	lsr_s	r1,r1,3
+	sub	r0,r3,r0
+	j_s.d	[blink]
+	sub	r0,r0,r1
+#else /* BIG ENDIAN */
+	lsr_s	r1,r1,7
+	mov.eq	r2,r6
+	bic_s	r1,r1,r2
+	norm	r1,r1
+	sub	r0,r3,r0
+	lsr_s	r1,r1,3
+	j_s.d	[blink]
+	add	r0,r0,r1
+#endif /* ENDIAN */
+.Learly_end:
+	b.d	.Lend
+	sub_s.ne r1,r1,r1
+END(strlen)
+libc_hidden_def(strlen)
diff --git a/libc/string/arm/_memcpy.S b/libc/string/arm/_memcpy.S
index 103580a0c..2999e8ee6 100644
--- a/libc/string/arm/_memcpy.S
+++ b/libc/string/arm/_memcpy.S
@@ -40,6 +40,7 @@
 #include <features.h>
 #include <endian.h>
 #include <bits/arm_asm.h>
+#include <bits/arm_bx.h>
 
 #if !defined(THUMB1_ONLY)
 /*
@@ -67,8 +68,9 @@
  * a time where possible.
  *
  * Note: r12 (aka ip) can be trashed during the function along with
- * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
+ * r0-r3 although r0-r2 have defined uses i.e. dest, src, len throughout.
  * Additional registers are preserved prior to use i.e. r4, r5 & lr
+ * The return value in r0 must be the destination address.
  *
  * Apologies for the state of the comments ;-)
  */
@@ -108,12 +110,8 @@ _memcpy:
 	cmp	r1, r0
 	bcc	.Lmemcpy_backwards
 
-	IT(tt, eq)			/* Quick abort for src=dst */
-#if defined(__USE_BX__)
-        bxeq    lr
-#else
-        moveq   pc, lr
-#endif
+	IT(t, eq)			/* Quick abort for src=dst */
+	BXC(eq, lr)
 	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
 	subs	r2, r2, #4
 	blt	.Lmemcpy_fl4		/* less than 4 bytes */
@@ -453,11 +451,7 @@ _memcpy:
 	/* less than 4 bytes to go */
 	adds	r2, r2, #4
 	IT(t, eq)
-#if defined(__USE_BX__)
-        bxeq    lr
-#else
-	moveq	pc, lr			/* done */
-#endif
+	BXC(eq, lr)			/* done */
 	/* copy the crud byte at a time */
 	cmp	r2, #2
 	ldrb	r3, [r1, #-1]!
@@ -475,11 +469,7 @@ _memcpy:
 	ldrgtb	r3, [r1, #-1]!
 	strgtb	r3, [r0, #-1]!
 #endif
-#if defined(__USE_BX__)
-        bx      lr
-#else
-	mov	pc, lr
-#endif
+	BX(lr)
 	/* erg - unaligned destination */
 .Lmemcpy_bdestul:
 	cmp	r12, #2
diff --git a/libc/string/arm/memcmp.S b/libc/string/arm/memcmp.S
index 65409f43a..5b9473cd0 100644
--- a/libc/string/arm/memcmp.S
+++ b/libc/string/arm/memcmp.S
@@ -31,6 +31,7 @@
 
 #include <features.h>
 #include <bits/arm_asm.h>
+#include <bits/arm_bx.h>
 
 .text
 .global memcmp
@@ -66,11 +67,7 @@ memcmp:
 	subs	r2, r2, #1
 	IT(tt, mi)
 	movmi	r0, #0
-#if defined(__USE_BX__)
-        bxmi    lr
-#else
-	movmi	pc, lr
-#endif
+	BXC(mi, lr)
 	/* ip == last src address to compare */
 	add	ip, r0, r2
 1:
@@ -81,11 +78,7 @@ memcmp:
 	cmpcs	r2, r3
 	beq	1b
 	sub	r0, r2, r3
-#if defined(__USE_BX__)
-        bx      lr
-#else
- 	mov	pc, lr
-#endif
+	BX(lr)
 #endif
 
 .size memcmp,.-memcmp
diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S
index 66aa6039c..2be4850e4 100644
--- a/libc/string/arm/memset.S
+++ b/libc/string/arm/memset.S
@@ -13,13 +13,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <features.h>
-#include <sys/syscall.h>
 #include <bits/arm_asm.h>
+#include <bits/arm_bx.h>
 
 .text
 .global memset
@@ -109,11 +108,7 @@ memset:
 2:
 	movs	a3, a3		@ anything left?
 	IT(t, eq)
-#if defined(__USE_BX__)
-        bxeq    lr
-#else
-        moveq	pc, lr		@ nope
-#endif
+	BXC(eq, lr)			@ nope
 #if defined (__thumb2__)
 1:
 	strb	a2, [a4], #1
@@ -131,11 +126,7 @@ memset:
 	strb	a2, [a4], $1
 	strb	a2, [a4], $1
 	strb	a2, [a4], $1
-#if defined(__USE_BX__)
-        bx      lr
-#else
- 	mov	pc, lr
-#endif
+	BX(lr)
 #endif
 #endif
 
diff --git a/libc/string/arm/strcmp.S b/libc/string/arm/strcmp.S
index 97363c1c2..81416a9a5 100644
--- a/libc/string/arm/strcmp.S
+++ b/libc/string/arm/strcmp.S
@@ -31,6 +31,7 @@
 
 #include <features.h>
 #include <bits/arm_asm.h>
+#include <bits/arm_bx.h>
 
 .text
 .global strcmp
@@ -62,11 +63,7 @@ strcmp:
 	cmpcs	r2, r3
 	beq	1b
 	sub	r0, r2, r3
-#if defined(__USE_BX__)
-        bx      lr
-#else
-  	mov	pc, lr
-#endif
+	BX(lr)
 #endif
 
 .size strcmp,.-strcmp
diff --git a/libc/string/arm/strlen.S b/libc/string/arm/strlen.S
index 949e918f4..9995d768c 100644
--- a/libc/string/arm/strlen.S
+++ b/libc/string/arm/strlen.S
@@ -13,14 +13,13 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <features.h>
 #include <endian.h>
-#include <sys/syscall.h>
 #include <bits/arm_asm.h>
+#include <bits/arm_bx.h>
 
 /* size_t strlen(const char *S)
  * entry: r0 -> string
@@ -99,11 +98,7 @@ Llastword:				@ drop through to here once we find a
 	IT(t, ne)
 	addne   r0, r0, $1              @  must be zero)
 #endif
-#if defined(__USE_BX__)
-        bx      lr
-#else
-  	mov	pc,lr
-#endif
+	BX(lr)
 #endif
 
 .size strlen,.-strlen
diff --git a/libc/string/arm/strncmp.S b/libc/string/arm/strncmp.S
deleted file mode 100644
index 8487639c8..000000000
--- a/libc/string/arm/strncmp.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2002 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Adapted for uClibc from NetBSD strncmp.S, version 1.2 2003/04/05
- * by Erik Andersen <andersen@codepoet.org>
- */
-
-#include <features.h>
-#include <bits/arm_asm.h>
-
-.text
-.global strncmp
-.type strncmp,%function
-.align 4
-
-#if defined(THUMB1_ONLY)
-.thumb_func
-strncmp:
-	/* if (len == 0) return 0 */
-	cmp	r2, #0
-	bne	1f
-	mov	r0, #0
-	bx	lr
-1:
-	push	{r4}
-
-	/* ip == last src address to compare */
-	add	r4, r0, r2
-2:
-	cmp	r4, r0
-	beq	3f
-	ldrb	r2, [r0]
-	add	r0, r0, #1
-	ldrb	r3, [r1]
-	add	r1, r1, #1
-	cmp	r2, #0
-	beq	3f
-	cmp	r2, r3
-	beq	2b
-3:
-	sub	r0, r2, r3
-	pop	{r4}
-	bx	lr
-#else
-strncmp:
-	/* if (len == 0) return 0 */
-	cmp	r2, #0
-	IT(tt, eq)
-	moveq	r0, #0
-#if defined(__USE_BX__)
-        bxeq    lr
-#else
-	moveq	pc, lr
-#endif
-	subs	r2, r2, #1
-
-	/* ip == last src address to compare */
-	add	ip, r0, r2
-1:
-	ldrb	r2, [r0], #1
-	ldrb	r3, [r1], #1
-	cmp	ip, r0
-	IT(tt, cs)
-	cmpcs	r2, #1
-	cmpcs	r2, r3
-	beq	1b
-	sub	r0, r2, r3
-#if defined(__USE_BX__)
-        bx      lr
-#else
-  	mov	pc, lr
-#endif
-#endif
-
-.size strncmp,.-strncmp
-
-libc_hidden_weak(strncmp)
diff --git a/libc/string/avr32/Makefile b/libc/string/avr32/Makefile
index e19e9d9ec..385cf085e 100644
--- a/libc/string/avr32/Makefile
+++ b/libc/string/avr32/Makefile
@@ -13,8 +13,7 @@
 # details.
 #
 # You should have received a copy of the GNU Library General Public License
-# along with this program; if not, write to the Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
 
 top_srcdir	:= ../../../
 top_builddir	:= ../../../
diff --git a/libc/string/basename.c b/libc/string/basename.c
index a076c20e9..abc9d89db 100644
--- a/libc/string/basename.c
+++ b/libc/string/basename.c
@@ -5,10 +5,9 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-#include "_string.h"
+#include <string.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(basename) */
 
 char *basename(const char *path)
 {
@@ -25,5 +24,4 @@ char *basename(const char *path)
 
 	return (char *) p;
 }
-libc_hidden_def(basename)
 #endif
diff --git a/libc/string/bcopy.c b/libc/string/bcopy.c
index 3aa7eab1e..e16ba241d 100644
--- a/libc/string/bcopy.c
+++ b/libc/string/bcopy.c
@@ -5,35 +5,14 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-#include "_string.h"
+#include <string.h>
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
-
-/* Experimentally off - libc_hidden_proto(memmove) */
-
 void bcopy(const void *s2, void *s1, size_t n)
 {
 #if 1
 	memmove(s1, s2, n);
 #else
-#ifdef __BCC__
-	register char *s;
-	register const char *p;
-
-	s = s1;
-	p = s2;
-	if (p >= s) {
-		while (n--) {
-			*s++ = *p++;
-		}
-	} else {
-		s += n;
-		p += n;
-		while (n--) {
-			*--s = *--p;
-		}
-	}
-#else
 	register char *s;
 	register const char *p;
 
@@ -51,6 +30,5 @@ void bcopy(const void *s2, void *s1, size_t n)
 		}
 	}
 #endif
-#endif
 }
 #endif
diff --git a/libc/string/bfin/memchr.S b/libc/string/bfin/memchr.S
index 88e46bef6..26d419f7c 100644
--- a/libc/string/bfin/memchr.S
+++ b/libc/string/bfin/memchr.S
@@ -25,8 +25,8 @@
 
 .weak _memchr
 ENTRY(_memchr)
-	P0 = R0;             // P0 = address
-	P2 = R2;             // P2 = count
+	P0 = R0;             /* P0 = address */
+	P2 = R2;             /* P2 = count */
 	R1 = R1.B(Z);
 	CC = R2 == 0;
 	IF CC JUMP .Lfailed;
diff --git a/libc/string/bfin/strcmp.S b/libc/string/bfin/strcmp.S
index 12e8c53c6..ef23aa9ab 100644
--- a/libc/string/bfin/strcmp.S
+++ b/libc/string/bfin/strcmp.S
@@ -29,66 +29,66 @@ ENTRY(_strcmp)
 	p1 = r0;
 	p2 = r1;
 
-	p0 = -1;	// (need for loop counter init)
+	p0 = -1;	/* (need for loop counter init) */
 
-	  // check if byte aligned
-	r0 = r0 | r1;	// check both pointers at same time
-	r0 <<= 30;	// dump all but last 2 bits
-	cc = az;	// are they zero?
-	if !cc jump .Lunaligned;	// no; use unaligned code.
-			// fall-thru for aligned case..
+	  /* check if byte aligned */
+	r0 = r0 | r1;	/* check both pointers at same time */
+	r0 <<= 30;	/* dump all but last 2 bits */
+	cc = az;	/* are they zero? */
+	if !cc jump .Lunaligned;	/* no; use unaligned code. */
+			/* fall-thru for aligned case.. */
 
-	  // note that r0 is zero from the previous...
-	  //           p0 set to -1
+	  /* note that r0 is zero from the previous... */
+	  /*           p0 set to -1 */
 
 	LSETUP (.Lbeginloop, .Lendloop) lc0=p0;
-	  // pick up first words
+	  /* pick up first words */
 	r1 = [p1++];
 	r2 = [p2++];
-	  // make up mask:  0FF0FF
+	  /* make up mask:  0FF0FF */
 	r7 = 0xFF;
 	r7.h = 0xFF;
-		// loop : 9 cycles to check 4 characters
+		/* loop : 9 cycles to check 4 characters */
 	cc = r1 == r2;
 .Lbeginloop:
-	if !cc jump .Lnotequal4;	// compare failure, exit loop
+	if !cc jump .Lnotequal4;	/* compare failure, exit loop */
 
-	  // starting with   44332211
-	  // see if char 3 or char 1 is 0
-	r3 = r1 & r7;		// form 00330011
-	  // add to zero, and (r2 is free, reload)
+	  /* starting with   44332211 */
+	  /* see if char 3 or char 1 is 0 */
+	r3 = r1 & r7;		/* form 00330011 */
+	  /* add to zero, and (r2 is free, reload) */
 	r6 = r3 +|+ r0 || r2 = [p2++] || nop;
-	cc = az;	// true if either is zero
-	r3 = r1 ^ r3;	        // form 44002200 (4321^0301 => 4020)
-				// (trick, saves having another mask)
-	// add to zero,  and  (r1 is free, reload)
+	cc = az;	/* true if either is zero */
+	r3 = r1 ^ r3;	        /* form 44002200 (4321^0301 => 4020) */
+				/* (trick, saves having another mask) */
+	/* add to zero,  and  (r1 is free, reload) */
 	r6 = r3 +|+ r0 || r1 = [p1++] || nop;
-	cc |= az;	// true if either is zero
-	if cc jump .Lzero4;	// leave if a zero somewhere
+	cc |= az;	/* true if either is zero */
+	if cc jump .Lzero4;	/* leave if a zero somewhere */
 .Lendloop:
 	cc = r1 == r2;
 
- // loop exits
-.Lnotequal4:		// compare failure on 4-char compare
-			// address pointers are one word ahead;
-			// faster to use zero4 exit code
+ /* loop exits */
+.Lnotequal4:		/* compare failure on 4-char compare */
+			/* address pointers are one word ahead; */
+			/* faster to use zero4 exit code */
 	p1 += 4;
 	p2 += 4;
 
-.Lzero4:			// one of the bytes in word 1 is zero
-			// but we've already fetched the next word; so
-			// backup two to look at failing word again
+.Lzero4:			/* one of the bytes in word 1 is zero */
+			/* but we've already fetched the next word; so */
+			/* backup two to look at failing word again */
 	p1 += -8;
 	p2 += -8;
 
 
 
-		// here when pointers are unaligned: checks one
-		// character at a time.  Also use at the end of
-		// the word-check algorithm to figure out what happened
+		/* here when pointers are unaligned: checks one */
+		/* character at a time.  Also use at the end of */
+		/* the word-check algorithm to figure out what happened */
 .Lunaligned:
-	  //	R0 is non-zero from before.
-	  //           p0 set to -1
+	  /*	R0 is non-zero from before. */
+	  /*           p0 set to -1 */
 
 	r0 = 0 (Z);
 	r1 = B[p1++] (Z);
@@ -96,18 +96,18 @@ ENTRY(_strcmp)
 	LSETUP (.Lbeginloop1, .Lendloop1) lc0=p0;
 
 .Lbeginloop1:
-	cc = r1;	// first char must be non-zero
-	// chars must be the same
+	cc = r1;	/* first char must be non-zero */
+	/* chars must be the same */
 	r3 = r2 - r1 (NS) || r1 = B[p1++] (Z) || nop;
 	cc &= az;
-	r3 = r0 - r2;	// second char must be non-zero
+	r3 = r0 - r2;	/* second char must be non-zero */
 	cc &= an;
 	if !cc jump .Lexitloop1;
 .Lendloop1:
 	r2 = B[p2++] (Z);
 
-.Lexitloop1: // here means we found a zero or a difference.
-	   // we have r2(N), p2(N), r1(N+1), p1(N+2)
+.Lexitloop1: /* here means we found a zero or a difference. */
+	   /* we have r2(N), p2(N), r1(N+1), p1(N+2) */
 	r1=B[p1+ -2] (Z);
 	r0 = r1 - r2;
 	(r7:4) = [sp++];
diff --git a/libc/string/bzero.c b/libc/string/bzero.c
index 7498f795f..32dce416e 100644
--- a/libc/string/bzero.c
+++ b/libc/string/bzero.c
@@ -5,30 +5,20 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-#include "_string.h"
+#include <string.h>
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
-
-/* Experimentally off - libc_hidden_proto(memset) */
-
 void bzero(void *s, size_t n)
 {
 #if 1
 	(void)memset(s, 0, n);
 #else
 	register unsigned char *p = s;
-#ifdef __BCC__
-	/* bcc can optimize the counter if it thinks it is a pointer... */
-	register const char *np = (const char *) n;
-#else
-#define np n
-#endif
 
-	while (np) {
+	while (n) {
 		*p++ = 0;
-		--np;
+		--n;
 	}
 #endif
 }
-#undef np
 #endif
diff --git a/libc/string/cris/memcopy.h b/libc/string/cris/memcopy.h
index 449c75641..ccd447c83 100644
--- a/libc/string/cris/memcopy.h
+++ b/libc/string/cris/memcopy.h
@@ -16,8 +16,7 @@
 
    You should have received a copy of the GNU Library General Public
    License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA.  */
+   see <http://www.gnu.org/licenses/>.  */
 
 #include "../generic/memcopy.h"
 
diff --git a/libc/string/cris/memcpy.c b/libc/string/cris/memcpy.c
index cc14188b8..94e576f4f 100644
--- a/libc/string/cris/memcpy.c
+++ b/libc/string/cris/memcpy.c
@@ -1,264 +1,242 @@
-/* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
-   Copyright (C) 1994, 1995, 2000 Axis Communications AB.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Library General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Library General Public License for more details.
-
-   You should have received a copy of the GNU Library General Public
-   License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA.  */
-
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# FUNCTION NAME: memcpy()                                                 */
-/*#                                                                         */
-/*# PARAMETERS:  void* dst;   Destination address.                          */
-/*#              void* src;   Source address.                               */
-/*#              int   len;   Number of bytes to copy.                      */
-/*#                                                                         */
-/*# RETURNS:     dst.                                                       */
-/*#                                                                         */
-/*# DESCRIPTION: Copies len bytes of memory from src to dst.  No guarantees */
-/*#              about copying of overlapping memory areas. This routine is */
-/*#              very sensitive to compiler changes in register allocation. */
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# HISTORY                                                                 */
-/*#                                                                         */
-/*# DATE      NAME            CHANGES                                       */
-/*# ----      ----            -------                                       */
-/*# 941007    Kenny R         Creation                                      */
-/*# 941011    Kenny R         Lots of optimizations and inlining.           */
-/*# 941129    Ulf A           Adapted for use in libc.                      */
-/*# 950216    HP              N==0 forgotten if non-aligned src/dst.        */
-/*#                           Added some optimizations.                     */
-/*# 001025    HP              Make src and dst char *.  Align dst to	    */
-/*#			      dword, not just word-if-both-src-and-dst-	    */
-/*#			      are-misaligned.				    */
-/*# 070806    RW              Modified for uClibc                           */
-/*#                           (__arch_v32 -> __CONFIG_CRISV32__,            */
-/*#                           include features.h to reach it.)              */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-
-#include <features.h>
-
-#ifdef __CONFIG_CRISV32__
+/* A memcpy for CRIS.
+   Copyright (C) 1994-2008 Axis Communications.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Neither the name of Axis Communications nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.  */
+
+/* FIXME: This file should really only be used for reference, as the
+   result is somewhat depending on gcc generating what we expect rather
+   than what we describe.  An assembly file should be used instead.  */
+
+#include <string.h>
+
+#ifdef __arch_v32
 /* For CRISv32, movem is very cheap.  */
-#define MEMCPY_BLOCK_THRESHOLD (44)
+#define MEMCPY_BY_BLOCK_THRESHOLD (44)
 #else
-/* Break even between movem and move16 is at 38.7*2, but modulo 44. */
-#define MEMCPY_BLOCK_THRESHOLD (44*2)
+/* Break even between movem and move16 is really at 38.7 * 2, but
+   modulo 44, so up to the next multiple of 44, we use ordinary code.  */
+#define MEMCPY_BY_BLOCK_THRESHOLD (44 * 2)
 #endif
 
-void *memcpy(void *, const void *, unsigned int);
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
 
-/* Experimentally off - libc_hidden_proto(memcpy) */
-void *memcpy(void *pdst,
-             const void *psrc,
-             unsigned int pn)
+void *
+memcpy(void *pdst, const void *psrc, size_t pn)
 {
-  /* Ok.  Now we want the parameters put in special registers.
+  /* Now we want the parameters put in special registers.
      Make sure the compiler is able to make something useful of this.
-      As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
+     As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
 
      If gcc was allright, it really would need no temporaries, and no
-     stack space to save stuff on. */
+     stack space to save stuff on.  */
 
-#ifndef MEMPCPY
   register void *return_dst __asm__ ("r10") = pdst;
-#else
-  /* FIXME: Use R10 for something.  */
-# define return_dst dst
-#endif
-
-  register char *dst __asm__ ("r13") = pdst;
-  register char *src __asm__ ("r11") = (char *) psrc;
+  register unsigned char *dst __asm__ ("r13") = pdst;
+  register unsigned const char *src __asm__ ("r11") = psrc;
   register int n __asm__ ("r12") = pn;
 
-
   /* When src is aligned but not dst, this makes a few extra needless
      cycles.  I believe it would take as many to check that the
      re-alignment was unnecessary.  */
   if (((unsigned long) dst & 3) != 0
       /* Don't align if we wouldn't copy more than a few bytes; so we
-	 don't have to check further for overflows.  */
+         don't have to check further for overflows.  */
       && n >= 3)
   {
     if ((unsigned long) dst & 1)
-    {
-      n--;
-      *(char*)dst = *(char*)src;
-      src++;
-      dst++;
-    }
+      {
+        n--;
+        *dst = *src;
+        src++;
+        dst++;
+      }
 
     if ((unsigned long) dst & 2)
-    {
-      n -= 2;
-      *(short*)dst = *(short*)src;
-      src += 2;
-      dst += 2;
-    }
+      {
+        n -= 2;
+        *(short *) dst = *(short *) src;
+        src += 2;
+        dst += 2;
+      }
   }
 
-  /* Decide which copying method to use. */
-  if (n >= MEMCPY_BLOCK_THRESHOLD)
-  {
-    /* For large copies we use 'movem' */
-
-  /* It is not optimal to tell the compiler about clobbering any
-     registers; that will move the saving/restoring of those registers
-     to the function prologue/epilogue, and make non-movem sizes
-     suboptimal.
-
-      This method is not foolproof; it assumes that the "register asm"
-     declarations at the beginning of the function really are used
-     here (beware: they may be moved to temporary registers).
-      This way, we do not have to save/move the registers around into
-     temporaries; we can safely use them straight away.  */
-    __asm__ __volatile__ ("\
-	.syntax no_register_prefix					\n\
-									\n\
-        ;; Check that the register asm declaration got right.		\n\
-        ;; The GCC manual explicitly says TRT will happen.		\n\
-	.ifnc %0-%1-%2,$r13-$r11-$r12					\n\
-	.err								\n\
-	.endif								\n\
-									\n\
-	;; Save the registers we'll use in the movem process		\n\
-	;; on the stack.						\n\
-	subq 	11*4,sp							\n\
-	movem	r10,[sp]						\n\
-									\n\
-        ;; Now we've got this:						\n\
-	;; r11 - src							\n\
-	;; r13 - dst							\n\
-	;; r12 - n							\n\
-									\n\
-        ;; Update n for the first loop					\n\
-        subq    44,r12							\n\
-0:									\n\
-	movem	[r11+],r10						\n\
-        subq   44,r12							\n\
-        bge     0b							\n\
-	movem	r10,[r13+]						\n\
-									\n\
-        addq   44,r12  ;; compensate for last loop underflowing n	\n\
-									\n\
-	;; Restore registers from stack					\n\
-        movem [sp+],r10"
-
-     /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
-     /* Inputs */ : "0" (dst), "1" (src), "2" (n));
-  }
+  /* Decide which copying method to use.  */
+  if (n >= MEMCPY_BY_BLOCK_THRESHOLD)
+    {
+      /* It is not optimal to tell the compiler about clobbering any
+         registers; that will move the saving/restoring of those registers
+         to the function prologue/epilogue, and make non-movem sizes
+         suboptimal.  */
+      __asm__ __volatile__
+        ("\
+         ;; GCC does promise correct register allocations, but let's    \n\
+         ;; make sure it keeps its promises.                            \n\
+         .ifnc %0-%1-%2,$r13-$r11-$r12                                  \n\
+         .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"       \n\
+         .endif                                                         \n\
+                                                                        \n\
+         ;; Save the registers we'll use in the movem process           \n\
+         ;; on the stack.                                               \n\
+         subq   11*4,sp                                                 \n\
+         movem  r10,[sp]                                                \n\
+                                                                        \n\
+         ;; Now we've got this:                                         \n\
+         ;; r11 - src                                                   \n\
+         ;; r13 - dst                                                   \n\
+         ;; r12 - n                                                     \n\
+                                                                        \n\
+         ;; Update n for the first loop.                                \n\
+         subq    44,r12                                                 \n\
+0:                                                                      \n\
+"
+#ifdef __arch_common_v10_v32
+         /* Cater to branch offset difference between v32 and v10.  We
+            assume the branch below has an 8-bit offset.  */
+"        setf\n"
+#endif
+"        movem  [r11+],r10                                              \n\
+         subq   44,r12                                                  \n\
+         bge     0b                                                     \n\
+         movem  r10,[r13+]                                              \n\
+                                                                        \n\
+         ;; Compensate for last loop underflowing n.                    \n\
+         addq   44,r12                                                  \n\
+                                                                        \n\
+         ;; Restore registers from stack.                               \n\
+         movem [sp+],r10"
+
+         /* Outputs.  */
+         : "=r" (dst), "=r" (src), "=r" (n)
+
+         /* Inputs.  */
+         : "0" (dst), "1" (src), "2" (n));
+    }
 
-  /* Either we directly starts copying, using dword copying
-     in a loop, or we copy as much as possible with 'movem'
-     and then the last block (<44 bytes) is copied here.
-     This will work since 'movem' will have updated src,dst,n. */
+  while (n >= 16)
+    {
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
 
-  while ( n >= 16 )
-  {
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    *((long*)dst)++ = *((long*)src)++;
-    n -= 16;
-  }
+      n -= 16;
+    }
 
-  /* A switch() is definitely the fastest although it takes a LOT of code.
-   * Particularly if you inline code this.
-   */
   switch (n)
-  {
+    {
     case 0:
       break;
+
     case 1:
-      *((char*)dst)++ = *((char*)src)++;
+      *dst = *src;
       break;
+
     case 2:
-      *((short*)dst)++ = *((short*)src)++;
+      *(short *) dst = *(short *) src;
       break;
+
     case 3:
-      *((short*)dst)++ = *((short*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 4:
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src;
       break;
+
     case 5:
-      *((long*)dst)++ = *((long*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 6:
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 7:
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 8:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 9:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 10:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 11:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
+
     case 12:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src;
       break;
+
     case 13:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *dst = *src;
       break;
+
     case 14:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src;
       break;
+
     case 15:
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((long*)dst)++ = *((long*)src)++;
-      *((short*)dst)++ = *((short*)src)++;
-      *((char*)dst)++ = *((char*)src)++;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(long *) dst = *(long *) src; dst += 4; src += 4;
+      *(short *) dst = *(short *) src; dst += 2; src += 2;
+      *dst = *src;
       break;
-  }
+    }
 
-  return return_dst; /* destination pointer. */
-} /* memcpy() */
+  return return_dst;
+}
 libc_hidden_def(memcpy)
diff --git a/libc/string/cris/memmove.c b/libc/string/cris/memmove.c
index fa495eba4..906ef8e74 100644
--- a/libc/string/cris/memmove.c
+++ b/libc/string/cris/memmove.c
@@ -18,16 +18,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
 #include "memcopy.h"
 #include "../generic/pagecopy.h"
 
-/* Experimentally off - libc_hidden_proto(memmove) */
 void *memmove (void *dest, const void *src, size_t len)
 {
   unsigned long int dstp = (long int) dest;
diff --git a/libc/string/cris/memset.c b/libc/string/cris/memset.c
index b578aac5d..fab4e8b66 100644
--- a/libc/string/cris/memset.c
+++ b/libc/string/cris/memset.c
@@ -1,271 +1,262 @@
-/* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
-   Copyright (C) 1999, 2000 Axis Communications AB.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Library General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Library General Public License for more details.
-
-   You should have received a copy of the GNU Library General Public
-   License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA.  */
-
-/*#************************************************************************#*/
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# FUNCTION NAME: memset()                                                 */
-/*#                                                                         */
-/*# PARAMETERS:  void* dst;   Destination address.                          */
-/*#              int     c;   Value of byte to write.                       */
-/*#              int   len;   Number of bytes to write.                     */
-/*#                                                                         */
-/*# RETURNS:     dst.                                                       */
-/*#                                                                         */
-/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
-/*#              Framework taken from memcpy.  This routine is              */
-/*#              very sensitive to compiler changes in register allocation. */
-/*#              Should really be rewritten to avoid this problem.          */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-/*#                                                                         */
-/*# HISTORY                                                                 */
-/*#                                                                         */
-/*# DATE      NAME            CHANGES                                       */
-/*# ----      ----            -------                                       */
-/*# 990713    HP              Tired of watching this function (or           */
-/*#                           really, the nonoptimized generic              */
-/*#                           implementation) take up 90% of simulator      */
-/*#                           output.  Measurements needed.                 */
-/*#                                                                         */
-/*#-------------------------------------------------------------------------*/
-
-/* No, there's no macro saying 12*4, since it is "hard" to get it into
-   the asm in a good way.  Thus better to expose the problem everywhere.
-   */
-
-/* Assuming 1 cycle per dword written or read (ok, not really true), and
-   one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
-   so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
-
-#define ZERO_BLOCK_SIZE (1*12*4)
-
-void *memset(void *, int, unsigned long);
-
-/* Experimentally off - libc_hidden_proto(memset) */
-void *memset(void *pdst,
-             int c,
-             unsigned long plen)
+/* A memset for CRIS.
+   Copyright (C) 1999-2008 Axis Communications.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Neither the name of Axis Communications nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY AXIS COMMUNICATIONS AND ITS CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AXIS
+   COMMUNICATIONS OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+   IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+   POSSIBILITY OF SUCH DAMAGE.  */
+
+/* FIXME: This file should really only be used for reference, as the
+   result is somewhat depending on gcc generating what we expect rather
+   than what we describe.  An assembly file should be used instead.  */
+
+#include <string.h>
+
+/* Note the multiple occurrence of the expression "12*4", including the
+   asm.  It is hard to get it into the asm in a good way.  Thus better to
+   expose the problem everywhere: no macro.  */
+
+/* Assuming one cycle per dword written or read (ok, not really true; the
+   world is not ideal), and one cycle per instruction, then 43+3*(n/48-1)
+   <= 24+24*(n/48-1) so n >= 45.7; n >= 0.9; we win on the first full
+   48-byte block to set.  */
+
+#define MEMSET_BY_BLOCK_THRESHOLD (1 * 48)
+
+/* No name ambiguities in this file.  */
+__asm__ (".syntax no_register_prefix");
+
+void *memset(void *pdst, int c, unsigned int plen)
 {
-  /* Ok.  Now we want the parameters put in special registers.
-     Make sure the compiler is able to make something useful of this. */
+  /* Now we want the parameters in special registers.  Make sure the
+     compiler does something usable with this.  */
 
   register char *return_dst __asm__ ("r10") = pdst;
-  register long n __asm__ ("r12") = plen;
+  register int n __asm__ ("r12") = plen;
   register int lc __asm__ ("r11") = c;
 
-  /* Most apps use memset sanely.  Only those memsetting about 3..4
-     bytes or less get penalized compared to the generic implementation
-     - and that's not really sane use. */
+  /* Most apps use memset sanely.  Memsetting about 3..4 bytes or less get
+     penalized here compared to the generic implementation.  */
 
-  /* Ugh.  This is fragile at best.  Check with newer GCC releases, if
-     they compile cascaded "x |= x << 8" sanely! */
-  __asm__("movu.b %0,$r13 \n\
-	   lslq 8,$r13    \n\
-	   move.b %0,$r13 \n\
-	   move.d $r13,%0 \n\
-	   lslq 16,$r13   \n\
-	   or.d $r13,%0"
-          : "=r" (lc) : "0" (lc) : "r13");
+  /* This is fragile performancewise at best.  Check with newer GCC
+     releases, if they compile cascaded "x |= x << 8" to sane code.  */
+  __asm__("movu.b %0,r13                                                \n\
+           lslq 8,r13                                                   \n\
+           move.b %0,r13                                                \n\
+           move.d r13,%0                                                \n\
+           lslq 16,r13                                                  \n\
+           or.d r13,%0"
+          : "=r" (lc)           /* Inputs.  */
+          : "0" (lc)            /* Outputs.  */
+          : "r13");             /* Trash.  */
 
   {
     register char *dst __asm__ ("r13") = pdst;
 
-  if (((unsigned long) pdst & 3) != 0
-     /* Oops! n=0 must be a legal call, regardless of alignment. */
-      && n >= 3)
-  {
-    if ((unsigned long)dst & 1)
-    {
-      *dst = (char) lc;
-      n--;
-      dst++;
-    }
-
-    if ((unsigned long)dst & 2)
-    {
-      *(short *)dst = lc;
-      n -= 2;
-      dst += 2;
-    }
-  }
+    if (((unsigned long) pdst & 3) != 0
+        /* Oops! n = 0 must be a valid call, regardless of alignment.  */
+        && n >= 3)
+      {
+        if ((unsigned long) dst & 1)
+          {
+            *dst = (char) lc;
+            n--;
+            dst++;
+          }
 
-  /* Now the fun part.  For the threshold value of this, check the equation
-     above. */
-  /* Decide which copying method to use. */
-  if (n >= ZERO_BLOCK_SIZE)
-  {
-    /* For large copies we use 'movem' */
-
-  /* It is not optimal to tell the compiler about clobbering any
-     registers; that will move the saving/restoring of those registers
-     to the function prologue/epilogue, and make non-movem sizes
-     suboptimal.
-
-      This method is not foolproof; it assumes that the "asm reg"
-     declarations at the beginning of the function really are used
-     here (beware: they may be moved to temporary registers).
-      This way, we do not have to save/move the registers around into
-     temporaries; we can safely use them straight away.  */
-    __asm__ __volatile__ ("								\n\
-	.syntax no_register_prefix						\n\
-										\n\
-        ;; Check that the register asm declaration got right.			\n\
-        ;; The GCC manual explicitly says there's no warranty for that (too).	\n\
-	.ifnc %0-%1-%4,$r13-$r12-$r11						\n\
-	.err									\n\
-	.endif									\n\
-										\n\
-	;; Save the registers we'll clobber in the movem process		\n\
-	;; on the stack.  Don't mention them to gcc, it will only be		\n\
-	;; upset.								\n\
-	subq 	11*4,sp								\n\
-        movem   r10,[sp]							\n\
-										\n\
-        move.d  r11,r0								\n\
-        move.d  r11,r1								\n\
-        move.d  r11,r2								\n\
-        move.d  r11,r3								\n\
-        move.d  r11,r4								\n\
-        move.d  r11,r5								\n\
-        move.d  r11,r6								\n\
-        move.d  r11,r7								\n\
-        move.d  r11,r8								\n\
-        move.d  r11,r9								\n\
-        move.d  r11,r10								\n\
-										\n\
-        ;; Now we've got this:							\n\
-	;; r13 - dst								\n\
-	;; r12 - n								\n\
-										\n\
-        ;; Update n for the first loop						\n\
-        subq    12*4,r12							\n\
-0:										\n\
-        subq   12*4,r12								\n\
-        bge     0b								\n\
-	movem	r11,[r13+]							\n\
-										\n\
-        addq   12*4,r12  ;; compensate for last loop underflowing n		\n\
-										\n\
-	;; Restore registers from stack						\n\
-        movem [sp+],r10"
-
-     /* Outputs */ : "=r" (dst), "=r" (n)
-     /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
+        if ((unsigned long) dst & 2)
+          {
+            *(short *) dst = lc;
+            n -= 2;
+            dst += 2;
+          }
+      }
 
-  }
+    /* Decide which setting method to use.  */
+    if (n >= MEMSET_BY_BLOCK_THRESHOLD)
+      {
+        /* It is not optimal to tell the compiler about clobbering any
+           registers; that will move the saving/restoring of those registers
+           to the function prologue/epilogue, and make non-block sizes
+           suboptimal.  */
+        __asm__ __volatile__
+          ("\
+           ;; GCC does promise correct register allocations, but let's  \n\
+           ;; make sure it keeps its promises.                          \n\
+           .ifnc %0-%1-%4,$r13-$r12-$r11                                \n\
+           .error \"GCC reg alloc bug: %0-%1-%4 != $r13-$r12-$r11\"     \n\
+           .endif                                                       \n\
+                                                                        \n\
+           ;; Save the registers we'll clobber in the movem process     \n\
+           ;; on the stack.  Don't mention them to gcc, it will only be \n\
+           ;; upset.                                                    \n\
+           subq    11*4,sp                                              \n\
+           movem   r10,[sp]                                             \n\
+                                                                        \n\
+           move.d  r11,r0                                               \n\
+           move.d  r11,r1                                               \n\
+           move.d  r11,r2                                               \n\
+           move.d  r11,r3                                               \n\
+           move.d  r11,r4                                               \n\
+           move.d  r11,r5                                               \n\
+           move.d  r11,r6                                               \n\
+           move.d  r11,r7                                               \n\
+           move.d  r11,r8                                               \n\
+           move.d  r11,r9                                               \n\
+           move.d  r11,r10                                              \n\
+                                                                        \n\
+           ;; Now we've got this:                                       \n\
+           ;; r13 - dst                                                 \n\
+           ;; r12 - n                                                   \n\
+                                                                        \n\
+           ;; Update n for the first loop                               \n\
+           subq    12*4,r12                                             \n\
+0:                                                                      \n\
+"
+#ifdef __arch_common_v10_v32
+           /* Cater to branch offset difference between v32 and v10.  We
+              assume the branch below has an 8-bit offset.  */
+"          setf\n"
+#endif
+"          subq   12*4,r12                                              \n\
+           bge     0b                                                   \n\
+           movem        r11,[r13+]                                      \n\
+                                                                        \n\
+           ;; Compensate for last loop underflowing n.                  \n\
+           addq   12*4,r12                                              \n\
+                                                                        \n\
+           ;; Restore registers from stack.                             \n\
+           movem [sp+],r10"
+
+           /* Outputs.  */
+           : "=r" (dst), "=r" (n)
+
+           /* Inputs.  */
+           : "0" (dst), "1" (n), "r" (lc));
+      }
+
+    /* An ad-hoc unroll, used for 4*12-1..16 bytes. */
+    while (n >= 16)
+      {
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        n -= 16;
+      }
 
-    /* Either we directly starts copying, using dword copying
-       in a loop, or we copy as much as possible with 'movem'
-       and then the last block (<44 bytes) is copied here.
-       This will work since 'movem' will have updated src,dst,n. */
-
-    while ( n >= 16 )
-    {
-      *((long*)dst)++ = lc;
-      *((long*)dst)++ = lc;
-      *((long*)dst)++ = lc;
-      *((long*)dst)++ = lc;
-      n -= 16;
-    }
-
-    /* A switch() is definitely the fastest although it takes a LOT of code.
-     * Particularly if you inline code this.
-     */
     switch (n)
-    {
+      {
       case 0:
         break;
+
       case 1:
-        *(char*)dst = (char) lc;
+        *dst = (char) lc;
         break;
+
       case 2:
-        *(short*)dst = (short) lc;
+        *(short *) dst = (short) lc;
         break;
+
       case 3:
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
+
       case 4:
-        *((long*)dst)++ = lc;
+        *(long *) dst = lc;
         break;
+
       case 5:
-        *((long*)dst)++ = lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *dst = (char) lc;
         break;
+
       case 6:
-        *((long*)dst)++ = lc;
-        *(short*)dst = (short) lc;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc;
         break;
+
       case 7:
-        *((long*)dst)++ = lc;
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
+
       case 8:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc;
         break;
+
       case 9:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *dst = (char) lc;
         break;
+
       case 10:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(short*)dst = (short) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc;
         break;
+
       case 11:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
+
       case 12:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc;
         break;
+
       case 13:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *dst = (char) lc;
         break;
+
       case 14:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *(short*)dst = (short) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc;
         break;
+
       case 15:
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((long*)dst)++ = lc;
-        *((short*)dst)++ = (short) lc;
-        *(char*)dst = (char) lc;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(long *) dst = lc; dst += 4;
+        *(short *) dst = (short) lc; dst += 2;
+        *dst = (char) lc;
         break;
-    }
+      }
   }
 
-  return return_dst; /* destination pointer. */
-} /* memset() */
+  return return_dst;
+}
 libc_hidden_def(memset)
diff --git a/libc/string/cris/strcpy.c b/libc/string/cris/strcpy.c
index 955a990b7..40e6389b9 100644
--- a/libc/string/cris/strcpy.c
+++ b/libc/string/cris/strcpy.c
@@ -6,7 +6,6 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strcpy) */
 char *strcpy(char *dest, const char *src)
 {
   char *ret = dest;
diff --git a/libc/string/cris/strncpy.c b/libc/string/cris/strncpy.c
index 3f2775bdd..8d074071a 100644
--- a/libc/string/cris/strncpy.c
+++ b/libc/string/cris/strncpy.c
@@ -6,9 +6,7 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memset) */
 
-/* Experimentally off - libc_hidden_proto(strncpy) */
 char *strncpy(char *dest, const char *src, size_t count)
 {
   char *ret = dest;
diff --git a/libc/string/dirname.c b/libc/string/dirname.c
index 6265e562e..c7f4dec1f 100644
--- a/libc/string/dirname.c
+++ b/libc/string/dirname.c
@@ -5,7 +5,8 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-#include "_string.h"
+#define __need_NULL
+#include <stddef.h>
 #include <libgen.h>
 
 char *dirname(char *path)
diff --git a/libc/string/ffs.c b/libc/string/ffs.c
index 241b7456f..f39d304b7 100644
--- a/libc/string/ffs.c
+++ b/libc/string/ffs.c
@@ -5,12 +5,9 @@
  * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  */
 
-/* ffsl,ffsll */
-
-#include "_string.h"
-
-/* Experimentally off - libc_hidden_proto(ffs) */
-
+#include <limits.h>
+#include <string.h>
+  
 int ffs(int i)
 {
 #if 1
@@ -53,3 +50,6 @@ int ffs(int i)
 #endif
 }
 libc_hidden_def(ffs)
+#if ULONG_MAX == UINT_MAX
+strong_alias_untyped(ffs, ffsl)
+#endif
diff --git a/libc/string/ffsll.c b/libc/string/ffsll.c
new file mode 100644
index 000000000..967fc5168
--- /dev/null
+++ b/libc/string/ffsll.c
@@ -0,0 +1,35 @@
+/* Copyright (C) 1991, 1992, 1997, 1998 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Torbjorn Granlund (tege@sics.se).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <limits.h>
+#include <string.h>
+
+/* Find the first bit set in I.  */
+int ffsll (long long int i)
+{
+  unsigned long long int x = i & -i;
+
+  if (x <= 0xffffffff)
+    return ffs (i);
+  else
+    return 32 + ffs (i >> 32);
+}
+
+#if ULONG_MAX != UINT_MAX
+strong_alias_untyped(ffsll, ffsl)
+#endif
diff --git a/libc/string/frv/memcpy.S b/libc/string/frv/memcpy.S
index ae843797d..47773726a 100644
--- a/libc/string/frv/memcpy.S
+++ b/libc/string/frv/memcpy.S
@@ -14,8 +14,8 @@
  *  Library General Public License for more details.
  *
  *  You should have received a copy of the GNU Library General Public
- *  License along with this library; if not, write to the Free
- *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  License along with this library; if not, see
+ *  <http://www.gnu.org/licenses/>.
  */
 
 #include <features.h>
diff --git a/libc/string/frv/memset.S b/libc/string/frv/memset.S
index 477597dcd..17013672e 100644
--- a/libc/string/frv/memset.S
+++ b/libc/string/frv/memset.S
@@ -14,8 +14,8 @@
  *  Library General Public License for more details.
  *
  *  You should have received a copy of the GNU Library General Public
- *  License along with this library; if not, write to the Free
- *  Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *  License along with this library; if not, see
+ *  <http://www.gnu.org/licenses/>.
  */
 
 #include <features.h>
@@ -155,4 +155,4 @@ memset:
 	bralr
 	.size		memset, .-memset
 
-/* Experimentally off - libc_hidden_proto(memset) */
+libc_hidden_def(memset)
diff --git a/libc/string/generic/bp-checks.h b/libc/string/generic/bp-checks.h
deleted file mode 100644
index 08c70aa5d..000000000
--- a/libc/string/generic/bp-checks.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Bounded-pointer checking macros for C.
-   Copyright (C) 2000 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Greg McGary <greg@mcgary.org>
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#ifndef _bp_checks_h_
-#define _bp_checks_h_ 1
-
-#if __BOUNDED_POINTERS__
-
-# define BOUNDS_VIOLATED (__builtin_trap (), 0)
-
-/* Verify that pointer's value >= low.  Return pointer value.  */
-# define CHECK_BOUNDS_LOW(ARG)					\
-  (((__ptrvalue (ARG) < __ptrlow (ARG)) && BOUNDS_VIOLATED),	\
-   __ptrvalue (ARG))
-
-/* Verify that pointer's value < high.  Return pointer value.  */
-# define CHECK_BOUNDS_HIGH(ARG)				\
-  (((__ptrvalue (ARG) > __ptrhigh (ARG)) && BOUNDS_VIOLATED),	\
-   __ptrvalue (ARG))
-
-# define _CHECK_N(ARG, N, COND)				\
-  (((COND)						\
-    && (__ptrvalue (ARG) < __ptrlow (ARG)		\
-	|| __ptrvalue (ARG) + (N) > __ptrhigh (ARG))	\
-    && BOUNDS_VIOLATED),				\
-   __ptrvalue (ARG))
-
-extern void *__unbounded __ubp_memchr (const void *__unbounded, int, unsigned);
-
-# define _CHECK_STRING(ARG, COND)				\
-  (((COND)							\
-    && (__ptrvalue (ARG) < __ptrlow (ARG)			\
-	|| !__ubp_memchr (__ptrvalue (ARG), '\0',			\
-		      (__ptrhigh (ARG) - __ptrvalue (ARG))))	\
-    && BOUNDS_VIOLATED),					\
-   __ptrvalue (ARG))
-
-/* Check bounds of a pointer seated to an array of N objects.  */
-# define CHECK_N(ARG, N) _CHECK_N ((ARG), (N), 1)
-/* Same as CHECK_N, but tolerate ARG == NULL.  */
-# define CHECK_N_NULL_OK(ARG, N) _CHECK_N ((ARG), (N), __ptrvalue (ARG))
-
-/* Check bounds of a pointer seated to a single object.  */
-# define CHECK_1(ARG) CHECK_N ((ARG), 1)
-/* Same as CHECK_1, but tolerate ARG == NULL.  */
-# define CHECK_1_NULL_OK(ARG) CHECK_N_NULL_OK ((ARG), 1)
-
-/* Check for NUL-terminator within string's bounds.  */
-# define CHECK_STRING(ARG) _CHECK_STRING ((ARG), 1)
-/* Same as CHECK_STRING, but tolerate ARG == NULL.  */
-# define CHECK_STRING_NULL_OK(ARG) _CHECK_STRING ((ARG), __ptrvalue (ARG))
-
-/* Check bounds of signal syscall args with type sigset_t.  */
-# define CHECK_SIGSET(SET) CHECK_N ((SET), _NSIG / (8 * sizeof *(SET)))
-/* Same as CHECK_SIGSET, but tolerate SET == NULL.  */
-# define CHECK_SIGSET_NULL_OK(SET) CHECK_N_NULL_OK ((SET), _NSIG / (8 * sizeof *(SET)))
-
-# if defined (_IOC_SIZESHIFT) && defined (_IOC_SIZEBITS)
-/* Extract the size of the ioctl data and check its bounds.  */
-#  define CHECK_IOCTL(ARG, CMD)						\
-  CHECK_N ((const char *) (ARG),					\
-	   (((CMD) >> _IOC_SIZESHIFT) & ((1 << _IOC_SIZEBITS) - 1)))
-# else
-/* We don't know the size of the ioctl data, so the best we can do
-   is check that the first byte is within bounds.  */
-#  define CHECK_IOCTL(ARG, CMD) CHECK_1 ((const char *) ARG)
-# endif
-
-/* Check bounds of `struct flock *' for the locking fcntl commands.  */
-# define CHECK_FCNTL(ARG, CMD)					\
-  (((CMD) == F_GETLK || (CMD) == F_SETLK || (CMD) == F_SETLKW)	\
-   ? CHECK_1 ((struct flock *) ARG) : (unsigned long) (ARG))
-
-/* Check bounds of an array of mincore residency-status flags that
-   cover a region of NBYTES.  Such a vector occupies one byte per page
-   of memory.  */
-# define CHECK_N_PAGES(ARG, NBYTES)				\
-  ({ int _page_size_ = __sysconf (_SC_PAGE_SIZE);			\
-     CHECK_N ((const char *) (ARG),				\
-	      ((NBYTES) + _page_size_ - 1) / _page_size_); })
-
-/* Return a bounded pointer with value PTR that satisfies CHECK_N (PTR, N).  */
-# define BOUNDED_N(PTR, N) 				\
-  ({ __typeof (PTR) __bounded _p_;			\
-     __ptrvalue _p_ = __ptrlow _p_ = __ptrvalue (PTR);	\
-     __ptrhigh _p_ = __ptrvalue _p_ + (N);		\
-     _p_; })
-
-#else /* !__BOUNDED_POINTERS__ */
-
-/* Do nothing if not compiling with -fbounded-pointers.  */
-
-# define BOUNDS_VIOLATED
-# define CHECK_BOUNDS_LOW(ARG) (ARG)
-# define CHECK_BOUNDS_HIGH(ARG) (ARG)
-# define CHECK_1(ARG) (ARG)
-# define CHECK_1_NULL_OK(ARG) (ARG)
-# define CHECK_N(ARG, N) (ARG)
-# define CHECK_N_NULL_OK(ARG, N) (ARG)
-# define CHECK_STRING(ARG) (ARG)
-# define CHECK_SIGSET(SET) (SET)
-# define CHECK_SIGSET_NULL_OK(SET) (SET)
-# define CHECK_IOCTL(ARG, CMD) (ARG)
-# define CHECK_FCNTL(ARG, CMD) (ARG)
-# define CHECK_N_PAGES(ARG, NBYTES) (ARG)
-# define BOUNDED_N(PTR, N) (PTR)
-
-#endif /* !__BOUNDED_POINTERS__ */
-
-#define BOUNDED_1(PTR) BOUNDED_N (PTR, 1)
-
-#endif /* _bp_checks_h_ */
diff --git a/libc/string/generic/memchr.c b/libc/string/generic/memchr.c
index 3c7c997bc..967ae51ea 100644
--- a/libc/string/generic/memchr.c
+++ b/libc/string/generic/memchr.c
@@ -17,22 +17,19 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 #include <limits.h>
 
-/* Experimentally off - libc_hidden_proto(memchr) */
-libc_hidden_proto(abort)
-
 #include "memcopy.h"
 
 #define LONG_MAX_32_BITS 2147483647
 
 /* Search no more than N bytes of S for C.  */
+#undef memchr
 void *memchr (const void * s, int c_in, size_t n)
 {
   const unsigned char *char_ptr;
diff --git a/libc/string/generic/memcmp.c b/libc/string/generic/memcmp.c
index fc63a2eae..170c50997 100644
--- a/libc/string/generic/memcmp.c
+++ b/libc/string/generic/memcmp.c
@@ -14,22 +14,16 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(memcmp) */
 
 #include <endian.h>
 
 #if __BYTE_ORDER == __BIG_ENDIAN
-# define WORDS_BIGENDIAN
-#endif
-
-#ifdef WORDS_BIGENDIAN
 # define CMP_LT_OR_GT(a, b) ((a) > (b) ? 1 : -1)
 #else
 # define CMP_LT_OR_GT(a, b) memcmp_bytes ((a), (b))
@@ -48,17 +42,12 @@
 
    3. Compare the few remaining bytes.  */
 
-#ifndef WORDS_BIGENDIAN
+#if __BYTE_ORDER != __BIG_ENDIAN
 /* memcmp_bytes -- Compare A and B bytewise in the byte order of the machine.
    A and B are known to be different.
    This is needed only on little-endian machines.  */
 
-static int memcmp_bytes __P((op_t, op_t));
-
-# ifdef  __GNUC__
-__inline
-# endif
-static int
+static __inline__ int
 memcmp_bytes (op_t a, op_t b)
 {
   long int srcp1 = (long int) &a;
@@ -77,8 +66,6 @@ memcmp_bytes (op_t a, op_t b)
 }
 #endif
 
-static int memcmp_common_alignment __P((long, long, size_t));
-
 /* memcmp_common_alignment -- Compare blocks at SRCP1 and SRCP2 with LEN `op_t'
    objects (not LEN bytes!).  Both SRCP1 and SRCP2 should be aligned for
    memory operations on `op_t's.  */
@@ -161,8 +148,6 @@ memcmp_common_alignment (long int srcp1, long int srcp2, size_t len)
   return 0;
 }
 
-static int memcmp_not_common_alignment __P((long, long, size_t));
-
 /* memcmp_not_common_alignment -- Compare blocks at SRCP1 and SRCP2 with LEN
    `op_t' objects (not LEN bytes!).  SRCP2 should be aligned for memory
    operations on `op_t', but SRCP1 *should be unaligned*.  */
diff --git a/libc/string/generic/memcopy.h b/libc/string/generic/memcopy.h
index fab4da764..031557ac8 100644
--- a/libc/string/generic/memcopy.h
+++ b/libc/string/generic/memcopy.h
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* The strategy of the memory functions is:
 
@@ -107,7 +106,6 @@ typedef unsigned char byte;
 	}								      \
     } while (0)
 
-#ifdef __ARCH_HAS_BWD_MEMCPY__
 /* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
    the assumption that DST_BP is aligned on an OPSIZ multiple.  If
    not all bytes could be easily copied, store remaining number of bytes
@@ -126,8 +124,6 @@ typedef unsigned char byte;
       (nbytes_left) = (nbytes) % OPSIZ;					      \
     } while (0)
 
-#endif
-
 /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
    beginning at the words (of type op_t) right before the pointers and
    continuing towards smaller addresses.  May take advantage of that
diff --git a/libc/string/generic/memcpy.c b/libc/string/generic/memcpy.c
index 4284f2fe5..ca2e7e0f9 100644
--- a/libc/string/generic/memcpy.c
+++ b/libc/string/generic/memcpy.c
@@ -15,15 +15,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include "memcopy.h"
 #include "pagecopy.h"
+#include "_memcpy_fwd.c"
 
-/* Experimentally off - libc_hidden_proto(memcpy) */
 
 void *memcpy (void *dstpp, const void *srcpp, size_t len)
 {
diff --git a/libc/string/generic/memmem.c b/libc/string/generic/memmem.c
index c75bb2426..753e43ae5 100644
--- a/libc/string/generic/memmem.c
+++ b/libc/string/generic/memmem.c
@@ -12,16 +12,13 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stddef.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(memmem) */
-/* Experimentally off - libc_hidden_proto(memcmp) */
 
 /* Return the first occurrence of NEEDLE in HAYSTACK.  */
 void *memmem (const void *haystack, size_t haystack_len,
@@ -50,5 +47,4 @@ void *memmem (const void *haystack, size_t haystack_len,
 
   return NULL;
 }
-libc_hidden_def(memmem)
 #endif
diff --git a/libc/string/generic/memmove.c b/libc/string/generic/memmove.c
index 7f945b150..bf78c4778 100644
--- a/libc/string/generic/memmove.c
+++ b/libc/string/generic/memmove.c
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
@@ -29,8 +28,6 @@
 #include "_memcpy_fwd.c"
 #endif
 
-/* Experimentally off - libc_hidden_proto(memmove) */
-/* Experimentally off - libc_hidden_proto(memcpy) */
 
 static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
 {
diff --git a/libc/string/generic/mempcpy.c b/libc/string/generic/mempcpy.c
index 8d7356486..bb5563a6a 100644
--- a/libc/string/generic/mempcpy.c
+++ b/libc/string/generic/mempcpy.c
@@ -8,13 +8,13 @@
 #include <string.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(mempcpy) */
-/* Experimentally off - libc_hidden_proto(memcpy) */
 
+# undef mempcpy
 void *mempcpy (void *dstpp, const void *srcpp, size_t len)
 {
   memcpy(dstpp, srcpp, len);
   return (void *)(((char *)dstpp) + len);
 }
 libc_hidden_weak(mempcpy)
+strong_alias(mempcpy,__mempcpy)
 #endif
diff --git a/libc/string/generic/memrchr.c b/libc/string/generic/memrchr.c
index 9ab805cf7..b74cf5152 100644
--- a/libc/string/generic/memrchr.c
+++ b/libc/string/generic/memrchr.c
@@ -18,17 +18,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 #include <limits.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(memrchr) */
-libc_hidden_proto(abort)
 
 #include "memcopy.h"
 
diff --git a/libc/string/generic/memset.c b/libc/string/generic/memset.c
index 62cc36fe3..5644e2522 100644
--- a/libc/string/generic/memset.c
+++ b/libc/string/generic/memset.c
@@ -12,14 +12,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(memset) */
 void *memset (void *dstpp, int c, size_t len)
 {
   long int dstp = (long int) dstpp;
diff --git a/libc/string/generic/pagecopy.h b/libc/string/generic/pagecopy.h
index 5a0ada1fa..16aaacab6 100644
--- a/libc/string/generic/pagecopy.h
+++ b/libc/string/generic/pagecopy.h
@@ -13,9 +13,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* This file defines the macro:
 
@@ -40,7 +39,7 @@
 */
 
 
-#if PAGE_COPY_THRESHOLD
+#if defined PAGE_COPY_THRESHOLD && PAGE_COPY_THRESHOLD
 
 #include <assert.h>
 
@@ -48,7 +47,7 @@
   do									      \
     {									      \
       if ((nbytes) >= PAGE_COPY_THRESHOLD &&				      \
-	  PAGE_OFFSET ((dstp) - (srcp)) == 0) 				      \
+	  PAGE_OFFSET ((dstp) - (srcp)) == 0)				      \
 	{								      \
 	  /* The amount to copy is past the threshold for copying	      \
 	     pages virtually with kernel VM operations, and the		      \
diff --git a/libc/string/generic/rawmemchr.c b/libc/string/generic/rawmemchr.c
index f8b97a61d..816589649 100644
--- a/libc/string/generic/rawmemchr.c
+++ b/libc/string/generic/rawmemchr.c
@@ -17,17 +17,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 #include <limits.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(rawmemchr) */
-libc_hidden_proto(abort)
 
 #include "memcopy.h"
 
diff --git a/libc/string/generic/strcat.c b/libc/string/generic/strcat.c
index e00494038..68fc2a289 100644
--- a/libc/string/generic/strcat.c
+++ b/libc/string/generic/strcat.c
@@ -12,14 +12,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(strcat) */
 /* Append SRC on the end of DEST.  */
 char *strcat (char *dest, const char *src)
 {
diff --git a/libc/string/generic/strchr.c b/libc/string/generic/strchr.c
index 66aed1e25..321d2b8c3 100644
--- a/libc/string/generic/strchr.c
+++ b/libc/string/generic/strchr.c
@@ -17,15 +17,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 
-/* Experimentally off - libc_hidden_proto(strchr) */
-libc_hidden_proto(abort)
 
 #include "memcopy.h"
 
diff --git a/libc/string/generic/strchrnul.c b/libc/string/generic/strchrnul.c
index 72cab2891..d11d9e00d 100644
--- a/libc/string/generic/strchrnul.c
+++ b/libc/string/generic/strchrnul.c
@@ -17,16 +17,13 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(strchrnul) */
-libc_hidden_proto(abort)
 
 #include "memcopy.h"
 
diff --git a/libc/string/generic/strcmp.c b/libc/string/generic/strcmp.c
index 50acd3548..24ad14382 100644
--- a/libc/string/generic/strcmp.c
+++ b/libc/string/generic/strcmp.c
@@ -12,15 +12,13 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(strcmp) */
 /* Compare S1 and S2, returning less than, equal to or
    greater than zero if S1 is lexicographically less than,
    equal to or greater than S2.  */
@@ -44,7 +42,6 @@ int strcmp (const char *p1, const char *p2)
 libc_hidden_weak(strcmp)
 
 #ifndef __UCLIBC_HAS_LOCALE__
-/* Experimentally off - libc_hidden_proto(strcoll) */
 strong_alias(strcmp,strcoll)
 libc_hidden_def(strcoll)
 #endif
diff --git a/libc/string/generic/strcpy.c b/libc/string/generic/strcpy.c
index 99e077139..924615fca 100644
--- a/libc/string/generic/strcpy.c
+++ b/libc/string/generic/strcpy.c
@@ -12,36 +12,21 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <stddef.h>
 
-#include "memcopy.h"
-#include "bp-checks.h"
-
-/* Experimentally off - libc_hidden_proto(strcpy) */
 /* Copy SRC to DEST.  */
-char *strcpy (char *dest, const char *src)
+char *strcpy(char *dest, const char *src)
 {
-  reg_char c;
-  char *__unbounded s = (char *__unbounded) CHECK_BOUNDS_LOW (src);
-  const ptrdiff_t off = CHECK_BOUNDS_LOW (dest) - s - 1;
-  size_t n;
-
-  do
-    {
-      c = *s++;
-      s[off] = c;
-    }
-  while (c != '\0');
+	char *dst = dest;
 
-  n = s - src;
-  (void) CHECK_BOUNDS_HIGH (src + n);
-  (void) CHECK_BOUNDS_HIGH (dest + n);
+	while ((*dst = *src) != '\0') {
+		src++;
+		dst++;
+	}
 
-  return dest;
+	return dest;
 }
 libc_hidden_def(strcpy)
diff --git a/libc/string/generic/strcspn.c b/libc/string/generic/strcspn.c
index b65b3b995..ca9506bdd 100644
--- a/libc/string/generic/strcspn.c
+++ b/libc/string/generic/strcspn.c
@@ -12,14 +12,11 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strcspn) */
-/* Experimentally off - libc_hidden_proto(strchr) */
 
 /* Return the length of the maximum initial segment of S
    which contains no characters from REJECT.  */
diff --git a/libc/string/generic/strlen.c b/libc/string/generic/strlen.c
index 764dae18d..dc383398b 100644
--- a/libc/string/generic/strlen.c
+++ b/libc/string/generic/strlen.c
@@ -15,15 +15,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 
-/* Experimentally off - libc_hidden_proto(strlen) */
-libc_hidden_proto(abort)
 
 /* Return the length of the null-terminated string STR.  Scan for
    the null terminator quickly by testing four bytes at a time.  */
diff --git a/libc/string/generic/strncat.c b/libc/string/generic/strncat.c
index 8e3423e49..f0cf8f995 100644
--- a/libc/string/generic/strncat.c
+++ b/libc/string/generic/strncat.c
@@ -12,15 +12,13 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(strncat) */
 char *strncat (char *s1, const char *s2, size_t n)
 {
   reg_char c;
diff --git a/libc/string/generic/strncmp.c b/libc/string/generic/strncmp.c
index c49f36d8b..ca980415e 100644
--- a/libc/string/generic/strncmp.c
+++ b/libc/string/generic/strncmp.c
@@ -12,14 +12,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(strncmp) */
 /* Compare no more than N characters of S1 and S2,
    returning less than, equal to or greater than zero
    if S1 is lexicographically less than, equal to or
diff --git a/libc/string/generic/strncpy.c b/libc/string/generic/strncpy.c
index d2d693f2b..0256bcc6b 100644
--- a/libc/string/generic/strncpy.c
+++ b/libc/string/generic/strncpy.c
@@ -12,14 +12,12 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include "memcopy.h"
 
-/* Experimentally off - libc_hidden_proto(strncpy) */
 char *strncpy (char *s1, const char *s2, size_t n)
 {
   reg_char c;
diff --git a/libc/string/generic/strnlen.c b/libc/string/generic/strnlen.c
index d9ba76129..4d4cde84f 100644
--- a/libc/string/generic/strnlen.c
+++ b/libc/string/generic/strnlen.c
@@ -17,16 +17,13 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA.  */
+   License along with the GNU C Library; see the file COPYING.LIB.  If
+   not, see <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 #include <stdlib.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(strnlen) */
-libc_hidden_proto(abort)
 
 /* Find the length of S, but scan at most MAXLEN characters.  If no
    '\0' terminator is found in that many characters, return MAXLEN.  */
@@ -34,7 +31,7 @@ size_t strnlen (const char *str, size_t maxlen)
 {
   const char *char_ptr, *end_ptr = str + maxlen;
   const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, himagic, lomagic;
+  unsigned long int longword, himagic, lomagic;
 
   if (maxlen == 0)
     return 0;
@@ -68,14 +65,12 @@ size_t strnlen (const char *str, size_t maxlen)
 
      The 1-bits make sure that carries propagate to the next 0-bit.
      The 0-bits provide holes for carries to fall into.  */
-  magic_bits = 0x7efefeffL;
   himagic = 0x80808080L;
   lomagic = 0x01010101L;
   if (sizeof (longword) > 4)
     {
       /* 64-bit version of the magic.  */
       /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL;
       himagic = ((himagic << 16) << 16) | himagic;
       lomagic = ((lomagic << 16) << 16) | lomagic;
     }
diff --git a/libc/string/generic/strrchr.c b/libc/string/generic/strrchr.c
index c85707241..8ca404843 100644
--- a/libc/string/generic/strrchr.c
+++ b/libc/string/generic/strrchr.c
@@ -12,14 +12,11 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strrchr) */
-/* Experimentally off - libc_hidden_proto(strchr) */
 
 /* Find the last occurrence of C in S.  */
 char *strrchr (const char *s, int c)
diff --git a/libc/string/generic/strsep.c b/libc/string/generic/strsep.c
index e02e57068..bbdaf8849 100644
--- a/libc/string/generic/strsep.c
+++ b/libc/string/generic/strsep.c
@@ -12,18 +12,14 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
 #ifdef __USE_BSD
 
-/* Experimentally off - libc_hidden_proto(strchr) */
-/* Experimentally off - libc_hidden_proto(strpbrk) */
 
-/* Experimentally off - libc_hidden_proto(strsep) */
 char *strsep (char **stringp, const char *delim)
 {
   char *begin, *end;
diff --git a/libc/string/generic/strspn.c b/libc/string/generic/strspn.c
index 010567744..86bcdcb70 100644
--- a/libc/string/generic/strspn.c
+++ b/libc/string/generic/strspn.c
@@ -12,13 +12,11 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strspn) */
 /* Return the length of the maximum initial segment
    of S which contains only characters in ACCEPT.  */
 size_t strspn (const char *s, const char *accept)
diff --git a/libc/string/generic/strstr.c b/libc/string/generic/strstr.c
index c12dceb33..dd101768b 100644
--- a/libc/string/generic/strstr.c
+++ b/libc/string/generic/strstr.c
@@ -13,9 +13,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /*
  * My personal strstr() implementation that beats most other algorithms.
@@ -28,7 +27,6 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strstr) */
 
 typedef unsigned chartype;
 
diff --git a/libc/string/generic/strtok_r.c b/libc/string/generic/strtok_r.c
index d082d226e..253964395 100644
--- a/libc/string/generic/strtok_r.c
+++ b/libc/string/generic/strtok_r.c
@@ -13,33 +13,27 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strtok_r) */
-/* Experimentally off - libc_hidden_proto(strspn) */
-/* Experimentally off - libc_hidden_proto(strpbrk) */
 #ifdef __USE_GNU
 # define __rawmemchr rawmemchr
-/* Experimentally off - libc_hidden_proto(rawmemchr) */
 #else
 # define __rawmemchr strchr
-/* Experimentally off - libc_hidden_proto(strchr) */
 #endif
-
-/* Parse S into tokens separated by characters in DELIM.
+#if 0
+   Parse S into tokens separated by characters in DELIM.
    If S is NULL, the saved pointer in SAVE_PTR is used as
    the next starting point.  For example:
 	char s[] = "-abc-=-def";
 	char *sp;
-	x = strtok_r(s, "-", &sp);	// x = "abc", sp = "=-def"
-	x = strtok_r(NULL, "-=", &sp);	// x = "def", sp = NULL
-	x = strtok_r(NULL, "=", &sp);	// x = NULL
-		// s = "abc\0-def\0"
-*/
+	x = strtok_r(s, "-", &sp);	/* x = "abc", sp = "=-def" */
+	x = strtok_r(NULL, "-=", &sp);	/* x = "def", sp = NULL */
+	x = strtok_r(NULL, "=", &sp);	/* x = NULL */
+		/* s = "abc\0-def\0" */
+#endif
 char *strtok_r (char *s, const char *delim, char **save_ptr)
 {
   char *token;
diff --git a/libc/string/i386/memchr.c b/libc/string/i386/memchr.c
index fe4537914..1960f6ba4 100644
--- a/libc/string/i386/memchr.c
+++ b/libc/string/i386/memchr.c
@@ -32,20 +32,44 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memchr) */
-void *memchr(const void *cs, int c, size_t count)
+#undef memchr
+/*#define memchr TESTING*/
+void *memchr(const void *s, int c, size_t count)
 {
-    int d0;
-    register void * __res;
-    if (!count)
-	return NULL;
-    __asm__ __volatile__(
-	    "repne\n\t"
-	    "scasb\n\t"
-	    "je 1f\n\t"
-	    "movl $1,%0\n"
-	    "1:\tdecl %0"
-	    :"=D" (__res), "=&c" (d0) : "a" (c),"0" (cs),"1" (count));
-    return __res;
+	void *edi;
+	int ecx;
+	__asm__ __volatile__(
+		"	jecxz	1f\n"
+		"	repne; scasb\n"
+		"	leal	-1(%%edi), %%edi\n"
+		"	je	2f\n"
+		"1:\n"
+		"	xorl	%%edi, %%edi\n" /* NULL */
+		"2:\n"
+		: "=&D" (edi), "=&c" (ecx)
+		: "a" (c), "0" (s), "1" (count)
+		/* : no clobbers */
+	);
+	return edi;
 }
+#ifndef memchr
 libc_hidden_def(memchr)
+#else
+/* Uncomment TESTING, gcc -D_GNU_SOURCE -m32 -Os memchr.c -o memchr
+ * and run ./memchr
+ */
+int main()
+{
+	static const char str[] = "abc.def";
+	printf((char*)memchr(str, '.',-2) - str == 3 ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.',-1) - str == 3 ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.', 0) == NULL    ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.', 1) == NULL    ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.', 2) == NULL    ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.', 3) == NULL    ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.', 4) - str == 3 ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str, '.', 5) - str == 3 ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str+3, '.', 0) == NULL    ? "ok\n" : "BAD!\n");
+	printf((char*)memchr(str+3, '.', 5) - str == 3 ? "ok\n" : "BAD!\n");
+}
+#endif
diff --git a/libc/string/i386/memcpy.c b/libc/string/i386/memcpy.c
index 285583f3b..697d0bdc2 100644
--- a/libc/string/i386/memcpy.c
+++ b/libc/string/i386/memcpy.c
@@ -32,22 +32,23 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memcpy) */
+#undef memcpy
 void *memcpy(void * to, const void * from, size_t n)
 {
-    int d0, d1, d2;
-    __asm__ __volatile__(
-	    "rep ; movsl\n\t"
-	    "testb $2,%b4\n\t"
-	    "je 1f\n\t"
-	    "movsw\n"
-	    "1:\ttestb $1,%b4\n\t"
-	    "je 2f\n\t"
-	    "movsb\n"
-	    "2:"
-	    : "=&c" (d0), "=&D" (d1), "=&S" (d2)
-	    :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
-	    : "memory");
-    return (to);
+	int d0, d1, d2;
+	__asm__ __volatile__(
+		"	rep; movsl\n"
+		"	movl	%4, %%ecx\n"
+		"	andl	$3, %%ecx\n"
+		/* jz is optional. avoids "rep; movsb" with ecx == 0,
+		 * but adds a branch, which is currently (2008) faster */
+		"	jz	1f\n"
+		"	rep; movsb\n"
+		"1:\n"
+		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
+		: "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
+		: "memory"
+	);
+	return to;
 }
 libc_hidden_def(memcpy)
diff --git a/libc/string/i386/memmove.c b/libc/string/i386/memmove.c
index a924efcbc..0ec8016a5 100644
--- a/libc/string/i386/memmove.c
+++ b/libc/string/i386/memmove.c
@@ -32,28 +32,40 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memmove) */
+#undef memmove
+/*#define memmove TESTING*/
 void *memmove(void *dest, const void *src, size_t n)
 {
-    int d0, d1, d2;
-    if (dest<src)
+	int eax, ecx, esi, edi;
 	__asm__ __volatile__(
-		"rep\n\t"
-		"movsb"
-		: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-		:"0" (n),"1" (src),"2" (dest)
-		: "memory");
-    else
-	__asm__ __volatile__(
-		"std\n\t"
-		"rep\n\t"
-		"movsb\n\t"
-		"cld"
-		: "=&c" (d0), "=&S" (d1), "=&D" (d2)
-		:"0" (n),
-		"1" (n-1+(const char *)src),
-		"2" (n-1+(char *)dest)
-		:"memory");
-    return dest;
+		"	movl	%%eax, %%edi\n"
+		"	cmpl	%%esi, %%eax\n"
+		"	je	2f\n" /* (optional) src == dest -> NOP */
+		"	jb	1f\n" /* src > dest -> simple copy */
+		"	leal	-1(%%esi,%%ecx), %%esi\n"
+		"	leal	-1(%%eax,%%ecx), %%edi\n"
+		"	std\n"
+		"1:	rep; movsb\n"
+		"	cld\n"
+		"2:\n"
+		: "=&c" (ecx), "=&S" (esi), "=&a" (eax), "=&D" (edi)
+		: "0" (n), "1" (src), "2" (dest)
+		: "memory"
+	);
+	return (void*)eax;
 }
+#ifndef memmove
 libc_hidden_def(memmove)
+#else
+/* Uncomment TESTING, gcc -D_GNU_SOURCE -m32 -Os memmove.c -o memmove
+ * and run ./memmove
+ */
+int main()
+{
+	static char str[] = "abcdef.123";
+	memmove(str + 1, str, 5);
+	printf(strcmp(str, "aabcde.123") == 0 ? "ok\n" : "BAD!\n");
+	memmove(str, str + 1, 5);
+	printf(strcmp(str, "abcdee.123") == 0 ? "ok\n" : "BAD!\n");
+}
+#endif
diff --git a/libc/string/i386/memset.c b/libc/string/i386/memset.c
index bbaa45215..9f51f3c60 100644
--- a/libc/string/i386/memset.c
+++ b/libc/string/i386/memset.c
@@ -28,20 +28,68 @@
  * More importantly, these should provide a good example for
  * others to follow when adding arch specific optimizations.
  *  -Erik
+ *
+ * 2009-04: modified by Denys Vlasenko <vda.linux@googlemail.com>
+ * Fill byte-by-byte is a bit too slow. I prefer 46 byte function
+ * which fills x4 faster than 21 bytes one.
  */
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memset) */
+#undef memset
 void *memset(void *s, int c, size_t count)
 {
-    int d0, d1;
-    __asm__ __volatile__(
-	    "rep\n\t"
-	    "stosb"
-	    : "=&c" (d0), "=&D" (d1)
-	    :"a" (c),"1" (s),"0" (count)
-	    :"memory");
-    return s;
+	int reg, edi;
+	__asm__ __volatile__(
+
+		/* Most of the time, count is divisible by 4 and nonzero */
+		/* It's better to make this case faster */
+	/*	"	jecxz	9f\n" - (optional) count == 0: goto ret */
+		"	mov	%%ecx, %1\n"
+		"	shr	$2, %%ecx\n"
+		"	jz	1f\n" /* zero words: goto fill_bytes */
+		/* extend 8-bit fill to 32 bits */
+		"	movzx	%%al, %%eax\n" /* 3 bytes */
+	/* or:	"	and	$0xff, %%eax\n" - 5 bytes */
+		"	imul	$0x01010101, %%eax\n" /* 6 bytes */
+		/* fill full words */
+		"	rep; stosl\n"
+		/* fill 0-3 bytes */
+		"1:	and	$3, %1\n"
+		"	jz	9f\n" /* (count & 3) == 0: goto end */
+		"2:	stosb\n"
+		"	dec	%1\n"
+		"	jnz	2b\n"
+		/* end */
+		"9:\n"
+
+		: "=&D" (edi), "=&r" (reg)
+		: "0" (s), "a" (c), "c" (count)
+		: "memory"
+	);
+	return s;
 }
 libc_hidden_def(memset)
+
+/*
+gcc 4.3.1
+=========
+57                     push   %edi
+8b 7c 24 08            mov    0x8(%esp),%edi
+8b 4c 24 10            mov    0x10(%esp),%ecx
+8b 44 24 0c            mov    0xc(%esp),%eax
+89 ca                  mov    %ecx,%edx
+c1 e9 02               shr    $0x2,%ecx
+74 0b                  je     1f <__GI_memset+0x1f>
+0f b6 c0               movzbl %al,%eax
+69 c0 01 01 01 01      imul   $0x1010101,%eax,%eax
+f3 ab                  rep stos %eax,%es:(%edi)
+83 e2 03               and    $0x3,%edx
+74 04                  je     28 <__GI_memset+0x28>
+aa                     stos   %al,%es:(%edi)
+4a                     dec    %edx
+75 fc                  jne    24 <__GI_memset+0x24>
+8b 44 24 08            mov    0x8(%esp),%eax
+5f                     pop    %edi
+c3                     ret
+*/
diff --git a/libc/string/i386/rawmemchr.c b/libc/string/i386/rawmemchr.c
new file mode 100644
index 000000000..be0b142c3
--- /dev/null
+++ b/libc/string/i386/rawmemchr.c
@@ -0,0 +1,24 @@
+/*
+ * Adapted from strlen.c code
+ *
+ * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+#include <string.h>
+
+#undef rawmemchr
+void *rawmemchr(const void *s, int c)
+{
+	void *eax;
+	int ecx, edi;
+	__asm__ __volatile__(
+		"	repne; scasb\n"
+		"	leal	-1(%%edi), %%eax\n"
+		: "=&c" (ecx), "=&D" (edi), "=&a" (eax)
+		: "0" (0xffffffff), "1" (s), "2" (c)
+	);
+	return eax;
+}
+libc_hidden_def(rawmemchr)
diff --git a/libc/string/i386/strcat.c b/libc/string/i386/strcat.c
index 2cf0237a6..e71aad4f7 100644
--- a/libc/string/i386/strcat.c
+++ b/libc/string/i386/strcat.c
@@ -32,7 +32,6 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strcat) */
 char *strcat(char * dest, const char * src)
 {
     int d0, d1, d2, d3;
diff --git a/libc/string/i386/strchr.c b/libc/string/i386/strchr.c
index 46b1dfb6e..93cc9583e 100644
--- a/libc/string/i386/strchr.c
+++ b/libc/string/i386/strchr.c
@@ -32,23 +32,25 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strchr) */
+#undef strchr
 char *strchr(const char *s, int c)
 {
-    int d0;
-    register char * __res;
-    __asm__ __volatile__(
-	    "movb %%al,%%ah\n"
-	    "1:\tlodsb\n\t"
-	    "cmpb %%ah,%%al\n\t"
-	    "je 2f\n\t"
-	    "testb %%al,%%al\n\t"
-	    "jne 1b\n\t"
-	    "movl $1,%1\n"
-	    "2:\tmovl %1,%0\n\t"
-	    "decl %0"
-	    :"=a" (__res), "=&S" (d0) : "1" (s),"0" (c));
-    return __res;
+	int esi;
+	register char * eax;
+	__asm__ __volatile__(
+		"	movb	%%al, %%ah\n"
+		"1:	lodsb\n"
+		"	cmpb	%%ah, %%al\n"
+		"	je	2f\n"
+		"	testb	%%al, %%al\n"
+		"	jnz	1b\n"
+		"	movl	$1, %%esi\n" /* can use shorter xor + inc */
+		"2:	leal	-1(%%esi), %%eax\n"
+		: "=a" (eax), "=&S" (esi)
+		: "0" (c), "1" (s)
+		/* no clobbers */
+	);
+	return eax;
 }
 libc_hidden_def(strchr)
 #ifdef __UCLIBC_SUSV3_LEGACY__
diff --git a/libc/string/i386/strchrnul.c b/libc/string/i386/strchrnul.c
new file mode 100644
index 000000000..d48427214
--- /dev/null
+++ b/libc/string/i386/strchrnul.c
@@ -0,0 +1,47 @@
+/*
+ * Adapted from strchr.c code
+ *
+ * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+#include <string.h>
+
+#undef strchrnul
+/*#define strchrnul TESTING*/
+char *strchrnul(const char *s, int c)
+{
+	int esi;
+	char *eax;
+	__asm__ __volatile__(
+		"	movb	%%al, %%ah\n"
+		"1:	lodsb\n"
+		"	cmpb	%%ah, %%al\n"
+		"	je	2f\n"
+		"	testb	%%al, %%al\n"
+		"	jnz	1b\n"
+		/* with this, we'd get strchr(): */
+		/* "	movl	$1, %%esi\n" */
+		"2:	leal	-1(%%esi), %%eax\n"
+		: "=a" (eax), "=&S" (esi)
+		: "0" (c), "1" (s)
+		/* no clobbers */
+	);
+	return eax;
+}
+#ifndef strchrnul
+libc_hidden_def(strchrnul)
+#else
+/* Uncomment TESTING, gcc -D_GNU_SOURCE -m32 -Os strchrnul.c -o strchrnul
+ * and run ./strchrnul
+ */
+int main()
+{
+	static const char str[] = "abc.def";
+	printf((char*)strchrnul(str, '.') - str == 3 ? "ok\n" : "BAD!\n");
+	printf((char*)strchrnul(str, '*') - str == 7 ? "ok\n" : "BAD!\n");
+	printf((char*)strchrnul(str,   0) - str == 7 ? "ok\n" : "BAD!\n");
+	printf((char*)strchrnul(str+3, '.') - str == 3 ? "ok\n" : "BAD!\n");
+}
+#endif
diff --git a/libc/string/i386/strcmp.c b/libc/string/i386/strcmp.c
index eff230c5c..9621f66f8 100644
--- a/libc/string/i386/strcmp.c
+++ b/libc/string/i386/strcmp.c
@@ -32,7 +32,6 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strcmp) */
 int strcmp(const char *cs, const char *ct)
 {
     int d0, d1;
@@ -55,7 +54,6 @@ int strcmp(const char *cs, const char *ct)
 libc_hidden_def(strcmp)
 
 #ifndef __UCLIBC_HAS_LOCALE__
-/* Experimentally off - libc_hidden_proto(strcoll) */
 strong_alias(strcmp,strcoll)
 libc_hidden_def(strcoll)
 #endif
diff --git a/libc/string/i386/strcpy.c b/libc/string/i386/strcpy.c
index 09065a9b7..fff1bd006 100644
--- a/libc/string/i386/strcpy.c
+++ b/libc/string/i386/strcpy.c
@@ -32,7 +32,7 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strcpy) */
+#undef strcpy
 char *strcpy(char * dest, const char * src)
 {
     int d0, d1, d2;
diff --git a/libc/string/i386/string.h b/libc/string/i386/string.h
new file mode 100644
index 000000000..cf4333dec
--- /dev/null
+++ b/libc/string/i386/string.h
@@ -0,0 +1,338 @@
+/*
+ * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball
+ */
+
+#if !defined _STRING_H
+#error "Never use <libc-string_i386.h> directly; include <string.h> instead"
+#endif
+
+#ifndef _LIBC_STRING_i386_H
+#define _LIBC_STRING_i386_H 1
+
+static __always_inline
+void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count)
+{
+	int ecx, edi;
+
+	if (count == 0)
+		return s;
+
+	/* Very small (2 stores or less) are best done with direct
+	 * mov <const>,<mem> instructions (they do not clobber registers) */
+	if (count == 1) {
+		*(char *)(s + 0) = eax;
+		return s;
+	}
+
+	/* You wonder why & 0xff is needed? Try memset(p, '\xff', size).
+	 * If char is signed, '\xff' == -1! */
+	eax = (eax & 0xff) * 0x01010101; /* done at compile time */
+
+	if (count == 2) {
+		*(short *)(s + 0) = eax;
+		return s;
+	}
+	if (count == 3) {
+		*(short *)(s + 0) = eax;
+		*(char *) (s + 2) = eax;
+		return s;
+	}
+	if (count == 1*4 + 0) {
+		*(int *)(s + 0) = eax;
+		return s;
+	}
+	if (count == 1*4 + 1) {
+		*(int *) (s + 0) = eax;
+		*(char *)(s + 4) = eax;
+		return s;
+	}
+	if (count == 1*4 + 2) {
+		*(int *)  (s + 0) = eax;
+		*(short *)(s + 4) = eax;
+		return s;
+	}
+
+	/* Small string stores: don't clobber ecx
+	 * (clobbers only eax and edi) */
+#define small_store(arg) { \
+	__asm__ __volatile__( \
+		arg \
+		: "=&D" (edi) \
+		: "a" (eax), "0" (s) \
+		: "memory" \
+	); \
+	return s; \
+}
+	if (count == 1*4 + 3) small_store("stosl; stosw; stosb");
+	if (count == 2*4 + 0) {
+		((int *)s)[0] = eax;
+		((int *)s)[1] = eax;
+		return s;
+	}
+	if (count == 2*4 + 1) small_store("stosl; stosl; stosb");
+	if (count == 2*4 + 2) small_store("stosl; stosl; stosw");
+	if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb");
+	if (count == 3*4 + 0) small_store("stosl; stosl; stosl");
+	if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb");
+	if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw");
+	if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb");
+	if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl");
+	if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb");
+	/* going over 7 bytes is suboptimal */
+	/* stosw is 2-byte insn, so this one takes 6 bytes: */
+	if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw");
+	/* 7 bytes */
+	if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb");
+	/* 5 bytes */
+	if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl");
+	/* 6 bytes */
+	if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb");
+	/* 7 bytes */
+	if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw");
+	/* 8 bytes, but oh well... */
+	if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb");
+	/* 6 bytes */
+	if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl");
+	/* the rest would be 7+ bytes and is handled below instead */
+#undef small_store
+
+	/* Not small, but multiple-of-4 store.
+	 * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */
+	__asm__ __volatile__(
+		"	rep; stosl\n"
+		: "=&c" (ecx), "=&D" (edi)
+		: "a" (eax), "0" (count / 4), "1" (s)
+		: "memory"
+	);
+	return s;
+}
+#if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */
+#define memset(s, c, count) ( \
+	( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \
+	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
+	) \
+	? memset((s), (c), (count)) \
+	: inlined_memset_const_c_count4((s), (c), (count)) \
+	)
+#endif
+
+
+static __always_inline
+void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count)
+{
+	int ecx;
+	char *esi, *edi;
+
+	if (count == 0)
+		return d;
+
+	if (count == 1) {
+		*(char *)d = *(char *)s;
+		return d + 1;
+	}
+	if (count == 2) {
+		*(short *)d = *(short *)s;
+		return d + 2;
+	}
+	/* Small string moves: don't clobber ecx
+	 * (clobbers only esi and edi) */
+#define small_move(arg) { \
+	__asm__ __volatile__( \
+		arg \
+		: "=&S" (esi), "=&D" (edi) \
+		: "0" (s), "1" (d) \
+		: "memory" \
+	); \
+	return edi; \
+}
+	if (count == 3) small_move("movsw; movsb");
+	if (count == 1*4 + 0) {
+		*(int *)d = *(int *)s;
+		return d + 4;
+	}
+	if (count == 1*4 + 1) small_move("movsl; movsb");
+	if (count == 1*4 + 2) small_move("movsl; movsw");
+	if (count == 1*4 + 3) small_move("movsl; movsw; movsb");
+	if (count == 2*4 + 0) small_move("movsl; movsl");
+	if (count == 2*4 + 1) small_move("movsl; movsl; movsb");
+	if (count == 2*4 + 2) small_move("movsl; movsl; movsw");
+	if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb");
+	if (count == 3*4 + 0) small_move("movsl; movsl; movsl");
+	if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb");
+	if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw");
+	if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb");
+	if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl");
+	if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb");
+	/* going over 7 bytes is suboptimal */
+	/* movsw is 2-byte insn, so this one takes 6 bytes: */
+	if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw");
+	/* 7 bytes */
+	if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb");
+	/* 5 bytes */
+	if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl");
+	/* 6 bytes */
+	if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb");
+	/* 7 bytes */
+	if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw");
+	/* 8 bytes, but oh well... */
+	if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb");
+	/* 6 bytes */
+	if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl");
+	/* the rest would be 7+ bytes and is handled below instead */
+#undef small_move
+
+	/* Not small, but multiple-of-4 move.
+	 * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */
+	__asm__ __volatile__(
+		"	rep; movsl\n"
+		: "=&c" (ecx), "=&S" (esi), "=&D" (edi)
+		: "0" (count / 4), "1" (s), "2" (d)
+		: "memory"
+	);
+	return edi;
+}
+static __always_inline
+void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count)
+{
+	inlined_mempcpy_const_count4(d, s, count);
+	return d;
+}
+#if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */
+#define mempcpy(d, s, count) ( \
+	( !(__builtin_constant_p(count)) \
+	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
+	) \
+	? mempcpy((d), (s), (count)) \
+	: inlined_mempcpy_const_count4((d), (s), (count)) \
+	)
+#define memcpy(d, s, count) ( \
+	( !(__builtin_constant_p(count)) \
+	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
+	) \
+	? memcpy((d), (s), (count)) \
+	: inlined_memcpy_const_count4((d), (s), (count)) \
+	)
+#endif
+
+
+static __always_inline
+size_t inlined_strlen(const char *s)
+{
+	int edi;
+	int ecx;
+	__asm__ __volatile__(
+		"	repne; scasb\n"
+	/*	"	notl	%0\n" */
+	/*	"	decl	%0\n" */
+		: "=c" (ecx), "=&D" (edi)
+		: "1" (s), "a" (0), "0" (0xffffffffu)
+		/* : no clobbers */
+	);
+	return -ecx - 1;
+}
+#if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */
+#define strlen(s) inlined_strlen(s)
+#endif
+
+
+static __always_inline
+char *inlined_stpcpy(char *dest, const char *src)
+{
+	char *esi, *edi;
+	int eax;
+	__asm__ __volatile__(
+		"1:	lodsb\n"
+		"	stosb\n"
+		"	testb	%%al, %%al\n"
+		"	jnz	1b\n"
+		: "=&S" (esi), "=&D" (edi), "=&a" (eax)
+		: "0" (src), "1" (dest)
+		: "memory"
+	);
+	return edi - 1;
+}
+static __always_inline
+char *inlined_strcpy(char *dest, const char *src)
+{
+	inlined_stpcpy(dest, src);
+	return dest;
+}
+#if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */
+#define stpcpy(dest, src) inlined_stpcpy(dest, src)
+#define strcpy(dest, src) inlined_strcpy(dest, src)
+#endif
+
+
+static __always_inline
+void *inlined_memchr(const void *s, int c, size_t count)
+{
+	void *edi;
+	int ecx;
+	/* Unfortunately, c gets loaded to %eax (wide insn), not %al */
+	__asm__ __volatile__(
+		"	jecxz	1f\n"
+		"	repne; scasb\n"
+		"	leal	-1(%%edi), %%edi\n"
+		"	je	2f\n"
+		"1:\n"
+		"	xorl	%%edi, %%edi\n"
+		"2:\n"
+		: "=&D" (edi), "=&c" (ecx)
+		: "a" (c), "0" (s), "1" (count)
+		/* : no clobbers */
+	);
+	return edi;
+}
+static __always_inline
+void *inlined_memchr_const_c(const void *s, int c, size_t count)
+{
+#if defined __OPTIMIZE__
+	void *edi;
+	int ecx, eax;
+	__asm__ __volatile__(
+		"	jecxz	1f\n"
+		"	movb	%4, %%al\n" /* const c to %%al */
+		"	repne; scasb\n"
+		"	leal	-1(%%edi), %%edi\n"
+		"	je	2f\n"
+		"1:\n"
+		"	xorl	%%edi, %%edi\n"
+		"2:\n"
+		: "=&D" (edi), "=&c" (ecx), "=&a" (eax)
+		: "0" (s), "i" (c), "1" (count)
+		/* : no clobbers */
+	);
+	return edi;
+#else
+	/* With -O0, gcc can't figure out how to encode CONST c
+	 * as an immediate operand. Generating slightly bigger code
+	 * (usually "movl CONST,%eax", 3 bytes bigger than needed):
+	 */
+	void *edi;
+	int ecx, eax;
+	__asm__ __volatile__(
+		"	jecxz	1f\n"
+		"	repne; scasb\n"
+		"	leal	-1(%%edi), %%edi\n"
+		"	je	2f\n"
+		"1:\n"
+		"	xorl	%%edi, %%edi\n"
+		"2:\n"
+		: "=&D" (edi), "=&c" (ecx), "=&a" (eax)
+		: "0" (s), "2" (c), "1" (count)
+		/* : no clobbers */
+	);
+	return edi;
+#endif
+}
+#if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */
+#define memchr(s, c, count) ( \
+	__builtin_constant_p(c) \
+	? inlined_memchr_const_c(s, (c) & 0xff, count) \
+	: inlined_memchr(s, c, count) \
+	)
+#endif
+
+#endif /* _LIBC_STRING_i386_H  */
diff --git a/libc/string/i386/strlen.c b/libc/string/i386/strlen.c
index 61a178393..ff2baeb38 100644
--- a/libc/string/i386/strlen.c
+++ b/libc/string/i386/strlen.c
@@ -32,17 +32,17 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strlen) */
+#undef strlen
 size_t strlen(const char *s)
 {
-    int d0;
-    register int __res;
-    __asm__ __volatile__(
-	    "repne\n\t"
-	    "scasb\n\t"
-	    "notl %0\n\t"
-	    "decl %0"
-	    :"=c" (__res), "=&D" (d0) :"1" (s),"a" (0), "0" (0xffffffff));
-    return __res;
+	int eax, ecx, edi;
+	__asm__ __volatile__(
+		"	repne; scasb\n"
+		"	notl	%%ecx\n"
+		"	leal	-1(%%ecx), %%eax\n"
+		: "=&c" (ecx), "=&D" (edi), "=&a" (eax)
+		: "0" (0xffffffff), "1" (s), "2" (0)
+	);
+	return eax;
 }
 libc_hidden_def(strlen)
diff --git a/libc/string/i386/strncat.c b/libc/string/i386/strncat.c
index 3872679d5..12f0a302b 100644
--- a/libc/string/i386/strncat.c
+++ b/libc/string/i386/strncat.c
@@ -32,30 +32,55 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strncat) */
-char *strncat(char * dest,
-	const char * src, size_t count)
+#undef strncat
+/*#define strncat TESTING*/
+char *strncat(char * dest, const char * src, size_t count)
 {
-    int d0, d1, d2, d3;
-    __asm__ __volatile__(
-	    "repne\n\t"
-	    "scasb\n\t"
-	    "decl %1\n\t"
-	    "movl %8,%3\n"
-	    "incl %3\n"
-	    "1:\tdecl %3\n\t"
-	    "jz 2f\n"
-	    "lodsb\n\t"
-	    "stosb\n\t"
-	    "testb %%al,%%al\n\t"
-	    "jne 1b\n"
-	    "jmp 3f\n"
-	    "2:\txorl %2,%2\n\t"
-	    "stosb\n"
-	    "3:"
-	    : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
-	    : "0" (src),"1" (dest),"2" (0),"3" (0xffffffff), "g" (count)
-	    : "memory");
-    return dest;
+	int esi, edi, eax, ecx, edx;
+	__asm__ __volatile__(
+		"	xorl	%%eax, %%eax\n"
+		"	incl	%%edx\n"
+		"	pushl	%%edi\n" /* save dest */
+		"	repne; scasb\n"
+		"	decl	%%edi\n" /* edi => NUL in dest */
+		/* count-- */
+		"1:	decl	%%edx\n"
+		/* if count reached 0, store NUL and bail out */
+		"	movl	%%edx, %%eax\n"
+		"	jz	2f\n"
+		/* else copy a char */
+		"	lodsb\n"
+		"2:	stosb\n"
+		"	testb	%%al, %%al\n"
+		"	jnz	1b\n"
+		/* end of loop */
+		"	popl	%%eax\n" /* restore dest into eax */
+		: "=&S" (esi), "=&D" (edi), "=&a" (eax), "=&c" (ecx), "=&d" (edx)
+		: "0" (src), "1" (dest), "3" (0xffffffff), "4" (count)
+		: "memory"
+	);
+	return (char *)eax;
 }
+#ifndef strncat
 libc_hidden_def(strncat)
+#else
+/* Uncomment TESTING, gcc -m32 -Os strncat.c -o strncat
+ * and run ./strncat
+ */
+int main()
+{
+	char buf[99];
+
+	strcpy(buf, "abc"); buf[4] = '*'; strncat(buf, "def", 0);
+	printf(strcmp(buf, "abc") == 0 && buf[4] == '*' ? "ok\n" : "BAD!\n");
+
+	strcpy(buf, "abc"); buf[6] = 1; buf[7] = '*'; strncat(buf, "def", 50);
+	printf(strcmp(buf, "abcdef") == 0 && buf[7] == '*' ? "ok\n" : "BAD!\n");
+
+	strcpy(buf, "abc"); buf[6] = 1; buf[7] = '*'; strncat(buf, "def", -1);
+	printf(strcmp(buf, "abcdef") == 0 && buf[7] == '*' ? "ok\n" : "BAD!\n");
+
+	strcpy(buf, "abc"); buf[6] = 1; buf[7] = '*'; strncat(buf, "def123", 3);
+	printf(strcmp(buf, "abcdef") == 0 && buf[7] == '*' ? "ok\n" : "BAD!\n");
+}
+#endif
diff --git a/libc/string/i386/strncmp.c b/libc/string/i386/strncmp.c
index a14bb503b..bfb20c307 100644
--- a/libc/string/i386/strncmp.c
+++ b/libc/string/i386/strncmp.c
@@ -32,27 +32,28 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strncmp) */
+#undef strncmp
 int strncmp(const char *cs, const char *ct, size_t count)
 {
-    register int __res;
-    int d0, d1, d2;
-    __asm__ __volatile__(
-	    "incl %3\n"
-	    "1:\tdecl %3\n\t"
-	    "jz 2f\n"
-	    "lodsb\n\t"
-	    "scasb\n\t"
-	    "jne 3f\n\t"
-	    "testb %%al,%%al\n\t"
-	    "jne 1b\n"
-	    "2:\txorl %%eax,%%eax\n\t"
-	    "jmp 4f\n"
-	    "3:\tsbbl %%eax,%%eax\n\t"
-	    "orb $1,%%al\n"
-	    "4:"
-	    :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
-	    :"1" (cs),"2" (ct),"3" (count));
-    return __res;
+	int eax;
+	int esi, edi, ecx;
+	__asm__ __volatile__(
+		"	incl	%%ecx\n"
+		"1:	decl	%%ecx\n"
+		"	jz	2f\n"
+		"	lodsb\n"
+		"	scasb\n"
+		"	jne	3f\n"
+		"	testb	%%al, %%al\n"
+		"	jnz	1b\n"
+		"2:	xorl	%%eax, %%eax\n"
+		"	jmp	4f\n"
+		"3:	sbbl	%%eax, %%eax\n"
+		"	orb	$1, %%al\n"
+		"4:\n"
+		: "=a" (eax), "=&S" (esi), "=&D" (edi), "=&c" (ecx)
+		: "1" (cs), "2" (ct), "3" (count)
+	);
+	return eax;
 }
 libc_hidden_weak(strncmp)
diff --git a/libc/string/i386/strncpy.c b/libc/string/i386/strncpy.c
index 76aa6ae1b..99d104b0d 100644
--- a/libc/string/i386/strncpy.c
+++ b/libc/string/i386/strncpy.c
@@ -32,25 +32,44 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strncpy) */
+#undef strncpy
+/*#define strncpy TESTING*/
 char *strncpy(char * dest, const char * src, size_t count)
 {
-    int d0, d1, d2, d3;
-    __asm__ __volatile__(
-	    "incl %2\n"
-	    "1:\n"
-	    "decl %2\n"
-	    "jz 2f\n"
-	    "lodsb\n\t"
-	    "stosb\n\t"
-	    "testb %%al,%%al\n\t"
-	    "jne 1b\n\t"
-	    "decl %2\n"
-	    "rep\n\t"
-	    "stosb\n"
-	    "2:"
-	    : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
-	    :"0" (src),"1" (dest),"2" (count) : "memory");
-    return dest;
+	int esi, edi, ecx, eax;
+	__asm__ __volatile__(
+		"1:	subl	$1, %%ecx\n" /* not dec! it doesnt set CF */
+		"	jc	2f\n"
+		"	lodsb\n"
+		"	stosb\n"
+		"	testb	%%al, %%al\n"
+		"	jnz	1b\n"
+		"	rep; stosb\n"
+		"2:\n"
+		: "=&S" (esi), "=&D" (edi), "=&c" (ecx), "=&a" (eax)
+		: "0" (src), "1" (dest), "2" (count)
+		: "memory"
+	);
+	return dest;
 }
+#ifndef strncpy
 libc_hidden_def(strncpy)
+#else
+/* Uncomment TESTING, gcc -D_GNU_SOURCE -m32 -Os strncpy.c -o strncpy
+ * and run ./strncpy
+ */
+int main()
+{
+	static char str[99];
+
+	str[3] = '*'; str[4] = 0; strncpy(str, "abc", 3);
+	printf(strcmp(str, "abc*") == 0 ? "ok\n" : "BAD!\n");
+
+	str[4] = '*'; str[5] = '+'; strncpy(str, "abc", 5);
+	printf(strcmp(str, "abc") == 0 && str[4] == 0 && str[5] == '+' ?
+				"ok\n" : "BAD!\n");
+	strncpy(str, "def", 0); /* should do nothing */
+	printf(strcmp(str, "abc") == 0 && str[4] == 0 && str[5] == '+' ?
+				"ok\n" : "BAD!\n");
+}
+#endif
diff --git a/libc/string/i386/strnlen.c b/libc/string/i386/strnlen.c
index 02c72f530..f58f698d1 100644
--- a/libc/string/i386/strnlen.c
+++ b/libc/string/i386/strnlen.c
@@ -33,24 +33,43 @@
 #include <string.h>
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(strnlen) */
+
+#undef strnlen
+/*#define strnlen TESTING*/
 size_t strnlen(const char *s, size_t count)
 {
-    int d0;
-    register int __res;
-    __asm__ __volatile__(
-	    "movl %2,%0\n\t"
-	    "incl %1\n"
-	    "jmp 2f\n"
-	    "1:\tcmpb $0,(%0)\n\t"
-	    "je 3f\n\t"
-	    "incl %0\n"
-	    "2:\tdecl %1\n\t"
-	    "jne 1b\n"
-	    "3:\tsubl %2,%0"
-	    :"=a" (__res), "=&d" (d0)
-	    :"c" (s),"1" (count));
-    return __res;
+	int edx;
+	int eax;
+	__asm__ __volatile__(
+		"	leal	-1(%%ecx), %%eax\n"
+		"1:	incl	%%eax\n"
+		"	decl	%%edx\n"
+		"	jz	3f\n"
+		"	cmpb	$0, (%%eax)\n"
+		"	jnz	1b\n"
+		"3:	subl	%%ecx, %%eax"
+		: "=a" (eax), "=&d" (edx)
+		: "c" (s), "1" (count + 1)
+	);
+	return eax;
 }
+#ifndef strnlen
 libc_hidden_def(strnlen)
+#else
+/* Uncomment TESTING, gcc -D_GNU_SOURCE -m32 -Os strnlen.c -o strnlen
+ * and run ./strnlen
+ */
+int main()
+{
+	printf(strnlen("abc\0def", -2) == 3 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", -1) == 3 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", 0) == 0 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", 1) == 1 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", 2) == 2 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", 3) == 3 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", 4) == 3 ? "ok\n" : "BAD!\n");
+	printf(strnlen("abc\0def", 5) == 3 ? "ok\n" : "BAD!\n");
+}
+#endif
+
 #endif
diff --git a/libc/string/i386/strrchr.c b/libc/string/i386/strrchr.c
index ef378685b..5c349f683 100644
--- a/libc/string/i386/strrchr.c
+++ b/libc/string/i386/strrchr.c
@@ -32,21 +32,25 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(strrchr) */
 char *strrchr(const char *s, int c)
 {
-    int d0, d1;
-    register char * __res;
-    __asm__ __volatile__(
-	    "movb %%al,%%ah\n"
-	    "1:\tlodsb\n\t"
-	    "cmpb %%ah,%%al\n\t"
-	    "jne 2f\n\t"
-	    "leal -1(%%esi),%0\n"
-	    "2:\ttestb %%al,%%al\n\t"
-	    "jne 1b"
-	    :"=g" (__res), "=&S" (d0), "=&a" (d1) :"0" (0),"1" (s),"2" (c));
-    return __res;
+	char *eax;
+
+	__asm__ __volatile__(
+		"	movb	%%cl, %%ch\n"
+		"1:	movb	(%1), %%cl\n" /* load char */
+		"	cmpb	%%cl, %%ch\n" /* char == c? */
+		"	jne	2f\n"
+		"	movl	%1, %%eax\n"
+		"2:	incl	%1\n"
+		"	testb	%%cl, %%cl\n" /* char == NUL? */
+		"	jnz	1b\n"
+		/* "=c": use ecx, not ebx (-fpic uses it). */
+		: "=a" (eax), "=r" (s), "=c" (c)
+		: "0" (0), "1" (s), "2" (c)
+		/* : no clobbers */
+	);
+	return eax;
 }
 libc_hidden_def(strrchr)
 #ifdef __UCLIBC_SUSV3_LEGACY__
diff --git a/libc/string/ia64/bcopy.S b/libc/string/ia64/bcopy.S
index c5637c369..62da68d74 100644
--- a/libc/string/ia64/bcopy.S
+++ b/libc/string/ia64/bcopy.S
@@ -1,4 +1,4 @@
-#include "sysdep.h"
+#include <sysdep.h>
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
 
diff --git a/libc/string/ia64/bzero.S b/libc/string/ia64/bzero.S
index d390838a6..79419579a 100644
--- a/libc/string/ia64/bzero.S
+++ b/libc/string/ia64/bzero.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: dest
 
@@ -32,7 +31,7 @@
    Since a stf.spill f0 can store 16B in one go, we use this instruction
    to get peak speed.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 
 #ifdef __UCLIBC_SUSV3_LEGACY__
 
@@ -47,13 +46,13 @@
 #define ptr1		r28
 #define ptr2		r27
 #define ptr3		r26
-#define ptr9 		r24
+#define ptr9		r24
 #define	loopcnt		r23
 #define linecnt		r22
 #define bytecnt		r21
 
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6	// default register for same-cycle branches
+/* This routine uses only scratch predicate registers (p6 - p15) */
+#define p_scr		p6	/* default register for same-cycle branches */
 #define p_unalgn	p9
 #define p_y		p11
 #define p_n		p12
@@ -65,7 +64,7 @@
 #define MIN1		15
 #define MIN1P1HALF	8
 #define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
+#define LSIZE_SH        7			/* shift amount */
 #define PREF_AHEAD	8
 
 #define USE_FLP
@@ -87,49 +86,49 @@ ENTRY(bzero)
 	movi0	save_lc = ar.lc
 } { .mmi
 	.body
-	mov	ret0 = dest		// return value
+	mov	ret0 = dest		/* return value */
 	nop.m	0
 	cmp.eq	p_scr, p0 = cnt, r0
 ;; }
 { .mmi
-	and	ptr2 = -(MIN1+1), dest	// aligned address
-	and	tmp = MIN1, dest	// prepare to check for alignment
-	tbit.nz p_y, p_n = dest, 0	// Do we have an odd address? (M_B_U)
+	and	ptr2 = -(MIN1+1), dest	/* aligned address */
+	and	tmp = MIN1, dest	/* prepare to check for alignment */
+	tbit.nz p_y, p_n = dest, 0	/* Do we have an odd address? (M_B_U) */
 } { .mib
 	mov	ptr1 = dest
 	nop.i	0
-(p_scr)	br.ret.dpnt.many rp		// return immediately if count = 0
+(p_scr)	br.ret.dpnt.many rp		/* return immediately if count = 0 */
 ;; }
 { .mib
 	cmp.ne	p_unalgn, p0 = tmp, r0
-} { .mib					// NB: # of bytes to move is 1
-	sub	bytecnt = (MIN1+1), tmp		//     higher than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
+} { .mib					/* NB: # of bytes to move is 1 */
+	sub	bytecnt = (MIN1+1), tmp		/*     higher than loopcnt */
+	cmp.gt	p_scr, p0 = 16, cnt		/* is it a minimalistic task? */
+(p_scr)	br.cond.dptk.many .move_bytes_unaligned	/* go move just a few (M_B_U) */
 ;; }
 { .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
+(p_unalgn) add	ptr1 = (MIN1+1), ptr2		/* after alignment */
+(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		/* after alignment */
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	/* should we do a st8 ? */
 ;; }
 { .mib
 (p_y)	add	cnt = -8, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	/* should we do a st4 ? */
 } { .mib
 (p_y)	st8	[ptr2] = r0,-4
 (p_n)	add	ptr2 = 4, ptr2
 ;; }
 { .mib
 (p_yy)	add	cnt = -4, cnt
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	/* should we do a st2 ? */
 } { .mib
 (p_yy)	st4	[ptr2] = r0,-2
 (p_nn)	add	ptr2 = 2, ptr2
 ;; }
 { .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
+	mov	tmp = LINE_SIZE+1		/* for compare */
 (p_y)	add	cnt = -2, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	/* should we do a st1 ? */
 } { .mmi
 	nop.m	0
 (p_y)	st2	[ptr2] = r0,-1
@@ -138,44 +137,44 @@ ENTRY(bzero)
 
 { .mmi
 (p_yy)	st1	[ptr2] = r0
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
+	cmp.gt	p_scr, p0 = tmp, cnt		/* is it a minimalistic task? */
 } { .mbb
 (p_yy)	add	cnt = -1, cnt
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
+(p_scr)	br.cond.dpnt.many .fraction_of_line	/* go move just a few */
 ;; }
 { .mib
-	nop.m 	0
+	nop.m	0
 	shr.u	linecnt = cnt, LSIZE_SH
 	nop.b	0
 ;; }
 
 	.align 32
-.l1b:	// ------------------//  L1B: store ahead into cache lines; fill later
+.l1b:	/* ------------------  L1B: store ahead into cache lines; fill later */
 { .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
+	and	tmp = -(LINE_SIZE), cnt		/* compute end of range */
+	mov	ptr9 = ptr1			/* used for prefetching */
+	and	cnt = (LINE_SIZE-1), cnt	/* remainder */
 } { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
+	mov	loopcnt = PREF_AHEAD-1		/* default prefetch loop */
+	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	/* check against actual value */
 ;; }
 { .mmi
 (p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1	// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1	// first address beyond total range
+	add	ptr2 = 16, ptr1	/* start of stores (beyond prefetch stores) */
+	add	ptr1 = tmp, ptr1	/* first address beyond total range */
 ;; }
 { .mmi
-	add	tmp = -1, linecnt	// next loop count
+	add	tmp = -1, linecnt	/* next loop count */
 	movi0	ar.lc = loopcnt
 ;; }
 .pref_l1b:
 { .mib
-	stf.spill [ptr9] = f0, 128	// Do stores one cache line apart
+	stf.spill [ptr9] = f0, 128	/* Do stores one cache line apart */
 	nop.i   0
 	br.cloop.dptk.few .pref_l1b
 ;; }
 { .mmi
-	add	ptr0 = 16, ptr2		// Two stores in parallel
+	add	ptr0 = 16, ptr2		/* Two stores in parallel */
 	movi0	ar.lc = tmp
 ;; }
 .l1bx:
@@ -190,7 +189,7 @@ ENTRY(bzero)
  { .mmi
 	stf.spill [ptr2] = f0, 32
 	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1	// do we need more prefetching?
+	cmp.lt	p_scr, p0 = ptr9, ptr1	/* do we need more prefetching? */
  ;; }
 { .mmb
 	stf.spill [ptr2] = f0, 32
@@ -198,14 +197,14 @@ ENTRY(bzero)
 	br.cloop.dptk.few .l1bx
 ;; }
 { .mib
-	cmp.gt  p_scr, p0 = 8, cnt	// just a few bytes left ?
+	cmp.gt  p_scr, p0 = 8, cnt	/* just a few bytes left ? */
 (p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment
 ;; }
 
 .fraction_of_line:
 { .mib
 	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   	// loopcnt = cnt / 32
+	shr.u	loopcnt = cnt, 5	/* loopcnt = cnt / 32 */
 ;; }
 { .mib
 	cmp.eq	p_scr, p0 = loopcnt, r0
@@ -213,11 +212,11 @@ ENTRY(bzero)
 (p_scr)	br.cond.dpnt.many .store_words
 ;; }
 { .mib
-	and	cnt = 0x1f, cnt		// compute the remaining cnt
+	and	cnt = 0x1f, cnt		/* compute the remaining cnt */
 	movi0   ar.lc = loopcnt
 ;; }
 	.align 32
-.l2:	// -----------------------------//  L2A:  store 32B in 2 cycles
+.l2:	/* -----------------------------  L2A:  store 32B in 2 cycles */
 { .mmb
 	store	[ptr1] = myval, 8
 	store	[ptr2] = myval, 8
@@ -228,38 +227,38 @@ ENTRY(bzero)
 ;; }
 .store_words:
 { .mib
-	cmp.gt	p_scr, p0 = 8, cnt	// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
+	cmp.gt	p_scr, p0 = 8, cnt	/* just a few bytes left ? */
+(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	/* Branch */
 ;; }
 
 { .mmi
-	store	[ptr1] = myval, 8	// store
-	cmp.le	p_y, p_n = 16, cnt	//
-	add	cnt = -8, cnt		// subtract
+	store	[ptr1] = myval, 8	/* store */
+	cmp.le	p_y, p_n = 16, cnt	/* */
+	add	cnt = -8, cnt		/* subtract */
 ;; }
 { .mmi
-(p_y)	store	[ptr1] = myval, 8	// store
+(p_y)	store	[ptr1] = myval, 8	/* store */
 (p_y)	cmp.le.unc p_yy, p_nn = 16, cnt
-(p_y)	add	cnt = -8, cnt		// subtract
+(p_y)	add	cnt = -8, cnt		/* subtract */
 ;; }
-{ .mmi					// store
+{ .mmi					/* store */
 (p_yy)	store	[ptr1] = myval, 8
-(p_yy)	add	cnt = -8, cnt		// subtract
+(p_yy)	add	cnt = -8, cnt		/* subtract */
 ;; }
 
 .move_bytes_from_alignment:
 { .mib
 	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2	// should we terminate with a st4 ?
+	tbit.nz.unc p_y, p0 = cnt, 2	/* should we terminate with a st4 ? */
 (p_scr)	br.cond.dpnt.few .restore_and_exit
 ;; }
 { .mib
 (p_y)	st4	[ptr1] = r0,4
-	tbit.nz.unc p_yy, p0 = cnt, 1	// should we terminate with a st2 ?
+	tbit.nz.unc p_yy, p0 = cnt, 1	/* should we terminate with a st2 ? */
 ;; }
 { .mib
 (p_yy)	st2	[ptr1] = r0,2
-	tbit.nz.unc p_y, p0 = cnt, 0	// should we terminate with a st1 ?
+	tbit.nz.unc p_y, p0 = cnt, 0	/* should we terminate with a st1 ? */
 ;; }
 
 { .mib
@@ -281,38 +280,38 @@ ENTRY(bzero)
 (p_n)	add	ptr2 = 2, ptr1
 } { .mmi
 (p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = r0, 1		// fill 1 (odd-aligned) byte
-(p_y)	add	cnt = -1, cnt		// [15, 14 (or less) left]
+(p_y)	st1	[ptr1] = r0, 1		/* fill 1 (odd-aligned) byte */
+(p_y)	add	cnt = -1, cnt		/* [15, 14 (or less) left] */
 ;; }
 { .mmi
 (p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt	// prepare last store
+	add	ptr3 = ptr1, cnt	/* prepare last store */
 	movi0	ar.lc = save_lc
 } { .mmi
-(p_yy)	st2	[ptr1] = r0, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = r0, 4		// fill 2 (aligned) bytes
-(p_yy)	add	cnt = -4, cnt		// [11, 10 (o less) left]
+(p_yy)	st2	[ptr1] = r0, 4		/* fill 2 (aligned) bytes */
+(p_yy)	st2	[ptr2] = r0, 4		/* fill 2 (aligned) bytes */
+(p_yy)	add	cnt = -4, cnt		/* [11, 10 (o less) left] */
 ;; }
 { .mmi
 (p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3		// last store
-	tbit.nz p_scr, p0 = cnt, 1	// will there be a st2 at the end ?
+	add	ptr3 = -1, ptr3		/* last store */
+	tbit.nz p_scr, p0 = cnt, 1	/* will there be a st2 at the end ? */
 } { .mmi
-(p_y)	st2	[ptr1] = r0, 4		// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = r0, 4		// fill 2 (aligned) bytes
-(p_y)	add	cnt = -4, cnt		// [7, 6 (or less) left]
+(p_y)	st2	[ptr1] = r0, 4		/* fill 2 (aligned) bytes */
+(p_y)	st2	[ptr2] = r0, 4		/* fill 2 (aligned) bytes */
+(p_y)	add	cnt = -4, cnt		/* [7, 6 (or less) left] */
 ;; }
 { .mmi
-(p_yy)	st2	[ptr1] = r0, 4		// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = r0, 4		// fill 2 (aligned) bytes
-					// [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0	// will there be a st1 at the end ?
+(p_yy)	st2	[ptr1] = r0, 4		/* fill 2 (aligned) bytes */
+(p_yy)	st2	[ptr2] = r0, 4		/* fill 2 (aligned) bytes */
+					/* [3, 2 (or less) left] */
+	tbit.nz p_y, p0 = cnt, 0	/* will there be a st1 at the end ? */
 } { .mmi
 (p_yy)	add	cnt = -4, cnt
 ;; }
 { .mmb
-(p_scr)	st2	[ptr1] = r0		// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = r0		// fill last byte (using ptr3)
+(p_scr)	st2	[ptr1] = r0		/* fill 2 (aligned) bytes */
+(p_y)	st1	[ptr3] = r0		/* fill last byte (using ptr3) */
 	br.ret.sptk.many rp
 ;; }
 END(bzero)
diff --git a/libc/string/ia64/memccpy.S b/libc/string/ia64/memccpy.S
index 1afba3637..5c4d7e3c2 100644
--- a/libc/string/ia64/memccpy.S
+++ b/libc/string/ia64/memccpy.S
@@ -14,16 +14,15 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: a pointer to the next byte after char in dest or NULL
 
    Inputs:
         in0:    dest
         in1:    src
-  	in2:	char
+	in2:	char
         in3:    byte count
 
    This implementation assumes little endian mode (UM.be = 0).
@@ -31,7 +30,7 @@
    This implementation assumes that it is safe to do read ahead
    in the src block, without getting beyond its limit.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define OP_T_THRES 	16
@@ -69,75 +68,75 @@ ENTRY(memccpy)
 	.rotr	r[MEMLAT + 7], tmp1[4], tmp2[4], val[4], tmp3[2], pos0[2]
 	.rotp	p[MEMLAT + 6 + 1]
 
-	mov	ret0 = r0		// return NULL if no match
+	mov	ret0 = r0		/* return NULL if no match */
 	.save pr, saved_pr
-	mov	saved_pr = pr		// save the predicate registers
-	mov 	dest = in0		// dest
+	mov	saved_pr = pr		/* save the predicate registers */
+	mov 	dest = in0		/* dest */
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc	// save the loop counter
-        mov 	saved_ec = ar.ec	// save the loop counter
+        mov 	saved_lc = ar.lc	/* save the loop counter */
+        mov 	saved_ec = ar.ec	/* save the loop counter */
 	.body
-	mov 	src = in1		// src
-	extr.u	char = in2, 0, 8	// char
-	mov	len = in3		// len
-	sub	tmp = r0, in0		// tmp = -dest
-	cmp.ne	p7, p0 = r0, r0		// clear p7
+	mov 	src = in1		/* src */
+	extr.u	char = in2, 0, 8	/* char */
+	mov	len = in3		/* len */
+	sub	tmp = r0, in0		/* tmp = -dest */
+	cmp.ne	p7, p0 = r0, r0		/* clear p7 */
 	;;
-	and	loopcnt = 7, tmp	// loopcnt = -dest % 8
-	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
-	mov	ar.ec = 0		// ec not guaranteed zero on entry
-(p6)	br.cond.spnt	.cpyfew		// copy byte by byte
+	and	loopcnt = 7, tmp	/* loopcnt = -dest % 8 */
+	cmp.ge	p6, p0 = OP_T_THRES, len	/* is len <= OP_T_THRES */
+	mov	ar.ec = 0		/* ec not guaranteed zero on entry */
+(p6)	br.cond.spnt	.cpyfew		/* copy byte by byte */
 	;;
 	cmp.eq	p6, p0 = loopcnt, r0
 	mux1	charx8 = char, @brcst
 (p6)	br.cond.sptk .dest_aligned
-	sub	len = len, loopcnt	// len -= -dest % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
+	sub	len = len, loopcnt	/* len -= -dest % 8 */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
 	;;
 	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-	ld1	value = [src], 1	// value = *src++
+.l1:					/* copy -dest % 8 bytes */
+	ld1	value = [src], 1	/* value = *src++ */
 	;;
-	st1	[dest] = value, 1	// *dest++ = value
+	st1	[dest] = value, 1	/* *dest++ = value */
 	cmp.eq	p6, p0 = value, char
 (p6)	br.cond.spnt .foundit
 	br.cloop.dptk .l1
 .dest_aligned:
-	and	sh1 = 7, src 		// sh1 = src % 8
-	and	tmp = -8, len   	// tmp = len & -OPSIZ
-	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	and	len = 7, len ;;		// len = len % 8
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	mov     pr.rot = 1 << 16 ;;	// set rotating predicates
-	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
-	mov	ar.lc = loopcnt		// set LC
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
+	and	sh1 = 7, src 		/* sh1 = src % 8 */
+	and	tmp = -8, len   	/* tmp = len & -OPSIZ */
+	and	asrc = -8, src		/* asrc = src & -OPSIZ  -- align src */
+	shr.u	loopcnt = len, 3	/* loopcnt = len / 8 */
+	and	len = 7, len ;;		/* len = len % 8 */
+	shl	sh1 = sh1, 3		/* sh1 = 8 * (src % 8) */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
+	mov     pr.rot = 1 << 16 ;;	/* set rotating predicates */
+	sub	sh2 = 64, sh1		/* sh2 = 64 - sh1 */
+	mov	ar.lc = loopcnt		/* set LC */
+	cmp.eq  p6, p0 = sh1, r0 	/* is the src aligned? */
 (p6)    br.cond.sptk .src_aligned ;;
-	add	src = src, tmp		// src += len & -OPSIZ
-	mov	ar.ec = MEMLAT + 6 + 1 	// six more passes needed
-	ld8	r[1] = [asrc], 8 	// r[1] = w0
-	cmp.ne	p6, p0 = r0, r0	;;	// clear p6
+	add	src = src, tmp		/* src += len & -OPSIZ */
+	mov	ar.ec = MEMLAT + 6 + 1 	/* six more passes needed */
+	ld8	r[1] = [asrc], 8 	/* r[1] = w0 */
+	cmp.ne	p6, p0 = r0, r0	;;	/* clear p6 */
 	ALIGN(32)
 .l2:
-(p[0])		ld8.s	r[0] = [asrc], 8		// r[0] = w1
-(p[MEMLAT])	shr.u	tmp1[0] = r[1 + MEMLAT], sh1	// tmp1 = w0 >> sh1
-(p[MEMLAT])	shl	tmp2[0] = r[0 + MEMLAT], sh2  	// tmp2 = w1 << sh2
+(p[0])		ld8.s	r[0] = [asrc], 8		/* r[0] = w1 */
+(p[MEMLAT])	shr.u	tmp1[0] = r[1 + MEMLAT], sh1	/* tmp1 = w0 >> sh1 */
+(p[MEMLAT])	shl	tmp2[0] = r[0 + MEMLAT], sh2  	/* tmp2 = w1 << sh2 */
 (p[MEMLAT+4])	xor	tmp3[0] = val[1], charx8
 (p[MEMLAT+5])	czx1.r	pos0[0] = tmp3[1]
-(p[MEMLAT+6])	chk.s	r[6 + MEMLAT], .recovery1	// our data isn't
-							// valid - rollback!
+(p[MEMLAT+6])	chk.s	r[6 + MEMLAT], .recovery1	/* our data isn't */
+							/* valid - rollback! */
 (p[MEMLAT+6])	cmp.ne	p6, p0 = 8, pos0[1]
 (p6)		br.cond.spnt	.gotit
-(p[MEMLAT+6])	st8	[dest] = val[3], 8		// store val to dest
-(p[MEMLAT+3])	or	val[0] = tmp1[3], tmp2[3] 	// val = tmp1 | tmp2
+(p[MEMLAT+6])	st8	[dest] = val[3], 8		/* store val to dest */
+(p[MEMLAT+3])	or	val[0] = tmp1[3], tmp2[3] 	/* val = tmp1 | tmp2 */
 		br.ctop.sptk    .l2
 		br.cond.sptk .cpyfew
 
 .src_aligned:
-		cmp.ne  p6, p0 = r0, r0			// clear p6
-		mov     ar.ec = MEMLAT + 2 + 1 ;;	// set EC
+		cmp.ne  p6, p0 = r0, r0			/* clear p6 */
+		mov     ar.ec = MEMLAT + 2 + 1 ;;	/* set EC */
 .l3:
 (p[0])		ld8.s	r[0] = [src], 8
 (p[MEMLAT])	xor	tmp3[0] = r[MEMLAT], charx8
@@ -149,8 +148,8 @@ ENTRY(memccpy)
 (p[MEMLAT+2])	st8	[dest] = r[MEMLAT+2], 8
 		br.ctop.dptk .l3
 .cpyfew:
-	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
-	adds	len = -1, len		// --len;
+	cmp.eq	p6, p0 = len, r0	/* is len == 0 ? */
+	adds	len = -1, len		/* --len; */
 (p6)	br.cond.spnt	.restore_and_exit ;;
 	mov	ar.lc = len
 .l4:
@@ -163,14 +162,14 @@ ENTRY(memccpy)
 .foundit:
 (p6)	mov	ret0 = dest
 .restore_and_exit:
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	mov 	ar.ec = saved_ec ;;	// restore the epilog counter
+	mov     pr = saved_pr, -1    	/* restore the predicate registers */
+	mov 	ar.lc = saved_lc	/* restore the loop counter */
+	mov 	ar.ec = saved_ec ;;	/* restore the epilog counter */
 	br.ret.sptk.many b0
 .gotit:
 	.pred.rel "mutex" p6, p7
-(p6)	mov	value = val[3]		// if coming from l2
-(p7)	mov	value = r[MEMLAT+2]	// if coming from l3
+(p6)	mov	value = val[3]		/* if coming from l2 */
+(p7)	mov	value = r[MEMLAT+2]	/* if coming from l3 */
 	mov	ar.lc = pos0[1] ;;
 .l5:
 	extr.u	tmp = value, 0, 8 ;;
diff --git a/libc/string/ia64/memchr.S b/libc/string/ia64/memchr.S
index 2bf078fe6..fcd9f9305 100644
--- a/libc/string/ia64/memchr.S
+++ b/libc/string/ia64/memchr.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: the address of the first occurence of chr in str or NULL
 
@@ -40,7 +39,7 @@
    All the loops in this function could have had the internal branch removed
    if br.ctop and br.cloop could be predicated :-(.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define saved_pr	r15
@@ -62,18 +61,18 @@ ENTRY(__memchr)
 	.rotr	value[MEMLAT+1], addr[MEMLAT+3], aux[2], poschr[2]
 	.rotp	p[MEMLAT+3]
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc 	// save the loop counter
+        mov 	saved_lc = ar.lc 	/* save the loop counter */
 	.save pr, saved_pr
-	mov	saved_pr = pr		// save the predicates
+	mov	saved_pr = pr		/* save the predicates */
 	.body
 	mov 	ret0 = str
-	and 	tmp = 7, str		// tmp = str % 8
-	cmp.ne	p7, p0 = r0, r0		// clear p7
-	extr.u	chr = in1, 0, 8		// chr = (unsigned char) in1
+	and 	tmp = 7, str		/* tmp = str % 8 */
+	cmp.ne	p7, p0 = r0, r0		/* clear p7 */
+	extr.u	chr = in1, 0, 8		/* chr = (unsigned char) in1 */
 	mov	len = in2
-	cmp.gtu	p6, p0 = 16, in2	// use a simple loop for short
-(p6)	br.cond.spnt .srchfew ;;	// searches
-	sub	loopcnt = 8, tmp	// loopcnt = 8 - tmp
+	cmp.gtu	p6, p0 = 16, in2	/* use a simple loop for short */
+(p6)	br.cond.spnt .srchfew ;;	/* searches */
+	sub	loopcnt = 8, tmp	/* loopcnt = 8 - tmp */
 	cmp.eq	p6, p0 = tmp, r0
 (p6)	br.cond.sptk	.str_aligned;;
 	sub	len = len, loopcnt
@@ -86,12 +85,12 @@ ENTRY(__memchr)
 (p6)	br.cond.spnt	.foundit
 	br.cloop.sptk	.l1 ;;
 .str_aligned:
-	cmp.ne	p6, p0 = r0, r0		// clear p6
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	and 	len = 7, len ;;		// remaining len = len & 7
+	cmp.ne	p6, p0 = r0, r0		/* clear p6 */
+	shr.u	loopcnt = len, 3	/* loopcnt = len / 8 */
+	and 	len = 7, len ;;		/* remaining len = len & 7 */
 	adds	loopcnt = -1, loopcnt
 	mov	ar.ec = MEMLAT + 3
-	mux1	chrx8 = chr, @brcst ;;	// get a word full of chr
+	mux1	chrx8 = chr, @brcst ;;	/* get a word full of chr */
 	mov	ar.lc = loopcnt
 	mov	pr.rot = 1 << 16 ;;
 .l2:
@@ -114,20 +113,18 @@ ENTRY(__memchr)
 (p6)	br.cond.dpnt	.foundit
 	br.cloop.sptk	.l3 ;;
 .notfound:
-	cmp.ne	p6, p0 = r0, r0	// clear p6 (p7 was already 0 when we got here)
-	mov	ret0 = r0 ;;	// return NULL
+	cmp.ne	p6, p0 = r0, r0	/* clear p6 (p7 was already 0 when we got here) */
+	mov	ret0 = r0 ;;	/* return NULL */
 .foundit:
 	.pred.rel "mutex" p6, p7
-(p6)	adds	ret0 = -1, ret0 		   // if we got here from l1 or l3
-(p7)	add	ret0 = addr[MEMLAT+2], poschr[1]   // if we got here from l2
+(p6)	adds	ret0 = -1, ret0 		   /* if we got here from l1 or l3 */
+(p7)	add	ret0 = addr[MEMLAT+2], poschr[1]   /* if we got here from l2 */
 	mov	pr = saved_pr, -1
 	mov	ar.lc = saved_lc
 	br.ret.sptk.many b0
 
 END(__memchr)
 
-weak_alias (__memchr, memchr)
-#if !__BOUNDED_POINTERS__
-weak_alias (__memchr, __ubp_memchr)
-#endif
-libc_hidden_def (memchr)
+weak_alias(__memchr, memchr)
+weak_alias(__memchr, __ubp_memchr)
+libc_hidden_def(memchr)
diff --git a/libc/string/ia64/memcmp.S b/libc/string/ia64/memcmp.S
index 8b0c096ce..0cf54e7db 100644
--- a/libc/string/ia64/memcmp.S
+++ b/libc/string/ia64/memcmp.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: the result of the comparison
 
@@ -28,16 +27,16 @@
    In this form, it assumes little endian mode.  For big endian mode, the
    the two shifts in .l2 must be inverted:
 
-	shl   	tmp1[0] = r[1 + MEMLAT], sh1   // tmp1 = w0 << sh1
+	shl	tmp1[0] = r[1 + MEMLAT], sh1   // tmp1 = w0 << sh1
 	shr.u   tmp2[0] = r[0 + MEMLAT], sh2   // tmp2 = w1 >> sh2
 
    and all the mux1 instructions should be replaced by plain mov's.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
-#define OP_T_THRES 	16
-#define OPSIZ 		8
+#define OP_T_THRES	16
+#define OPSIZ		8
 #define MEMLAT		2
 
 #define start		r15
@@ -56,85 +55,85 @@
 
 ENTRY(memcmp)
 	.prologue
-	alloc 	r2 = ar.pfs, 3, 37, 0, 40
+	alloc	r2 = ar.pfs, 3, 37, 0, 40
 
 	.rotr	r[MEMLAT + 2], q[MEMLAT + 5], tmp1[4], tmp2[4], val[2]
 	.rotp	p[MEMLAT + 4 + 1]
 
-	mov	ret0 = r0		// by default return value = 0
+	mov	ret0 = r0		/* by default return value = 0 */
 	.save pr, saved_pr
-	mov	saved_pr = pr		// save the predicate registers
+	mov	saved_pr = pr		/* save the predicate registers */
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc	// save the loop counter
+        mov	saved_lc = ar.lc	/* save the loop counter */
 	.body
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-	mov	len = in2		// len
-	sub	tmp = r0, in0		// tmp = -dest
+	mov	dest = in0		/* dest */
+	mov	src = in1		/* src */
+	mov	len = in2		/* len */
+	sub	tmp = r0, in0		/* tmp = -dest */
 	;;
-	and	loopcnt = 7, tmp		// loopcnt = -dest % 8
-	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
-(p6)	br.cond.spnt	.cmpfew			// compare byte by byte
+	and	loopcnt = 7, tmp		/* loopcnt = -dest % 8 */
+	cmp.ge	p6, p0 = OP_T_THRES, len	/* is len <= OP_T_THRES */
+(p6)	br.cond.spnt	.cmpfew			/* compare byte by byte */
 	;;
 	cmp.eq	p6, p0 = loopcnt, r0
 (p6)	br.cond.sptk .dest_aligned
-	sub	len = len, loopcnt	// len -= -dest % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
+	sub	len = len, loopcnt	/* len -= -dest % 8 */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
 	;;
 	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-	ld1	value1 = [src], 1	// value = *src++
+.l1:					/* copy -dest % 8 bytes */
+	ld1	value1 = [src], 1	/* value = *src++ */
 	ld1	value2 = [dest], 1
 	;;
 	cmp.ne	p6, p0 = value1, value2
 (p6)	br.cond.spnt .done
 	br.cloop.dptk .l1
 .dest_aligned:
-	and	sh1 = 7, src 		// sh1 = src % 8
-	and	tmp = -8, len   	// tmp = len & -OPSIZ
-	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	and	len = 7, len ;;		// len = len % 8
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	mov     pr.rot = 1 << 16 ;;	// set rotating predicates
-	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
-	mov	ar.lc = loopcnt		// set LC
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
+	and	sh1 = 7, src		/* sh1 = src % 8 */
+	and	tmp = -8, len		/* tmp = len & -OPSIZ */
+	and	asrc = -8, src		/* asrc = src & -OPSIZ  -- align src */
+	shr.u	loopcnt = len, 3	/* loopcnt = len / 8 */
+	and	len = 7, len ;;		/* len = len % 8 */
+	shl	sh1 = sh1, 3		/* sh1 = 8 * (src % 8) */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
+	mov     pr.rot = 1 << 16 ;;	/* set rotating predicates */
+	sub	sh2 = 64, sh1		/* sh2 = 64 - sh1 */
+	mov	ar.lc = loopcnt		/* set LC */
+	cmp.eq  p6, p0 = sh1, r0	/* is the src aligned? */
 (p6)    br.cond.sptk .src_aligned
-	add	src = src, tmp		// src += len & -OPSIZ
-	mov	ar.ec = MEMLAT + 4 + 1 	// four more passes needed
-	ld8	r[1] = [asrc], 8 ;;	// r[1] = w0
+	add	src = src, tmp		/* src += len & -OPSIZ */
+	mov	ar.ec = MEMLAT + 4 + 1	/* four more passes needed */
+	ld8	r[1] = [asrc], 8 ;;	/* r[1] = w0 */
 	.align	32
 
-// We enter this loop with p6 cleared by the above comparison
+/* We enter this loop with p6 cleared by the above comparison */
 
 .l2:
-(p[0])		ld8	r[0] = [asrc], 8		// r[0] = w1
+(p[0])		ld8	r[0] = [asrc], 8		/* r[0] = w1 */
 (p[0])		ld8	q[0] = [dest], 8
-(p[MEMLAT])	shr.u	tmp1[0] = r[1 + MEMLAT], sh1	// tmp1 = w0 >> sh1
-(p[MEMLAT])	shl	tmp2[0] = r[0 + MEMLAT], sh2  	// tmp2 = w1 << sh2
+(p[MEMLAT])	shr.u	tmp1[0] = r[1 + MEMLAT], sh1	/* tmp1 = w0 >> sh1 */
+(p[MEMLAT])	shl	tmp2[0] = r[0 + MEMLAT], sh2	/* tmp2 = w1 << sh2 */
 (p[MEMLAT+4])	cmp.ne	p6, p0 = q[MEMLAT + 4], val[1]
-(p[MEMLAT+3])	or	val[0] = tmp1[3], tmp2[3] 	// val = tmp1 | tmp2
+(p[MEMLAT+3])	or	val[0] = tmp1[3], tmp2[3]	/* val = tmp1 | tmp2 */
 (p6)		br.cond.spnt .l2exit
 		br.ctop.sptk    .l2
 		br.cond.sptk .cmpfew
 .l3exit:
 	mux1	value1 = r[MEMLAT], @rev
 	mux1	value2 = q[MEMLAT], @rev
-	cmp.ne	p6, p0 = r0, r0	;;	// clear p6
+	cmp.ne	p6, p0 = r0, r0	;;	/* clear p6 */
 .l2exit:
 (p6)	mux1	value1 = val[1], @rev
 (p6)	mux1	value2 = q[MEMLAT + 4], @rev ;;
 	cmp.ltu	p6, p7 = value2, value1 ;;
 (p6)	mov	ret0 = -1
 (p7)	mov	ret0 = 1
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
+	mov     pr = saved_pr, -1	/* restore the predicate registers */
+	mov	ar.lc = saved_lc	/* restore the loop counter */
 	br.ret.sptk.many b0
 .src_aligned:
-	cmp.ne	p6, p0 = r0, r0		// clear p6
-	mov     ar.ec = MEMLAT + 1 ;;	// set EC
+	cmp.ne	p6, p0 = r0, r0		/* clear p6 */
+	mov     ar.ec = MEMLAT + 1 ;;	/* set EC */
 .l3:
 (p[0])		ld8	r[0] = [src], 8
 (p[0])		ld8	q[0] = [dest], 8
@@ -142,8 +141,8 @@ ENTRY(memcmp)
 (p6)		br.cond.spnt .l3exit
 		br.ctop.dptk .l3 ;;
 .cmpfew:
-	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
-	adds	len = -1, len		// --len;
+	cmp.eq	p6, p0 = len, r0	/* is len == 0 ? */
+	adds	len = -1, len		/* --len; */
 (p6)	br.cond.spnt	.restore_and_exit ;;
 	mov	ar.lc = len
 .l4:
@@ -154,10 +153,10 @@ ENTRY(memcmp)
 (p6)	br.cond.spnt	.done
 	br.cloop.dptk	.l4 ;;
 .done:
-(p6)	sub	ret0 = value2, value1	// don't execute it if falling thru
+(p6)	sub	ret0 = value2, value1	/* don't execute it if falling thru */
 .restore_and_exit:
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
+	mov     pr = saved_pr, -1	/* restore the predicate registers */
+	mov	ar.lc = saved_lc	/* restore the loop counter */
 	br.ret.sptk.many b0
 END(memcmp)
 libc_hidden_def (memcmp)
diff --git a/libc/string/ia64/memcpy.S b/libc/string/ia64/memcpy.S
index 810eb0c0e..5f2e79414 100644
--- a/libc/string/ia64/memcpy.S
+++ b/libc/string/ia64/memcpy.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: dest
 
@@ -37,13 +36,13 @@
 
 #define USE_LFETCH
 #define USE_FLP
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define LFETCH_DIST     500
 
-#define ALIGN_UNROLL_no   4 // no. of elements
-#define ALIGN_UNROLL_sh	  2 // (shift amount)
+#define ALIGN_UNROLL_no   4 /* no. of elements */
+#define ALIGN_UNROLL_sh	  2 /* (shift amount) */
 
 #define MEMLAT	8
 #define Nrot	((4*(MEMLAT+2) + 7) & ~7)
@@ -168,76 +167,76 @@ ENTRY(memcpy)
 	.rotr	r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
 	.rotp	p[MEMLAT+2]
 	.rotf	fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
-	mov	ret0 = in0		// return tmp2 = dest
+	mov	ret0 = in0		/* return tmp2 = dest */
 	.save   pr, saved_pr
-	movi0	saved_pr = pr		// save the predicate registers
+	movi0	saved_pr = pr		/* save the predicate registers */
 } { .mmi
-	and	tmp4 = 7, in0 		// check if destination is aligned
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
+	and	tmp4 = 7, in0 		/* check if destination is aligned */
+	mov 	dest = in0		/* dest */
+	mov 	src = in1		/* src */
 ;; }
 { .mii
-	cmp.eq	p_scr, p0 = in2, r0	// if (len == 0)
+	cmp.eq	p_scr, p0 = in2, r0	/* if (len == 0) */
 	.save   ar.lc, saved_lc
-        movi0 	saved_lc = ar.lc	// save the loop counter
+        movi0 	saved_lc = ar.lc	/* save the loop counter */
 	.body
-	cmp.ge	p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
+	cmp.ge	p_few, p0 = OP_T_THRES, in2 /* is len <= OP_T_THRESH */
 } { .mbb
-	mov	len = in2		// len
-(p_scr)	br.cond.dpnt.few .restore_and_exit // 	Branch no. 1: return dest
-(p_few) br.cond.dpnt.many .copy_bytes	// Branch no. 2: copy byte by byte
+	mov	len = in2		/* len */
+(p_scr)	br.cond.dpnt.few .restore_and_exit /* 	Branch no. 1: return dest */
+(p_few) br.cond.dpnt.many .copy_bytes	/* Branch no. 2: copy byte by byte */
 ;; }
 { .mmi
 #if defined(USE_LFETCH)
-	lfetch.nt1 [dest]		//
-	lfetch.nt1 [src]		//
+	lfetch.nt1 [dest]		/* */
+	lfetch.nt1 [src]		/* */
 #endif
-	shr.u	elemcnt = len, 3	// elemcnt = len / 8
+	shr.u	elemcnt = len, 3	/* elemcnt = len / 8 */
 } { .mib
-	cmp.eq	p_scr, p0 = tmp4, r0	// is destination aligned?
-	sub	loopcnt = 7, tmp4	//
+	cmp.eq	p_scr, p0 = tmp4, r0	/* is destination aligned? */
+	sub	loopcnt = 7, tmp4	/* */
 (p_scr) br.cond.dptk.many .dest_aligned
 ;; }
 { .mmi
-	ld1	tmp2 = [src], 1		//
-	sub	len = len, loopcnt, 1	// reduce len
-	movi0	ar.lc = loopcnt		//
+	ld1	tmp2 = [src], 1		/* */
+	sub	len = len, loopcnt, 1	/* reduce len */
+	movi0	ar.lc = loopcnt		/* */
 } { .mib
-	cmp.ne  p_scr, p0 = 0, loopcnt	// avoid loading beyond end-point
+	cmp.ne  p_scr, p0 = 0, loopcnt	/* avoid loading beyond end-point */
 ;; }
 
-.l0:	// ---------------------------- // L0: Align src on 8-byte boundary
+.l0:	/* ---------------------------- L0: Align src on 8-byte boundary */
 { .mmi
-	st1	[dest] = tmp2, 1	//
-(p_scr)	ld1	tmp2 = [src], 1		//
+	st1	[dest] = tmp2, 1	/* */
+(p_scr)	ld1	tmp2 = [src], 1		/* */
 } { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
+	cmp.lt	p_scr, p0 = 1, loopcnt	/* avoid load beyond end-point */
 	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few .l0		//
+	br.cloop.dptk.few .l0		/* */
 ;; }
 
 .dest_aligned:
 { .mmi
-	and	tmp4 = 7, src		// ready for alignment check
-	shr.u	elemcnt = len, 3	// elemcnt = len / 8
+	and	tmp4 = 7, src		/* ready for alignment check */
+	shr.u	elemcnt = len, 3	/* elemcnt = len / 8 */
 ;; }
 { .mib
-	cmp.ne	p_scr, p0 = tmp4, r0	// is source also aligned
-	tbit.nz p_xtr, p_nxtr = src, 3	// prepare a separate move if src
-} { .mib				// is not 16B aligned
-	add	ptr2 = LFETCH_DIST, dest	// prefetch address
+	cmp.ne	p_scr, p0 = tmp4, r0	/* is source also aligned */
+	tbit.nz p_xtr, p_nxtr = src, 3	/* prepare a separate move if src */
+} { .mib				/* is not 16B aligned */
+	add	ptr2 = LFETCH_DIST, dest	/* prefetch address */
 	add	ptr1 = LFETCH_DIST, src
 (p_scr) br.cond.dptk.many .src_not_aligned
 ;; }
 
-// The optimal case, when dest, and src are aligned
+/* The optimal case, when dest, and src are aligned */
 
 .both_aligned:
 { .mmi
 	.pred.rel "mutex",p_xtr,p_nxtr
-(p_xtr)	cmp.gt  p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
-(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt  // Need only N to qualify
-	movi0	pr.rot = 1 << 16	// set rotating predicates
+(p_xtr)	cmp.gt  p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt /* Need N + 1 to qualify */
+(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt  /* Need only N to qualify */
+	movi0	pr.rot = 1 << 16	/* set rotating predicates */
 } { .mib
 (p_scr) br.cond.dpnt.many .copy_full_words
 ;; }
@@ -245,21 +244,21 @@ ENTRY(memcpy)
 { .mmi
 (p_xtr)	load	tempreg = [src], 8
 (p_xtr) add 	elemcnt = -1, elemcnt
-	movi0	ar.ec = MEMLAT + 1	// set the epilog counter
+	movi0	ar.ec = MEMLAT + 1	/* set the epilog counter */
 ;; }
 { .mmi
-(p_xtr) add	len = -8, len		//
-	add 	asrc = 16, src 		// one bank apart (for USE_INT)
-	shr.u	loopcnt = elemcnt, ALIGN_UNROLL_sh  // cater for unrolling
+(p_xtr) add	len = -8, len		/* */
+	add 	asrc = 16, src 		/* one bank apart (for USE_INT) */
+	shr.u	loopcnt = elemcnt, ALIGN_UNROLL_sh  /* cater for unrolling */
 ;;}
 { .mmi
 	add	loopcnt = -1, loopcnt
-(p_xtr)	store	[dest] = tempreg, 8	// copy the "extra" word
+(p_xtr)	store	[dest] = tempreg, 8	/* copy the "extra" word */
 	nop.i	0
 ;; }
 { .mib
 	add	adest = 16, dest
-	movi0	ar.lc = loopcnt 	// set the loop counter
+	movi0	ar.lc = loopcnt 	/* set the loop counter */
 ;; }
 
 #ifdef  GAS_ALIGN_BREAKS_UNWIND_INFO
@@ -268,7 +267,7 @@ ENTRY(memcpy)
 	.align	32
 #endif
 #if defined(USE_FLP)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
+.l1: /* ------------------------------- L1: Everything a multiple of 8 */
 { .mmi
 #if defined(USE_LFETCH)
 (p[0])	lfetch.nt1 [ptr2],32
@@ -290,7 +289,7 @@ ENTRY(memcpy)
 	br.ctop.dptk.many .l1
 ;; }
 #elif defined(USE_INT)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
+.l1: /* ------------------------------- L1: Everything a multiple of 8 */
 { .mmi
 (p[0])	load	the_r[0] = [src], 8
 (p[0])	load	the_q[0] = [asrc], 8
@@ -317,58 +316,58 @@ ENTRY(memcpy)
 
 .copy_full_words:
 { .mib
-	cmp.gt	p_scr, p0 = 8, len	//
-	shr.u	elemcnt = len, 3	//
+	cmp.gt	p_scr, p0 = 8, len	/* */
+	shr.u	elemcnt = len, 3	/* */
 (p_scr) br.cond.dpnt.many .copy_bytes
 ;; }
 { .mii
 	load	tempreg = [src], 8
-	add	loopcnt = -1, elemcnt	//
+	add	loopcnt = -1, elemcnt	/* */
 ;; }
 { .mii
-	cmp.ne	p_scr, p0 = 0, loopcnt	//
-	mov	ar.lc = loopcnt		//
+	cmp.ne	p_scr, p0 = 0, loopcnt	/* */
+	mov	ar.lc = loopcnt		/* */
 ;; }
 
-.l2: // ------------------------------- // L2: Max 4 words copied separately
+.l2: /* ------------------------------- L2: Max 4 words copied separately */
 { .mmi
 	store	[dest] = tempreg, 8
-(p_scr)	load	tempreg = [src], 8	//
+(p_scr)	load	tempreg = [src], 8	/* */
 	add	len = -8, len
 } { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
+	cmp.lt	p_scr, p0 = 1, loopcnt	/* avoid load beyond end-point */
 	add	loopcnt = -1, loopcnt
 	br.cloop.dptk.few  .l2
 ;; }
 
 .copy_bytes:
 { .mib
-	cmp.eq	p_scr, p0 = len, r0	// is len == 0 ?
-	add	loopcnt = -1, len	// len--;
+	cmp.eq	p_scr, p0 = len, r0	/* is len == 0 ? */
+	add	loopcnt = -1, len	/* len--; */
 (p_scr)	br.cond.spnt	.restore_and_exit
 ;; }
 { .mii
 	ld1	tmp2 = [src], 1
 	movi0	ar.lc = loopcnt
-	cmp.ne	p_scr, p0 = 0, loopcnt	// avoid load beyond end-point
+	cmp.ne	p_scr, p0 = 0, loopcnt	/* avoid load beyond end-point */
 ;; }
 
-.l3: // ------------------------------- // L3: Final byte move
+.l3: /* ------------------------------- L3: Final byte move */
 { .mmi
 	st1	[dest] = tmp2, 1
 (p_scr)	ld1	tmp2 = [src], 1
 } { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
+	cmp.lt	p_scr, p0 = 1, loopcnt	/* avoid load beyond end-point */
 	add	loopcnt = -1, loopcnt
 	br.cloop.dptk.few  .l3
 ;; }
 
 .restore_and_exit:
 { .mmi
-	movi0	pr = saved_pr, -1	// restore the predicate registers
+	movi0	pr = saved_pr, -1	/* restore the predicate registers */
 ;; }
 { .mib
-	movi0	ar.lc = saved_lc	// restore the loop counter
+	movi0	ar.lc = saved_lc	/* restore the loop counter */
 	br.ret.sptk.many b0
 ;; }
 
@@ -376,41 +375,41 @@ ENTRY(memcpy)
 .src_not_aligned:
 { .mmi
 	cmp.gt	p_scr, p0 = 16, len
-	and	sh1 = 7, src 		// sh1 = src % 8
-	shr.u	loopcnt = len, 4	// element-cnt = len / 16
+	and	sh1 = 7, src 		/* sh1 = src % 8 */
+	shr.u	loopcnt = len, 4	/* element-cnt = len / 16 */
 } { .mib
 	add	tmp4 = @ltoff(.table), gp
 	add 	tmp3 = @ltoff(.loop56), gp
-(p_scr)	br.cond.dpnt.many .copy_bytes	// do byte by byte if too few
+(p_scr)	br.cond.dpnt.many .copy_bytes	/* do byte by byte if too few */
 ;; }
 { .mmi
-	and	asrc = -8, src		// asrc = (-8) -- align src for loop
-	add 	loopcnt = -1, loopcnt	// loopcnt--
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
+	and	asrc = -8, src		/* asrc = (-8) -- align src for loop */
+	add 	loopcnt = -1, loopcnt	/* loopcnt-- */
+	shl	sh1 = sh1, 3		/* sh1 = 8 * (src % 8) */
 } { .mmi
-	ld8	ptable = [tmp4]		// ptable = &table
-	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
-	and	tmp2 = -16, len		// tmp2 = len & -OPSIZ
+	ld8	ptable = [tmp4]		/* ptable = &table */
+	ld8	ploop56 = [tmp3]	/* ploop56 = &loop56 */
+	and	tmp2 = -16, len		/* tmp2 = len & -OPSIZ */
 ;; }
 { .mmi
-	add	tmp3 = ptable, sh1	// tmp3 = &table + sh1
-	add	src = src, tmp2		// src += len & (-16)
-	movi0	ar.lc = loopcnt		// set LC
+	add	tmp3 = ptable, sh1	/* tmp3 = &table + sh1 */
+	add	src = src, tmp2		/* src += len & (-16) */
+	movi0	ar.lc = loopcnt		/* set LC */
 ;; }
 { .mmi
-	ld8	tmp4 = [tmp3]		// tmp4 = loop offset
-	sub	len = len, tmp2		// len -= len & (-16)
-	movi0	ar.ec = MEMLAT + 2 	// one more pass needed
+	ld8	tmp4 = [tmp3]		/* tmp4 = loop offset */
+	sub	len = len, tmp2		/* len -= len & (-16) */
+	movi0	ar.ec = MEMLAT + 2 	/* one more pass needed */
 ;; }
 { .mmi
-	ld8	s[1] = [asrc], 8	// preload
-	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
-	movi0   pr.rot = 1 << 16	// set rotating predicates
+	ld8	s[1] = [asrc], 8	/* preload */
+	sub	loopaddr = ploop56,tmp4	/* loopadd = &loop56 - loop offset */
+	movi0   pr.rot = 1 << 16	/* set rotating predicates */
 ;; }
 { .mib
 	nop.m	0
 	movi0	b6 = loopaddr
-	br	b6			// jump to the appropriate loop
+	br	b6			/* jump to the appropriate loop */
 ;; }
 
 	LOOP(8)
@@ -426,7 +425,7 @@ libc_hidden_def (memcpy)
 	.rodata
 	.align 8
 .table:
-	data8	0			// dummy entry
+	data8	0			/* dummy entry */
 	data8 	.loop56 - .loop8
 	data8 	.loop56 - .loop16
 	data8 	.loop56 - .loop24
diff --git a/libc/string/ia64/memmove.S b/libc/string/ia64/memmove.S
index 00342d8e0..7d830f912 100644
--- a/libc/string/ia64/memmove.S
+++ b/libc/string/ia64/memmove.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: dest
 
@@ -33,7 +32,7 @@
    sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
    or the UM.be bit should be cleared at the beginning and set at the end.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define OP_T_THRES 	16
@@ -81,48 +80,48 @@ ENTRY(memmove)
 	alloc 	r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
 	.rotr	r[MEMLAT + 2], q[MEMLAT + 1]
 	.rotp	p[MEMLAT + 2]
-	mov	ret0 = in0		// return value = dest
+	mov	ret0 = in0		/* return value = dest */
 	.save pr, saved_pr
-	mov	saved_pr = pr		// save the predicate registers
+	mov	saved_pr = pr		/* save the predicate registers */
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc	// save the loop counter
+        mov 	saved_lc = ar.lc	/* save the loop counter */
 	.body
-	or	tmp3 = in0, in1 ;;	// tmp3 = dest | src
-	or	tmp3 = tmp3, in2	// tmp3 = dest | src | len
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-	mov	len = in2		// len
-	sub	tmp2 = r0, in0		// tmp2 = -dest
-	cmp.eq	p6, p0 = in2, r0	// if (len == 0)
-(p6)	br.cond.spnt .restore_and_exit;;// 	return dest;
-	and	tmp4 = 7, tmp3 		// tmp4 = (dest | src | len) & 7
-	cmp.le	p6, p0 = dest, src	// if dest <= src it's always safe
-(p6)	br.cond.spnt .forward		// to copy forward
+	or	tmp3 = in0, in1 ;;	/* tmp3 = dest | src */
+	or	tmp3 = tmp3, in2	/* tmp3 = dest | src | len */
+	mov 	dest = in0		/* dest */
+	mov 	src = in1		/* src */
+	mov	len = in2		/* len */
+	sub	tmp2 = r0, in0		/* tmp2 = -dest */
+	cmp.eq	p6, p0 = in2, r0	/* if (len == 0) */
+(p6)	br.cond.spnt .restore_and_exit;;/* 	return dest; */
+	and	tmp4 = 7, tmp3 		/* tmp4 = (dest | src | len) & 7 */
+	cmp.le	p6, p0 = dest, src	/* if dest <= src it's always safe */
+(p6)	br.cond.spnt .forward		/* to copy forward */
 	add	tmp3 = src, len;;
-	cmp.lt	p6, p0 = dest, tmp3	// if dest > src && dest < src + len
-(p6)	br.cond.spnt .backward		// we have to copy backward
+	cmp.lt	p6, p0 = dest, tmp3	/* if dest > src && dest < src + len */
+(p6)	br.cond.spnt .backward		/* we have to copy backward */
 
 .forward:
-	shr.u	loopcnt = len, 4 ;;	// loopcnt = len / 16
-	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
-(p6)	br.cond.sptk .next		//	goto next;
+	shr.u	loopcnt = len, 4 ;;	/* loopcnt = len / 16 */
+	cmp.ne	p6, p0 = tmp4, r0	/* if ((dest | src | len) & 7 != 0) */
+(p6)	br.cond.sptk .next		/*	goto next; */
 
-// The optimal case, when dest, src and len are all multiples of 8
+/* The optimal case, when dest, src and len are all multiples of 8 */
 
 	and	tmp3 = 0xf, len
-	mov	pr.rot = 1 << 16	// set rotating predicates
-	mov	ar.ec = MEMLAT + 1 ;;	// set the epilog counter
-	cmp.ne	p6, p0 = tmp3, r0	// do we have to copy an extra word?
-	adds	loopcnt = -1, loopcnt;;	// --loopcnt
+	mov	pr.rot = 1 << 16	/* set rotating predicates */
+	mov	ar.ec = MEMLAT + 1 ;;	/* set the epilog counter */
+	cmp.ne	p6, p0 = tmp3, r0	/* do we have to copy an extra word? */
+	adds	loopcnt = -1, loopcnt;;	/* --loopcnt */
 (p6)	ld8	value = [src], 8;;
-(p6)	st8	[dest] = value, 8	// copy the "odd" word
-	mov	ar.lc = loopcnt 	// set the loop counter
+(p6)	st8	[dest] = value, 8	/* copy the "odd" word */
+	mov	ar.lc = loopcnt 	/* set the loop counter */
 	cmp.eq	p6, p0 = 8, len
-(p6)	br.cond.spnt .restore_and_exit;;// the one-word special case
-	adds	adest = 8, dest		// set adest one word ahead of dest
-	adds	asrc = 8, src ;;	// set asrc one word ahead of src
-	nop.b	0			// get the "golden" alignment for
-	nop.b	0			// the next loop
+(p6)	br.cond.spnt .restore_and_exit;;/* the one-word special case */
+	adds	adest = 8, dest		/* set adest one word ahead of dest */
+	adds	asrc = 8, src ;;	/* set asrc one word ahead of src */
+	nop.b	0			/* get the "golden" alignment for */
+	nop.b	0			/* the next loop */
 .l0:
 (p[0])		ld8	r[0] = [src], 16
 (p[0])		ld8	q[0] = [asrc], 16
@@ -130,50 +129,50 @@ ENTRY(memmove)
 (p[MEMLAT])	st8	[adest] = q[MEMLAT], 16
 		br.ctop.dptk .l0 ;;
 
-	mov	pr = saved_pr, -1	// restore the predicate registers
-	mov	ar.lc = saved_lc	// restore the loop counter
+	mov	pr = saved_pr, -1	/* restore the predicate registers */
+	mov	ar.lc = saved_lc	/* restore the loop counter */
 	br.ret.sptk.many b0
 .next:
-	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
-	and	loopcnt = 7, tmp2 		// loopcnt = -dest % 8
-(p6)	br.cond.spnt	.cpyfew			// copy byte by byte
+	cmp.ge	p6, p0 = OP_T_THRES, len	/* is len <= OP_T_THRES */
+	and	loopcnt = 7, tmp2 		/* loopcnt = -dest % 8 */
+(p6)	br.cond.spnt	.cpyfew			/* copy byte by byte */
 	;;
 	cmp.eq	p6, p0 = loopcnt, r0
 (p6)	br.cond.sptk	.dest_aligned
-	sub	len = len, loopcnt	// len -= -dest % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
+	sub	len = len, loopcnt	/* len -= -dest % 8 */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
 	;;
 	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-	ld1	value = [src], 1	// value = *src++
+.l1:					/* copy -dest % 8 bytes */
+	ld1	value = [src], 1	/* value = *src++ */
 	;;
-	st1	[dest] = value, 1	// *dest++ = value
+	st1	[dest] = value, 1	/* *dest++ = value */
 	br.cloop.dptk .l1
 .dest_aligned:
-	and	sh1 = 7, src 		// sh1 = src % 8
-	and	tmp2 = -8, len   	// tmp2 = len & -OPSIZ
-	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	and	len = 7, len;;		// len = len % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
+	and	sh1 = 7, src 		/* sh1 = src % 8 */
+	and	tmp2 = -8, len   	/* tmp2 = len & -OPSIZ */
+	and	asrc = -8, src		/* asrc = src & -OPSIZ  -- align src */
+	shr.u	loopcnt = len, 3	/* loopcnt = len / 8 */
+	and	len = 7, len;;		/* len = len % 8 */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
 	addl	tmp4 = @ltoff(.table), gp
 	addl	tmp3 = @ltoff(.loop56), gp
-	mov     ar.ec = MEMLAT + 1	// set EC
-	mov     pr.rot = 1 << 16;;	// set rotating predicates
-	mov	ar.lc = loopcnt		// set LC
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
+	mov     ar.ec = MEMLAT + 1	/* set EC */
+	mov     pr.rot = 1 << 16;;	/* set rotating predicates */
+	mov	ar.lc = loopcnt		/* set LC */
+	cmp.eq  p6, p0 = sh1, r0 	/* is the src aligned? */
 (p6)    br.cond.sptk .src_aligned
-	add	src = src, tmp2		// src += len & -OPSIZ
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
-	ld8	ptable = [tmp4];;	// ptable = &table
-	add	tmp3 = ptable, sh1;;	// tmp3 = &table + sh1
-	mov	ar.ec = MEMLAT + 1 + 1 // one more pass needed
-	ld8	tmp4 = [tmp3];;		// tmp4 = loop offset
-	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
-	ld8	r[1] = [asrc], 8;;	// w0
+	add	src = src, tmp2		/* src += len & -OPSIZ */
+	shl	sh1 = sh1, 3		/* sh1 = 8 * (src % 8) */
+	ld8	ploop56 = [tmp3]	/* ploop56 = &loop56 */
+	ld8	ptable = [tmp4];;	/* ptable = &table */
+	add	tmp3 = ptable, sh1;;	/* tmp3 = &table + sh1 */
+	mov	ar.ec = MEMLAT + 1 + 1 /* one more pass needed */
+	ld8	tmp4 = [tmp3];;		/* tmp4 = loop offset */
+	sub	loopaddr = ploop56,tmp4	/* loopadd = &loop56 - loop offset */
+	ld8	r[1] = [asrc], 8;;	/* w0 */
 	mov	b6 = loopaddr;;
-	br	b6			// jump to the appropriate loop
+	br	b6			/* jump to the appropriate loop */
 
 	LOOP(8)
 	LOOP(16)
@@ -189,8 +188,8 @@ ENTRY(memmove)
 (p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
 		br.ctop.dptk .l3
 .cpyfew:
-	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
-	adds	len = -1, len		// --len;
+	cmp.eq	p6, p0 = len, r0	/* is len == 0 ? */
+	adds	len = -1, len		/* --len; */
 (p6)	br.cond.spnt	.restore_and_exit ;;
 	mov	ar.lc = len
 .l4:
@@ -199,36 +198,36 @@ ENTRY(memmove)
 	st1	[dest] = value, 1
 	br.cloop.dptk	.l4 ;;
 .restore_and_exit:
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
+	mov     pr = saved_pr, -1    	/* restore the predicate registers */
+	mov 	ar.lc = saved_lc	/* restore the loop counter */
 	br.ret.sptk.many b0
 
-// In the case of a backward copy, optimise only the case when everything
-// is a multiple of 8, otherwise copy byte by byte.  The backward copy is
-// used only when the blocks are overlapping and dest > src.
-
+/* In the case of a backward copy, optimise only the case when everything
+   is a multiple of 8, otherwise copy byte by byte.  The backward copy is
+   used only when the blocks are overlapping and dest > src.
+*/
 .backward:
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	add	src = src, len		// src points one byte past the end
-	add	dest = dest, len ;; 	// dest points one byte past the end
-	mov	ar.ec = MEMLAT + 1	// set the epilog counter
-	mov	pr.rot = 1 << 16	// set rotating predicates
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
-(p6)	br.cond.sptk .bytecopy ;;	// copy byte by byte backward
-	adds	src = -8, src		// src points to the last word
-	adds	dest = -8, dest 	// dest points to the last word
-	mov	ar.lc = loopcnt;;	// set the loop counter
+	shr.u	loopcnt = len, 3	/* loopcnt = len / 8 */
+	add	src = src, len		/* src points one byte past the end */
+	add	dest = dest, len ;; 	/* dest points one byte past the end */
+	mov	ar.ec = MEMLAT + 1	/* set the epilog counter */
+	mov	pr.rot = 1 << 16	/* set rotating predicates */
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
+	cmp.ne	p6, p0 = tmp4, r0	/* if ((dest | src | len) & 7 != 0) */
+(p6)	br.cond.sptk .bytecopy ;;	/* copy byte by byte backward */
+	adds	src = -8, src		/* src points to the last word */
+	adds	dest = -8, dest 	/* dest points to the last word */
+	mov	ar.lc = loopcnt;;	/* set the loop counter */
 .l5:
 (p[0])		ld8	r[0] = [src], -8
 (p[MEMLAT])	st8	[dest] = r[MEMLAT], -8
 		br.ctop.dptk .l5
 		br.cond.sptk .restore_and_exit
 .bytecopy:
-	adds	src = -1, src		// src points to the last byte
-	adds	dest = -1, dest		// dest points to the last byte
-	adds	loopcnt = -1, len;;	// loopcnt = len - 1
-	mov	ar.lc = loopcnt;;	// set the loop counter
+	adds	src = -1, src		/* src points to the last byte */
+	adds	dest = -1, dest		/* dest points to the last byte */
+	adds	loopcnt = -1, len;;	/* loopcnt = len - 1 */
+	mov	ar.lc = loopcnt;;	/* set the loop counter */
 .l6:
 (p[0])		ld1	r[0] = [src], -1
 (p[MEMLAT])	st1	[dest] = r[MEMLAT], -1
@@ -239,7 +238,7 @@ END(memmove)
 	.rodata
 	.align 8
 .table:
-	data8	0			// dummy entry
+	data8	0			/* dummy entry */
 	data8 	.loop56 - .loop8
 	data8 	.loop56 - .loop16
 	data8 	.loop56 - .loop24
diff --git a/libc/string/ia64/memset.S b/libc/string/ia64/memset.S
index ed27f3f31..7bd15c88a 100644
--- a/libc/string/ia64/memset.S
+++ b/libc/string/ia64/memset.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: dest
 
@@ -33,7 +32,7 @@
    Since a stf.spill f0 can store 16B in one go, we use this instruction
    to get peak speed when value = 0.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define dest		in0
@@ -46,15 +45,15 @@
 #define ptr1		r28
 #define ptr2		r27
 #define ptr3		r26
-#define ptr9 		r24
+#define ptr9		r24
 #define	loopcnt		r23
 #define linecnt		r22
 #define bytecnt		r21
 
 #define fvalue		f6
 
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6			// default register for same-cycle branches
+/* This routine uses only scratch predicate registers (p6 - p15) */
+#define p_scr		p6	/* default register for same-cycle branches */
 #define p_nz		p7
 #define p_zr		p8
 #define p_unalgn	p9
@@ -68,7 +67,7 @@
 #define MIN1		15
 #define MIN1P1HALF	8
 #define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
+#define LSIZE_SH        7			/* shift amount */
 #define PREF_AHEAD	8
 
 #define USE_FLP
@@ -90,97 +89,97 @@ ENTRY(memset)
 	movi0	save_lc = ar.lc
 } { .mmi
 	.body
-	mov	ret0 = dest		// return value
-	cmp.ne	p_nz, p_zr = value, r0	// use stf.spill if value is zero
+	mov	ret0 = dest		/* return value */
+	cmp.ne	p_nz, p_zr = value, r0	/* use stf.spill if value is zero */
 	cmp.eq	p_scr, p0 = cnt, r0
 ;; }
 { .mmi
-	and	ptr2 = -(MIN1+1), dest	// aligned address
-	and	tmp = MIN1, dest	// prepare to check for alignment
-	tbit.nz p_y, p_n = dest, 0	// Do we have an odd address? (M_B_U)
+	and	ptr2 = -(MIN1+1), dest	/* aligned address */
+	and	tmp = MIN1, dest	/* prepare to check for alignment */
+	tbit.nz p_y, p_n = dest, 0	/* Do we have an odd address? (M_B_U) */
 } { .mib
 	mov	ptr1 = dest
-	mux1	value = value, @brcst	// create 8 identical bytes in word
-(p_scr)	br.ret.dpnt.many rp		// return immediately if count = 0
+	mux1	value = value, @brcst	/* create 8 identical bytes in word */
+(p_scr)	br.ret.dpnt.many rp		/* return immediately if count = 0 */
 ;; }
 { .mib
 	cmp.ne	p_unalgn, p0 = tmp, r0
-} { .mib				// NB: # of bytes to move is 1 higher
-	sub	bytecnt = (MIN1+1), tmp	//     than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
+} { .mib				/* NB: # of bytes to move is 1 higher */
+	sub	bytecnt = (MIN1+1), tmp	/*     than loopcnt */
+	cmp.gt	p_scr, p0 = 16, cnt		/* is it a minimalistic task? */
+(p_scr)	br.cond.dptk.many .move_bytes_unaligned	/* go move just a few (M_B_U) */
 ;; }
 { .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
+(p_unalgn) add	ptr1 = (MIN1+1), ptr2		/* after alignment */
+(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		/* after alignment */
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	/* should we do a st8 ? */
 ;; }
 { .mib
 (p_y)	add	cnt = -8, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	/* should we do a st4 ? */
 } { .mib
 (p_y)	st8	[ptr2] = value, -4
 (p_n)	add	ptr2 = 4, ptr2
 ;; }
 { .mib
 (p_yy)	add	cnt = -4, cnt
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	/* should we do a st2 ? */
 } { .mib
 (p_yy)	st4	[ptr2] = value, -2
 (p_nn)	add	ptr2 = 2, ptr2
 ;; }
 { .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
+	mov	tmp = LINE_SIZE+1		/* for compare */
 (p_y)	add	cnt = -2, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	/* should we do a st1 ? */
 } { .mmi
-	setf.sig fvalue=value			// transfer value to FLP side
+	setf.sig fvalue=value			/* transfer value to FLP side */
 (p_y)	st2	[ptr2] = value, -1
 (p_n)	add	ptr2 = 1, ptr2
 ;; }
 
 { .mmi
 (p_yy)	st1	[ptr2] = value
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
+	cmp.gt	p_scr, p0 = tmp, cnt		/* is it a minimalistic task? */
 } { .mbb
 (p_yy)	add	cnt = -1, cnt
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
+(p_scr)	br.cond.dpnt.many .fraction_of_line	/* go move just a few */
 ;; }
 
 { .mib
 	nop.m 0
 	shr.u	linecnt = cnt, LSIZE_SH
-(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
+(p_zr)	br.cond.dptk.many .l1b			/* Jump to use stf.spill */
 ;; }
 
 #ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
-	.align 32 // -------- //  L1A: store ahead into cache lines; fill later
+	.align 32 /* --------  L1A: store ahead into cache lines; fill later */
 #endif
 { .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
+	and	tmp = -(LINE_SIZE), cnt		/* compute end of range */
+	mov	ptr9 = ptr1			/* used for prefetching */
+	and	cnt = (LINE_SIZE-1), cnt	/* remainder */
 } { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
+	mov	loopcnt = PREF_AHEAD-1		/* default prefetch loop */
+	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	/* check against actual value */
 ;; }
 { .mmi
-(p_scr)	add	loopcnt = -1, linecnt		// start of stores
-	add	ptr2 = 8, ptr1			// (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total
-;; }						// range
+(p_scr)	add	loopcnt = -1, linecnt		/* start of stores */
+	add	ptr2 = 8, ptr1			/* (beyond prefetch stores) */
+	add	ptr1 = tmp, ptr1		/* first address beyond total */
+;; }						/* range */
 { .mmi
-	add	tmp = -1, linecnt		// next loop count
+	add	tmp = -1, linecnt		/* next loop count */
 	movi0	ar.lc = loopcnt
 ;; }
 .pref_l1a:
 { .mib
-	store [ptr9] = myval, 128	// Do stores one cache line apart
+	store [ptr9] = myval, 128	/* Do stores one cache line apart */
 	nop.i	0
 	br.cloop.dptk.few .pref_l1a
 ;; }
 { .mmi
-	add	ptr0 = 16, ptr2		// Two stores in parallel
+	add	ptr0 = 16, ptr2		/* Two stores in parallel */
 	movi0	ar.lc = tmp
 ;; }
 .l1ax:
@@ -211,7 +210,7 @@ ENTRY(memset)
  { .mmi
 	store [ptr2] = myval, 8
 	store [ptr0] = myval, 32
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
+	cmp.lt	p_scr, p0 = ptr9, ptr1	/* do we need more prefetching? */
  ;; }
 { .mmb
 	store [ptr2] = myval, 24
@@ -219,9 +218,9 @@ ENTRY(memset)
 	br.cloop.dptk.few .l1ax
 ;; }
 { .mbb
-	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
-	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
+	cmp.le  p_scr, p0 = 8, cnt		/* just a few bytes left ? */
+(p_scr) br.cond.dpnt.many  .fraction_of_line	/* Branch no. 2 */
+	br.cond.dpnt.many  .move_bytes_from_alignment	/* Branch no. 3 */
 ;; }
 
 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
@@ -229,32 +228,32 @@ ENTRY(memset)
 #else
 	.align 32
 #endif
-.l1b:	// ------------------ //  L1B: store ahead into cache lines; fill later
+.l1b:	/* ------------------  L1B: store ahead into cache lines; fill later */
 { .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
+	and	tmp = -(LINE_SIZE), cnt		/* compute end of range */
+	mov	ptr9 = ptr1			/* used for prefetching */
+	and	cnt = (LINE_SIZE-1), cnt	/* remainder */
 } { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
+	mov	loopcnt = PREF_AHEAD-1		/* default prefetch loop */
+	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	/* check against actual value */
 ;; }
 { .mmi
 (p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1	// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1	// first address beyond total range
+	add	ptr2 = 16, ptr1	/* start of stores (beyond prefetch stores) */
+	add	ptr1 = tmp, ptr1	/* first address beyond total range */
 ;; }
 { .mmi
-	add	tmp = -1, linecnt	// next loop count
+	add	tmp = -1, linecnt	/* next loop count */
 	movi0	ar.lc = loopcnt
 ;; }
 .pref_l1b:
 { .mib
-	stf.spill [ptr9] = f0, 128	// Do stores one cache line apart
+	stf.spill [ptr9] = f0, 128	/* Do stores one cache line apart */
 	nop.i   0
 	br.cloop.dptk.few .pref_l1b
 ;; }
 { .mmi
-	add	ptr0 = 16, ptr2		// Two stores in parallel
+	add	ptr0 = 16, ptr2		/* Two stores in parallel */
 	movi0	ar.lc = tmp
 ;; }
 .l1bx:
@@ -269,7 +268,7 @@ ENTRY(memset)
  { .mmi
 	stf.spill [ptr2] = f0, 32
 	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1	// do we need more prefetching?
+	cmp.lt	p_scr, p0 = ptr9, ptr1	/* do we need more prefetching? */
  ;; }
 { .mmb
 	stf.spill [ptr2] = f0, 32
@@ -277,14 +276,14 @@ ENTRY(memset)
 	br.cloop.dptk.few .l1bx
 ;; }
 { .mib
-	cmp.gt  p_scr, p0 = 8, cnt	// just a few bytes left ?
+	cmp.gt  p_scr, p0 = 8, cnt	/* just a few bytes left ? */
 (p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment
 ;; }
 
 .fraction_of_line:
 { .mib
 	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   	// loopcnt = cnt / 32
+	shr.u	loopcnt = cnt, 5	/* loopcnt = cnt / 32 */
 ;; }
 { .mib
 	cmp.eq	p_scr, p0 = loopcnt, r0
@@ -292,13 +291,13 @@ ENTRY(memset)
 (p_scr)	br.cond.dpnt.many store_words
 ;; }
 { .mib
-	and	cnt = 0x1f, cnt		// compute the remaining cnt
+	and	cnt = 0x1f, cnt		/* compute the remaining cnt */
 	movi0   ar.lc = loopcnt
 ;; }
 #ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
 	.align 32
 #endif
-.l2:	// ---------------------------- //  L2A:  store 32B in 2 cycles
+.l2:	/* ----------------------------  L2A:  store 32B in 2 cycles */
 { .mmb
 	store	[ptr1] = myval, 8
 	store	[ptr2] = myval, 8
@@ -309,34 +308,34 @@ ENTRY(memset)
 ;; }
 store_words:
 { .mib
-	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
+	cmp.gt	p_scr, p0 = 8, cnt		/* just a few bytes left ? */
+(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	/* Branch */
 ;; }
 
 { .mmi
-	store	[ptr1] = myval, 8		// store
-	cmp.le	p_y, p_n = 16, cnt		//
-	add	cnt = -8, cnt			// subtract
+	store	[ptr1] = myval, 8		/* store */
+	cmp.le	p_y, p_n = 16, cnt		/* */
+	add	cnt = -8, cnt			/* subtract */
 ;; }
 { .mmi
-(p_y)	store	[ptr1] = myval, 8		// store
-(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt		//
-(p_y)	add	cnt = -8, cnt			// subtract
+(p_y)	store	[ptr1] = myval, 8		/* store */
+(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt		/* */
+(p_y)	add	cnt = -8, cnt			/* subtract */
 ;; }
-{ .mmi						// store
-(p_yy)	store	[ptr1] = myval, 8		//
-(p_yy)	add	cnt = -8, cnt			// subtract
+{ .mmi						/* store */
+(p_yy)	store	[ptr1] = myval, 8		/* */
+(p_yy)	add	cnt = -8, cnt			/* subtract */
 ;; }
 
 .move_bytes_from_alignment:
 { .mib
 	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2	// should we terminate with a st4 ?
+	tbit.nz.unc p_y, p0 = cnt, 2	/* should we terminate with a st4 ? */
 (p_scr)	br.cond.dpnt.few .restore_and_exit
 ;; }
 { .mib
 (p_y)	st4	[ptr1] = value, 4
-	tbit.nz.unc p_yy, p0 = cnt, 1	// should we terminate with a st2 ?
+	tbit.nz.unc p_yy, p0 = cnt, 1	/* should we terminate with a st2 ? */
 ;; }
 { .mib
 (p_yy)	st2	[ptr1] = value, 2
@@ -362,38 +361,38 @@ store_words:
 (p_n)	add	ptr2 = 2, ptr1
 } { .mmi
 (p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = value, 1	// fill 1 (odd-aligned) byte
-(p_y)	add	cnt = -1, cnt		// [15, 14 (or less) left]
+(p_y)	st1	[ptr1] = value, 1	/* fill 1 (odd-aligned) byte */
+(p_y)	add	cnt = -1, cnt		/* [15, 14 (or less) left] */
 ;; }
 { .mmi
 (p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt	// prepare last store
+	add	ptr3 = ptr1, cnt	/* prepare last store */
 	movi0	ar.lc = save_lc
 } { .mmi
-(p_yy)	st2	[ptr1] = value, 4	// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4	// fill 2 (aligned) bytes
-(p_yy)	add	cnt = -4, cnt		// [11, 10 (o less) left]
+(p_yy)	st2	[ptr1] = value, 4	/* fill 2 (aligned) bytes */
+(p_yy)	st2	[ptr2] = value, 4	/* fill 2 (aligned) bytes */
+(p_yy)	add	cnt = -4, cnt		/* [11, 10 (o less) left] */
 ;; }
 { .mmi
 (p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3		// last store
-	tbit.nz p_scr, p0 = cnt, 1	// will there be a st2 at the end ?
+	add	ptr3 = -1, ptr3		/* last store */
+	tbit.nz p_scr, p0 = cnt, 1	/* will there be a st2 at the end ? */
 } { .mmi
-(p_y)	st2	[ptr1] = value, 4	// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = value, 4	// fill 2 (aligned) bytes
-(p_y)	add	cnt = -4, cnt		// [7, 6 (or less) left]
+(p_y)	st2	[ptr1] = value, 4	/* fill 2 (aligned) bytes */
+(p_y)	st2	[ptr2] = value, 4	/* fill 2 (aligned) bytes */
+(p_y)	add	cnt = -4, cnt		/* [7, 6 (or less) left] */
 ;; }
 { .mmi
-(p_yy)	st2	[ptr1] = value, 4	// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4	// fill 2 (aligned) bytes
-					// [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0	// will there be a st1 at the end ?
+(p_yy)	st2	[ptr1] = value, 4	/* fill 2 (aligned) bytes */
+(p_yy)	st2	[ptr2] = value, 4	/* fill 2 (aligned) bytes */
+					/* [3, 2 (or less) left] */
+	tbit.nz p_y, p0 = cnt, 0	/* will there be a st1 at the end ? */
 } { .mmi
 (p_yy)	add	cnt = -4, cnt
 ;; }
 { .mmb
-(p_scr)	st2	[ptr1] = value		// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = value		// fill last byte (using ptr3)
+(p_scr)	st2	[ptr1] = value		/* fill 2 (aligned) bytes */
+(p_y)	st1	[ptr3] = value		/* fill last byte (using ptr3) */
 	br.ret.sptk.many rp
 ;; }
 END(memset)
diff --git a/libc/string/ia64/softpipe.h b/libc/string/ia64/softpipe.h
index d71af735e..a9a9dc679 100644
--- a/libc/string/ia64/softpipe.h
+++ b/libc/string/ia64/softpipe.h
@@ -12,9 +12,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* The latency of a memory load assumed by the assembly implementation
    of the mem and str functions.  Since we don't have any clue about
diff --git a/libc/string/ia64/strchr.S b/libc/string/ia64/strchr.S
index 401a07941..034fd3096 100644
--- a/libc/string/ia64/strchr.S
+++ b/libc/string/ia64/strchr.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: the address of the first occurence of chr in str or NULL
 
@@ -30,7 +29,7 @@
    This implementation assumes little endian mode.  For big endian mode,
    the instruction czx1.r should be replaced by czx1.l.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define saved_lc	r18
@@ -49,15 +48,15 @@ ENTRY(strchr)
 	.prologue
 	alloc r2 = ar.pfs, 2, 0, 0, 0
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc 	// save the loop counter
+        mov 	saved_lc = ar.lc 	/* save the loop counter */
 	.body
 	mov 	ret0 = str	
-	and 	tmp = 7, str		// tmp = str % 8
+	and 	tmp = 7, str		/* tmp = str % 8 */
 	mux1	chrx8 = chr, @brcst
-	extr.u	chr = chr, 0, 8		// retain only the last byte
-	cmp.ne	p8, p0 = r0, r0		// clear p8
+	extr.u	chr = chr, 0, 8		/* retain only the last byte */
+	cmp.ne	p8, p0 = r0, r0		/* clear p8 */
 	;;
-	sub	loopcnt = 8, tmp	// loopcnt = 8 - tmp
+	sub	loopcnt = 8, tmp	/* loopcnt = 8 - tmp */
 	cmp.eq	p6, p0 = tmp, r0
 (p6)	br.cond.sptk	.str_aligned;;
 	adds	loopcnt = -1, loopcnt;;
@@ -75,10 +74,10 @@ ENTRY(strchr)
 	nop.b	0
 	nop.b 	0
 .l2:	
-	ld8.s	val2 = [ret0], 8	// don't bomb out here
+	ld8.s	val2 = [ret0], 8	/* don't bomb out here */
 	czx1.r	pos0 = val1	
-	xor	tmp = val1, chrx8	// if val1 contains chr, tmp will
-	;;				// contain a zero in its position
+	xor	tmp = val1, chrx8	/* if val1 contains chr, tmp will */
+	;;				/* contain a zero in its position */
 	czx1.r	poschr = tmp
 	cmp.ne	p6, p0 = 8, pos0
 	;;
@@ -90,21 +89,21 @@ ENTRY(strchr)
 	mov	val1 = val2	
 	br.cond.dptk .l2
 .foundit:
-(p6)	cmp.lt	p8, p0 = pos0, poschr	// we found chr and null in the word
-(p8)	br.cond.spnt .notfound		// null was found before chr
+(p6)	cmp.lt	p8, p0 = pos0, poschr	/* we found chr and null in the word */
+(p8)	br.cond.spnt .notfound		/* null was found before chr */
 	add	ret0 = ret0, poschr ;;
-	adds	ret0 = -15, ret0 ;;	// should be -16, but we decrement
-.restore_and_exit:			// ret0 in the next instruction
-	adds	ret0 = -1, ret0		// ret0 was pointing 1 char too far
-	mov 	ar.lc = saved_lc	// restore the loop counter
+	adds	ret0 = -15, ret0 ;;	/* should be -16, but we decrement */
+.restore_and_exit:			/* ret0 in the next instruction */
+	adds	ret0 = -1, ret0		/* ret0 was pointing 1 char too far */
+	mov 	ar.lc = saved_lc	/* restore the loop counter */
 	br.ret.sptk.many b0
 .notfound:
-	mov	ret0 = r0		// return NULL if null was found
+	mov	ret0 = r0		/* return NULL if null was found */
 	mov 	ar.lc = saved_lc
 	br.ret.sptk.many b0
 .recovery:
 	adds	ret0 = -8, ret0;;
-	ld8	val2 = [ret0], 8	// bomb out here
+	ld8	val2 = [ret0], 8	/* bomb out here */
 	br.cond.sptk	.back
 END(strchr)
 libc_hidden_def (strchr)
diff --git a/libc/string/ia64/strcmp.S b/libc/string/ia64/strcmp.S
index d3b41e642..c45ab4801 100644
--- a/libc/string/ia64/strcmp.S
+++ b/libc/string/ia64/strcmp.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: the result of the comparison
 
@@ -27,7 +26,7 @@
    Unlike memcmp(), this function is optimized for mismatches within the
    first few characters.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define s1		in0
@@ -42,7 +41,7 @@ ENTRY(strcmp)
 .loop:
 	ld1	val1 = [s1], 1
 	ld1	val2 = [s2], 1
-	cmp.eq	p6, p0 = r0, r0		// set p6
+	cmp.eq	p6, p0 = r0, r0		/* set p6 */
 	;;
 	cmp.ne.and p6, p0 = val1, r0
 	cmp.ne.and p6, p0 = val2, r0
diff --git a/libc/string/ia64/strcpy.S b/libc/string/ia64/strcpy.S
index e4a9915ca..c9b3bc143 100644
--- a/libc/string/ia64/strcpy.S
+++ b/libc/string/ia64/strcpy.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: dest
 
@@ -27,11 +26,11 @@
    In this form, it assumes little endian mode.  For big endian mode, the
    the two shifts in .l2 must be inverted:
 
-	shl   	value = r[1], sh1   	// value = w0 << sh1
-	shr.u   tmp = r[0], sh2   	// tmp = w1 >> sh2
+	shl	value = r[1], sh1	// value = w0 << sh1
+	shr.u   tmp = r[0], sh2		// tmp = w1 >> sh2
  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define saved_lc	r15
@@ -53,62 +52,62 @@
 
 ENTRY(strcpy)
 	.prologue
-	alloc 	r2 = ar.pfs, 2, 0, 30, 32
+	alloc	r2 = ar.pfs, 2, 0, 30, 32
 
 #define MEMLAT 2
 	.rotr	r[MEMLAT + 2]
 	.rotp	p[MEMLAT + 1]
 
-	mov	ret0 = in0		// return value = dest
+	mov	ret0 = in0		/* return value = dest */
 	.save pr, saved_pr
-	mov	saved_pr = pr           // save the predicate registers
+	mov	saved_pr = pr           /* save the predicate registers */
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc	// save the loop counter
+        mov	saved_lc = ar.lc	/* save the loop counter */
 	.body
-	sub	tmp = r0, in0 ;;	// tmp = -dest
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-	and	loopcnt = 7, tmp ;;	// loopcnt = -dest % 8
+	sub	tmp = r0, in0 ;;	/* tmp = -dest */
+	mov	dest = in0		/* dest */
+	mov	src = in1		/* src */
+	and	loopcnt = 7, tmp ;;	/* loopcnt = -dest % 8 */
 	cmp.eq	p6, p0 = loopcnt, r0
-	adds	loopcnt = -1, loopcnt	// --loopcnt
+	adds	loopcnt = -1, loopcnt	/* --loopcnt */
 (p6)	br.cond.sptk .dest_aligned ;;
 	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-	ld1	c = [src], 1		// c = *src++
+.l1:					/* copy -dest % 8 bytes */
+	ld1	c = [src], 1		/* c = *src++ */
 	;;
-	st1	[dest] = c, 1		// *dest++ = c
+	st1	[dest] = c, 1		/* *dest++ = c */
 	cmp.eq	p6, p0 = c, r0
 (p6)	br.cond.dpnt .restore_and_exit
 	br.cloop.dptk .l1 ;;
 .dest_aligned:
-	and	sh1 = 7, src 		// sh1 = src % 8
-	mov	ar.lc = -1		// "infinite" loop
-	and	asrc = -8, src ;;	// asrc = src & -OPSIZ  -- align src
+	and	sh1 = 7, src		/* sh1 = src % 8 */
+	mov	ar.lc = -1		/* "infinite" loop */
+	and	asrc = -8, src ;;	/* asrc = src & -OPSIZ  -- align src */
 	sub	thresh = 8, sh1
-	mov	pr.rot = 1 << 16	// set rotating predicates
-	cmp.ne	p7, p0 = r0, r0		// clear p7
-	shl	sh1 = sh1, 3 ;;		// sh1 = 8 * (src % 8)
-	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
+	mov	pr.rot = 1 << 16	/* set rotating predicates */
+	cmp.ne	p7, p0 = r0, r0		/* clear p7 */
+	shl	sh1 = sh1, 3 ;;		/* sh1 = 8 * (src % 8) */
+	sub	sh2 = 64, sh1		/* sh2 = 64 - sh1 */
+	cmp.eq  p6, p0 = sh1, r0	/* is the src aligned? */
 (p6)    br.cond.sptk .src_aligned ;;
 	ld8	r[1] = [asrc],8 ;;
 
 	.align	32
 .l2:
 	ld8.s	r[0] = [asrc], 8
-	shr.u	value = r[1], sh1 ;; 	// value = w0 >> sh1
-	czx1.r	pos = value ;;		// do we have an "early" zero
-	cmp.lt	p7, p0 = pos, thresh	// in w0 >> sh1?
+	shr.u	value = r[1], sh1 ;;	/* value = w0 >> sh1 */
+	czx1.r	pos = value ;;		/* do we have an "early" zero */
+	cmp.lt	p7, p0 = pos, thresh	/* in w0 >> sh1? */
 (p7)	br.cond.dpnt .found0
-	chk.s	r[0], .recovery2	// it is safe to do that only
-.back2:					// after the previous test
-	shl	tmp = r[0], sh2  	// tmp = w1 << sh2
+	chk.s	r[0], .recovery2	/* it is safe to do that only */
+.back2:					/* after the previous test */
+	shl	tmp = r[0], sh2		/* tmp = w1 << sh2 */
 	;;
-	or	value = value, tmp ;;	// value |= tmp
+	or	value = value, tmp ;;	/* value |= tmp */
 	czx1.r	pos = value ;;
 	cmp.ne	p7, p0 = 8, pos
 (p7)	br.cond.dpnt .found0
-	st8	[dest] = value, 8	// store val to dest
+	st8	[dest] = value, 8	/* store val to dest */
 	br.ctop.dptk    .l2 ;;
 .src_aligned:
 .l3:
@@ -124,14 +123,14 @@ ENTRY(strcpy)
 .found0:
 	mov	ar.lc = pos
 .l4:
-	extr.u	c = value, 0, 8		// c = value & 0xff
+	extr.u	c = value, 0, 8		/* c = value & 0xff */
 	shr.u	value = value, 8
 	;;
 	st1	[dest] = c, 1
 	br.cloop.dptk	.l4 ;;
 .restore_and_exit:
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	mov	pr = saved_pr, -1	// restore the predicate registers
+	mov	ar.lc = saved_lc	/* restore the loop counter */
+	mov	pr = saved_pr, -1	/* restore the predicate registers */
 	br.ret.sptk.many b0
 .recovery2:
 	add	tmp = -8, asrc ;;
diff --git a/libc/string/ia64/strlen.S b/libc/string/ia64/strlen.S
index 9b27a2d1b..83244cd76 100644
--- a/libc/string/ia64/strlen.S
+++ b/libc/string/ia64/strlen.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: the length of the input string
 
@@ -33,7 +32,7 @@
    This implementation assumes little endian mode.  For big endian mode,
    the instruction czx1.r should be replaced by czx1.l.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define saved_lc	r18
@@ -50,13 +49,13 @@ ENTRY(strlen)
 	.prologue
 	alloc r2 = ar.pfs, 1, 0, 0, 0
 	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc 	// save the loop counter
+        mov 	saved_lc = ar.lc 	/* save the loop counter */
 	.body
 	mov 	str = in0	
-	mov 	len = r0		// len = 0
-	and 	tmp = 7, in0		// tmp = str % 8
+	mov 	len = r0		/* len = 0 */
+	and 	tmp = 7, in0		/* tmp = str % 8 */
 	;;
-	sub	loopcnt = 8, tmp	// loopcnt = 8 - tmp
+	sub	loopcnt = 8, tmp	/* loopcnt = 8 - tmp */
 	cmp.eq	p6, p0 = tmp, r0
 (p6)	br.cond.sptk	.str_aligned;;
 	adds	loopcnt = -1, loopcnt;;
@@ -69,11 +68,11 @@ ENTRY(strlen)
 	adds	len = 1, len
 	br.cloop.dptk	.l1
 .str_aligned:
-	mov	origadd = str		// origadd = orig
+	mov	origadd = str		/* origadd = orig */
 	ld8	val1 = [str], 8;;
 	nop.b	0
 	nop.b 	0
-.l2:	ld8.s	val2 = [str], 8		// don't bomb out here
+.l2:	ld8.s	val2 = [str], 8		/* don't bomb out here */
 	czx1.r	pos0 = val1	
 	;;
 	cmp.ne	p6, p0 = 8, pos0
@@ -83,16 +82,16 @@ ENTRY(strlen)
 	mov	val1 = val2	
 	br.cond.dptk	.l2
 .foundit:
-	sub	tmp = str, origadd	// tmp = crt address - orig
+	sub	tmp = str, origadd	/* tmp = crt address - orig */
 	add	len = len, pos0;;
 	add	len = len, tmp;;
 	adds	len = -16, len
 .restore_and_exit:
-	mov ar.lc = saved_lc		// restore the loop counter
+	mov ar.lc = saved_lc		/* restore the loop counter */
 	br.ret.sptk.many b0
 .recovery:
 	adds	str = -8, str;;
-	ld8	val2 = [str], 8		// bomb out here
+	ld8	val2 = [str], 8		/* bomb out here */
 	br.cond.sptk	.back
 END(strlen)
 libc_hidden_def (strlen)
diff --git a/libc/string/ia64/strncmp.S b/libc/string/ia64/strncmp.S
index 8e0373c7f..d58a2007e 100644
--- a/libc/string/ia64/strncmp.S
+++ b/libc/string/ia64/strncmp.S
@@ -14,21 +14,20 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: the result of the comparison
 
    Inputs:
         in0:    s1
         in1:    s2
-  	in2:	n
+	in2:	n
 
    Unlike memcmp(), this function is optimized for mismatches within the
    first few characters.  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define s1		in0
@@ -42,13 +41,13 @@
 ENTRY(strncmp)
 	alloc	r2 = ar.pfs, 3, 0, 0, 0
 	mov	ret0 = r0
-	cmp.eq  p6, p0 = r0, r0		// set p6
-	cmp.eq	p7, p0 = n, r0		// return immediately if n == 0
+	cmp.eq  p6, p0 = r0, r0		/* set p6 */
+	cmp.eq	p7, p0 = n, r0		/* return immediately if n == 0 */
 (p7)	br.cond.spnt .restore_and_exit ;;
 .loop:
 	ld1	val1 = [s1], 1
 	ld1	val2 = [s2], 1
-	adds	n = -1, n		// n--
+	adds	n = -1, n		/* n-- */
 	;;
 	cmp.ne.and p6, p0 = val1, r0
 	cmp.ne.and p6, p0 = val2, r0
@@ -58,5 +57,5 @@ ENTRY(strncmp)
 	sub	ret0 = val1, val2
 .restore_and_exit:
 	br.ret.sptk.many b0
-END(strncmp)	
+END(strncmp)
 libc_hidden_weak (strncmp)
diff --git a/libc/string/ia64/strncpy.S b/libc/string/ia64/strncpy.S
index 4f1129350..b72e8a70c 100644
--- a/libc/string/ia64/strncpy.S
+++ b/libc/string/ia64/strncpy.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 /* Return: dest
 
@@ -29,7 +28,7 @@
    In this form, it assumes little endian mode.
  */
 
-#include "sysdep.h"
+#include <sysdep.h>
 #undef ret
 
 #define saved_lc	r15
@@ -58,64 +57,64 @@ ENTRY(strncpy)
 	.rotr	r[MEMLAT + 2]
 	.rotp	p[MEMLAT + 1]
 
-	mov	ret0 = in0		// return value = dest
+	mov	ret0 = in0		/* return value = dest */
 	.save pr, saved_pr
-	mov	saved_pr = pr           // save the predicate registers
+	mov	saved_pr = pr           /* save the predicate registers */
 	.save ar.lc, saved_lc
-	mov 	saved_lc = ar.lc	// save the loop counter
-	mov	ar.ec = 0		// ec is not guaranteed to
-					// be zero upon function entry
+	mov 	saved_lc = ar.lc	/* save the loop counter */
+	mov	ar.ec = 0		/* ec is not guaranteed to */
+					/* be zero upon function entry */
 	.body
 	cmp.geu p6, p5 = 24, in2
 (p6)	br.cond.spnt .short_len
-	sub	tmp = r0, in0 ;;	// tmp = -dest
-	mov	len = in2		// len
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-	and	tmp = 7, tmp ;;		// loopcnt = -dest % 8
+	sub	tmp = r0, in0 ;;	/* tmp = -dest */
+	mov	len = in2		/* len */
+	mov 	dest = in0		/* dest */
+	mov 	src = in1		/* src */
+	and	tmp = 7, tmp ;;		/* loopcnt = -dest % 8 */
 	cmp.eq	p6, p7 = tmp, r0
-	adds	loopcnt = -1, tmp	// --loopcnt
+	adds	loopcnt = -1, tmp	/* --loopcnt */
 (p6)	br.cond.sptk .dest_aligned ;;
-	sub	len = len, tmp		// len -= -dest % 8
+	sub	len = len, tmp		/* len -= -dest % 8 */
 	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-(p5)	ld1	c = [src], 1		// c = *src++
+.l1:					/* copy -dest % 8 bytes */
+(p5)	ld1	c = [src], 1		/* c = *src++ */
 	;;
-	st1	[dest] = c, 1		// *dest++ = c
+	st1	[dest] = c, 1		/* *dest++ = c */
 	cmp.ne	p5, p7 = c, r0
 	br.cloop.dptk .l1 ;;
 (p7)	br.cond.dpnt	.found0_align
 
-.dest_aligned:				// p7 should be cleared here
-	shr.u	c = len, 3		// c = len / 8
-	and	sh1 = 7, src 		// sh1 = src % 8
-	and	asrc = -8, src ;;	// asrc = src & -OPSIZ  -- align src
-	adds	c = (MEMLAT-1), c	// c = (len / 8) + MEMLAT - 1
+.dest_aligned:				/* p7 should be cleared here */
+	shr.u	c = len, 3		/* c = len / 8 */
+	and	sh1 = 7, src 		/* sh1 = src % 8 */
+	and	asrc = -8, src ;;	/* asrc = src & -OPSIZ  -- align src */
+	adds	c = (MEMLAT-1), c	/* c = (len / 8) + MEMLAT - 1 */
 	sub	thresh = 8, sh1
-	mov	pr.rot = 1 << 16	// set rotating predicates
-	shl	sh1 = sh1, 3 ;;		// sh1 = 8 * (src % 8)
-	mov	ar.lc = c		// "infinite" loop
-	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
+	mov	pr.rot = 1 << 16	/* set rotating predicates */
+	shl	sh1 = sh1, 3 ;;		/* sh1 = 8 * (src % 8) */
+	mov	ar.lc = c		/* "infinite" loop */
+	sub	sh2 = 64, sh1		/* sh2 = 64 - sh1 */
+	cmp.eq  p6, p0 = sh1, r0 	/* is the src aligned? */
 (p6)    br.cond.sptk .src_aligned
-	adds	c = -(MEMLAT-1), c ;;	// c = (len / 8)
+	adds	c = -(MEMLAT-1), c ;;	/* c = (len / 8) */
 	ld8	r[1] = [asrc],8
 	mov	ar.lc = c ;;
 
 	.align	32
 .l2:
-(p6)	st8	[dest] = value, 8	// store val to dest
+(p6)	st8	[dest] = value, 8	/* store val to dest */
 	ld8.s	r[0] = [asrc], 8
-	shr.u	value = r[1], sh1 ;; 	// value = w0 >> sh1
-	czx1.r	pos = value ;;		// do we have an "early" zero
-	cmp.lt	p7, p0 = pos, thresh	// in w0 >> sh1?
-	adds	len = -8, len		// len -= 8
+	shr.u	value = r[1], sh1 ;; 	/* value = w0 >> sh1 */
+	czx1.r	pos = value ;;		/* do we have an "early" zero */
+	cmp.lt	p7, p0 = pos, thresh	/* in w0 >> sh1? */
+	adds	len = -8, len		/* len -= 8 */
 (p7)	br.cond.dpnt .nonalign_found0
-	chk.s	r[0], .recovery2	// it is safe to do that only
-.back2:					// after the previous test
-	shl	tmp = r[0], sh2  	// tmp = w1 << sh2
+	chk.s	r[0], .recovery2	/* it is safe to do that only */
+.back2:					/* after the previous test */
+	shl	tmp = r[0], sh2  	/* tmp = w1 << sh2 */
 	;;
-	or	value = value, tmp ;;	// value |= tmp
+	or	value = value, tmp ;;	/* value |= tmp */
 	czx1.r	pos = value ;;
 	cmp.ne	p7, p6 = 8, pos
 (p7)	br.cond.dpnt .nonalign_found0
@@ -137,7 +136,7 @@ ENTRY(strncpy)
 (p[MEMLAT])	mov	value = r[MEMLAT]
 (p[MEMLAT])	czx1.r	pos = r[MEMLAT] ;;
 (p[MEMLAT])	cmp.ne	p7, p0 = 8, pos
-(p[MEMLAT])	adds	len = -8, len	// len -= 8
+(p[MEMLAT])	adds	len = -8, len	/* len -= 8 */
 (p7)		br.cond.dpnt .found0
 (p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
 		br.ctop.dptk .l3 ;;
@@ -152,7 +151,7 @@ ENTRY(strncpy)
 (p5)	br.cond.dptk	.restore_and_exit ;;
 	mov	ar.lc = len
 .l4:
-(p6)	extr.u	c = value, 0, 8		// c = value & 0xff
+(p6)	extr.u	c = value, 0, 8		/* c = value & 0xff */
 (p6)	shr.u	value = value, 8 ;;
 	st1	[dest] = c, 1
 	cmp.ne	p6, p0 = c, r0
@@ -165,7 +164,7 @@ ENTRY(strncpy)
 	mov	value = 0 ;;
 .found0:
 	shl	tmp = pos, 3
-	shr.u	loopcnt = len, 4	// loopcnt = len / 16
+	shr.u	loopcnt = len, 4	/* loopcnt = len / 16 */
 	mov	c = -1 ;;
 	cmp.eq	p6, p0 = loopcnt, r0
 	adds	loopcnt = -1, loopcnt
@@ -192,24 +191,24 @@ ENTRY(strncpy)
 	st1	[dest] = r0, 1
 	br.cloop.dptk	.l7 ;;
 .restore_and_exit:
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	mov	pr = saved_pr, -1	// restore the predicate registers
+	mov 	ar.lc = saved_lc	/* restore the loop counter */
+	mov	pr = saved_pr, -1	/* restore the predicate registers */
 	br.ret.sptk.many b0
 
 .short_len:
 	cmp.eq	p5, p0 = in2, r0
 	adds	loopcnt = -1, in2
 (p5)	br.cond.spnt .restore_and_exit ;;
-	mov	ar.lc = loopcnt		// p6 should be set when we get here
+	mov	ar.lc = loopcnt		/* p6 should be set when we get here */
 .l8:
-(p6)	ld1	c = [in1], 1		// c = *src++
+(p6)	ld1	c = [in1], 1		/* c = *src++ */
 	;;
-	st1	[in0] = c, 1		// *dest++ = c
+	st1	[in0] = c, 1		/* *dest++ = c */
 (p6)	cmp.ne	p6, p0 = c, r0
 	br.cloop.dptk .l8
 	;;
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	mov	pr = saved_pr, -1	// restore the predicate registers
+	mov 	ar.lc = saved_lc	/* restore the loop counter */
+	mov	pr = saved_pr, -1	/* restore the predicate registers */
 	br.ret.sptk.many b0
 .recovery2:
 	add	c = 8, len
diff --git a/libc/string/ia64/sysdep.h b/libc/string/ia64/sysdep.h
deleted file mode 100644
index d10020ac1..000000000
--- a/libc/string/ia64/sysdep.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (C) 1999, 2000, 2002, 2003, 2004 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Written by Jes Sorensen, <Jes.Sorensen@cern.ch>, April 1999.
-   Based on code originally written by David Mosberger-Tang
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#ifndef _LINUX_IA64_SYSDEP_H
-#define _LINUX_IA64_SYSDEP_H 1
-
-#include <features.h>
-#include <asm/unistd.h>
-
-#ifdef __ASSEMBLER__
-
-/* Macros to help writing .prologue directives in assembly code.  */
-#define ASM_UNW_PRLG_RP			0x8
-#define ASM_UNW_PRLG_PFS		0x4
-#define ASM_UNW_PRLG_PSP		0x2
-#define ASM_UNW_PRLG_PR			0x1
-#define ASM_UNW_PRLG_GRSAVE(ninputs)	(32+(ninputs))
-
-#ifdef	__STDC__
-#define C_LABEL(name)		name :
-#else
-#define C_LABEL(name)		name/**/:
-#endif
-
-#define CALL_MCOUNT
-
-#define ENTRY(name)				\
-	.text;					\
-	.align 32;				\
-	.proc C_SYMBOL_NAME(name);		\
-	.global C_SYMBOL_NAME(name);		\
-	C_LABEL(name)				\
-	CALL_MCOUNT
-
-#define LEAF(name)				\
-  .text;					\
-  .align 32;					\
-  .proc C_SYMBOL_NAME(name);			\
-  .global name;					\
-  C_LABEL(name)
-
-/* Mark the end of function SYM.  */
-#undef END
-#define END(sym)	.endp C_SYMBOL_NAME(sym)
-
-/* For Linux we can use the system call table in the header file
-	/usr/include/asm/unistd.h
-   of the kernel.  But these symbols do not follow the SYS_* syntax
-   so we have to redefine the `SYS_ify' macro here.  */
-#undef SYS_ify
-#ifdef __STDC__
-# define SYS_ify(syscall_name)	__NR_##syscall_name
-#else
-# define SYS_ify(syscall_name)	__NR_/**/syscall_name
-#endif
-
-/* Linux uses a negative return value to indicate syscall errors, unlike
-   most Unices, which use the condition codes' carry flag.
-
-   Since version 2.1 the return value of a system call might be negative
-   even if the call succeeded.  E.g., the `lseek' system call might return
-   a large offset.  Therefore we must not anymore test for < 0, but test
-   for a real error by making sure the value in %d0 is a real error
-   number.  Linus said he will make sure the no syscall returns a value
-   in -1 .. -4095 as a valid result so we can savely test with -4095.  */
-
-/* We don't want the label for the error handler to be visible in the symbol
-   table when we define it here.  */
-#define SYSCALL_ERROR_LABEL __syscall_error
-
-#undef PSEUDO
-#define	PSEUDO(name, syscall_name, args)	\
-  ENTRY(name)					\
-    DO_CALL (SYS_ify(syscall_name));		\
-	cmp.eq p6,p0=-1,r10;			\
-(p6)	br.cond.spnt.few __syscall_error;
-
-#define DO_CALL_VIA_BREAK(num)			\
-	mov r15=num;				\
-	break __BREAK_SYSCALL
-
-#ifdef IA64_USE_NEW_STUB
-# ifdef SHARED
-#  define DO_CALL(num)				\
-	.prologue;				\
-	adds r2 = SYSINFO_OFFSET, r13;;		\
-	ld8 r2 = [r2];				\
-	.save ar.pfs, r11;			\
-	mov r11 = ar.pfs;;			\
-	.body;					\
-	mov r15 = num;				\
-	mov b7 = r2;				\
-	br.call.sptk.many b6 = b7;;		\
-	.restore sp;				\
-	mov ar.pfs = r11;			\
-	.prologue;				\
-	.body
-# else /* !SHARED */
-#  define DO_CALL(num)				\
-	.prologue;				\
-	mov r15 = num;				\
-	movl r2 = _dl_sysinfo;;			\
-	ld8 r2 = [r2];				\
-	.save ar.pfs, r11;			\
-	mov r11 = ar.pfs;;			\
-	.body;					\
-	mov b7 = r2;				\
-	br.call.sptk.many b6 = b7;;		\
-	.restore sp;				\
-	mov ar.pfs = r11;			\
-	.prologue;				\
-	.body
-# endif
-#else
-# define DO_CALL(num)				DO_CALL_VIA_BREAK(num)
-#endif
-
-#undef PSEUDO_END
-#define PSEUDO_END(name)	.endp C_SYMBOL_NAME(name);
-
-#undef PSEUDO_NOERRNO
-#define	PSEUDO_NOERRNO(name, syscall_name, args)	\
-  ENTRY(name)						\
-    DO_CALL (SYS_ify(syscall_name));
-
-#undef PSEUDO_END_NOERRNO
-#define PSEUDO_END_NOERRNO(name)	.endp C_SYMBOL_NAME(name);
-
-#undef PSEUDO_ERRVAL
-#define	PSEUDO_ERRVAL(name, syscall_name, args)	\
-  ENTRY(name)					\
-    DO_CALL (SYS_ify(syscall_name));		\
-	cmp.eq p6,p0=-1,r10;			\
-(p6)	mov r10=r8;
-
-
-#undef PSEUDO_END_ERRVAL
-#define PSEUDO_END_ERRVAL(name)	.endp C_SYMBOL_NAME(name);
-
-#undef END
-#define END(name)						\
-	.size	C_SYMBOL_NAME(name), . - C_SYMBOL_NAME(name) ;	\
-	.endp	C_SYMBOL_NAME(name)
-
-#define ret			br.ret.sptk.few b0
-#define ret_NOERRNO		ret
-#define ret_ERRVAL		ret
-
-#endif /* not __ASSEMBLER__ */
-
-#endif /* linux/ia64/sysdep.h */
diff --git a/libc/string/memchr.c b/libc/string/memchr.c
index 413999722..99e13a2fc 100644
--- a/libc/string/memchr.c
+++ b/libc/string/memchr.c
@@ -10,31 +10,23 @@
 #ifdef WANT_WIDE
 # define Wmemchr wmemchr
 #else
+# undef memchr
 # define Wmemchr memchr
 #endif
 
-libc_hidden_proto(Wmemchr)
-
 Wvoid *Wmemchr(const Wvoid *s, Wint c, size_t n)
 {
 	register const Wuchar *r = (const Wuchar *) s;
-#ifdef __BCC__
-	/* bcc can optimize the counter if it thinks it is a pointer... */
-	register const char *np = (const char *) n;
-#else
-# define np n
-#endif
 
-	while (np) {
+	while (n) {
 		if (*r == ((Wuchar)c)) {
 			return (Wvoid *) r;	/* silence the warning */
 		}
 		++r;
-		--np;
+		--n;
 	}
 
 	return NULL;
 }
-#undef np
 
 libc_hidden_def(Wmemchr)
diff --git a/libc/string/memcmp.c b/libc/string/memcmp.c
index 762fc23c1..6cb37f417 100644
--- a/libc/string/memcmp.c
+++ b/libc/string/memcmp.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wmemcmp wmemcmp
 #else
-/* Experimentally off - libc_hidden_proto(memcmp) */
 # define Wmemcmp memcmp
 #endif
 
diff --git a/libc/string/memcpy.c b/libc/string/memcpy.c
index dc2986778..42436e0b6 100644
--- a/libc/string/memcpy.c
+++ b/libc/string/memcpy.c
@@ -10,26 +10,19 @@
 #ifdef WANT_WIDE
 # define Wmemcpy wmemcpy
 #else
+# undef memcpy
 # define Wmemcpy memcpy
 #endif
 
-libc_hidden_proto(Wmemcpy)
-
 Wvoid *Wmemcpy(Wvoid * __restrict s1, const Wvoid * __restrict s2, size_t n)
 {
 	register Wchar *r1 = s1;
 	register const Wchar *r2 = s2;
 
-#ifdef __BCC__
-	while (n--) {
-		*r1++ = *r2++;
-	}
-#else
 	while (n) {
 		*r1++ = *r2++;
 		--n;
 	}
-#endif
 
 	return s1;
 }
diff --git a/libc/string/memmem.c b/libc/string/memmem.c
index 9dcd4c4c0..1b3a0bab6 100644
--- a/libc/string/memmem.c
+++ b/libc/string/memmem.c
@@ -8,7 +8,6 @@
 #include "_string.h"
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(memmem) */
 void *memmem(const void *haystack, size_t haystacklen,
 		     const void *needle, size_t needlelen)
 {
@@ -38,5 +37,4 @@ void *memmem(const void *haystack, size_t haystacklen,
 
 	return NULL;
 }
-libc_hidden_def(memmem)
 #endif
diff --git a/libc/string/memmove.c b/libc/string/memmove.c
index 0bea9b497..b768b6ea9 100644
--- a/libc/string/memmove.c
+++ b/libc/string/memmove.c
@@ -10,30 +10,11 @@
 #ifdef WANT_WIDE
 # define Wmemmove wmemmove
 #else
-/* Experimentally off - libc_hidden_proto(memmove) */
 # define Wmemmove memmove
 #endif
 
 Wvoid *Wmemmove(Wvoid *s1, const Wvoid *s2, size_t n)
 {
-#ifdef __BCC__
-	register Wchar *s = (Wchar *) s1;
-	register const Wchar *p = (const Wchar *) s2;
-
-	if (p >= s) {
-		while (n--) {
-			*s++ = *p++;
-		}
-	} else {
-		s += n;
-		p += n;
-		while (n--) {
-			*--s = *--p;
-		}
-	}
-
-	return s1;
-#else
 	register Wchar *s = (Wchar *) s1;
 	register const Wchar *p = (const Wchar *) s2;
 
@@ -50,9 +31,8 @@ Wvoid *Wmemmove(Wvoid *s1, const Wvoid *s2, size_t n)
 	}
 
 	return s1;
-#endif
 }
 
 #ifndef WANT_WIDE
-libc_hidden_def(Wmemmove)
+libc_hidden_def(memmove)
 #endif
diff --git a/libc/string/mempcpy.c b/libc/string/mempcpy.c
index 91896434b..d1d752b50 100644
--- a/libc/string/mempcpy.c
+++ b/libc/string/mempcpy.c
@@ -12,26 +12,19 @@
 #ifdef WANT_WIDE
 # define Wmempcpy wmempcpy
 #else
+# undef mempcpy
 # define Wmempcpy mempcpy
 #endif
 
-libc_hidden_proto(Wmempcpy)
-
 Wvoid *Wmempcpy(Wvoid * __restrict s1, const Wvoid * __restrict s2, size_t n)
 {
 	register Wchar *r1 = s1;
 	register const Wchar *r2 = s2;
 
-#ifdef __BCC__
-	while (n--) {
-		*r1++ = *r2++;
-	}
-#else
 	while (n) {
 		*r1++ = *r2++;
 		--n;
 	}
-#endif
 
 	return r1;
 }
diff --git a/libc/string/memrchr.c b/libc/string/memrchr.c
index 48ec50a4e..60211f804 100644
--- a/libc/string/memrchr.c
+++ b/libc/string/memrchr.c
@@ -8,31 +8,21 @@
 #include "_string.h"
 
 #ifdef __USE_GNU
-
-/* Experimentally off - libc_hidden_proto(memrchr) */
-
 void *memrchr(const void *s, int c, size_t n)
 {
 	register const unsigned char *r;
-#ifdef __BCC__
-	/* bcc can optimize the counter if it thinks it is a pointer... */
-	register const char *np = (const char *) n;
-#else
-#define np n
-#endif
 
-	r = ((unsigned char *)s) + ((size_t) np);
+	r = ((unsigned char *)s) + ((size_t) n);
 
-	while (np) {
+	while (n) {
 		if (*--r == ((unsigned char)c)) {
 			return (void *) r;	/* silence the warning */
 		}
-		--np;
+		--n;
 	}
 
 	return NULL;
 }
-#undef np
 
 libc_hidden_def(memrchr)
 #endif
diff --git a/libc/string/memset.c b/libc/string/memset.c
index 6dd20d668..2a7c19dee 100644
--- a/libc/string/memset.c
+++ b/libc/string/memset.c
@@ -10,28 +10,21 @@
 #ifdef WANT_WIDE
 # define Wmemset wmemset
 #else
-/* Experimentally off - libc_hidden_proto(memset) */
+# undef memset
 # define Wmemset memset
 #endif
 
 Wvoid *Wmemset(Wvoid *s, Wint c, size_t n)
 {
 	register Wuchar *p = (Wuchar *) s;
-#ifdef __BCC__
-	/* bcc can optimize the counter if it thinks it is a pointer... */
-	register const char *np = (const char *) n;
-#else
-# define np n
-#endif
 
-	while (np) {
+	while (n) {
 		*p++ = (Wuchar) c;
-		--np;
+		--n;
 	}
 
 	return s;
 }
-#undef np
 
 #ifndef WANT_WIDE
 libc_hidden_def(memset)
diff --git a/libc/string/metag/Makefile b/libc/string/metag/Makefile
new file mode 100644
index 000000000..523cf6842
--- /dev/null
+++ b/libc/string/metag/Makefile
@@ -0,0 +1,13 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
+#
+# Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+#
+
+top_srcdir:=../../../
+top_builddir:=../../../
+all: objs
+include $(top_builddir)Rules.mak
+include ../Makefile.in
+include $(top_srcdir)Makerules
diff --git a/libc/string/metag/memchr.S b/libc/string/metag/memchr.S
new file mode 100644
index 000000000..8b48d863c
--- /dev/null
+++ b/libc/string/metag/memchr.S
@@ -0,0 +1,156 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+!
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+	.text
+	.global	_memchr
+	.type	_memchr,function
+! D0Ar6 src
+! D0Ar2 c
+! D1Ar3 n
+_memchr:
+	CMP     D1Ar3, #0
+	BEQ	$Lexit_fail
+	!! convert c to unsigned char
+	AND     D0Ar2,D0Ar2,#0xff
+	MOV	D0Ar6, D1Ar1
+	MOV	D1Ar5, D0Ar6
+	!! test alignment
+	AND	D1Ar5, D1Ar5, #7
+	CMP	D1Ar5, #0
+	BNZ	$Lunaligned_loop
+	!! length must be greater than or equal to 8 for aligned loop
+	CMP     D1Ar3, #8
+	BGE     $Laligned_setup
+$Lunaligned_loop:
+	!! get 1 char from s
+	GETB	D0Re0, [D0Ar6++]
+	!! increase alignment counter
+	ADD	D1Ar5, D1Ar5, #1
+	!! decrement n
+	SUB     D1Ar3, D1Ar3, #1
+	!! exit if we have a match
+	CMP	D0Re0, D0Ar2
+	BZ	$Lexit_success1
+	!! exit if we have hit the end of the string
+	CMP	D1Ar3, #0
+	BZ	$Lexit_fail
+	!! fall through if the buffer is aligned now
+	CMP	D1Ar5, #8
+	BNE	$Lunaligned_loop
+	!! fall through if there is more than 8 bytes left
+	CMP	D1Ar3, #8
+	BLT	$Lunaligned_loop
+$Laligned_setup:
+	!! fill the c into 4 bytes
+	MOV	D0Ar4, D0Ar2
+	LSL	D0Ar4, D0Ar4, #8
+	ADD	D0Ar4, D0Ar4, D0Ar2
+	LSL	D0Ar4, D0Ar4, #8
+	ADD	D0Ar4, D0Ar4, D0Ar2
+	LSL	D0Ar4, D0Ar4, #8
+	ADD	D0Ar4, D0Ar4, D0Ar2
+	!! divide n by 8
+	MOV	D1Ar5, D1Ar3
+	LSR	D1Ar5, D1Ar5, #3
+$Laligned_loop:
+	!! get 8 chars from s
+	GETL	D0Re0, D1Re0, [D0Ar6++]
+	!! decrement loop counter
+	SUB	D1Ar5, D1Ar5, #1
+	!! test first 4 chars
+	XOR	D0Re0, D0Re0, D0Ar4
+	!! test second 4 chars
+	MOV	D0Ar2, D1Re0
+	XOR	D1Re0, D0Ar2, D0Ar4
+	!! check for matches in the first 4 chars
+	MOV	D0Ar2, D0Re0
+	ADDT	D0Re0, D0Re0, #HI(0xfefefeff)
+	ADD	D0Re0, D0Re0, #LO(0xfefefeff)
+	XOR	D0Ar2, D0Ar2, #-1
+	AND	D0Re0, D0Re0, D0Ar2
+	ANDMT	D0Re0, D0Re0, #HI(0x80808080)
+	ANDMB	D0Re0, D0Re0, #LO(0x80808080)
+	CMP	D0Re0, #0
+	BNZ	$Lmatch_word1
+	!! check for matches in the second 4 chars
+	MOV	D1Ar1, D1Re0
+	ADDT	D1Re0, D1Re0, #HI(0xfefefeff)
+	ADD	D1Re0, D1Re0, #LO(0xfefefeff)
+	XOR	D1Ar1, D1Ar1, #-1
+	AND	D1Re0, D1Re0, D1Ar1
+	ANDMT	D1Re0, D1Re0, #HI(0x80808080)
+	ANDMB	D1Re0, D1Re0, #LO(0x80808080)
+	CMP	D1Re0, #0
+	BNZ	$Lmatch_word2
+	!! check if we have reached the end of the buffer
+	CMP	D1Ar5, #0
+	BNE	$Laligned_loop
+	!! exit if there are no chars left to check
+	AND	D1Ar3, D1Ar3, #7
+	CMP	D1Ar3, #0
+	BZ	$Lexit_fail
+	!! recover c
+	AND	D0Ar2, D0Ar4, #0xff
+$Lbyte_loop:
+	!! get 1 char from s
+	GETB	D0Re0, [D0Ar6++]
+	!! decrement n
+	SUB	D1Ar3, D1Ar3, #1
+	!! exit if we have a match
+	CMP	D0Re0, D0Ar2
+	BZ	$Lexit_success1
+	!! fall through if we have run out of chars
+	CMP	D1Ar3, #0
+	BNE	$Lbyte_loop
+
+$Lexit_fail:
+	MOV	D0Re0, #0
+	B	$Lend
+
+$Lmatch_word1:
+	!! move the match word into D1Re0
+	MOV	D1Re0, D0Re0
+	!! roll back the buffer pointer by 4 chars
+	SUB	D0Ar6, D0Ar6, #4
+$Lmatch_word2:
+	!! roll back the buffer pointer by 4 chars
+	SUB	D0Ar6, D0Ar6, #4
+	!! exit if lowest byte is 0
+	MOV	D1Ar1, D1Re0
+	AND	D1Ar1, D1Ar1, #0xff
+	CMP	D1Ar1, #0
+	BNE	$Lexit_success2
+	!! advance buffer pointer to the next char
+	ADD	D0Ar6, D0Ar6, #1
+	!! shift in the next lowest byte
+	LSR	D1Re0, D1Re0, #8
+	!! exit if lowest byte is 0
+	MOV	D1Ar1, D1Re0
+	AND	D1Ar1, D1Ar1, #0xff
+	CMP	D1Ar1, #0
+	BNE	$Lexit_success2
+	!! advance buffer pointer to the next char
+	ADD	D0Ar6, D0Ar6, #1
+	!! shift in the next lowest byte
+	LSR	D1Re0, D1Re0, #8
+	!! exit if lowest byte is 0
+	MOV	D1Ar1, D1Re0
+	AND	D1Ar1, D1Ar1, #0xff
+	CMP	D1Ar1, #0
+	BNE	$Lexit_success2
+	!! the match must be in the last byte, exit
+	ADD	D0Ar6, D0Ar6, #1
+	B	$Lexit_success2
+
+$Lexit_success1:
+	SUB	D0Ar6, D0Ar6, #1
+$Lexit_success2:
+	!! return the buffer pointer
+	MOV	D0Re0, D0Ar6
+$Lend:
+	MOV	PC, D1RtP
+
+	.size _memchr,.-_memchr
+
+libc_hidden_def(memchr)
diff --git a/libc/string/metag/memcpy.S b/libc/string/metag/memcpy.S
new file mode 100644
index 000000000..f96c9d131
--- /dev/null
+++ b/libc/string/metag/memcpy.S
@@ -0,0 +1,189 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+	.text
+	.global	_memcpy
+	.type	_memcpy,function
+! D1Ar1 dst
+! D0Ar2 src
+! D1Ar3 cnt
+! D0Re0 dst
+_memcpy:
+	CMP 	D1Ar3, #16
+	MOV 	A1.2, D0Ar2		! source pointer
+	MOV 	A0.2, D1Ar1		! destination pointer
+	MOV 	A0.3, D1Ar1		! for return value
+! If there are less than 16 bytes to copy use the byte copy loop
+	BGE 	$Llong_copy
+
+$Lbyte_copy:
+! Simply copy a byte at a time
+	SUBS	TXRPT, D1Ar3, #1
+	BLT	$Lend
+$Lloop_byte:
+	GETB 	D1Re0, [A1.2++]
+	SETB 	[A0.2++], D1Re0
+	BR	$Lloop_byte
+
+$Lend:
+! Finally set return value and return
+	MOV 	D0Re0, A0.3
+	MOV 	PC, D1RtP
+
+$Llong_copy:
+	ANDS 	D1Ar5, D1Ar1, #7	! test destination alignment
+	BZ	$Laligned_dst
+
+! The destination address is not 8 byte aligned. We will copy bytes from
+! the source to the destination until the remaining data has an 8 byte
+! destination address alignment (i.e we should never copy more than 7
+! bytes here).
+$Lalign_dst:
+	GETB 	D0Re0, [A1.2++]
+	ADD 	D1Ar5, D1Ar5, #1	! dest is aligned when D1Ar5 reaches #8
+	SUB 	D1Ar3, D1Ar3, #1	! decrement count of remaining bytes
+	SETB 	[A0.2++], D0Re0
+	CMP 	D1Ar5, #8
+	BNE 	$Lalign_dst
+
+! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
+! blocks, then jump to the unaligned copy loop or fall through to the aligned
+! copy loop as appropriate.
+$Laligned_dst:
+	MOV	D0Ar4, A1.2
+	LSR 	D1Ar5, D1Ar3, #3	! D1Ar5 = number of 8 byte blocks
+	ANDS 	D0Ar4, D0Ar4, #7	! test source alignment
+	BNZ 	$Lunaligned_copy	! if unaligned, use unaligned copy loop
+
+! Both source and destination are 8 byte aligned - the easy case.
+$Laligned_copy:
+	LSRS	D1Ar5, D1Ar3, #5	! D1Ar5 = number of 32 byte blocks
+	BZ	$Lbyte_copy
+	SUB	TXRPT, D1Ar5, #1
+
+$Laligned_32:
+	GETL 	D0Re0, D1Re0, [A1.2++]
+	GETL 	D0Ar6, D1Ar5, [A1.2++]
+	SETL 	[A0.2++], D0Re0, D1Re0
+	SETL 	[A0.2++], D0Ar6, D1Ar5
+	GETL 	D0Re0, D1Re0, [A1.2++]
+	GETL 	D0Ar6, D1Ar5, [A1.2++]
+	SETL 	[A0.2++], D0Re0, D1Re0
+	SETL 	[A0.2++], D0Ar6, D1Ar5
+	BR	$Laligned_32
+
+! If there are any remaining bytes use the byte copy loop, otherwise we are done
+	ANDS 	D1Ar3, D1Ar3, #0x1f
+	BNZ	$Lbyte_copy
+	B	$Lend
+
+! The destination is 8 byte aligned but the source is not, and there are 8
+! or more bytes to be copied.
+$Lunaligned_copy:
+! Adjust the source pointer (A1.2) to the 8 byte boundary before its
+! current value
+	MOV 	D0Ar4, A1.2
+	MOV 	D0Ar6, A1.2
+	ANDMB 	D0Ar4, D0Ar4, #0xfff8
+	MOV 	A1.2, D0Ar4
+! Save the number of bytes of mis-alignment in D0Ar4 for use later
+	SUBS 	D0Ar6, D0Ar6, D0Ar4
+	MOV	D0Ar4, D0Ar6
+! if there is no mis-alignment after all, use the aligned copy loop
+	BZ 	$Laligned_copy
+
+! prefetch 8 bytes
+	GETL 	D0Re0, D1Re0, [A1.2]
+
+	SUB	TXRPT, D1Ar5, #1
+
+! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
+! 4 bytes, and more than 4 bytes.
+	CMP 	D0Ar6, #4
+	BLT 	$Lunaligned_1_2_3	! use 1-3 byte mis-alignment loop
+	BZ 	$Lunaligned_4		! use 4 byte mis-alignment loop
+
+! The mis-alignment is more than 4 bytes
+$Lunaligned_5_6_7:
+	SUB 	D0Ar6, D0Ar6, #4
+! Calculate the bit offsets required for the shift operations necesssary
+! to align the data.
+! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
+	MULW 	D0Ar6, D0Ar6, #8
+	MOV	D1Ar5, #32
+	SUB	D1Ar5, D1Ar5, D0Ar6
+! Move data 4 bytes before we enter the main loop
+	MOV 	D0Re0, D1Re0
+
+$Lloop_5_6_7:
+	GETL 	D0Ar2, D1Ar1, [++A1.2]
+! form 64-bit data in D0Re0, D1Re0
+	LSR 	D0Re0, D0Re0, D0Ar6
+	MOV 	D1Re0, D0Ar2
+	LSL 	D1Re0, D1Re0, D1Ar5
+	ADD 	D0Re0, D0Re0, D1Re0
+
+	LSR 	D0Ar2, D0Ar2, D0Ar6
+	LSL 	D1Re0, D1Ar1, D1Ar5
+	ADD 	D1Re0, D1Re0, D0Ar2
+
+	SETL 	[A0.2++], D0Re0, D1Re0
+	MOV 	D0Re0, D1Ar1
+	BR	$Lloop_5_6_7
+
+	B 	$Lunaligned_end
+
+$Lunaligned_1_2_3:
+! Calculate the bit offsets required for the shift operations necesssary
+! to align the data.
+! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
+	MULW 	D0Ar6, D0Ar6, #8
+	MOV	D1Ar5, #32
+	SUB	D1Ar5, D1Ar5, D0Ar6
+
+$Lloop_1_2_3:
+! form 64-bit data in D0Re0,D1Re0
+	LSR 	D0Re0, D0Re0, D0Ar6
+	LSL 	D1Ar1, D1Re0, D1Ar5
+	ADD 	D0Re0, D0Re0, D1Ar1
+	MOV	D0Ar2, D1Re0
+	LSR 	D0FrT, D0Ar2, D0Ar6
+	GETL 	D0Ar2, D1Ar1, [++A1.2]
+
+	MOV 	D1Re0, D0Ar2
+	LSL 	D1Re0, D1Re0, D1Ar5
+	ADD 	D1Re0, D1Re0, D0FrT
+
+	SETL 	[A0.2++], D0Re0, D1Re0
+	MOV 	D0Re0, D0Ar2
+	MOV 	D1Re0, D1Ar1
+	BR	$Lloop_1_2_3
+
+	B 	$Lunaligned_end
+
+! The 4 byte mis-alignment case - this does not require any shifting, just a
+! shuffling of registers.
+$Lunaligned_4:
+	MOV 	D0Re0, D1Re0
+$Lloop_4:
+	GETL 	D0Ar2, D1Ar1, [++A1.2]
+	MOV 	D1Re0, D0Ar2
+	SETL 	[A0.2++], D0Re0, D1Re0
+	MOV 	D0Re0, D1Ar1
+	BR	$Lloop_4
+
+$Lunaligned_end:
+! If there are no remaining bytes to copy, we are done.
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ	$Lend
+! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
+! address of the remaining bytes, and fall through to the byte copy loop.
+	MOV 	D0Ar6, A1.2
+	ADD 	D1Ar5, D0Ar4, D0Ar6
+	MOV 	A1.2, D1Ar5
+	B	$Lbyte_copy
+
+        .size _memcpy,.-_memcpy
+
+libc_hidden_def(memcpy)
diff --git a/libc/string/metag/memmove.S b/libc/string/metag/memmove.S
new file mode 100644
index 000000000..3416fd558
--- /dev/null
+++ b/libc/string/metag/memmove.S
@@ -0,0 +1,350 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+
+	.text
+	.global	_memmove
+	.type	_memmove,function
+! D1Ar1 dst
+! D0Ar2 src
+! D1Ar3 cnt
+! D0Re0 dst
+_memmove:
+	CMP 	D1Ar3, #0
+	MOV 	D0Re0, D1Ar1
+	BZ 	$LEND2
+	MSETL 	[A0StP], D0.5, D0.6, D0.7
+	MOV 	D1Ar5, D0Ar2
+	CMP 	D1Ar1, D1Ar5
+	BLT 	$Lforwards_copy
+	SUB 	D0Ar4, D1Ar1, D1Ar3
+	ADD 	D0Ar4, D0Ar4, #1
+	CMP 	D0Ar2, D0Ar4
+	BLT 	$Lforwards_copy
+	! should copy backwards
+	MOV 	D1Re0, D0Ar2
+	! adjust pointer to the end of mem
+	ADD 	D0Ar2, D1Re0, D1Ar3
+	ADD 	D1Ar1, D1Ar1, D1Ar3
+
+	MOV 	A1.2, D0Ar2
+	MOV 	A0.2, D1Ar1
+	CMP 	D1Ar3, #8
+	BLT 	$Lbbyte_loop
+
+	MOV 	D0Ar4, D0Ar2
+	MOV 	D1Ar5, D1Ar1
+
+	! test 8 byte alignment
+	ANDS 	D1Ar5, D1Ar5, #7
+	BNE 	$Lbdest_unaligned
+
+	ANDS 	D0Ar4, D0Ar4, #7
+	BNE 	$Lbsrc_unaligned
+
+	LSR 	D1Ar5, D1Ar3, #3
+
+$Lbaligned_loop:
+	GETL 	D0Re0, D1Re0, [--A1.2]
+	SETL 	[--A0.2], D0Re0, D1Re0
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lbaligned_loop
+
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lbbyte_loop_exit
+$Lbbyte_loop:
+	GETB 	D1Re0, [--A1.2]
+	SETB 	[--A0.2], D1Re0
+	SUBS 	D1Ar3, D1Ar3, #1
+	BNE 	$Lbbyte_loop
+$Lbbyte_loop_exit:
+	MOV 	D0Re0, A0.2
+$LEND:
+	SUB 	A0.2, A0StP, #24
+	MGETL 	D0.5, D0.6, D0.7, [A0.2]
+	SUB 	A0StP, A0StP, #24
+$LEND2:
+	MOV 	PC, D1RtP
+
+$Lbdest_unaligned:
+	GETB 	D0Re0, [--A1.2]
+	SETB 	[--A0.2], D0Re0
+	SUBS 	D1Ar5, D1Ar5, #1
+	SUB 	D1Ar3, D1Ar3, #1
+	BNE 	$Lbdest_unaligned
+	CMP 	D1Ar3, #8
+	BLT 	$Lbbyte_loop
+$Lbsrc_unaligned:
+	LSR 	D1Ar5, D1Ar3, #3
+	! adjust A1.2
+	MOV 	D0Ar4, A1.2
+	! save original address
+	MOV 	D0Ar6, A1.2
+
+	ADD 	D0Ar4, D0Ar4, #7
+	ANDMB 	D0Ar4, D0Ar4, #0xfff8
+	! new address is the 8-byte aligned one above the original
+	MOV 	A1.2, D0Ar4
+
+	! A0.2 dst 64-bit is aligned
+	! measure the gap size
+	SUB 	D0Ar6, D0Ar4, D0Ar6
+	MOVS 	D0Ar4, D0Ar6
+	! keep this information for the later adjustment
+	! both aligned
+	BZ 	$Lbaligned_loop
+
+	! prefetch
+	GETL 	D0Re0, D1Re0, [--A1.2]
+
+	CMP 	D0Ar6, #4
+	BLT 	$Lbunaligned_1_2_3
+	! 32-bit aligned
+	BZ 	$Lbaligned_4
+
+	SUB 	D0Ar6, D0Ar6, #4
+	! D1.6 stores the gap size in bits
+	MULW 	D1.6, D0Ar6, #8
+	MOV 	D0.6, #32
+	! D0.6 stores the complement of the gap size
+	SUB 	D0.6, D0.6, D1.6
+
+$Lbunaligned_5_6_7:
+	GETL 	D0.7, D1.7, [--A1.2]
+	! form 64-bit data in D0Re0, D1Re0
+	MOV 	D1Re0, D0Re0
+	! D1Re0 << gap-size
+	LSL 	D1Re0, D1Re0, D1.6
+	MOV 	D0Re0, D1.7
+	! D0Re0 >> complement
+	LSR 	D0Re0, D0Re0, D0.6
+	MOV 	D1.5, D0Re0
+	! combine the both
+	ADD 	D1Re0, D1Re0, D1.5
+
+	MOV 	D1.5, D1.7
+	LSL 	D1.5, D1.5, D1.6
+	MOV 	D0Re0, D0.7
+	LSR 	D0Re0, D0Re0, D0.6
+	MOV 	D0.5, D1.5
+	ADD 	D0Re0, D0Re0, D0.5
+
+	SETL 	[--A0.2], D0Re0, D1Re0
+	MOV 	D0Re0, D0.7
+	MOV 	D1Re0, D1.7
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lbunaligned_5_6_7
+
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lbbyte_loop_exit
+	! Adjust A1.2
+	! A1.2 <- A1.2 +8 - gapsize
+	ADD 	A1.2, A1.2, #8
+	SUB 	A1.2, A1.2, D0Ar4
+	B 	$Lbbyte_loop
+
+$Lbunaligned_1_2_3:
+	MULW 	D1.6, D0Ar6, #8
+	MOV 	D0.6, #32
+	SUB 	D0.6, D0.6, D1.6
+
+$Lbunaligned_1_2_3_loop:
+	GETL 	D0.7, D1.7, [--A1.2]
+	! form 64-bit data in D0Re0, D1Re0
+	LSL 	D1Re0, D1Re0, D1.6
+	! save D0Re0 for later use
+	MOV 	D0.5, D0Re0
+	LSR 	D0Re0, D0Re0, D0.6
+	MOV 	D1.5, D0Re0
+	ADD 	D1Re0, D1Re0, D1.5
+
+	! orignal data in D0Re0
+	MOV 	D1.5, D0.5
+	LSL 	D1.5, D1.5, D1.6
+	MOV 	D0Re0, D1.7
+	LSR 	D0Re0, D0Re0, D0.6
+	MOV 	D0.5, D1.5
+	ADD 	D0Re0, D0Re0, D0.5
+
+	SETL 	[--A0.2], D0Re0, D1Re0
+	MOV 	D0Re0, D0.7
+	MOV 	D1Re0, D1.7
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lbunaligned_1_2_3_loop
+
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lbbyte_loop_exit
+	! Adjust A1.2
+	ADD 	A1.2, A1.2, #8
+	SUB 	A1.2, A1.2, D0Ar4
+	B 	$Lbbyte_loop
+
+$Lbaligned_4:
+	GETL 	D0.7, D1.7, [--A1.2]
+	MOV 	D1Re0, D0Re0
+	MOV 	D0Re0, D1.7
+	SETL 	[--A0.2], D0Re0, D1Re0
+	MOV 	D0Re0, D0.7
+	MOV 	D1Re0, D1.7
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lbaligned_4
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lbbyte_loop_exit
+	! Adjust A1.2
+	ADD 	A1.2, A1.2, #8
+	SUB 	A1.2, A1.2, D0Ar4
+	B 	$Lbbyte_loop
+
+$Lforwards_copy:
+	MOV 	A1.2, D0Ar2
+	MOV 	A0.2, D1Ar1
+	CMP 	D1Ar3, #8
+	BLT 	$Lfbyte_loop
+
+	MOV 	D0Ar4, D0Ar2
+	MOV 	D1Ar5, D1Ar1
+
+	ANDS 	D1Ar5, D1Ar5, #7
+	BNE 	$Lfdest_unaligned
+
+	ANDS 	D0Ar4, D0Ar4, #7
+	BNE 	$Lfsrc_unaligned
+
+	LSR 	D1Ar5, D1Ar3, #3
+
+$Lfaligned_loop:
+	GETL 	D0Re0, D1Re0, [A1.2++]
+	SUBS 	D1Ar5, D1Ar5, #1
+	SETL 	[A0.2++], D0Re0, D1Re0
+	BNE 	$Lfaligned_loop
+
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lfbyte_loop_exit
+$Lfbyte_loop:
+	GETB 	D1Re0, [A1.2++]
+	SETB 	[A0.2++], D1Re0
+	SUBS 	D1Ar3, D1Ar3, #1
+	BNE 	$Lfbyte_loop
+$Lfbyte_loop_exit:
+	MOV 	D0Re0, D1Ar1
+	B 	$LEND
+
+$Lfdest_unaligned:
+	GETB 	D0Re0, [A1.2++]
+	ADD 	D1Ar5, D1Ar5, #1
+	SUB 	D1Ar3, D1Ar3, #1
+	SETB 	[A0.2++], D0Re0
+	CMP 	D1Ar5, #8
+	BNE 	$Lfdest_unaligned
+	CMP 	D1Ar3, #8
+	BLT 	$Lfbyte_loop
+$Lfsrc_unaligned:
+	! adjust A1.2
+	LSR 	D1Ar5, D1Ar3, #3
+
+	MOV 	D0Ar4, A1.2
+	MOV 	D0Ar6, A1.2
+	ANDMB 	D0Ar4, D0Ar4, #0xfff8
+	MOV 	A1.2, D0Ar4
+
+	! A0.2 dst 64-bit is aligned
+	SUB 	D0Ar6, D0Ar6, D0Ar4
+	! keep the information for the later adjustment
+	MOVS 	D0Ar4, D0Ar6
+
+	! both aligned
+	BZ 	$Lfaligned_loop
+
+	! prefetch
+	GETL 	D0Re0, D1Re0, [A1.2]
+
+	CMP 	D0Ar6, #4
+	BLT 	$Lfunaligned_1_2_3
+	BZ 	$Lfaligned_4
+
+	SUB 	D0Ar6, D0Ar6, #4
+	MULW 	D0.6, D0Ar6, #8
+	MOV 	D1.6, #32
+	SUB 	D1.6, D1.6, D0.6
+
+$Lfunaligned_5_6_7:
+	GETL 	D0.7, D1.7, [++A1.2]
+	! form 64-bit data in D0Re0, D1Re0
+	MOV 	D0Re0, D1Re0
+	LSR 	D0Re0, D0Re0, D0.6
+	MOV 	D1Re0, D0.7
+	LSL 	D1Re0, D1Re0, D1.6
+	MOV 	D0.5, D1Re0
+	ADD 	D0Re0, D0Re0, D0.5
+
+	MOV 	D0.5, D0.7
+	LSR 	D0.5, D0.5, D0.6
+	MOV 	D1Re0, D1.7
+	LSL 	D1Re0, D1Re0, D1.6
+	MOV 	D1.5, D0.5
+	ADD 	D1Re0, D1Re0, D1.5
+
+	SETL 	[A0.2++], D0Re0, D1Re0
+	MOV 	D0Re0, D0.7
+	MOV 	D1Re0, D1.7
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lfunaligned_5_6_7
+
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lfbyte_loop_exit
+	! Adjust A1.2
+	ADD	A1.2, A1.2, D0Ar4
+	B 	$Lfbyte_loop
+
+$Lfunaligned_1_2_3:
+	MULW 	D0.6, D0Ar6, #8
+	MOV 	D1.6, #32
+	SUB 	D1.6, D1.6, D0.6
+
+$Lfunaligned_1_2_3_loop:
+	GETL 	D0.7, D1.7, [++A1.2]
+	! form 64-bit data in D0Re0, D1Re0
+	LSR 	D0Re0, D0Re0, D0.6
+	MOV 	D1.5, D1Re0
+	LSL 	D1Re0, D1Re0, D1.6
+	MOV 	D0.5, D1Re0
+	ADD 	D0Re0, D0Re0, D0.5
+
+	MOV 	D0.5, D1.5
+	LSR 	D0.5, D0.5, D0.6
+	MOV 	D1Re0, D0.7
+	LSL 	D1Re0, D1Re0, D1.6
+	MOV 	D1.5, D0.5
+	ADD 	D1Re0, D1Re0, D1.5
+
+	SETL 	[A0.2++], D0Re0, D1Re0
+	MOV 	D0Re0, D0.7
+	MOV 	D1Re0, D1.7
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lfunaligned_1_2_3_loop
+
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lfbyte_loop_exit
+	! Adjust A1.2
+	ADD	A1.2, A1.2, D0Ar4
+	B 	$Lfbyte_loop
+
+$Lfaligned_4:
+	GETL 	D0.7, D1.7, [++A1.2]
+	MOV 	D0Re0, D1Re0
+	MOV 	D1Re0, D0.7
+	SETL 	[A0.2++], D0Re0, D1Re0
+	MOV 	D0Re0, D0.7
+	MOV 	D1Re0, D1.7
+	SUBS 	D1Ar5, D1Ar5, #1
+	BNE 	$Lfaligned_4
+	ANDS 	D1Ar3, D1Ar3, #7
+	BZ 	$Lfbyte_loop_exit
+	! Adjust A1.2
+	ADD	A1.2, A1.2, D0Ar4
+	B 	$Lfbyte_loop
+
+	.size _memmove,.-_memmove
+
+libc_hidden_def(memmove)
diff --git a/libc/string/metag/memset.S b/libc/string/metag/memset.S
new file mode 100644
index 000000000..8d4e9a158
--- /dev/null
+++ b/libc/string/metag/memset.S
@@ -0,0 +1,90 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+
+	.text
+	.global	_memset
+	.type	_memset,function
+! D1Ar1 dst
+! D0Ar2 c
+! D1Ar3 cnt
+! D0Re0 dst
+_memset:
+	AND	D0Ar2,D0Ar2,#0xFF	! Ensure a byte input value
+	MULW 	D0Ar2,D0Ar2,#0x0101	! Duplicate byte value into  0-15
+	ANDS	D0Ar4,D1Ar1,#7		! Extract bottom LSBs of dst
+	LSL 	D0Re0,D0Ar2,#16		! Duplicate byte value into 16-31
+	ADD	A0.2,D0Ar2,D0Re0	! Duplicate byte value into 4 (A0.2)
+	MOV	D0Re0,D1Ar1		! Return dst
+	BZ	$LLongStub		! if start address is aligned
+	! start address is not aligned on an 8 byte boundary, so we
+	! need the number of bytes up to the next 8 byte address
+	! boundary, or the length of the string if less than 8, in D1Ar5
+	MOV	D0Ar2,#8		! Need 8 - N in D1Ar5 ...
+	SUB	D1Ar5,D0Ar2,D0Ar4	!            ... subtract N
+	CMP	D1Ar3,D1Ar5
+	MOVMI	D1Ar5,D1Ar3
+	B	$LByteStub		! dst is mis-aligned, do $LByteStub
+
+!
+! Preamble to LongLoop which generates 4*8 bytes per interation (5 cycles)
+!
+$LLongStub:
+	LSRS	D0Ar2,D1Ar3,#5
+	AND	D1Ar3,D1Ar3,#0x1F
+	MOV	A1.2,A0.2
+	BEQ	$LLongishStub
+	SUB	TXRPT,D0Ar2,#1
+	CMP	D1Ar3,#0
+$LLongLoop:
+	SETL 	[D1Ar1++],A0.2,A1.2
+	SETL 	[D1Ar1++],A0.2,A1.2
+	SETL 	[D1Ar1++],A0.2,A1.2
+	SETL 	[D1Ar1++],A0.2,A1.2
+	BR	$LLongLoop
+	BZ	$Lexit
+!
+! Preamble to LongishLoop which generates 1*8 bytes per interation (2 cycles)
+!
+$LLongishStub:
+	LSRS	D0Ar2,D1Ar3,#3
+	AND	D1Ar3,D1Ar3,#0x7
+	MOV	D1Ar5,D1Ar3
+	BEQ	$LByteStub
+	SUB	TXRPT,D0Ar2,#1
+	CMP	D1Ar3,#0
+$LLongishLoop:
+	SETL 	[D1Ar1++],A0.2,A1.2
+	BR	$LLongishLoop
+	BZ	$Lexit
+!
+! This does a byte structured burst of up to 7 bytes
+!
+!	D1Ar1 should point to the location required
+!	D1Ar3 should be the remaining total byte count
+!	D1Ar5 should be burst size (<= D1Ar3)
+!
+$LByteStub:
+	SUBS	D1Ar3,D1Ar3,D1Ar5	! Reduce count
+	ADD	D1Ar1,D1Ar1,D1Ar5	! Advance pointer to end of area
+	MULW	D1Ar5,D1Ar5,#4		! Scale to (1*4), (2*4), (3*4)
+	SUB	D1Ar5,D1Ar5,#(8*4)	! Rebase to -(7*4), -(6*4), -(5*4), ...
+	MOV	A1.2,D1Ar5
+	SUB	PC,CPC1,A1.2		! Jump into table below
+	SETB 	[D1Ar1+#(-7)],A0.2
+	SETB 	[D1Ar1+#(-6)],A0.2
+	SETB 	[D1Ar1+#(-5)],A0.2
+	SETB 	[D1Ar1+#(-4)],A0.2
+	SETB 	[D1Ar1+#(-3)],A0.2
+	SETB 	[D1Ar1+#(-2)],A0.2
+	SETB 	[D1Ar1+#(-1)],A0.2
+!
+! Return if all data has been output, otherwise do $LLongStub
+!
+	BNZ	$LLongStub
+$Lexit:
+	MOV	PC,D1RtP
+        .size _memset,.-_memset
+
+libc_hidden_def(memset)
diff --git a/libc/string/metag/strchr.S b/libc/string/metag/strchr.S
new file mode 100644
index 000000000..6b0f2ea43
--- /dev/null
+++ b/libc/string/metag/strchr.S
@@ -0,0 +1,167 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+
+#include <features.h>
+
+	.text
+	.global	_strchr
+	.type	_strchr, function
+! D1Ar1 src
+! D0Ar2 c
+_strchr:
+	AND     D0Ar2,D0Ar2,#0xff                           ! Drop all but 8 bits of c
+	MOV 	D1Ar5, D1Ar1                                ! Copy src to D1Ar5
+	AND 	D1Ar5, D1Ar5, #7                            ! Check 64 bit alignment
+	CMP 	D1Ar5, #0
+	BZ 	$Laligned64bit                              ! Jump to 64 bit aligned strchr
+$Lalign64bit:
+	GETB 	D0Re0, [D1Ar1++]                            ! Get the next character
+	ADD 	D1Ar5, D1Ar5, #1                            ! Increment alignment counter
+	CMP 	D0Re0, D0Ar2                                ! Is the char c
+	BZ 	$Lcharatprevious                            ! If so exit returning position
+	CMP 	D0Re0, #0                                   ! End of string?
+	BZ 	$Lnotfound                                  ! If so exit
+	CMP 	D1Ar5, #8                                   ! Are we aligned 64bit yet?
+	BNZ 	$Lalign64bit                                ! If not keep aligning
+$Laligned64bit:                                             ! src is 64bit aligned
+	MOV 	D0Ar4, D0Ar2                                ! put c into D0Ar4
+	LSL 	D0Ar4, D0Ar4, #8                            ! Shift it up
+	ADD 	D0Ar4, D0Ar4, D0Ar2                         ! another c
+	LSL 	D0Ar4, D0Ar4, #8                            ! shift
+	ADD 	D0Ar4, D0Ar4, D0Ar2                         ! another c
+	LSL 	D0Ar4, D0Ar4, #8                            ! shift
+	ADD 	D0Ar4, D0Ar4, D0Ar2                         ! 4 copies of c
+$Lcheck8bytes:
+	GETL 	D0Re0, D1Re0, [D1Ar1++]                     ! grab 16 bytes
+	MOV 	A0.3, D0Re0                                 ! save for later use
+							    ! first word
+							    ! check for \0
+	MOV 	D0Ar2, D0Re0                                ! D0Ar2 is a scratch now
+	ADDT 	D0Re0, D0Re0, #HI(0xfefefeff)               ! Do 4 1-byte compares
+	ADD 	D0Re0, D0Re0, #LO(0xfefefeff)
+	XOR 	D0Ar2, D0Ar2, #-1
+	AND 	D0Re0, D0Re0, D0Ar2
+	ANDMT 	D0Re0, D0Re0, #HI(0x80808080)
+	ANDMB 	D0Re0, D0Re0, #LO(0x80808080)
+	CMP 	D0Re0, #0
+	BNZ 	$Lnullinword1                               ! found \0 (or c if c==\0)
+
+							    ! Check for c
+	MOV 	D0Re0, A0.3                                 ! restore the first word
+	XOR 	D0Re0, D0Re0, D0Ar4
+	MOV 	D0Ar2, D0Re0                                ! DO 4 1-byte compares
+	ADDT 	D0Re0, D0Re0, #HI(0xfefefeff)
+	ADD 	D0Re0, D0Re0, #LO(0xfefefeff)
+	XOR 	D0Ar2, D0Ar2, #-1
+	AND 	D0Re0, D0Re0, D0Ar2
+	ANDMT 	D0Re0, D0Re0, #HI(0x80808080)
+	ANDMB 	D0Re0, D0Re0, #LO(0x80808080)
+	CMP 	D0Re0, #0
+	BNZ 	$Lcharinword1                               ! found c
+
+							    ! second word
+							    ! check for \0
+	MOV 	A0.3, D1Re0                                 ! save for later use
+	MOV 	D1Ar3, D1Re0
+	ADDT 	D1Re0, D1Re0, #HI(0xfefefeff)               ! Do 4 1-byte compares
+	ADD 	D1Re0, D1Re0, #LO(0xfefefeff)
+	XOR 	D1Ar3, D1Ar3, #-1
+	AND 	D1Re0, D1Re0, D1Ar3
+	ANDMT 	D1Re0, D1Re0, #HI(0x80808080)
+	ANDMB 	D1Re0, D1Re0, #LO(0x80808080)
+	CMP 	D1Re0, #0
+	BNZ 	$Lnullinword2                               ! Found \0 (or c if c==\0)
+
+	MOV 	D0.4, A0.3                                  ! restore the second word
+	XOR 	D1Re0, D0.4, D0Ar4                          ! test c
+
+	MOV 	D1Ar3, D1Re0
+	ADDT 	D1Re0, D1Re0, #HI(0xfefefeff)               ! Do 4 1-byte compares
+	ADD 	D1Re0, D1Re0, #LO(0xfefefeff)
+	XOR 	D1Ar3, D1Ar3, #-1
+	AND 	D1Re0, D1Re0, D1Ar3
+	ANDMT 	D1Re0, D1Re0, #HI(0x80808080)
+	ANDMB 	D1Re0, D1Re0, #LO(0x80808080)
+	CMP 	D1Re0, #0
+	BNZ 	$Lcharinword2                               ! found c
+
+	B 	$Lcheck8bytes                               ! Keep checking
+
+$Lnullinword1:                                              ! found \0 somewhere, check for c too
+	SUB 	D1Ar1, D1Ar1, #4
+$Lnullinword2:
+	SUB 	D1Ar1, D1Ar1, #4
+	AND 	D0Ar2, D0Ar4, #0xff                         ! restore c
+	MOV 	D0Re0, A0.3                                 ! restore the word
+	MOV 	D0.4, D0Re0                                 ! for shifting later
+	AND 	D0Re0, D0Re0, #0xff                         ! take first byte of word
+	CMP 	D0Re0, D0Ar2
+	BZ 	$Lcharatcurrent                             ! found c
+	CMP 	D0Re0, #0!
+	BZ 	$Lnotfound                                  ! found \0
+
+	ADD 	D1Ar1, D1Ar1, #1
+	LSR 	D0.4, D0.4, #8
+	MOV 	D0Re0, D0.4
+	AND 	D0Re0, D0Re0, #0xff                         ! take second byte of word
+	CMP 	D0Re0, D0Ar2
+	BZ 	$Lcharatcurrent                             ! found c
+	CMP 	D0Re0, #0
+	BZ 	$Lnotfound                                  ! found \0
+
+	ADD 	D1Ar1, D1Ar1, #1
+	LSR 	D0.4, D0.4, #8
+	MOV 	D0Re0, D0.4
+	AND 	D0Re0, D0Re0, #0xff                         ! take third byte of word
+	CMP 	D0Re0, D0Ar2
+	BZ 	$Lcharatcurrent                             ! found c
+	CMP 	D0Re0, #0
+	BZ 	$Lnotfound                                  ! found \0
+
+	ADD 	D1Ar1, D1Ar1, #1                            ! move to 4th byte
+	CMP     D0Ar2, #0                                   ! If c was \0
+	BZ      $Lcharatcurrent                             ! c has been found!
+
+$Lnotfound:
+	MOV 	D0Re0,		#0                          ! End of string c not found
+	B 	$Lend
+
+$Lcharinword1: 						    ! found c in first word
+	MOV 	D1Re0, D0Re0
+	SUB 	D1Ar1, D1Ar1, #4
+$Lcharinword2:                                              ! found c in second word
+	SUB 	D1Ar1, D1Ar1, #4
+
+	AND 	D0Re0, D1Re0, #0xff                         ! First byte
+	CMP 	D0Re0, #0                                   ! Test c (zero indicates c due
+							    ! to the 4 1-byte compare code)
+	BNE 	$Lcharatcurrent
+	ADD 	D1Ar1, D1Ar1, #1
+
+	LSR 	D1Re0, D1Re0, #8
+	AND 	D0Re0, D1Re0, #0xff                         ! Second byte
+	CMP 	D0Re0, #0                                   ! Test c (indicated by zero)
+	BNE 	$Lcharatcurrent
+	ADD 	D1Ar1, D1Ar1, #1
+
+	LSR 	D1Re0, D1Re0, #8
+	AND 	D0Re0, D1Re0, #0xff                         ! Third byte
+	CMP 	D0Re0, #0                                   ! Test c (indicated by zero)
+	BNE 	$Lcharatcurrent
+	ADD 	D1Ar1, D1Ar1, #1                            ! Must be the fourth byte
+	B 	$Lcharatcurrent
+
+$Lcharatprevious:
+	SUB 	D1Ar1, D1Ar1, #1                            ! Fix-up pointer
+$Lcharatcurrent:
+	MOV 	D0Re0, D1Ar1                                ! Return the string pointer
+$Lend:
+	MOV 	PC, D1RtP
+	.size _strchr,.-_strchr
+
+libc_hidden_def(strchr)
+#ifdef __UCLIBC_SUSV3_LEGACY__
+strong_alias(strchr,index)
+#endif
diff --git a/libc/string/metag/strcmp.S b/libc/string/metag/strcmp.S
new file mode 100644
index 000000000..3278ffaa5
--- /dev/null
+++ b/libc/string/metag/strcmp.S
@@ -0,0 +1,65 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+
+#include <features.h>
+
+	.text
+	.global	_strcmp
+	.type	_strcmp,function
+!D1Ar1 s1
+!D0Ar2 s2
+_strcmp:
+	TST	D1Ar1,#3
+	TSTZ	D0Ar2,#3
+	MOVT	D1Re0,#0x0101
+	ADD	D1Re0,D1Re0,#0x0101
+	BNZ	$Lstrcmp_slow
+	GETD	D1Ar3,[D1Ar1+#4++]	! Load 32-bits from s1
+	GETD	D1Ar5,[D0Ar2+#4++]      ! Load 32-bits from s2
+	LSL	D0FrT,D1Re0,#7		! D0FrT = 0x80808080
+$Lstrcmp4_loop:
+	SUB	D0Re0,D1Ar3,D1Re0	! D1Re0 = 0x01010101
+	MOV	D0Ar6,D1Ar3
+	SUBS	D0Ar4,D1Ar3,D1Ar5	! Calculate difference
+	XOR	D0Ar6,D0Ar6,#-1
+	GETD	D1Ar3,[D1Ar1+#4++]	! Load 32-bits from s1
+	AND	D0Re0,D0Re0,D0Ar6
+	ANDSZ	D0Ar6,D0Re0,D0FrT	! D0FrT = 0x80808080
+	GETD	D1Ar5,[D0Ar2+#4++]      ! Load 32-bits from s2
+	BZ	$Lstrcmp4_loop
+	AND	D0Ar6, D0Re0, D0FrT	! D0FrT = 0x80808080
+!
+! Either they are different or they both contain a NULL + junk
+!
+$Lstrcmp4_end:
+	LSLS	D0Re0,D0Ar4,#24		! Was Byte[0] the same?
+	LSLSZ	D0Ar2,D0Ar6,#24		! Yes: AND they where not zero?
+	LSLSZ	D0Re0,D0Ar4,#16		! Yes: Was Byte[1] the same?
+	LSLSZ	D0Ar2,D0Ar6,#16		! Yes: AND they where not zero?
+	LSLSZ	D0Re0,D0Ar4,#8		! Tes: Was Byte[2] the same?
+	LSLSZ	D0Ar2,D0Ar6,#8		! Yes: AND they where not zero?
+	MOVZ	D0Re0,D0Ar4		! Yes: Must by Byte[3] thats the result
+	ASR	D0Re0,D0Re0,#24		! Sign extend result to integer
+	MOV	PC,D1RtP
+!
+! Misaligned case, byte at a time
+!
+$Lstrcmp_slow:
+	GETB	D1Ar3,[D1Ar1++]		! Load char from s1
+	GETB	D1Ar5,[D0Ar2++]         ! Load char from s2
+	CMP	D1Ar3,#1		! Null -> C and NZ, rest -> NC (\1->Z)
+	CMPNC	D1Ar3,D1Ar5		! NOT Null: Same -> Z, else -> NZ
+	BZ	$Lstrcmp_slow		! NOT Null and Same: Loop
+	SUB	D0Re0,D1Ar3,D1Ar5	! Generate result
+	MOV	PC,D1RtP
+
+        .size _strcmp,.-_strcmp
+
+
+libc_hidden_def(strcmp)
+#ifndef __UCLIBC_HAS_LOCALE__
+strong_alias(strcmp,strcoll)
+libc_hidden_def(strcoll)
+#endif
diff --git a/libc/string/metag/strcpy.S b/libc/string/metag/strcpy.S
new file mode 100644
index 000000000..529ac9279
--- /dev/null
+++ b/libc/string/metag/strcpy.S
@@ -0,0 +1,94 @@
+!    Copyright (C) 2013 Imagination Technologies Ltd.
+
+!    Licensed under LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
+
+
+	.text
+	.global	_strcpy
+	.type	_strcpy,function
+! D1Ar1 dst
+! D0Ar2 src
+
+_strcpy:
+	MOV	A1.2, D1Ar1
+
+	! test 4 byte alignment of src
+	ANDS	D0Ar4, D0Ar2, #3
+	BNZ	$Lbyteloop
+
+	! test 4 byte alignment of dest
+	ANDS	D1Ar5, D1Ar1, #3
+	BNZ	$Lbyteloop
+
+	! load mask values for aligned loops
+	MOVT	D1Ar3, #HI(0xfefefeff)
+	ADD	D1Ar3, D1Ar3, #LO(0xfefefeff)
+	MOVT	D0FrT, #HI(0x80808080)
+	ADD	D0FrT, D0FrT, #LO(0x80808080)
+
+	! test 8 byte alignment of src
+	ANDS	D0Ar4, D0Ar2, #7
+	BNZ	$Lwordloop
+
+	! test 8 byte alignment of dest
+	ANDS	D1Ar5, D1Ar1, #7
+	BNZ	$Lwordloop
+
+$L8byteloop:
+	GETL	D1Ar5, D0Ar6, [D0Ar2++]
+	MOV	D1Re0, D1Ar5
+	MOV	D0Re0, D1Ar5
+	ADD	D1Re0, D1Re0, D1Ar3
+	XOR	D0Re0, D0Re0, #-1
+	AND	D1Re0, D1Re0, D0Re0
+	ANDS	D1Re0, D1Re0, D0FrT
+	BNZ	$Lnullfound		! NULL in first word
+
+	MOV	D1Re0, D0Ar6
+	MOV	D0Re0, D0Ar6
+	ADD	D1Re0, D1Re0, D1Ar3
+	XOR	D0Re0, D0Re0, #-1
+	AND	D1Re0, D1Re0, D0Re0
+	ANDS	D1Re0, D1Re0, D0FrT
+	BNZ	$Lnullfound2		! NULL in the second word
+
+	SETL	[A1.2++], D1Ar5, D0Ar6
+	B	$L8byteloop
+
+$Lwordloop:
+	GETD	D0Ar6, [D0Ar2++]
+	MOV	D1Re0, D0Ar6
+	MOV	D0Re0, D0Ar6
+	ADD	D1Re0, D1Re0, D1Ar3
+	XOR	D0Re0, D0Re0, #-1
+	AND	D1Re0, D1Re0, D0Re0
+	ANDS	D1Re0, D1Re0, D0FrT
+	MOV	D1Ar5, D0Ar6
+	BNZ	$Lnullfound
+	SETD	[A1.2++], D0Ar6
+	B	$Lwordloop
+
+$Lnullfound2:
+	SETD	[A1.2++], D1Ar5
+	MOV	D1Ar5, D0Ar6
+
+$Lnullfound:
+	SETB	[A1.2++], D1Ar5
+	ANDS	D0Ar6, D1Ar5, #0xff
+	LSR	D1Ar5, D1Ar5, #8
+	BNZ	$Lnullfound
+	B	$Lend
+
+$Lbyteloop:
+	GETB	D0Ar6, [D0Ar2++]
+	SETB	[A1.2++], D0Ar6
+	CMP	D0Ar6, #0
+	BNZ	$Lbyteloop
+
+$Lend:
+	MOV	D0Re0, D1Ar1
+	MOV	PC, D1RtP
+
+	.size _strcpy,.-_strcpy
+
+libc_hidden_def(strcpy)
diff --git a/libc/string/sh64/Makefile b/libc/string/microblaze/Makefile
index 0a95346fd..5bdfef2f7 100644
--- a/libc/string/sh64/Makefile
+++ b/libc/string/microblaze/Makefile
@@ -5,8 +5,8 @@
 # Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
 #
 
-top_srcdir:=../../../
-top_builddir:=../../../
+top_srcdir	:= ../../../
+top_builddir	:= ../../../
 all: objs
 include $(top_builddir)Rules.mak
 include ../Makefile.in
diff --git a/libc/string/microblaze/memcpy.S b/libc/string/microblaze/memcpy.S
new file mode 100644
index 000000000..5219e9919
--- /dev/null
+++ b/libc/string/microblaze/memcpy.S
@@ -0,0 +1,334 @@
+/*
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2008-2009 PetaLogix
+ * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of this
+ * archive for more details.
+ *
+ * Written by Jim Law <jlaw@irispower.com>
+ *
+ * intended to replace:
+ *	memcpy in memcpy.c and
+ *	memmove in memmove.c
+ * ... in arch/microblaze/lib
+ *
+ *
+ * assly_fastcopy.S
+ *
+ * Attempt at quicker memcpy and memmove for MicroBlaze
+ *	Input :	Operand1 in Reg r5 - destination address
+ *		Operand2 in Reg r6 - source address
+ *		Operand3 in Reg r7 - number of bytes to transfer
+ *	Output: Result in Reg r3 - starting destinaition address
+ *
+ *
+ * Explanation:
+ *	Perform (possibly unaligned) copy of a block of memory
+ *	between mem locations with size of xfer spec'd in bytes
+ */
+
+	.text
+	.globl	memcpy
+	.type  memcpy, @function
+	.ent	memcpy
+
+#ifdef __MICROBLAZEEL__
+#	define BSLLI bsrli
+#	define BSRLI bslli
+#else
+#	define BSLLI bslli
+#	define BSRLI bsrli
+#endif
+
+memcpy:
+fast_memcpy_ascending:
+	/* move d to return register as value of function */
+	addi	r3, r5, 0
+
+	addi	r4, r0, 4	/* n = 4 */
+	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
+	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
+
+	/* transfer first 0~3 bytes to get aligned dest address */
+	andi	r4, r5, 3		/* n = d & 3 */
+	/* if zero, destination already aligned */
+	beqi	r4, a_dalign_done
+	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
+	rsubi	r4, r4, 4
+	rsub	r7, r4, r7		/* c = c - n adjust c */
+
+a_xfer_first_loop:
+	/* if no bytes left to transfer, transfer the bulk */
+	beqi	r4, a_dalign_done
+	lbui	r11, r6, 0		/* h = *s */
+	sbi	r11, r5, 0		/* *d = h */
+	addi	r6, r6, 1		/* s++ */
+	addi	r5, r5, 1		/* d++ */
+	brid	a_xfer_first_loop	/* loop */
+	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
+
+a_dalign_done:
+	addi	r4, r0, 32		/* n = 32 */
+	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */
+	/* if n < 0, less than one block to transfer */
+	blti	r4, a_block_done
+
+a_block_xfer:
+	andi	r9, r6, 3		/* t1 = s & 3 */
+	/* if temp == 0, everything is word-aligned */
+	beqi	r9, a_word_xfer
+
+a_block_unaligned:
+	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
+	rsub	r7, r4, r7		/* c = c - n */
+	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
+	add	r6, r6, r4		/* s = s + n */
+	lwi	r11, r8, 0		/* h = *(as + 0) */
+
+	addi	r9, r9, -1
+	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */
+	addi	r9, r9, -1
+	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */
+
+a_block_u3:
+	BSLLI	r11, r11, 24	/* h = h << 24 */
+a_bu3_loop:
+	lwi	r12, r8, 4	/* v = *(as + 4) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 0	/* *(d + 0) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 8	/* v = *(as + 8) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 4	/* *(d + 4) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 12	/* v = *(as + 12) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 8	/* *(d + 8) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 16	/* v = *(as + 16) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 12	/* *(d + 12) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 20	/* v = *(as + 20) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 16	/* *(d + 16) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 24	/* v = *(as + 24) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 20	/* *(d + 20) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 28	/* v = *(as + 28) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 24	/* *(d + 24) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	lwi	r12, r8, 32	/* v = *(as + 32) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 28	/* *(d + 28) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	addi	r8, r8, 32	/* as = as + 32 */
+	addi	r4, r4, -32	/* n = n - 32 */
+	bneid	r4, a_bu3_loop	/* while (n) loop */
+	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
+	bri	a_block_done
+
+a_block_u1:
+	BSLLI	r11, r11, 8	/* h = h << 8 */
+a_bu1_loop:
+	lwi	r12, r8, 4	/* v = *(as + 4) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 0	/* *(d + 0) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 8	/* v = *(as + 8) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 4	/* *(d + 4) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 12	/* v = *(as + 12) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 8	/* *(d + 8) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 16	/* v = *(as + 16) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 12	/* *(d + 12) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 20	/* v = *(as + 20) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 16	/* *(d + 16) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 24	/* v = *(as + 24) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 20	/* *(d + 20) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 28	/* v = *(as + 28) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 24	/* *(d + 24) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	lwi	r12, r8, 32	/* v = *(as + 32) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 28	/* *(d + 28) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	addi	r8, r8, 32	/* as = as + 32 */
+	addi	r4, r4, -32	/* n = n - 32 */
+	bneid	r4, a_bu1_loop	/* while (n) loop */
+	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
+	bri	a_block_done
+
+a_block_u2:
+	BSLLI	r11, r11, 16	/* h = h << 16 */
+a_bu2_loop:
+	lwi	r12, r8, 4	/* v = *(as + 4) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 0	/* *(d + 0) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 8	/* v = *(as + 8) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 4	/* *(d + 4) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 12	/* v = *(as + 12) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 8	/* *(d + 8) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 16	/* v = *(as + 16) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 12	/* *(d + 12) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 20	/* v = *(as + 20) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 16	/* *(d + 16) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 24	/* v = *(as + 24) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 20	/* *(d + 20) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 28	/* v = *(as + 28) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 24	/* *(d + 24) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	lwi	r12, r8, 32	/* v = *(as + 32) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 28	/* *(d + 28) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	addi	r8, r8, 32	/* as = as + 32 */
+	addi	r4, r4, -32	/* n = n - 32 */
+	bneid	r4, a_bu2_loop	/* while (n) loop */
+	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */
+
+a_block_done:
+	addi	r4, r0, 4	/* n = 4 */
+	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
+	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */
+
+a_word_xfer:
+	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
+	addi	r10, r0, 0		/* offset = 0 */
+
+	andi	r9, r6, 3		/* t1 = s & 3 */
+	/* if temp != 0, unaligned transfers needed */
+	bnei	r9, a_word_unaligned
+
+a_word_aligned:
+	lw	r9, r6, r10		/* t1 = *(s+offset) */
+	sw	r9, r5, r10		/* *(d+offset) = t1 */
+	addi	r4, r4,-4		/* n-- */
+	bneid	r4, a_word_aligned	/* loop */
+	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */
+
+	bri	a_word_done
+
+a_word_unaligned:
+	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
+	lwi	r11, r8, 0		/* h = *(as + 0) */
+	addi	r8, r8, 4		/* as = as + 4 */
+
+	addi	r9, r9, -1
+	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */
+	addi	r9, r9, -1
+	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */
+
+a_word_u3:
+	BSLLI	r11, r11, 24	/* h = h << 24 */
+a_wu3_loop:
+	lw	r12, r8, r10	/* v = *(as + offset) */
+	BSRLI	r9, r12, 8	/* t1 = v >> 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	sw	r9, r5, r10	/* *(d + offset) = t1 */
+	BSLLI	r11, r12, 24	/* h = v << 24 */
+	addi	r4, r4,-4	/* n = n - 4 */
+	bneid	r4, a_wu3_loop	/* while (n) loop */
+	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
+
+	bri	a_word_done
+
+a_word_u1:
+	BSLLI	r11, r11, 8	/* h = h << 8 */
+a_wu1_loop:
+	lw	r12, r8, r10	/* v = *(as + offset) */
+	BSRLI	r9, r12, 24	/* t1 = v >> 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	sw	r9, r5, r10	/* *(d + offset) = t1 */
+	BSLLI	r11, r12, 8	/* h = v << 8 */
+	addi	r4, r4,-4	/* n = n - 4 */
+	bneid	r4, a_wu1_loop	/* while (n) loop */
+	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
+
+	bri	a_word_done
+
+a_word_u2:
+	BSLLI	r11, r11, 16	/* h = h << 16 */
+a_wu2_loop:
+	lw	r12, r8, r10	/* v = *(as + offset) */
+	BSRLI	r9, r12, 16	/* t1 = v >> 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	sw	r9, r5, r10	/* *(d + offset) = t1 */
+	BSLLI	r11, r12, 16	/* h = v << 16 */
+	addi	r4, r4,-4	/* n = n - 4 */
+	bneid	r4, a_wu2_loop	/* while (n) loop */
+	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */
+
+a_word_done:
+	add	r5, r5, r10	/* d = d + offset */
+	add	r6, r6, r10	/* s = s + offset */
+	rsub	r7, r10, r7	/* c = c - offset */
+
+a_xfer_end:
+a_xfer_end_loop:
+	beqi	r7, a_done		/* while (c) */
+	lbui	r9, r6, 0		/* t1 = *s */
+	addi	r6, r6, 1		/* s++ */
+	sbi	r9, r5, 0		/* *d = t1 */
+	addi	r7, r7, -1		/* c-- */
+	brid	a_xfer_end_loop		/* loop */
+	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */
+
+a_done:
+	rtsd	r15, 8
+	nop
+
+.size  memcpy, . - memcpy
+.end memcpy
+libc_hidden_def(memcpy)
diff --git a/libc/string/microblaze/memmove.S b/libc/string/microblaze/memmove.S
new file mode 100644
index 000000000..6bac01620
--- /dev/null
+++ b/libc/string/microblaze/memmove.S
@@ -0,0 +1,356 @@
+/*
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
+ * Copyright (C) 2008-2009 PetaLogix
+ * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of this
+ * archive for more details.
+ *
+ * Written by Jim Law <jlaw@irispower.com>
+ *
+ * intended to replace:
+ *	memcpy in memcpy.c and
+ *	memmove in memmove.c
+ * ... in arch/microblaze/lib
+ *
+ *
+ * assly_fastcopy.S
+ *
+ * Attempt at quicker memcpy and memmove for MicroBlaze
+ *	Input :	Operand1 in Reg r5 - destination address
+ *		Operand2 in Reg r6 - source address
+ *		Operand3 in Reg r7 - number of bytes to transfer
+ *	Output: Result in Reg r3 - starting destinaition address
+ *
+ *
+ * Explanation:
+ *	Perform (possibly unaligned) copy of a block of memory
+ *	between mem locations with size of xfer spec'd in bytes
+ */
+
+	.globl	memmove
+	.type  memmove, @function
+	.ent	memmove
+
+#ifdef __MICROBLAZEEL__
+#	define BSLLI bsrli
+#	define BSRLI bslli
+#else
+#	define BSLLI bslli
+#	define BSRLI bsrli
+#endif
+
+memmove:
+	cmpu	r4, r5, r6	/* n = s - d */
+	bgei	r4, HIDDEN_JUMPTARGET(memcpy)
+
+fast_memcpy_descending:
+	/* move d to return register as value of function */
+	addi	r3, r5, 0
+
+	add	r5, r5, r7	/* d = d + c */
+	add	r6, r6, r7	/* s = s + c */
+
+	addi	r4, r0, 4	/* n = 4 */
+	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
+	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
+
+	/* transfer first 0~3 bytes to get aligned dest address */
+	andi	r4, r5, 3		/* n = d & 3 */
+	/* if zero, destination already aligned */
+	beqi	r4,d_dalign_done
+	rsub	r7, r4, r7		/* c = c - n adjust c */
+
+d_xfer_first_loop:
+	/* if no bytes left to transfer, transfer the bulk */
+	beqi	r4,d_dalign_done
+	addi	r6, r6, -1		/* s-- */
+	addi	r5, r5, -1		/* d-- */
+	lbui	r11, r6, 0		/* h = *s */
+	sbi	r11, r5, 0		/* *d = h */
+	brid	d_xfer_first_loop	/* loop */
+	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */
+
+d_dalign_done:
+	addi	r4, r0, 32	/* n = 32 */
+	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
+	/* if n < 0, less than one block to transfer */
+	blti	r4, d_block_done
+
+d_block_xfer:
+	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */
+	rsub	r7, r4, r7		/* c = c - n */
+
+	andi	r9, r6, 3		/* t1 = s & 3 */
+	/* if temp != 0, unaligned transfers needed */
+	bnei	r9, d_block_unaligned
+
+d_block_aligned:
+	addi	r6, r6, -32		/* s = s - 32 */
+	addi	r5, r5, -32		/* d = d - 32 */
+	lwi	r9, r6, 28		/* t1 = *(s + 28) */
+	lwi	r10, r6, 24		/* t2 = *(s + 24) */
+	lwi	r11, r6, 20		/* t3 = *(s + 20) */
+	lwi	r12, r6, 16		/* t4 = *(s + 16) */
+	swi	r9, r5, 28		/* *(d + 28) = t1 */
+	swi	r10, r5, 24		/* *(d + 24) = t2 */
+	swi	r11, r5, 20		/* *(d + 20) = t3 */
+	swi	r12, r5, 16		/* *(d + 16) = t4 */
+	lwi	r9, r6, 12		/* t1 = *(s + 12) */
+	lwi	r10, r6, 8		/* t2 = *(s + 8) */
+	lwi	r11, r6, 4		/* t3 = *(s + 4) */
+	lwi	r12, r6, 0		/* t4 = *(s + 0) */
+	swi	r9, r5, 12		/* *(d + 12) = t1 */
+	swi	r10, r5, 8		/* *(d + 8) = t2 */
+	swi	r11, r5, 4		/* *(d + 4) = t3 */
+	addi	r4, r4, -32		/* n = n - 32 */
+	bneid	r4, d_block_aligned	/* while (n) loop */
+	swi	r12, r5, 0		/* *(d + 0) = t4 (IN DELAY SLOT) */
+	bri	d_block_done
+
+d_block_unaligned:
+	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
+	rsub	r6, r4, r6		/* s = s - n */
+	lwi	r11, r8, 0		/* h = *(as + 0) */
+
+	addi	r9, r9, -1
+	beqi	r9,d_block_u1		/* t1 was 1 => 1 byte offset */
+	addi	r9, r9, -1
+	beqi	r9,d_block_u2		/* t1 was 2 => 2 byte offset */
+
+d_block_u3:
+	BSRLI	r11, r11, 8	/* h = h >> 8 */
+d_bu3_loop:
+	addi	r8, r8, -32	/* as = as - 32 */
+	addi	r5, r5, -32	/* d = d - 32 */
+	lwi	r12, r8, 28	/* v = *(as + 28) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 28	/* *(d + 28) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 24	/* v = *(as + 24) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 24	/* *(d + 24) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 20	/* v = *(as + 20) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 20	/* *(d + 20) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 16	/* v = *(as + 16) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 16	/* *(d + 16) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 12	/* v = *(as + 12) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 12	/* *(d + 112) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 8	/* v = *(as + 8) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 8	/* *(d + 8) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 4	/* v = *(as + 4) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 4	/* *(d + 4) = t1 */
+	BSRLI	r11, r12, 8	/* h = v >> 8 */
+	lwi	r12, r8, 0	/* v = *(as + 0) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 0	/* *(d + 0) = t1 */
+	addi	r4, r4, -32	/* n = n - 32 */
+	bneid	r4, d_bu3_loop	/* while (n) loop */
+	BSRLI	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
+	bri	d_block_done
+
+d_block_u1:
+	BSRLI	r11, r11, 24	/* h = h >> 24 */
+d_bu1_loop:
+	addi	r8, r8, -32	/* as = as - 32 */
+	addi	r5, r5, -32	/* d = d - 32 */
+	lwi	r12, r8, 28	/* v = *(as + 28) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 28	/* *(d + 28) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 24	/* v = *(as + 24) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 24	/* *(d + 24) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 20	/* v = *(as + 20) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 20	/* *(d + 20) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 16	/* v = *(as + 16) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 16	/* *(d + 16) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 12	/* v = *(as + 12) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 12	/* *(d + 112) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 8	/* v = *(as + 8) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 8	/* *(d + 8) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 4	/* v = *(as + 4) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 4	/* *(d + 4) = t1 */
+	BSRLI	r11, r12, 24	/* h = v >> 24 */
+	lwi	r12, r8, 0	/* v = *(as + 0) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 0	/* *(d + 0) = t1 */
+	addi	r4, r4, -32	/* n = n - 32 */
+	bneid	r4, d_bu1_loop	/* while (n) loop */
+	BSRLI	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
+	bri	d_block_done
+
+d_block_u2:
+	BSRLI	r11, r11, 16	/* h = h >> 16 */
+d_bu2_loop:
+	addi	r8, r8, -32	/* as = as - 32 */
+	addi	r5, r5, -32	/* d = d - 32 */
+	lwi	r12, r8, 28	/* v = *(as + 28) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 28	/* *(d + 28) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 24	/* v = *(as + 24) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 24	/* *(d + 24) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 20	/* v = *(as + 20) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 20	/* *(d + 20) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 16	/* v = *(as + 16) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 16	/* *(d + 16) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 12	/* v = *(as + 12) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 12	/* *(d + 112) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 8	/* v = *(as + 8) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 8	/* *(d + 8) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 4	/* v = *(as + 4) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 4	/* *(d + 4) = t1 */
+	BSRLI	r11, r12, 16	/* h = v >> 16 */
+	lwi	r12, r8, 0	/* v = *(as + 0) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	swi	r9, r5, 0	/* *(d + 0) = t1 */
+	addi	r4, r4, -32	/* n = n - 32 */
+	bneid	r4, d_bu2_loop	/* while (n) loop */
+	BSRLI	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
+
+d_block_done:
+	addi	r4, r0, 4	/* n = 4 */
+	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */
+	blti	r4,d_xfer_end	/* if n < 0, less than one word to transfer */
+
+d_word_xfer:
+	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */
+	rsub	r5, r4, r5		/* d = d - n */
+	rsub	r6, r4, r6		/* s = s - n */
+	rsub	r7, r4, r7		/* c = c - n */
+
+	andi	r9, r6, 3		/* t1 = s & 3 */
+	/* if temp != 0, unaligned transfers needed */
+	bnei	r9, d_word_unaligned
+
+d_word_aligned:
+	addi	r4, r4,-4		/* n-- */
+	lw	r9, r6, r4		/* t1 = *(s+n) */
+	bneid	r4, d_word_aligned	/* loop */
+	sw	r9, r5, r4		/* *(d+n) = t1 (IN DELAY SLOT) */
+
+	bri	d_word_done
+
+d_word_unaligned:
+	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */
+	lw	r11, r8, r4		/* h = *(as + n) */
+
+	addi	r9, r9, -1
+	beqi	r9,d_word_u1		/* t1 was 1 => 1 byte offset */
+	addi	r9, r9, -1
+	beqi	r9,d_word_u2		/* t1 was 2 => 2 byte offset */
+
+d_word_u3:
+	BSRLI	r11, r11, 8	/* h = h >> 8 */
+d_wu3_loop:
+	addi	r4, r4,-4	/* n = n - 4 */
+	lw	r12, r8, r4	/* v = *(as + n) */
+	BSLLI	r9, r12, 24	/* t1 = v << 24 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	sw	r9, r5, r4	/* *(d + n) = t1 */
+	bneid	r4, d_wu3_loop	/* while (n) loop */
+	BSRLI	r11, r12, 8	/* h = v >> 8 (IN DELAY SLOT) */
+
+	bri	d_word_done
+
+d_word_u1:
+	BSRLI	r11, r11, 24	/* h = h >> 24 */
+d_wu1_loop:
+	addi	r4, r4,-4	/* n = n - 4 */
+	lw	r12, r8, r4	/* v = *(as + n) */
+	BSLLI	r9, r12, 8	/* t1 = v << 8 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	sw	r9, r5, r4	/* *(d + n) = t1 */
+	bneid	r4, d_wu1_loop	/* while (n) loop */
+	BSRLI	r11, r12, 24	/* h = v >> 24 (IN DELAY SLOT) */
+
+	bri	d_word_done
+
+d_word_u2:
+	BSRLI	r11, r11, 16	/* h = h >> 16 */
+d_wu2_loop:
+	addi	r4, r4,-4	/* n = n - 4 */
+	lw	r12, r8, r4	/* v = *(as + n) */
+	BSLLI	r9, r12, 16	/* t1 = v << 16 */
+	or	r9, r11, r9	/* t1 = h | t1 */
+	sw	r9, r5, r4	/* *(d + n) = t1 */
+	bneid	r4, d_wu2_loop	/* while (n) loop */
+	BSRLI	r11, r12, 16	/* h = v >> 16 (IN DELAY SLOT) */
+
+d_word_done:
+
+d_xfer_end:
+d_xfer_end_loop:
+	beqi	r7, a_done		/* while (c) */
+	addi	r6, r6, -1		/* s-- */
+	lbui	r9, r6, 0		/* t1 = *s */
+	addi	r5, r5, -1		/* d-- */
+	sbi	r9, r5, 0		/* *d = t1 */
+	brid	d_xfer_end_loop		/* loop */
+	addi	r7, r7, -1		/* c-- (IN DELAY SLOT) */
+
+a_done:
+d_done:
+	rtsd	r15, 8
+	nop
+
+.size  memmove, . - memmove
+.end memmove
+libc_hidden_def(memmove)
diff --git a/libc/string/mips/memcpy.S b/libc/string/mips/memcpy.S
index 9b05ee6da..59f9f0a3a 100644
--- a/libc/string/mips/memcpy.S
+++ b/libc/string/mips/memcpy.S
@@ -1,6 +1,5 @@
-/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -13,245 +12,861 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include <features.h>
-/*#include <sysdep.h>*/
-#include <endian.h>
-#include "sysdep.h"
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define USE_MEMMOVE_FOR_OVERLAP
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
+# include <sysdep.h>
+# include <sys/regdef.h>
+# include <sys/asm.h>
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif defined _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+# include <sys/regdef.h>
+# include <sys/asm.h>
+#endif
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
+    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
+# ifndef DISABLE_PREFETCH
+#  define USE_PREFETCH
+# endif
+#endif
+
+#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
+# ifndef DISABLE_DOUBLE
+#  define USE_DOUBLE
+# endif
+#endif
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+#  define L(label) $L ## label
+# else
+#  define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+#  define PTR_ADDIU	daddiu
+# else
+#  define PTR_ADDIU	addiu
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_SRA macro definition.  */
+#ifndef PTR_SRA
+# ifdef USE_DOUBLE
+#  define PTR_SRA		dsra
+# else
+#  define PTR_SRA		sra
+# endif
+#endif
+
+/* New R6 instructions that may not be in asm.h.  */
+#ifndef PTR_LSA
+# if _MIPS_SIM == _ABI64
+#  define PTR_LSA	dlsa
+# else
+#  define PTR_LSA	lsa
+# endif
+#endif
+
+/*
+ * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
+ * prefetches appears to offer a slight preformance advantage.
+ *
+ * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+ * or PREFETCH_STORE_STREAMED offers a large performance advantage
+ * but PREPAREFORSTORE has some special restrictions to consider.
+ *
+ * Prefetch with the 'prepare for store' hint does not copy a memory
+ * location into the cache, it just allocates a cache line and zeros
+ * it out.  This means that if you do not write to the entire cache
+ * line before writing it out to memory some data will get zero'ed out
+ * when the cache line is written back to memory and data will be lost.
+ *
+ * Also if you are using this memcpy to copy overlapping buffers it may
+ * not behave correctly when using the 'prepare for store' hint.  If you
+ * use the 'prepare for store' prefetch on a memory area that is in the
+ * memcpy source (as well as the memcpy destination), then you will get
+ * some data zero'ed out before you have a chance to read it and data will
+ * be lost.
+ *
+ * If you are going to use this memcpy routine with the 'prepare for store'
+ * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
+ * the problem of running memcpy on overlapping buffers.
+ *
+ * There are ifdef'ed sections of this memcpy to make sure that it does not
+ * do prefetches on cache lines that are not going to be completely written.
+ * This code is only needed and only used when PREFETCH_STORE_HINT is set to
+ * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+ * 32 bytes and if the cache line is larger it will not work correctly.
+ */
+
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_LOAD		0
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_LOAD_STREAMED	4
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_LOAD_RETAINED	6
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_WRITEBACK_INVAL	25
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/*
+ * If we have not picked out what hints to use at this point use the
+ * standard load and store prefetch hints.
+ */
+# ifndef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+# endif
+# ifndef PREFETCH_LOAD_HINT
+#  define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
+# endif
+
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case.  The assumption is that each individual
+ * prefetch brings in 32 bytes.
+ */
+
+# ifdef USE_DOUBLE
+#  define PREFETCH_CHUNK 64
+#  define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
+ pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
+ pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
+# else
+#  define PREFETCH_CHUNK 32
+#  define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+# endif
+/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
+ * than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
+ * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
+ * hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
+ * used then MAX_PREFETCH_SIZE does not matter.  */
+# define MAX_PREFETCH_SIZE 128
+/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
+ * than 5 on a STORE prefetch and that a single prefetch can never be larger
+ * than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
+ * we actually do two prefetches in that case, one 32 bytes after the other.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
+# else
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
+# endif
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
+    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
+/* We cannot handle this because the initial prefetches may fetch bytes that
+ * are before the buffer being copied.  We start copies with an offset
+ * of 4 so avoid this situation when using PREPAREFORSTORE.  */
+#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
+# endif
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+#if __mips_isa_rev > 5
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+#  undef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+# endif
+# define R6_CODE
+#endif
 
-/* void *memcpy(void *s1, const void *s2, size_t n);  */
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMCPY_NAME
+# define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying.  */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64))
+# define REG4 t4
+# define REG5 t5
+# define REG6 t6
+# define REG7 t7
+#else
+# define REG4 ta0
+# define REG5 ta1
+# define REG6 ta2
+# define REG7 ta3
+#endif
 
-#ifdef __mips64
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+ * The C_ prefix stands for CHUNK and is used to avoid macro name
+ * conflicts with system header files.  */
 
-#include <sys/asm.h>
+#ifdef USE_DOUBLE
+# define C_ST	sd
+# define C_LD	ld
+# ifdef __MIPSEB
+#  define C_LDHI	ldl	/* high part is left in big-endian	*/
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
+#  define C_LDLO	ldr	/* low part is right in big-endian	*/
+#  define C_STLO	sdr	/* low part is right in big-endian	*/
+# else
+#  define C_LDHI	ldr	/* high part is right in little-endian	*/
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+#  define C_LDLO	ldl	/* low part is left in little-endian	*/
+#  define C_STLO	sdl	/* low part is left in little-endian	*/
+# endif
+# define C_ALIGN	dalign	/* r6 align instruction			*/
+#else
+# define C_ST	sw
+# define C_LD	lw
+# ifdef __MIPSEB
+#  define C_LDHI	lwl	/* high part is left in big-endian	*/
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+#  define C_LDLO	lwr	/* low part is right in big-endian	*/
+#  define C_STLO	swr	/* low part is right in big-endian	*/
+# else
+#  define C_LDHI	lwr	/* high part is right in little-endian	*/
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+#  define C_LDLO	lwl	/* low part is left in little-endian	*/
+#  define C_STLO	swl	/* low part is left in little-endian	*/
+# endif
+# define C_ALIGN	align	/* r6 align instruction			*/
+#endif
 
-#if __BYTE_ORDER == __BIG_ENDIAN
-#  define LDHI	ldl		/* high part is left in big-endian	*/
-#  define SDHI	sdl		/* high part is left in big-endian	*/
-#  define LDLO	ldr		/* low part is right in big-endian	*/
-#  define SDLO	sdr		/* low part is right in big-endian	*/
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
 #else
-#  define LDHI	ldr		/* high part is right in little-endian	*/
-#  define SDHI	sdr		/* high part is right in little-endian	*/
-#  define LDLO	ldl		/* low part is left in little-endian	*/
-#  define SDLO	sdl		/* low part is left in little-endian	*/
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
 #endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
 
-ENTRY (memcpy)
+#ifdef ANDROID_CHANGES
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+	.set	nomips16
 	.set	noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef USE_MEMMOVE_FOR_OVERLAP
+	PTR_SUBU t0,a0,a1
+	PTR_SRA	t2,t0,31
+	xor	t1,t0,t2
+	PTR_SUBU t0,t1,t2
+	sltu	t2,t0,a2
+	beq	t2,zero,L(memcpy)
+	la	t9,memmove
+	jr	t9
+	 nop
+L(memcpy):
+#endif
+/*
+ * If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lasts)
+#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
+	move	v0,zero
+#else
+	move	v0,a0
+#endif
 
-	slti	t0, a2, 16		# Less than 16?
-	bne	t0, zero, L(last16)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x7
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	PTR_SUBU t1, zero, a1
-	andi	t1, 0x7			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	PTR_SUBU a2, t1
-	LDHI	t0, 0(a1)		# Yes we are... take care of that
-	PTR_ADDU a1, t1
-	SDHI	t0, 0(a0)
-	PTR_ADDU a0, t1
-
-L(chk8w):
-	andi	t0, a2, 0x3f		# 64 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	PTR_SUBU a3, a2, t0		# Yes
-	PTR_ADDU a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	ld	t0,  0(a1)		# Loop taking 8 words at a time
-	ld	t1,  8(a1)
-	ld	t2, 16(a1)
-	ld	t3, 24(a1)
-	ld	ta0, 32(a1)
-	ld	ta1, 40(a1)
-	ld	ta2, 48(a1)
-	ld	ta3, 56(a1)
-	PTR_ADDIU a0, 64
-	PTR_ADDIU a1, 64
-	sd	t0, -64(a0)
-	sd	t1, -56(a0)
-	sd	t2, -48(a0)
-	sd	t3, -40(a0)
-	sd	ta0, -32(a0)
-	sd	ta1, -24(a0)
-	sd	ta2, -16(a0)
-	bne	a1, a3, L(lop8w)
-	sd	ta3,  -8(a0)
+#ifndef R6_CODE
 
-L(chk1w):
-	andi	t0, a2, 0x7		# 8 or more bytes left?
-	beq	t0, a2, L(last16)
-	PTR_SUBU a3, a2, t0		# Yes, handle them one dword at a time
-	PTR_ADDU a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):
-	ld	t0, 0(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
-	bne	a1, a3, L(lop1w)
-	sd	t0, -8(a0)
-
-L(last16):
-	blez	a2, L(lst16e)		# Handle last 16 bytes, one at a time
-	PTR_ADDU a3, a2, a1
-L(lst16l):
-	lb	t0, 0(a1)
-	PTR_ADDIU a0, 1
-	PTR_ADDIU a1, 1
-	bne	a1, a3, L(lst16l)
-	sb	t0, -1(a0)
-L(lst16e):
-	jr	ra			# Bye, bye
-	nop
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned.  If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+	xor	t8,a1,a0
+	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
+	bne	t8,zero,L(unaligned)
+	PTR_SUBU a3, zero, a0
 
-L(shift):
-	PTR_SUBU a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x7			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	PTR_SUBU a2, a3			# a2 = bytes left
-	LDHI	t0, 0(a1)		# Take care of first odd part
-	LDLO	t0, 7(a1)
-	PTR_ADDU a1, a3
-	SDHI	t0, 0(a0)
-	PTR_ADDU a0, a3
-L(shft1):
-	andi	t0, a2, 0x7
-	PTR_SUBU a3, a2, t0
-	PTR_ADDU a3, a1
-L(shfth):
-	LDHI	t1, 0(a1)		# Limp through, dword by dword
-	LDLO	t1, 7(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
-	bne	a1, a3, L(shfth)
-	sd	t1, -8(a0)
-	b	L(last16)		# Handle anything which may be left
-	move	a2, t0
+	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
+	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
 
-	.set	reorder
-END (memcpy)
+	C_LDHI	t8,0(a1)
+	PTR_ADDU a1,a1,a3
+	C_STHI	t8,0(a0)
+	PTR_ADDU a0,a0,a3
+
+#else /* R6_CODE */
+
+/*
+ * Align the destination and hope that the source gets aligned too.  If it
+ * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
+ * align instruction.
+ */
+	andi	t8,a0,7
+	lapc	t9,L(atable)
+	PTR_LSA	t9,t8,t9,2
+	jrc	t9
+L(atable):
+	bc	L(lb0)
+	bc	L(lb7)
+	bc	L(lb6)
+	bc	L(lb5)
+	bc	L(lb4)
+	bc	L(lb3)
+	bc	L(lb2)
+	bc	L(lb1)
+L(lb7):
+	lb	a3, 6(a1)
+	sb	a3, 6(a0)
+L(lb6):
+	lb	a3, 5(a1)
+	sb	a3, 5(a0)
+L(lb5):
+	lb	a3, 4(a1)
+	sb	a3, 4(a0)
+L(lb4):
+	lb	a3, 3(a1)
+	sb	a3, 3(a0)
+L(lb3):
+	lb	a3, 2(a1)
+	sb	a3, 2(a0)
+L(lb2):
+	lb	a3, 1(a1)
+	sb	a3, 1(a0)
+L(lb1):
+	lb	a3, 0(a1)
+	sb	a3, 0(a0)
+
+	li	t9,8
+	subu	t8,t9,t8
+	PTR_SUBU a2,a2,t8
+	PTR_ADDU a0,a0,t8
+	PTR_ADDU a1,a1,t8
+L(lb0):
 
-#else /* !__mips64 */
+	andi	t8,a1,(NSIZE-1)
+	lapc	t9,L(jtable)
+	PTR_LSA	t9,t8,t9,2
+	jrc	t9
+L(jtable):
+        bc      L(aligned)
+        bc      L(r6_unaligned1)
+        bc      L(r6_unaligned2)
+        bc      L(r6_unaligned3)
+# ifdef USE_DOUBLE
+        bc      L(r6_unaligned4)
+        bc      L(r6_unaligned5)
+        bc      L(r6_unaligned6)
+        bc      L(r6_unaligned7)
+# endif
+#endif /* R6_CODE */
 
-#if __BYTE_ORDER == __BIG_ENDIAN
-#  define LWHI	lwl		/* high part is left in big-endian	*/
-#  define SWHI	swl		/* high part is left in big-endian	*/
-#  define LWLO	lwr		/* low part is right in big-endian	*/
-#  define SWLO	swr		/* low part is right in big-endian	*/
+L(aligned):
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+ * in this case the a0+x should not be past the "t0-32" address.  This
+ * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+ * for x=64 the last "safe" a0 address is "t0-96" In the current version we
+ * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
+ */
+#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+#endif
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_LOAD  (3, a1)
+#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
+	PREFETCH_FOR_STORE (1, a0)
+	PREFETCH_FOR_STORE (2, a0)
+	PREFETCH_FOR_STORE (3, a0)
+#endif
+#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
+# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
+	sltu    v1,t9,a0
+	bgtz    v1,L(skip_set)
+	nop
+	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
+L(skip_set):
+# else
+	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
+# endif
+#endif
+#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
+# ifdef USE_DOUBLE
+	PTR_ADDIU v0,v0,32
+# endif
+#endif
+L(loop16w):
+	C_LD	t0,UNIT(0)(a1)
+#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(skip_pref)
+#endif
+	C_LD	t1,UNIT(1)(a1)
+#ifdef R6_CODE
+	PREFETCH_FOR_STORE (2, a0)
 #else
-#  define LWHI	lwr		/* high part is right in little-endian	*/
-#  define SWHI	swr		/* high part is right in little-endian	*/
-#  define LWLO	lwl		/* low part is left in little-endian	*/
-#  define SWLO	swl		/* low part is left in little-endian	*/
+	PREFETCH_FOR_STORE (4, a0)
+	PREFETCH_FOR_STORE (5, a0)
+#endif
+#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
+	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
+# ifdef USE_DOUBLE
+	PTR_ADDIU v0,v0,32
+# endif
 #endif
+L(skip_pref):
+	C_LD	REG2,UNIT(2)(a1)
+	C_LD	REG3,UNIT(3)(a1)
+	C_LD	REG4,UNIT(4)(a1)
+	C_LD	REG5,UNIT(5)(a1)
+	C_LD	REG6,UNIT(6)(a1)
+	C_LD	REG7,UNIT(7)(a1)
+#ifdef R6_CODE
+	PREFETCH_FOR_LOAD (3, a1)
+#else
+	PREFETCH_FOR_LOAD (4, a1)
+#endif
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
 
-ENTRY (memcpy)
-	.set	noreorder
+	C_LD	t0,UNIT(8)(a1)
+	C_LD	t1,UNIT(9)(a1)
+	C_LD	REG2,UNIT(10)(a1)
+	C_LD	REG3,UNIT(11)(a1)
+	C_LD	REG4,UNIT(12)(a1)
+	C_LD	REG5,UNIT(13)(a1)
+	C_LD	REG6,UNIT(14)(a1)
+	C_LD	REG7,UNIT(15)(a1)
+#ifndef R6_CODE
+        PREFETCH_FOR_LOAD (5, a1)
+#endif
+	C_ST	t0,UNIT(8)(a0)
+	C_ST	t1,UNIT(9)(a0)
+	C_ST	REG2,UNIT(10)(a0)
+	C_ST	REG3,UNIT(11)(a0)
+	C_ST	REG4,UNIT(12)(a0)
+	C_ST	REG5,UNIT(13)(a0)
+	C_ST	REG6,UNIT(14)(a0)
+	C_ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+	bne	a0,a3,L(loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
+				/* The t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
+	nop
+	C_LD	t0,UNIT(0)(a1)
+	C_LD	t1,UNIT(1)(a1)
+	C_LD	REG2,UNIT(2)(a1)
+	C_LD	REG3,UNIT(3)(a1)
+	C_LD	REG4,UNIT(4)(a1)
+	C_LD	REG5,UNIT(5)(a1)
+	C_LD	REG6,UNIT(6)(a1)
+	C_LD	REG7,UNIT(7)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less than 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.  Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastw)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
-	slti	t0, a2, 8		# Less than 8?
-	bne	t0, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x3
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	subu	t1, zero, a1
-	andi	t1, 0x3			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	subu	a2, t1
-	LWHI	t0, 0(a1)		# Yes we are... take care of that
-	addu	a1, t1
-	SWHI	t0, 0(a0)
-	addu	a0, t1
-
-L(chk8w):	
-	andi	t0, a2, 0x1f		# 32 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	subu	a3, a2, t0		# Yes
-	addu	a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	lw	t0,  0(a1)		# Loop taking 8 words at a time
-	lw	t1,  4(a1)
-	lw	t2,  8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-	addiu	a0, 32
-	addiu	a1, 32
-	sw	t0, -32(a0)
-	sw	t1, -28(a0)
-	sw	t2, -24(a0)
-	sw	t3, -20(a0)
-	sw	t4, -16(a0)
-	sw	t5, -12(a0)
-	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
-	sw	t7,  -4(a0)
-
-L(chk1w):	
-	andi	t0, a2, 0x3		# 4 or more bytes left?
-	beq	t0, a2, L(last8)
-	subu	a3, a2, t0		# Yes, handle them one word at a time
-	addu	a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):	
-	lw	t0, 0(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(lop1w)
-	sw	t0, -4(a0)
-
-L(last8):	
-	blez	a2, L(lst8e)		# Handle last 8 bytes, one at a time
-	addu	a3, a2, a1
-L(lst8l):	
-	lb	t0, 0(a1)
-	addiu	a0, 1
-	addiu	a1, 1
-	bne	a1, a3, L(lst8l)
-	sb	t0, -1(a0)
-L(lst8e):	
-	jr	ra			# Bye, bye
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+	C_LD	REG3,UNIT(0)(a1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	REG3,UNIT(-1)(a0)
+
+/* If we have been copying double words, see if we can copy a single word
+   before doing byte copies.  We can have, at most, one word to copy.  */
+
+L(lastw):
+#ifdef USE_DOUBLE
+	andi    t8,a2,3		/* a2 is the remainder past 4 byte chunks.  */
+	beq	t8,a2,L(lastb)
+	move	a2,t8
+	lw	REG3,0(a1)
+	sw	REG3,0(a0)
+	PTR_ADDIU a0,a0,4
+	PTR_ADDIU a1,a1,4
+#endif
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(lastbloop):
+	lb	v1,0(a1)
+	PTR_ADDIU a0,a0,1
+	PTR_ADDIU a1,a1,1
+	bne	a0,a3,L(lastbloop)
+	sb	v1,-1(a0)
+L(leave):
+	j	ra
 	nop
 
-L(shift):	
-	subu	a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x3			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	subu	a2, a3			# a2 = bytes left
-	LWHI	t0, 0(a1)		# Take care of first odd part
-	LWLO	t0, 3(a1)
-	addu	a1, a3
-	SWHI	t0, 0(a0)
-	addu	a0, a3
-L(shft1):	
-	andi	t0, a2, 0x3
-	subu	a3, a2, t0
-	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
-	move	a2, t0
+/* We jump here with a memcpy of less than 8 or 16 bytes, depending on
+   whether or not USE_DOUBLE is defined.  Instead of just doing byte
+   copies, check the alignment and size and use lw/sw if possible.
+   Otherwise, do byte copies.  */
 
-	.set	reorder
-END (memcpy)
+L(lasts):
+	andi	t8,a2,3
+	beq	t8,a2,L(lastb)
+
+	andi	t9,a0,3
+	bne	t9,zero,L(lastb)
+	andi	t9,a1,3
+	bne	t9,zero,L(lastb)
+
+	PTR_SUBU a3,a2,t8
+	PTR_ADDU a3,a0,a3
+
+L(wcopy_loop):
+	lw	REG3,0(a1)
+	PTR_ADDIU a0,a0,4
+	PTR_ADDIU a1,a1,4
+	bne	a0,a3,L(wcopy_loop)
+	sw	REG3,-4(a0)
 
-#endif /* !__mips64 */
+	b	L(lastb)
+	move	a2,t8
 
-libc_hidden_def(memcpy)
+#ifndef R6_CODE
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
+	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
+
+	C_LDHI	v1,UNIT(0)(a1)
+	C_LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDU a1,a1,a3
+	C_STHI	v1,UNIT(0)(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ *  Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+# endif
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
+	PREFETCH_FOR_STORE (1, a0)
+	PREFETCH_FOR_STORE (2, a0)
+	PREFETCH_FOR_STORE (3, a0)
+# endif
+# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
+#  if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu    v1,t9,a0
+	bgtz    v1,L(ua_skip_set)
+	nop
+	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
+L(ua_skip_set):
+#  else
+	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
+#  endif
+# endif
+L(ua_loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	C_LDHI	t0,UNIT(0)(a1)
+	C_LDHI	t1,UNIT(1)(a1)
+	C_LDHI	REG2,UNIT(2)(a1)
+# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+	bgtz	v1,L(ua_skip_pref)
+# endif
+	C_LDHI	REG3,UNIT(3)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+	PREFETCH_FOR_STORE (5, a0)
+L(ua_skip_pref):
+	C_LDHI	REG4,UNIT(4)(a1)
+	C_LDHI	REG5,UNIT(5)(a1)
+	C_LDHI	REG6,UNIT(6)(a1)
+	C_LDHI	REG7,UNIT(7)(a1)
+	C_LDLO	t0,UNITM1(1)(a1)
+	C_LDLO	t1,UNITM1(2)(a1)
+	C_LDLO	REG2,UNITM1(3)(a1)
+	C_LDLO	REG3,UNITM1(4)(a1)
+	C_LDLO	REG4,UNITM1(5)(a1)
+	C_LDLO	REG5,UNITM1(6)(a1)
+	C_LDLO	REG6,UNITM1(7)(a1)
+	C_LDLO	REG7,UNITM1(8)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	C_LDHI	t0,UNIT(8)(a1)
+	C_LDHI	t1,UNIT(9)(a1)
+	C_LDHI	REG2,UNIT(10)(a1)
+	C_LDHI	REG3,UNIT(11)(a1)
+	C_LDHI	REG4,UNIT(12)(a1)
+	C_LDHI	REG5,UNIT(13)(a1)
+	C_LDHI	REG6,UNIT(14)(a1)
+	C_LDHI	REG7,UNIT(15)(a1)
+	C_LDLO	t0,UNITM1(9)(a1)
+	C_LDLO	t1,UNITM1(10)(a1)
+	C_LDLO	REG2,UNITM1(11)(a1)
+	C_LDLO	REG3,UNITM1(12)(a1)
+	C_LDLO	REG4,UNITM1(13)(a1)
+	C_LDLO	REG5,UNITM1(14)(a1)
+	C_LDLO	REG6,UNITM1(15)(a1)
+	C_LDLO	REG7,UNITM1(16)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	C_ST	t0,UNIT(8)(a0)
+	C_ST	t1,UNIT(9)(a0)
+	C_ST	REG2,UNIT(10)(a0)
+	C_ST	REG3,UNIT(11)(a0)
+	C_ST	REG4,UNIT(12)(a0)
+	C_ST	REG5,UNIT(13)(a0)
+	C_ST	REG6,UNIT(14)(a0)
+	C_ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+	bne	a0,a3,L(ua_loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy.  */
+
+L(ua_chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
+				  /* t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+	nop
+	C_LDHI	t0,UNIT(0)(a1)
+	C_LDHI	t1,UNIT(1)(a1)
+	C_LDHI	REG2,UNIT(2)(a1)
+	C_LDHI	REG3,UNIT(3)(a1)
+	C_LDHI	REG4,UNIT(4)(a1)
+	C_LDHI	REG5,UNIT(5)(a1)
+	C_LDHI	REG6,UNIT(6)(a1)
+	C_LDHI	REG7,UNIT(7)(a1)
+	C_LDLO	t0,UNITM1(1)(a1)
+	C_LDLO	t1,UNITM1(2)(a1)
+	C_LDLO	REG2,UNITM1(3)(a1)
+	C_LDLO	REG3,UNITM1(4)(a1)
+	C_LDLO	REG4,UNITM1(5)(a1)
+	C_LDLO	REG5,UNITM1(6)(a1)
+	C_LDLO	REG6,UNITM1(7)(a1)
+	C_LDLO	REG7,UNITM1(8)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less than 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(ua_smallCopy)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+	C_LDHI	v1,UNIT(0)(a1)
+	C_LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	bne	a0,a3,L(ua_wordCopy_loop)
+	C_ST	v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+	beqz	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(ua_smallCopy_loop):
+	lb	v1,0(a1)
+	PTR_ADDIU a0,a0,1
+	PTR_ADDIU a1,a1,1
+	bne	a0,a3,L(ua_smallCopy_loop)
+	sb	v1,-1(a0)
+
+	j	ra
+	nop
+
+#else /* R6_CODE */
+
+# ifdef __MIPSEB
+#  define SWAP_REGS(X,Y) X, Y
+#  define ALIGN_OFFSET(N) (N)
+# else
+#  define SWAP_REGS(X,Y) Y, X
+#  define ALIGN_OFFSET(N) (NSIZE-N)
+# endif
+# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
+	andi	REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
+	beq	REG7, a2, L(lastb); /* Check for bytes to copy by word	   */ \
+	PTR_SUBU a3, a2, REG7;	/* a3 is number of bytes to be copied in   */ \
+				/* (d)word chunks.			   */ \
+	move	a2, REG7;	/* a2 is # of bytes to copy byte by byte   */ \
+				/* after word loop is finished.		   */ \
+	PTR_ADDU REG6, a0, a3;	/* REG6 is the dst address after loop.	   */ \
+	PTR_SUBU REG2, a1, t8;	/* REG2 is the aligned src address.	   */ \
+	PTR_ADDU a1, a1, a3;	/* a1 is addr of source after word loop.   */ \
+	C_LD	t0, UNIT(0)(REG2);  /* Load first part of source.	   */ \
+L(r6_ua_wordcopy##BYTEOFFSET):						      \
+	C_LD	t1, UNIT(1)(REG2);  /* Load second part of source.	   */ \
+	C_ALIGN	REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET);	      \
+	PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.	   */ \
+	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
+	move	t0, t1;		/* Move second part of source to first.	   */ \
+	bne	a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);			      \
+	C_ST	REG3, UNIT(-1)(a0);					      \
+	j	L(lastb);						      \
+	nop
+
+	/* We are generating R6 code, the destination is 4 byte aligned and
+	   the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
+           alignment of the source.  */
+
+L(r6_unaligned1):
+	R6_UNALIGNED_WORD_COPY(1)
+L(r6_unaligned2):
+	R6_UNALIGNED_WORD_COPY(2)
+L(r6_unaligned3):
+	R6_UNALIGNED_WORD_COPY(3)
+# ifdef USE_DOUBLE
+L(r6_unaligned4):
+	R6_UNALIGNED_WORD_COPY(4)
+L(r6_unaligned5):
+	R6_UNALIGNED_WORD_COPY(5)
+L(r6_unaligned6):
+	R6_UNALIGNED_WORD_COPY(6)
+L(r6_unaligned7):
+	R6_UNALIGNED_WORD_COPY(7)
+# endif
+#endif /* R6_CODE */
+
+	.set	at
+	.set	reorder
+END(MEMCPY_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+#  ifdef __UCLIBC__
+libc_hidden_def(MEMCPY_NAME)
+#  else
+libc_hidden_builtin_def (MEMCPY_NAME)
+#  endif
+# endif
+#endif
diff --git a/libc/string/mips/memset.S b/libc/string/mips/memset.S
index ff0554ff9..43034cebb 100644
--- a/libc/string/mips/memset.S
+++ b/libc/string/mips/memset.S
@@ -1,6 +1,5 @@
-/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+/* Copyright (C) 2013-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -13,147 +12,420 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include <features.h>
-/*#include <sysdep.h>*/
-#include <endian.h>
-#include "sysdep.h"
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
+# include <sysdep.h>
+# include <sys/regdef.h>
+# include <sys/asm.h>
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif defined _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+# include <sys/regdef.h>
+# include <sys/asm.h>
+#endif
+
+/* Check to see if the MIPS architecture we are compiling for supports
+   prefetching.  */
+
+#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
+# ifndef DISABLE_PREFETCH
+#  define USE_PREFETCH
+# endif
+#endif
+
+#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
+# ifndef DISABLE_DOUBLE
+#  define USE_DOUBLE
+# endif
+#endif
+
+#ifndef USE_DOUBLE
+# ifndef DISABLE_DOUBLE_ALIGN
+#  define DOUBLE_ALIGN
+# endif
+#endif
+
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+#  define L(label) $L ## label
+# else
+#  define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+#  define PTR_ADDIU	daddiu
+# else
+#  define PTR_ADDIU	addiu
+# endif
+#endif
 
-/* void *memset(void *s, int c, size_t n).  */
+/* New R6 instructions that may not be in asm.h.  */
+#ifndef PTR_LSA
+# if _MIPS_SIM == _ABI64
+#  define PTR_LSA        dlsa
+# else
+#  define PTR_LSA        lsa
+# endif
+#endif
+
+/* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+   or PREFETCH_STORE_STREAMED offers a large performance advantage
+   but PREPAREFORSTORE has some special restrictions to consider.
+
+   Prefetch with the 'prepare for store' hint does not copy a memory
+   location into the cache, it just allocates a cache line and zeros
+   it out.  This means that if you do not write to the entire cache
+   line before writing it out to memory some data will get zero'ed out
+   when the cache line is written back to memory and data will be lost.
+
+   There are ifdef'ed sections of this memcpy to make sure that it does not
+   do prefetches on cache lines that are not going to be completely written.
+   This code is only needed and only used when PREFETCH_STORE_HINT is set to
+   PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+   less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will
+   not work correctly.  */
+
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/* If we have not picked out what hints to use at this point use the
+   standard load and store prefetch hints.  */
+# ifndef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+# endif
+
+/* We double everything when USE_DOUBLE is true so we do 2 prefetches to
+   get 64 bytes in that case.  The assumption is that each individual
+   prefetch brings in 32 bytes.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_CHUNK 64
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+    pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
+    pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
+# else
+#  define PREFETCH_CHUNK 32
+#  define PREFETCH_FOR_STORE(chunk, reg) \
+    pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+# endif
 
-#ifdef __mips64
+/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
+   than PREFETCH_CHUNK, the assumed size of each prefetch.  If the real size
+   of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
+   hint is used, the code will not work correctly.  If PREPAREFORSTORE is not
+   used than MAX_PREFETCH_SIZE does not matter.  */
+# define MAX_PREFETCH_SIZE 128
+/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
+   than 5 on a STORE prefetch and that a single prefetch can never be larger
+   than MAX_PREFETCH_SIZE.  We add the extra 32 when USE_DOUBLE is set because
+   we actually do two prefetches in that case, one 32 bytes after the other.  */
+# ifdef USE_DOUBLE
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
+# else
+#  define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
+# endif
 
-#include <sys/asm.h>
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
+    && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
+/* We cannot handle this because the initial prefetches may fetch bytes that
+   are before the buffer being copied.  We start copies with an offset
+   of 4 so avoid this situation when using PREPAREFORSTORE.  */
+#  error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
+# endif
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+#if __mips_isa_rev > 5
+# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+#  undef PREFETCH_STORE_HINT
+#  define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+# endif
+# define R6_CODE
+#endif
 
-#if __BYTE_ORDER == __BIG_ENDIAN
-# define SDHI	sdl		/* high part is left in big-endian	*/
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMSET_NAME
+# define MEMSET_NAME memset
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+   The C_ prefix stands for CHUNK and is used to avoid macro name
+   conflicts with system header files.  */
+
+#ifdef USE_DOUBLE
+# define C_ST	sd
+# ifdef __MIPSEB
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
+# else
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+# endif
 #else
-# define SDHI	sdr		/* high part is right in little-endian	*/
+# define C_ST	sw
+# ifdef __MIPSEB
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+# else
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+# endif
 #endif
 
-ENTRY (memset)
-	.set	noreorder
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
+#else
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
 
-	slti	ta1, a2, 16		# Less than 16?
-	bne	ta1, zero, L(last16)
-	move	v0, a0			# Setup exit value before too late
-
-	beq	a1, zero, L(ueven)	# If zero pattern, no need to extend
-	andi	a1, 0xff		# Avoid problems with bogus arguments
-	dsll	ta0, a1, 8
-	or	a1, ta0
-	dsll	ta0, a1, 16
-	or	a1, ta0			# a1 is now pattern in full word
-	dsll	ta0, a1, 32
-	or	a1, ta0			# a1 is now pattern in double word
-
-L(ueven):
-	PTR_SUBU ta0, zero, a0		# Unaligned address?
-	andi	ta0, 0x7
-	beq	ta0, zero, L(chkw)
-	PTR_SUBU a2, ta0
-	SDHI	a1, 0(a0)		# Yes, handle first unaligned part
-	PTR_ADDU a0, ta0		# Now both a0 and a2 are updated
+#ifdef ANDROID_CHANGES
+LEAF(MEMSET_NAME,0)
+#else
+LEAF(MEMSET_NAME)
+#endif
 
-L(chkw):
-	andi	ta0, a2, 0xf		# Enough left for one loop iteration?
-	beq	ta0, a2, L(chkl)
-	PTR_SUBU a3, a2, ta0
-	PTR_ADDU a3, a0			# a3 is last loop address +1
-	move	a2, ta0			# a2 is now # of bytes left after loop
-L(loopw):
-	PTR_ADDIU a0, 16		# Handle 2 dwords pr. iteration
-	sd	a1, -16(a0)
-	bne	a0, a3, L(loopw)
-	sd	a1,  -8(a0)
-
-L(chkl):
-	andi	ta0, a2, 0x8		# Check if there is at least a double
-	beq	ta0, zero, L(last16)	#  word remaining after the loop
-	PTR_SUBU a2, ta0
-	sd	a1, 0(a0)		# Yes...
-	PTR_ADDIU a0, 8
-
-L(last16):
-	blez	a2, L(exit)		# Handle last 16 bytes (if cnt>0)
-	PTR_ADDU a3, a2, a0		# a3 is last address +1
-L(lst16l):
-	PTR_ADDIU a0, 1
-	bne	a0, a3, L(lst16l)
-	sb	a1, -1(a0)
-L(exit):
-	j	ra			# Bye, bye
+	.set	nomips16
+	.set	noreorder
+/* If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+   size, copy dst pointer to v0 for the return value.  */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+
+/* If memset value is not zero, we copy it to all the bytes in a 32 or 64
+   bit word.  */
+	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
+	PTR_SUBU a3,zero,a0
 	nop
 
-	.set	reorder
-END (memset)
+	/* smear byte into 32 or 64 bit word */
+#if ((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2)
+# ifdef USE_DOUBLE
+	dins	a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	dins	a1, a1, 16, 16      /* Replicate fill byte into word.       */
+	dins	a1, a1, 32, 32      /* Replicate fill byte into dbl word.   */
+# else
+	ins	a1, a1, 8, 8        /* Replicate fill byte into half-word.  */
+	ins	a1, a1, 16, 16      /* Replicate fill byte into word.       */
+# endif
+#else
+# ifdef USE_DOUBLE
+        and     a1,0xff
+	dsll	t2,a1,8
+	or	a1,t2
+	dsll	t2,a1,16
+	or	a1,t2
+	dsll	t2,a1,32
+	or	a1,t2
+# else
+        and     a1,0xff
+	sll	t2,a1,8
+	or	a1,t2
+	sll	t2,a1,16
+	or	a1,t2
+# endif
+#endif
+
+/* If the destination address is not aligned do a partial store to get it
+   aligned.  If it is already aligned just jump to L(aligned).  */
+L(set0):
+#ifndef R6_CODE
+	andi	t2,a3,(NSIZE-1)		/* word-unaligned address?          */
+	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
+	PTR_SUBU a2,a2,t2
+	C_STHI	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+#else /* R6_CODE */
+	andi	t2,a0,(NSIZE-1)
+	lapc	t9,L(atable)
+	PTR_LSA	t9,t2,t9,2
+	jrc	t9
+L(atable):
+	bc	L(aligned)
+# ifdef USE_DOUBLE
+	bc	L(lb7)
+	bc	L(lb6)
+	bc	L(lb5)
+	bc	L(lb4)
+# endif
+	bc	L(lb3)
+	bc	L(lb2)
+	bc	L(lb1)
+L(lb7):
+	sb	a1,6(a0)
+L(lb6):
+	sb	a1,5(a0)
+L(lb5):
+	sb	a1,4(a0)
+L(lb4):
+	sb	a1,3(a0)
+L(lb3):
+	sb	a1,2(a0)
+L(lb2):
+	sb	a1,1(a0)
+L(lb1):
+	sb	a1,0(a0)
+
+	li	t9,NSIZE
+	subu	t2,t9,t2
+	PTR_SUBU a2,a2,t2
+	PTR_ADDU a0,a0,t2
+#endif /* R6_CODE */
+
+L(aligned):
+/* If USE_DOUBLE is not set we may still want to align the data on a 16
+   byte boundry instead of an 8 byte boundry to maximize the opportunity
+   of proAptiv chips to do memory bonding (combining two sequential 4
+   byte stores into one 8 byte store).  We know there are at least 4 bytes
+   left to store or we would have jumped to L(lastb) earlier in the code.  */
+#ifdef DOUBLE_ALIGN
+	andi	t2,a3,4
+	beq	t2,zero,L(double_aligned)
+	PTR_SUBU a2,a2,t2
+	sw	a1,0(a0)
+	PTR_ADDU a0,a0,t2
+L(double_aligned):
+#endif
 
-#else /* !__mips64 */
+/* Now the destination is aligned to (word or double word) aligned address
+   Set a2 to count how many bytes we have to copy after all the 64/128 byte
+   chunks are copied and a3 to the dest pointer after all the 64/128 byte
+   chunks have been copied.  We will loop, incrementing a0 until it equals
+   a3.  */
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
-#if __BYTE_ORDER == __BIG_ENDIAN
-# define SWHI	swl		/* high part is left in big-endian	*/
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+   in this case the a0+x should not be past the "t0-32" address.  This
+   means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+   for x=64 the last "safe" a0 address is "t0-96" In the current version we
+   will use "prefetch hint,128(a0)", so "t0-160" is the limit.  */
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+#endif
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
+	PREFETCH_FOR_STORE (1, a0)
+	PREFETCH_FOR_STORE (2, a0)
+	PREFETCH_FOR_STORE (3, a0)
+#endif
+
+L(loop16w):
+#if defined(USE_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(skip_pref)
+	nop
+#endif
+#ifdef R6_CODE
+	PREFETCH_FOR_STORE (2, a0)
 #else
-# define SWHI	swr		/* high part is right in little-endian	*/
+	PREFETCH_FOR_STORE (4, a0)
+	PREFETCH_FOR_STORE (5, a0)
 #endif
+L(skip_pref):
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	C_ST	a1,UNIT(8)(a0)
+	C_ST	a1,UNIT(9)(a0)
+	C_ST	a1,UNIT(10)(a0)
+	C_ST	a1,UNIT(11)(a0)
+	C_ST	a1,UNIT(12)(a0)
+	C_ST	a1,UNIT(13)(a0)
+	C_ST	a1,UNIT(14)(a0)
+	C_ST	a1,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+	bne	a0,a3,L(loop16w)
+	nop
+	move	a2,t8
 
-ENTRY (memset)
-	.set	noreorder
+/* Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
+   Check for a 32(64) byte chunk and copy if if there is one.  Otherwise
+   jump down to L(chk1w) to handle the tail end of the copy.  */
+L(chkw):
+	andi	t8,a2,NSIZEMASK	/* is there a 32-byte/64-byte chunk.  */
+				/* the t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
+	nop
+	C_ST	a1,UNIT(0)(a0)
+	C_ST	a1,UNIT(1)(a0)
+	C_ST	a1,UNIT(2)(a0)
+	C_ST	a1,UNIT(3)(a0)
+	C_ST	a1,UNIT(4)(a0)
+	C_ST	a1,UNIT(5)(a0)
+	C_ST	a1,UNIT(6)(a0)
+	C_ST	a1,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/* Here we have less than 32(64) bytes to set.  Set up for a loop to
+   copy one word (or double word) at a time.  Set a2 to count how many
+   bytes we have to copy after all the word (or double word) chunks are
+   copied and a3 to the dest pointer after all the (d)word chunks have
+   been copied.  We will loop, incrementing a0 until a0 equals a3.  */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
-	slti	t1, a2, 8		# Less than 8?
-	bne	t1, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	beq	a1, zero, L(ueven)	# If zero pattern, no need to extend
-	andi	a1, 0xff		# Avoid problems with bogus arguments
-	sll	t0, a1, 8
-	or	a1, t0
-	sll	t0, a1, 16
-	or	a1, t0			# a1 is now pattern in full word
-
-L(ueven):	
-	subu	t0, zero, a0		# Unaligned address?
-	andi	t0, 0x3
-	beq	t0, zero, L(chkw)
-	subu	a2, t0
-	SWHI	a1, 0(a0)		# Yes, handle first unaligned part
-	addu	a0, t0			# Now both a0 and a2 are updated
-
-L(chkw):	
-	andi	t0, a2, 0x7		# Enough left for one loop iteration?
-	beq	t0, a2, L(chkl)
-	subu	a3, a2, t0
-	addu	a3, a0			# a3 is last loop address +1
-	move	a2, t0			# a2 is now # of bytes left after loop
-L(loopw):	
-	addiu	a0, 8			# Handle 2 words pr. iteration
-	sw	a1, -8(a0)
-	bne	a0, a3, L(loopw)
-	sw	a1, -4(a0)
-
-L(chkl):	
-	andi	t0, a2, 0x4		# Check if there is at least a full
-	beq	t0, zero, L(last8)	#  word remaining after the loop
-	subu	a2, t0
-	sw	a1, 0(a0)		# Yes...
-	addiu	a0, 4
-
-L(last8):	
-	blez	a2, L(exit)		# Handle last 8 bytes (if cnt>0)
-	addu	a3, a2, a0		# a3 is last address +1
-L(lst8l):	
-	addiu	a0, 1
-	bne	a0, a3, L(lst8l)
-	sb	a1, -1(a0)
-L(exit):	
-	j	ra			# Bye, bye
+/* copying in words (4-byte or 8 byte chunks) */
+L(wordCopy_loop):
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	a1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
+L(lastbloop):
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	a1,-1(a0)
+L(leave):
+	j	ra
 	nop
 
+	.set	at
 	.set	reorder
-END (memset)
-
-#endif /* !__mips64 */
+END(MEMSET_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+#  ifdef __UCLIBC__
+libc_hidden_def(MEMSET_NAME)
+#  else
+libc_hidden_builtin_def (MEMSET_NAME)
+#  endif
+# endif
+#endif
 
-libc_hidden_def(memset)
diff --git a/libc/string/mips/sysdep.h b/libc/string/mips/sysdep.h
deleted file mode 100644
index 5dad8342e..000000000
--- a/libc/string/mips/sysdep.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Adapted from glibc's sysdeps/unix/mips/sysdep.h */
-
-/* Copyright (C) 1992, 1995, 1997, 1999, 2000, 2002, 2003
-   Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Brendan Kehoe (brendan@zen.org).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#ifdef __ASSEMBLER__
-
-#include <sgidefs.h>
-#include <sys/regdef.h>
-
-#define ENTRY(name) \
-  .globl name;                                                                \
-  .align 2;                                                                   \
-  .ent name,0;                                                                \
-  name/* use a comment rather than ## to workaround bug in gcc-3.4.x */:
-
-#undef END
-#define END(function)                                   \
-                .end    function;                       \
-                .size   function,.-function
-
-#if _MIPS_SIM == _MIPS_SIM_ABI32 || _MIPS_SIM == _MIPS_SIM_ABIO64
-# define L(label) $L ## label
-#else
-# define L(label) .L ## label
-#endif
-
-#endif
diff --git a/libc/string/powerpc/memcpy.c b/libc/string/powerpc/memcpy.c
index f3d800739..22794ec33 100644
--- a/libc/string/powerpc/memcpy.c
+++ b/libc/string/powerpc/memcpy.c
@@ -21,16 +21,15 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memcpy) */
-void *memcpy(void *to, const void *from, size_t n)
-/* PPC can do pre increment and load/store, but not post increment and load/store.
-   Therefore use *++ptr instead of *ptr++. */
+/* PPC can do pre increment and load/store, but not post increment and
+   load/store.  Therefore use *++ptr instead of *ptr++.  */
+void *memcpy(void *to, const void *from, size_t len)
 {
 	unsigned long rem, chunks, tmp1, tmp2;
 	unsigned char *tmp_to;
 	unsigned char *tmp_from = (unsigned char *)from;
 
-	chunks = n / 8;
+	chunks = len / 8;
 	tmp_from -= 4;
 	tmp_to = to - 4;
 	if (!chunks)
@@ -49,30 +48,33 @@ void *memcpy(void *to, const void *from, size_t n)
 		*(unsigned long *)tmp_to = tmp2;
 	} while (--chunks);
  lessthan8:
-	n = n % 8;
-	if (n >= 4) {
-		*(unsigned long *)(tmp_to+4) = *(unsigned long *)(tmp_from+4);
+	len = len % 8;
+	if (len >= 4) {
 		tmp_from += 4;
 		tmp_to += 4;
-		n = n-4;
+		*(unsigned long *)(tmp_to) = *(unsigned long *)(tmp_from);
+		len -= 4;
 	}
-	if (!n ) return to;
+	if (!len)
+		return to;
 	tmp_from += 3;
 	tmp_to += 3;
 	do {
 		*++tmp_to = *++tmp_from;
-	} while (--n);
+	} while (--len);
 
 	return to;
  align:
+	/* ???: Do we really need to generate the carry flag here? If not, then:
+	rem -= 4; */
 	rem = 4 - rem;
-	n = n - rem;
+	len -= rem;
 	do {
 		*(tmp_to+4) = *(tmp_from+4);
 		++tmp_from;
 		++tmp_to;
 	} while (--rem);
-	chunks = n / 8;
+	chunks = len / 8;
 	if (chunks)
 		goto copy_chunks;
 	goto lessthan8;
diff --git a/libc/string/powerpc/memmove.c b/libc/string/powerpc/memmove.c
index 8badae37d..6bd79915d 100644
--- a/libc/string/powerpc/memmove.c
+++ b/libc/string/powerpc/memmove.c
@@ -21,9 +21,7 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memcpy) */
 
-/* Experimentally off - libc_hidden_proto(memmove) */
 void *memmove(void *to, const void *from, size_t n)
 {
 	unsigned long rem, chunks, tmp1, tmp2;
diff --git a/libc/string/powerpc/memset.c b/libc/string/powerpc/memset.c
index 1cbfd04fc..a900b92cb 100644
--- a/libc/string/powerpc/memset.c
+++ b/libc/string/powerpc/memset.c
@@ -21,7 +21,6 @@
 
 #include <string.h>
 
-/* Experimentally off - libc_hidden_proto(memset) */
 
 static __inline__ int expand_byte_word(int c){
 	/* this does:
diff --git a/libc/string/psignal.c b/libc/string/psignal.c
index 1ca8725db..3e1f68b94 100644
--- a/libc/string/psignal.c
+++ b/libc/string/psignal.c
@@ -10,8 +10,6 @@
 #include <string.h>
 #include <signal.h>
 
-libc_hidden_proto(fprintf)
-/* Experimentally off - libc_hidden_proto(strsignal) */
 
 /* TODO: make this threadsafe with a reentrant version of strsignal? */
 
diff --git a/libc/string/rawmemchr.c b/libc/string/rawmemchr.c
index 3cddefa10..f0cb7ee47 100644
--- a/libc/string/rawmemchr.c
+++ b/libc/string/rawmemchr.c
@@ -8,7 +8,6 @@
 #include "_string.h"
 
 #ifdef __USE_GNU
-/* Experimentally off - libc_hidden_proto(rawmemchr) */
 void *rawmemchr(const void *s, int c)
 {
 	register const unsigned char *r = s;
diff --git a/libc/string/sh/memchr.S b/libc/string/sh/memchr.S
new file mode 100644
index 000000000..6b7142f69
--- /dev/null
+++ b/libc/string/sh/memchr.S
@@ -0,0 +1,30 @@
+/* $Id: memchr.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memchr" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+/*
+ * void *memchr(const void *s, int c, size_t n);
+ */
+
+#include <sysdep.h>
+
+ENTRY(memchr)
+	tst	r6,r6
+	bt/s	2f
+	 exts.b	r5,r5
+1:	mov.b	@r4,r1
+	cmp/eq	r1,r5
+	bt/s	3f
+	 dt	r6
+	bf/s	1b
+	 add	#1,r4
+2:	mov	#0,r4
+3:	rts
+	 mov	r4,r0
+END(memchr)
+libc_hidden_def (memchr)
diff --git a/libc/string/sh/sh4/memcpy.S b/libc/string/sh/sh4/memcpy.S
index 2d918293e..6a229a06c 100644
--- a/libc/string/sh/sh4/memcpy.S
+++ b/libc/string/sh/sh4/memcpy.S
@@ -6,6 +6,9 @@
  *   Modified from memcpy.S and micro-optimised for SH4
  *   Stuart Menefy (stuart.menefy@st.com)
  *
+ * Copyright (c) 2009  STMicroelectronics Ltd
+ *   Optimised using prefetching and 64bit data transfer via FPU
+ *   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
  */
 
 /*
@@ -15,8 +18,32 @@
  * If there is an overlap, then the results are undefined.
  */
 
+#include <sysdep.h>
 #include <endian.h>
 
+#if defined (__LITTLE_ENDIAN__) && defined (__SH_FPU_ANY__)
+#define	MEMCPY_USES_FPU
+/* Use paired single precision load or store mode for 64-bit tranfering.
+ * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
+ * Currenlty it has been only implemented and tested for little endian mode. */
+.macro FPU_SET_PAIRED_PREC
+	sts	fpscr, r7
+	mov	#0x10, r0	! PR=0 SZ=1
+	shll16	r0
+	lds	r0, fpscr
+.endm
+.macro RESTORE_FPSCR
+	lds	r7, fpscr
+.endm
+.macro DALLOC
+	! Cache allocate + store on dst-32.
+	add	#-32, r1
+	movca.l	r0, @r1
+	add	#32, r1
+.endm
+
+#endif
+
 	!
 	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
 	!
@@ -127,10 +154,10 @@
 
 	 mov.l	r3,@-r0		!  30 LS
 #else
-3:	mov	r1,r3		! OPQR
+3:	mov	r7,r3		! OPQR
 	shlr8	r3		! xOPQ
-	mov.l	@(r0,r5),r1	! KLMN
-	mov	r1,r6
+	mov.l	@(r0,r5),r7	! KLMN
+	mov	r7,r6
 	shll16	r6
 	shll8	r6		! Nxxx
 	or	r6,r3		! NOPQ
@@ -157,12 +184,7 @@
 9:	rts
 	 nop
 
-/* void * memcpy(void *dst, const void *src, size_t len) */
-.text
-.align 4
-.type  memcpy,@function
-.globl memcpy;
-memcpy:
+ENTRY(memcpy)
 
 	! Calculate the invariants which will be used in the remainder
 	! of the code:
@@ -189,9 +211,7 @@ memcpy:
 	mov	r4, r0		!   5 MT (0 cycle latency)
 	add	r6, r0		!  49 EX
 
-	mov	#16, r1		!   6 EX
 	bt/s	.Lcase00	! 111 BR		(aligned)
-
 	 sub	r4, r5		!  75 EX
 
 	! Arguments are not nicely long word aligned or zero len.
@@ -207,6 +227,7 @@ memcpy:
 	! However the penalty for getting it 'wrong' is much higher for long word
 	! aligned data (and this is more common), so use a value of 16.
 
+	mov	#16, r1		!   6 EX
 	cmp/gt	r6,r1		!  56 MT
 
 	add	#-1,r5		!  50 EX
@@ -447,6 +468,183 @@ memcpy:
 
 	 mov.l	r7, @-r0	!  30 LS
 
+#ifdef MEMCPY_USES_FPU
+	! Copy the cache line aligned blocks by using the FPU registers.
+	! If src and dst are well aligned adopt 64-bit data transfer.
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+1:
+	add	r0, r5
+	mov	r0, r1
+
+	mov	r1, r3		! MT
+	sub	r2, r3		! EX (r3 - r2 -> r3)
+	mov	#-5, r0
+	shld	r0, r3		! number of the cache lines
+
+	mov	#8, r0
+	cmp/ge	r0, r3		! Check if there are many cache lines to copy.
+	bf	45f		! Copy cache line aligned blocks without pref.
+	mov	r5, r0
+	add	#-0x7c, r0
+	tst	#7, r0		! src is 8byte aligned
+	bf	45f
+
+	! Many cache lines have to be copied and the buffers are well aligned.
+	! Aggressive prefetching and FPU in single paired precision.
+	mov	r0, r5
+	mov	r5, r6
+	add	#-0x80, r6	! prefetch head
+
+	! store FPU (in single precision mode, do not check R15 align).
+	fmov	fr12, @-r15
+	fmov	fr13, @-r15
+	fmov	fr14, @-r15
+	fmov	fr15, @-r15
+
+	FPU_SET_PAIRED_PREC
+
+	mov	#4, r0
+67:
+	add	#-0x20, r6
+	pref	@r6
+	add	#-0x20, r6
+	pref	@r6
+
+	fmov	@r5+, dr0
+	fmov	@r5+, dr2
+	fmov	@r5+, dr4
+	fmov	@r5+, dr6
+	fmov	@r5+, dr8
+	fmov	@r5+, dr10
+	fmov	@r5+, dr12
+	fmov	@r5+, dr14
+	fmov	@r5+, xd0
+	fmov	@r5+, xd2
+	fmov	@r5+, xd4
+	fmov	@r5+, xd6
+	fmov	@r5+, xd8
+	fmov	@r5+, xd10
+	fmov	@r5+, xd12
+	fmov	@r5+, xd14
+
+	DALLOC
+	fmov	xd14, @-r1
+	fmov	xd12, @-r1
+	fmov	xd10, @-r1
+	fmov	xd8, @-r1
+	DALLOC
+	fmov	xd6, @-r1
+	fmov	xd4, @-r1
+	fmov	xd2, @-r1
+	fmov	xd0, @-r1
+	DALLOC
+	fmov	dr14, @-r1
+	fmov	dr12, @-r1
+	fmov	dr10, @-r1
+	fmov	dr8, @-r1
+	DALLOC
+	fmov	dr6, @-r1
+	add	#-0x80, r5
+	fmov	dr4, @-r1
+	add	#-0x80, r5
+	fmov	dr2, @-r1
+	add	#-0x20, r6
+	fmov	dr0, @-r1
+	add	#-4, r3
+	pref	@r6
+	add	#-0x20, r6
+	cmp/ge	r0, r3
+	bt/s	67b
+	 pref	@r6
+
+	RESTORE_FPSCR
+
+	! Restore FPU callee save registers
+	fmov	@r15+, fr15
+	fmov	@r15+, fr14
+	fmov	@r15+, fr13
+	fmov	@r15+, fr12
+
+	! Other cache lines could be copied: so use the FPU in single paired
+	! precision without prefetching. No check for alignment is necessary.
+
+	mov	#1, r0
+	cmp/ge	r0, r3
+	bt/s	3f
+	 add	#0x60, r5
+
+	bra	5f
+	 nop
+
+	! No prefetch and FPU in single precision.
+45:
+	add	#-0x1c, r5
+	mov	r5, r0
+	tst	#7, r0
+	bt	3f
+
+2:	fmov.s	@r5+, fr0
+	fmov.s	@r5+, fr1
+	fmov.s	@r5+, fr2
+	fmov.s	@r5+, fr3
+	fmov.s	@r5+, fr4
+	fmov.s	@r5+, fr5
+	fmov.s	@r5+, fr6
+	fmov.s	@r5+, fr7
+
+	DALLOC
+
+	fmov.s	fr7, @-r1
+	fmov.s	fr6, @-r1
+	fmov.s	fr5, @-r1
+	fmov.s	fr4, @-r1
+	fmov.s	fr3, @-r1
+	fmov.s	fr2, @-r1
+	fmov.s	fr1, @-r1
+	fmov.s	fr0, @-r1
+
+	cmp/eq	r2,r1
+
+	bf/s	2b
+	 add	#-0x40, r5
+
+	bra	5f
+	 nop
+
+	! No prefetch and FPU in single paired precision.
+
+3:	FPU_SET_PAIRED_PREC
+
+4:	fmov	@r5+, dr0
+	fmov	@r5+, dr2
+	fmov	@r5+, dr4
+	fmov	@r5+, dr6
+
+	DALLOC
+
+	fmov	dr6, @-r1
+	fmov	dr4, @-r1
+	fmov	dr2, @-r1
+	fmov	dr0, @-r1
+	cmp/eq	r2,r1
+
+	bf/s	4b
+	 add	#-0x40, r5
+
+	RESTORE_FPSCR
+
+5:	mov	r1, r0
+
+	cmp/eq	r4, r0		!  54 MT
+	bf/s	1f		! 109 BR
+	 sub	r1, r5		!  75 EX
+
+	rts
+	 nop
+1:
+#else
 	! Copy the cache line aligned blocks
 	!
 	! In use: r0, r2, r4, r5
@@ -512,6 +710,7 @@ memcpy:
 
 	rts
 1:	 mov.l	@r15+, r8	!  15 LS
+#endif
 	sub	r4, r1		!  75 EX		(len remaining)
 
 	! number of trailing bytes is non-zero
@@ -733,30 +932,30 @@ memcpy:
 	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
 	xtrct	r9, r8		!  48 EX
 
-	mov.w	@(0x02,r5), r12	!  18 LS (latency=2)
+	mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
 	xtrct	r10, r9		!  48 EX
 
 	movca.l	r0,@r1		!  40 LS (latency=3-7)
 	add	#-0x1c, r1	!  50 EX
 
-	mov.l	r3, @(0x1c,r1)	!  33 LS
+	mov.l	r3, @(0x18,r1)	!  33 LS
 	xtrct	r11, r10	!  48 EX
 
-	mov.l	r6, @(0x18,r1)	!  33 LS
+	mov.l	r6, @(0x14,r1)	!  33 LS
 	xtrct	r12, r11	!  48 EX
 
-	mov.l	r7, @(0x14,r1)	!  33 LS
+	mov.l	r7, @(0x10,r1)	!  33 LS
 
-	mov.l	r8, @(0x10,r1)	!  33 LS
-	add	#-0x3e, r5	!  50 EX
+	mov.l	r8, @(0x0c,r1)	!  33 LS
+	add	#-0x1e, r5	!  50 EX
 
-	mov.l	r9, @(0x0c,r1)	!  33 LS
+	mov.l	r9, @(0x08,r1)	!  33 LS
 	cmp/eq	r2,r1		!  54 MT
 
-	mov.l	r10, @(0x08,r1)	!  33 LS
+	mov.l	r10, @(0x04,r1)	!  33 LS
 	bf/s	2b		! 109 BR
 
-	 mov.l	r11, @(0x04,r1)	!  33 LS
+	 mov.l	r11, @(0x00,r1)	!  33 LS
 #endif
 
 	mov.l	@r15+, r12
@@ -803,6 +1002,5 @@ memcpy:
 	rts
 	 mov.b	r1,@-r0
 
-.size memcpy,.-memcpy;
-
+END(memcpy)
 libc_hidden_def (memcpy)
diff --git a/libc/string/sh/sh4/memmove.c b/libc/string/sh/sh4/memmove.c
new file mode 100644
index 000000000..8059bd4cc
--- /dev/null
+++ b/libc/string/sh/sh4/memmove.c
@@ -0,0 +1,121 @@
+/* memmove implementation for SH4
+ *
+ * Copyright (C) 2009 STMicroelectronics Ltd.
+ *
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+#ifndef __SH_FPU_ANY__
+#include "../../generic/memmove.c"
+#else
+
+#include <string.h>
+
+#define FPSCR_SR	(1 << 20)
+#define STORE_FPSCR(x)	__asm__ __volatile__("sts fpscr, %0" : "=r"(x))
+#define LOAD_FPSCR(x)	__asm__ __volatile__("lds %0, fpscr" : : "r"(x))
+
+static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len)
+{
+	char *d = (char *)dest;
+	char *s = (char *)src;
+
+	if (len >= 64) {
+		unsigned long fpscr;
+		int *s1;
+		int *d1;
+
+		/* Align the dest to 4 byte boundary. */
+		while ((unsigned)d & 0x7) {
+			*d++ = *s++;
+			len--;
+		}
+
+		s1 = (int *)s;
+		d1 = (int *)d;
+
+		/* check if s is well aligned to use FPU */
+		if (!((unsigned)s1 & 0x7)) {
+
+			/* Align the dest to cache-line boundary */
+			while ((unsigned)d1 & 0x1c) {
+				*d1++ = *s1++;
+				len -= 4;
+			}
+
+			/* Use paired single precision load or store mode for
+			 * 64-bit tranfering.*/
+			STORE_FPSCR(fpscr);
+			LOAD_FPSCR(FPSCR_SR);
+
+			while (len >= 32) {
+				__asm__ __volatile__ ("fmov @%0+,dr0":"+r" (s1));
+				__asm__ __volatile__ ("fmov @%0+,dr2":"+r" (s1));
+				__asm__ __volatile__ ("fmov @%0+,dr4":"+r" (s1));
+				__asm__ __volatile__ ("fmov @%0+,dr6":"+r" (s1));
+				__asm__
+				    __volatile__ ("fmov dr0,@%0"::"r"
+					      (d1):"memory");
+				d1 += 2;
+				__asm__
+				    __volatile__ ("fmov dr2,@%0"::"r"
+					      (d1):"memory");
+				d1 += 2;
+				__asm__
+				    __volatile__ ("fmov dr4,@%0"::"r"
+					      (d1):"memory");
+				d1 += 2;
+				__asm__
+				    __volatile__ ("fmov dr6,@%0"::"r"
+					      (d1):"memory");
+				d1 += 2;
+				len -= 32;
+			}
+			LOAD_FPSCR(fpscr);
+		}
+		s = (char *)s1;
+		d = (char *)d1;
+		/*TODO: other subcases could be covered here?!?*/
+	}
+	/* Go to per-byte copy */
+	while (len > 0) {
+		*d++ = *s++;
+		len--;
+	}
+	return;
+}
+
+void *memmove(void *dest, const void *src, size_t len)
+{
+	unsigned long int d = (long int)dest;
+	unsigned long int s = (long int)src;
+	unsigned long int res;
+
+	if (d >= s)
+		res = d - s;
+	else
+		res = s - d;
+	/*
+	 * 1) dest and src are not overlap  ==> memcpy (BWD/FDW)
+	 * 2) dest and src are 100% overlap ==> memcpy (BWD/FDW)
+	 * 3) left-to-right overlap ==>  Copy from the beginning to the end
+	 * 4) right-to-left overlap ==>  Copy from the end to the beginning
+	 */
+
+	if (res == 0)		/* 100% overlap */
+		memcpy(dest, src, len);	/* No overlap */
+	else if (res >= len)
+		memcpy(dest, src, len);
+	else {
+		if (d > s)	/* right-to-left overlap */
+			memcpy(dest, src, len);	/* memcpy is BWD */
+		else		/* cannot use SH4 memcpy for this case */
+			fpu_optimised_copy_fwd(dest, src, len);
+	}
+	return (dest);
+}
+
+libc_hidden_def(memmove)
+#endif /*__SH_FPU_ANY__ */
diff --git a/libc/string/sh/sh4/memset.S b/libc/string/sh/sh4/memset.S
new file mode 100644
index 000000000..eb83355ce
--- /dev/null
+++ b/libc/string/sh/sh4/memset.S
@@ -0,0 +1,152 @@
+/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
+ *
+ * "memset" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ *
+ * Copyright (c) 2009  STMicroelectronics Ltd
+ *   Optimised using 64bit data transfer (via FPU) and the movca.l inst.
+ *   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+/*
+ *            void *memset(void *s, int c, size_t n);
+ */
+
+#include <sysdep.h>
+
+#if defined (__LITTLE_ENDIAN__) && defined (__SH_FPU_ANY__)
+#define MEMSET_USES_FPU
+/* Use paired single precision load or store mode for 64-bit tranfering.
+ * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
+ * Currenlty it has been only implemented and tested for little endian mode. */
+.macro FPU_SET_PAIRED_PREC
+	sts	fpscr, r3
+	mov	#0x10, r1	! PR=0 SZ=1
+	shll16  r1
+	lds	r1, fpscr
+.endm
+.macro RESTORE_FPSCR
+	lds	r3, fpscr
+.endm
+#endif
+
+ENTRY(memset)
+	mov	#12,r0
+	add	r6,r4
+	cmp/gt	r6,r0
+	bt/s	40f		! if it's too small, set a byte at once
+	 mov	r4,r0
+	and	#3,r0
+	cmp/eq	#0,r0
+	bt/s	2f		! It's aligned
+	 sub	r0,r6
+1:
+	dt	r0
+	bf/s	1b
+	 mov.b	r5,@-r4
+2:				! make VVVV
+	extu.b	r5,r5
+	swap.b	r5,r0		!   V0
+	or	r0,r5		!   VV
+	swap.w	r5,r0		! VV00
+	or	r0,r5		! VVVV
+
+	! Check if enough bytes need to be copied to be worth the big loop
+	mov	#0x40, r0	! (MT)
+	cmp/gt	r6,r0		! (MT)  64 > len => slow loop
+
+	bt/s	22f
+	 mov	r6,r0
+
+	! align the dst to the cache block size if necessary
+	mov	r4, r3
+	mov	#~(0x1f), r1
+
+	and	r3, r1
+	cmp/eq	r3, r1
+
+	bt/s	11f		! dst is already aligned
+	 sub	r1, r3		! r3-r1 -> r3
+	shlr2	r3		! number of loops
+
+10:	mov.l	r5,@-r4
+	dt	r3
+	bf/s	10b
+	 add	#-4, r6
+
+11:	! dst is 32byte aligned
+	mov	r6,r2
+	mov	#-5,r0
+	shld	r0,r2		! number of loops
+
+	add	#-32, r4
+	mov	r5, r0
+
+#ifdef MEMSET_USES_FPU
+	lds	r5, fpul	! (CO)
+	fsts	fpul, fr0	! Dr0 will be 'VVVVVVVV'
+	fsts	fpul, fr1
+
+	FPU_SET_PAIRED_PREC
+12:
+	movca.l	r0, @r4
+	mov.l	r5, @(4, r4)
+	add	#32, r4
+	fmov	dr0, @-r4
+	fmov	dr0, @-r4
+	add	#-0x20, r6
+	fmov	dr0, @-r4
+	dt	r2
+	bf/s	12b
+	 add	#-40, r4
+
+	RESTORE_FPSCR
+#else
+12:
+	movca.l	r0,@r4
+	mov.l	r5,@(4, r4)
+	mov.l	r5,@(8, r4)
+	mov.l	r5,@(12,r4)
+	mov.l	r5,@(16,r4)
+	mov.l	r5,@(20,r4)
+	add	#-0x20, r6
+	mov.l	r5,@(24,r4)
+	dt	r2
+	mov.l	r5,@(28,r4)
+	bf/s	12b
+	 add	#-32, r4
+
+#endif
+	add	#32, r4
+	mov	#8, r0
+	cmp/ge	r0, r6
+	bf	40f
+
+	mov	r6,r0
+22:
+	shlr2	r0
+	shlr	r0		! r0 = r6 >> 3
+3:
+	dt	r0
+	mov.l	r5,@-r4		! set 8-byte at once
+	bf/s	3b
+	 mov.l	r5,@-r4
+	!
+	mov	#7,r0
+	and	r0,r6
+
+	! fill bytes (length may be zero)
+40:	tst	r6,r6
+	bt	5f
+4:
+	dt	r6
+	bf/s	4b
+	 mov.b	r5,@-r4
+5:
+	rts
+	 mov	r4,r0
+END(memset)
+libc_hidden_def (memset)
diff --git a/libc/string/sh/sh4/strcpy.S b/libc/string/sh/sh4/strcpy.S
new file mode 100644
index 000000000..0f8278017
--- /dev/null
+++ b/libc/string/sh/sh4/strcpy.S
@@ -0,0 +1,28 @@
+/* strcpy implementation for SUPERH
+ *
+ * Copyright (C) 2009 STMicroelectronics Ltd.
+ *
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+/*
+	char *strcpy(char *dest, const char *src);
+ */
+
+#include <sysdep.h>
+
+ENTRY(strcpy)
+	mov	r4,r2
+1:
+	mov.b	@r5+,r1
+	tst	r1,r1
+	mov.b	r1,@r2
+	bf/s	1b
+	 add	#1,r2
+
+	rts
+	 mov	r4,r0
+END(strcpy)
+libc_hidden_def (strcpy)
diff --git a/libc/string/sh/sh4/strncpy.S b/libc/string/sh/sh4/strncpy.S
new file mode 100644
index 000000000..8a16f39d4
--- /dev/null
+++ b/libc/string/sh/sh4/strncpy.S
@@ -0,0 +1,43 @@
+/* strncpy implementation for SUPERH
+ *
+ * Copyright (C) 2009 STMicroelectronics Ltd.
+ *
+ * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+/*
+	char *strncpy(char *dest, const char *src, size_t n);
+ */
+
+#include <sysdep.h>
+
+ENTRY(strncpy)
+	mov	#0,r0
+	bra	2f
+	 mov	r4,r2
+1:
+	mov.b	r1,@(r0,r2)
+	add	#1,r0
+2:
+	cmp/hs	r6,r0
+	bt	5f
+	mov.b	@(r0,r5),r1
+	tst	r1,r1
+	bf/s	1b
+	 cmp/hs	r6,r0
+	bra	4f
+	 nop
+3:
+	mov.b	r1,@(r0,r2)
+	add	#1,r0
+	cmp/hs	r6,r0
+4:
+	bf/s	3b
+	 mov	#0,r1
+5:
+	rts
+	 mov     r2,r0
+END(strncpy)
+libc_hidden_def(strncpy)
diff --git a/libc/string/sh/strlen.S b/libc/string/sh/strlen.S
new file mode 100644
index 000000000..1ccecc17b
--- /dev/null
+++ b/libc/string/sh/strlen.S
@@ -0,0 +1,75 @@
+/* $Id: strlen.S,v 1.2 2001/06/29 14:07:15 gniibe Exp $
+ *
+ * "strlen" implementation of SuperH
+ *
+ * Copyright (C) 1999  Kaz Kojima
+ *
+ * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+ */
+
+/* size_t strlen (const char *s)  */
+
+#include <sysdep.h>
+#include <endian.h>
+
+ENTRY(strlen)
+	mov	r4,r0
+	and	#3,r0
+	tst	r0,r0
+	bt/s	1f
+	 mov	#0,r2
+
+	add	#-1,r0
+	shll2	r0
+	shll	r0
+	braf	r0
+	 nop
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+	mov.b	@r4+,r1
+	tst	r1,r1
+	bt	8f
+	add	#1,r2
+
+1:
+	mov	#0,r3
+2:
+	mov.l	@r4+,r1
+	cmp/str	r3,r1
+	bf/s	2b
+	 add	#4,r2
+
+	add	#-4,r2
+#ifndef __LITTLE_ENDIAN__
+	swap.b	r1,r1
+	swap.w	r1,r1
+	swap.b	r1,r1
+#endif
+	extu.b	r1,r0
+	tst	r0,r0
+	bt/s	8f
+	 shlr8	r1
+	add	#1,r2
+	extu.b	r1,r0
+	tst	r0,r0
+	bt/s	8f
+	 shlr8	r1
+	add	#1,r2
+	extu.b	r1,r0
+	tst	r0,r0
+	bt	8f
+	add	#1,r2
+8:
+	rts
+	 mov	r2,r0
+END(strlen)
+libc_hidden_def (strlen)
diff --git a/libc/string/sh64/memcpy.S b/libc/string/sh64/memcpy.S
deleted file mode 100644
index 3c0ea0c0d..000000000
--- a/libc/string/sh64/memcpy.S
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
-/* Modified by SuperH, Inc. September 2003 */
-!
-! Fast SH memcpy
-!
-! by Toshiyasu Morita (tm@netcom.com)
-! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
-! SH5 code Copyright 2002 SuperH Ltd.
-!
-! Entry: ARG0: destination pointer
-!        ARG1: source pointer
-!        ARG2: byte count
-!
-! Exit:  RESULT: destination pointer
-!        any other registers in the range r0-r7: trashed
-!
-! Notes: Usually one wants to do small reads and write a longword, but
-!        unfortunately it is difficult in some cases to concatanate bytes
-!        into a longword on the SH, so this does a longword read and small
-!        writes.
-!
-! This implementation makes two assumptions about how it is called:
-!
-! 1.: If the byte count is nonzero, the address of the last byte to be
-!     copied is unsigned greater than the address of the first byte to
-!     be copied.  This could be easily swapped for a signed comparison,
-!     but the algorithm used needs some comparison.
-!
-! 2.: When there are two or three bytes in the last word of an 11-or-more
-!     bytes memory chunk to b copied, the rest of the word can be read
-!     without side effects.
-!     This could be easily changed by increasing the minumum size of
-!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
-!     however, this would cost a few extra cyles on average.
-!     For SHmedia, the assumption is that any quadword can be read in its
-!     enirety if at least one byte is included in the copy.
-!
-
-#include <features.h>
-
-	.section .text..SHmedia32,"ax"
-	.globl	memcpy
-	.type	memcpy, @function
-	.align	5
-
-memcpy:
-
-#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
-#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
-#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
-#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
-
-	ld.b r3,0,r63
-	pta/l Large,tr0
-	movi 25,r0
-	bgeu/u r4,r0,tr0
-	nsb r4,r0
-	shlli r0,5,r0
-	movi (L1-L0+63*32 + 1) & 0xffff,r1
-	sub r1, r0, r0
-L0:	ptrel r0,tr0
-	add r2,r4,r5
-	ptabs r18,tr1
-	add r3,r4,r6
-	blink tr0,r63
-	
-/* Rearranged to make cut2 safe */
-	.balign 8
-L4_7:	/* 4..7 byte memcpy cntd. */
-	stlo.l r2, 0, r0
-	or r6, r7, r6
-	sthi.l r5, -1, r6
-	stlo.l r5, -4, r6
-	blink tr1,r63
-
-	.balign 8
-L1:	/* 0 byte memcpy */
-	nop
-	blink tr1,r63
-	nop
-	nop
-	nop
-	nop
-
-L2_3:	/* 2 or 3 byte memcpy cntd. */
-	st.b r5,-1,r6
-	blink tr1,r63
-
-	/* 1 byte memcpy */
-	ld.b r3,0,r0
-	st.b r2,0,r0
-	blink tr1,r63
-
-L8_15:	/* 8..15 byte memcpy cntd. */
-	stlo.q r2, 0, r0
-	or r6, r7, r6
-	sthi.q r5, -1, r6
-	stlo.q r5, -8, r6
-	blink tr1,r63
-	
-	/* 2 or 3 byte memcpy */
-	ld.b r3,0,r0
-	ld.b r2,0,r63
-	ld.b r3,1,r1
-	st.b r2,0,r0
-	pta/l L2_3,tr0
-	ld.b r6,-1,r6
-	st.b r2,1,r1
-	blink tr0, r63
-
-	/* 4 .. 7 byte memcpy */
-	LDUAL (r3, 0, r0, r1)
-	pta L4_7, tr0
-	ldlo.l r6, -4, r7
-	or r0, r1, r0
-	sthi.l r2, 3, r0
-	ldhi.l r6, -1, r6
-	blink tr0, r63
-
-	/* 8 .. 15 byte memcpy */
-	LDUAQ (r3, 0, r0, r1)
-	pta L8_15, tr0
-	ldlo.q r6, -8, r7
-	or r0, r1, r0
-	sthi.q r2, 7, r0
-	ldhi.q r6, -1, r6
-	blink tr0, r63
-
-	/* 16 .. 24 byte memcpy */
-	LDUAQ (r3, 0, r0, r1)
-	LDUAQ (r3, 8, r8, r9)
-	or r0, r1, r0
-	sthi.q r2, 7, r0
-	or r8, r9, r8
-	sthi.q r2, 15, r8
-	ldlo.q r6, -8, r7
-	ldhi.q r6, -1, r6
-	stlo.q r2, 8, r8
-	stlo.q r2, 0, r0
-	or r6, r7, r6
-	sthi.q r5, -1, r6
-	stlo.q r5, -8, r6
-	blink tr1,r63
-
-Large:
-	ld.b r2, 0, r63
-	pta/l  Loop_ua, tr1
-	ori r3, -8, r7
-	sub r2, r7, r22
-	sub r3, r2, r6
-	add r2, r4, r5
-	ldlo.q r3, 0, r0
-	addi r5, -16, r5
-	movi 64+8, r27 // could subtract r7 from that.
-	stlo.q r2, 0, r0
-	sthi.q r2, 7, r0
-	ldx.q r22, r6, r0
-	bgtu/l r27, r4, tr1
-
-	addi r5, -48, r27
-	pta/l Loop_line, tr0
-	addi r6, 64, r36
-	addi r6, -24, r19
-	addi r6, -16, r20
-	addi r6, -8, r21
-
-Loop_line:
-	ldx.q r22, r36, r63
-	alloco r22, 32
-	addi r22, 32, r22
-	ldx.q r22, r19, r23
-	sthi.q r22, -25, r0
-	ldx.q r22, r20, r24
-	ldx.q r22, r21, r25
-	stlo.q r22, -32, r0
-	ldx.q r22, r6,  r0
-	sthi.q r22, -17, r23
-	sthi.q r22,  -9, r24
-	sthi.q r22,  -1, r25
-	stlo.q r22, -24, r23
-	stlo.q r22, -16, r24
-	stlo.q r22,  -8, r25
-	bgeu r27, r22, tr0
-
-Loop_ua:
-	addi r22, 8, r22
-	sthi.q r22, -1, r0
-	stlo.q r22, -8, r0
-	ldx.q r22, r6, r0
-	bgtu/l r5, r22, tr1
-
-	add r3, r4, r7
-	ldlo.q r7, -8, r1
-	sthi.q r22, 7, r0
-	ldhi.q r7, -1, r7
-	ptabs r18,tr1
-	stlo.q r22, 0, r0
-	or r1, r7, r1
-	sthi.q r5, 15, r1
-	stlo.q r5, 8, r1
-	blink tr1, r63
-
-	.size memcpy,.-memcpy
-
-libc_hidden_def(memcpy)
diff --git a/libc/string/sh64/memset.S b/libc/string/sh64/memset.S
deleted file mode 100644
index f588323f0..000000000
--- a/libc/string/sh64/memset.S
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
-/* Modified by SuperH, Inc. September 2003 */
-!
-! Fast SH memset
-!
-! by Toshiyasu Morita (tm@netcom.com)
-!
-! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
-! Copyright 2002 SuperH Ltd.
-!
-
-#include <features.h>
-#include <endian.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define SHHI shlld
-#define SHLO shlrd
-#else
-#define SHHI shlrd
-#define SHLO shlld
-#endif
-
-	.section .text..SHmedia32,"ax"
-	.globl	memset
-	.type	memset, @function
-
-	.align 5
-
-memset:
-	pta/l multiquad, tr0
-	andi r2, 7, r22
-	ptabs r18, tr2
-	mshflo.b r3,r3,r3
-	add r4, r22, r23
-	mperm.w r3, r63, r3	// Fill pattern now in every byte of r3
-
-	movi 8, r9
-	bgtu/u r23, r9, tr0 // multiquad
-
-	beqi/u r4, 0, tr2       // Return with size 0 - ensures no mem accesses
-	ldlo.q r2, 0, r7
-	shlli r4, 2, r4
-	movi -1, r8
-	SHHI r8, r4, r8
-	SHHI r8, r4, r8
-	mcmv r7, r8, r3
-	stlo.q r2, 0, r3
-	blink tr2, r63
-
-multiquad:
-	pta/l lastquad, tr0
-	stlo.q r2, 0, r3
-	shlri r23, 3, r24
-	add r2, r4, r5
-	beqi/u r24, 1, tr0 // lastquad
-	pta/l loop, tr1
-	sub r2, r22, r25
-	andi r5, -8, r20   // calculate end address and
-	addi r20, -7*8, r8 // loop end address; This might overflow, so we need
-	                   // to use a different test before we start the loop
-	bge/u r24, r9, tr1 // loop
-	st.q r25, 8, r3
-	st.q r20, -8, r3
-	shlri r24, 1, r24
-	beqi/u r24, 1, tr0 // lastquad
-	st.q r25, 16, r3
-	st.q r20, -16, r3
-	beqi/u r24, 2, tr0 // lastquad
-	st.q r25, 24, r3
-	st.q r20, -24, r3
-lastquad:
-	sthi.q r5, -1, r3
-	blink tr2,r63
-
-loop:
-!!!	alloco r25, 32	// QQQ comment out for short-term fix to SHUK #3895.
-			// QQQ commenting out is locically correct, but sub-optimal
-			// QQQ Sean McGoogan - 4th April 2003.
-	st.q r25, 8, r3
-	st.q r25, 16, r3
-	st.q r25, 24, r3
-	st.q r25, 32, r3
-	addi r25, 32, r25
-	bgeu/l r8, r25, tr1 // loop
-
-	st.q r20, -40, r3
-	st.q r20, -32, r3
-	st.q r20, -24, r3
-	st.q r20, -16, r3
-	st.q r20, -8, r3
-	sthi.q r5, -1, r3
-	blink tr2,r63
-
-	.size	memset,.-memset
-
-libc_hidden_def(memset)
diff --git a/libc/string/sh64/strcpy.S b/libc/string/sh64/strcpy.S
deleted file mode 100644
index da79d5143..000000000
--- a/libc/string/sh64/strcpy.S
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
-/* Modified by SuperH, Inc. September 2003 */
-! Entry: arg0: destination
-!        arg1: source
-! Exit:  result: destination
-!
-! SH5 code Copyright 2002 SuperH Ltd.
-
-#include <features.h>
-#include <endian.h>
-
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define SHHI shlld
-#define SHLO shlrd
-#else
-#define SHHI shlrd
-#define SHLO shlld
-#endif
-
-	.section .text..SHmedia32,"ax"
-	.globl	strcpy
-	.type	strcpy, @function
-	.align 5
-
-strcpy:
-
-	pta/l shortstring,tr1
-	ldlo.q r3,0,r4
-	ptabs r18,tr4
-	shlli r3,3,r7
-	addi r2, 8, r0
-	mcmpeq.b r4,r63,r6
-	SHHI r6,r7,r6
-	bnei/u r6,0,tr1 // shortstring
-	pta/l no_lddst, tr2
-	ori r3,-8,r23
-	sub r2, r23, r0
-	sub r3, r2, r21
-	addi r21, 8, r20
-	ldx.q r0, r21, r5
-	pta/l loop, tr0
-	ori r2,-8,r22
-	mcmpeq.b r5, r63, r6
-	bgt/u r22, r23, tr2 // no_lddst
-
-	// r22 < r23 :  Need to do a load from the destination.
-	// r22 == r23 : Doesn't actually need to load from destination,
-	//              but still can be handled here.
-	ldlo.q r2, 0, r9
-	movi -1, r8
-	SHLO r8, r7, r8
-	mcmv r4, r8, r9
-	stlo.q r2, 0, r9
-	beqi/l r6, 0, tr0 // loop
-
-	add r5, r63, r4
-	addi r0, 8, r0
-	blink tr1, r63 // shortstring
-no_lddst:
-	// r22 > r23: note that for r22 == r23 the sthi.q would clobber
-	//            bytes before the destination region.
-	stlo.q r2, 0, r4
-	SHHI r4, r7, r4
-	sthi.q r0, -1, r4
-	beqi/l r6, 0, tr0 // loop
-
-	add r5, r63, r4
-	addi r0, 8, r0
-shortstring:
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-	pta/l shortstring2,tr1
-	byterev r4,r4
-#endif
-shortstring2:
-	st.b r0,-8,r4
-	andi r4,0xff,r5
-	shlri r4,8,r4
-	addi r0,1,r0
-	bnei/l r5,0,tr1
-	blink tr4,r63 // return
-	
-	.balign 8
-loop:
-	stlo.q r0, 0, r5
-	ldx.q r0, r20, r4
-	addi r0, 16, r0
-	sthi.q r0, -9, r5
-	mcmpeq.b r4, r63, r6
-	bnei/u r6, 0, tr1 // shortstring
-	ldx.q r0, r21, r5
-	stlo.q r0, -8, r4
-	sthi.q r0, -1, r4
-	mcmpeq.b r5, r63, r6
-	beqi/l r6, 0, tr0 // loop
-
-	add r5, r63, r4
-	addi r0, 8, r0
-	blink tr1, r63 // shortstring
-
-	.size	strcpy,.-strcpy
-
-libc_hidden_def(strcpy)
diff --git a/libc/string/sh64/strlen.S b/libc/string/sh64/strlen.S
deleted file mode 100644
index 18f4164ff..000000000
--- a/libc/string/sh64/strlen.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* vi: set sw=8 ts=8: */
-/*
- * libc/string/sh64/strlen.S
- *
- * Simplistic strlen() implementation for SHmedia.
- *
- * Copyright (C) 2003  Paul Mundt <lethal@linux-sh.org>
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. The name of the above contributors may not be
- *    used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <features.h>
-
-	.section .text..SHmedia32,"ax"
-	.globl	strlen
-	.type	strlen,@function
-
-	.balign 16
-strlen:
-	ptabs	r18, tr4
-
-	/*
-	 * Note: We could easily deal with the NULL case here with a simple
-	 * sanity check, though it seems that the behavior we want is to fault
-	 * in the event that r2 == NULL, so we don't bother.
-	 */
-/*	beqi    r2, 0, tr4 */	! Sanity check
-
-	movi	-1, r0
-	pta/l	loop, tr0
-loop:
-	ld.b	r2, 0, r1
-	addi	r2, 1, r2
-	addi	r0, 1, r0
-	bnei/l	r1, 0, tr0
-
-	or	r0, r63, r2
-	blink	tr4, r63
-
-	.size	strlen,.-strlen
-
-libc_hidden_def(strlen)
diff --git a/libc/string/sparc/sparc32/memchr.S b/libc/string/sparc/sparc32/memchr.S
index 4d57a553b..1949db2e5 100644
--- a/libc/string/sparc/sparc32/memchr.S
+++ b/libc/string/sparc/sparc32/memchr.S
@@ -24,9 +24,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	.text
 	.align		4
@@ -139,6 +138,4 @@ ENTRY(memchr)
 END(memchr)
 libc_hidden_def(memchr)
 
-#if !__BOUNDED_POINTERS__
 weak_alias(memchr,__ubp_memchr)
-#endif
diff --git a/libc/string/sparc/sparc32/memcpy.S b/libc/string/sparc/sparc32/memcpy.S
index 25a48844d..2fb87bb17 100644
--- a/libc/string/sparc/sparc32/memcpy.S
+++ b/libc/string/sparc/sparc32/memcpy.S
@@ -17,9 +17,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <features.h>
 
diff --git a/libc/string/sparc/sparc32/memset.S b/libc/string/sparc/sparc32/memset.S
index 6c6424cf8..6d02fc1a8 100644
--- a/libc/string/sparc/sparc32/memset.S
+++ b/libc/string/sparc/sparc32/memset.S
@@ -16,9 +16,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include <features.h>
 
diff --git a/libc/string/sparc/sparc32/stpcpy.S b/libc/string/sparc/sparc32/stpcpy.S
index daf116eb1..2984ea156 100644
--- a/libc/string/sparc/sparc32/stpcpy.S
+++ b/libc/string/sparc/sparc32/stpcpy.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
 	   to find out if any byte in xword could be zero. This is fast, but
diff --git a/libc/string/sparc/sparc32/strcat.S b/libc/string/sparc/sparc32/strcat.S
index eda029a16..e968a18a3 100644
--- a/libc/string/sparc/sparc32/strcat.S
+++ b/libc/string/sparc/sparc32/strcat.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
 	   to find out if any byte in xword could be zero. This is fast, but
diff --git a/libc/string/sparc/sparc32/strchr.S b/libc/string/sparc/sparc32/strchr.S
index 16710d4e8..fabc3e7e5 100644
--- a/libc/string/sparc/sparc32/strchr.S
+++ b/libc/string/sparc/sparc32/strchr.S
@@ -16,9 +16,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
 	   to find out if any byte in xword could be zero. This is fast, but
diff --git a/libc/string/sparc/sparc32/strcmp.S b/libc/string/sparc/sparc32/strcmp.S
index d43883de6..07284cd18 100644
--- a/libc/string/sparc/sparc32/strcmp.S
+++ b/libc/string/sparc/sparc32/strcmp.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
 	   to find out if any byte in xword could be zero. This is fast, but
diff --git a/libc/string/sparc/sparc32/strcpy.S b/libc/string/sparc/sparc32/strcpy.S
index 4d7742ebc..3287546f3 100644
--- a/libc/string/sparc/sparc32/strcpy.S
+++ b/libc/string/sparc/sparc32/strcpy.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
 	   to find out if any byte in xword could be zero. This is fast, but
diff --git a/libc/string/sparc/sparc32/strlen.S b/libc/string/sparc/sparc32/strlen.S
index 4edfe7e78..66c790cb6 100644
--- a/libc/string/sparc/sparc32/strlen.S
+++ b/libc/string/sparc/sparc32/strlen.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
 	   to find out if any byte in xword could be zero. This is fast, but
diff --git a/libc/string/sparc/sparc64/memchr.S b/libc/string/sparc/sparc64/memchr.S
deleted file mode 100644
index 6096cc218..000000000
--- a/libc/string/sparc/sparc64/memchr.S
+++ /dev/null
@@ -1,261 +0,0 @@
-/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less
-   than N.
-   For SPARC v9.
-   Copyright (C) 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
-   This version is developed using the same algorithm as the fast C
-   version which carries the following introduction:
-   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se) and
-   commentary by Jim Blandy (jimb@ai.mit.edu);
-   adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
-   and implemented by Roland McGrath (roland@ai.mit.edu).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <asm/asi.h>
-#ifndef XCC
-#define XCC xcc
-#define USE_BPR
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-#endif
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(memchr)
-	and		%o1, 0xff, %o1			/* IEU0		Group		*/
-#ifdef USE_BPR
-	brz,pn		%o2, 12f			/* CTI+IEU1			*/
-#else
-	tst		%o2				/* IEU1				*/
-	be,pn		%XCC, 12f			/* CTI				*/
-#endif
-	 sll		%o1, 8, %g3			/* IEU0		Group		*/
-	add		%o0, %o2, %o2			/* IEU1				*/
-
-	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	or		%g3, %o1, %g3			/* IEU1				*/
-	ldub		[%o0], %o3			/* Load				*/
-	sllx		%g3, 16, %g5			/* IEU0		Group		*/
-
-	or		%g1, %lo(0x01010101), %g1	/* IEU1				*/
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-	or		%g3, %g5, %g3			/* IEU1				*/
-	sllx		%g3, 32, %g5			/* IEU0		Group		*/
-
-	cmp		%o3, %o1			/* IEU1				*/
-	be,pn		%xcc, 13f			/* CTI				*/
-	 or		%g1, %g2, %g1			/* IEU0		Group		*/
-	andcc		%o0, 7, %g0			/* IEU1				*/
-
-	bne,a,pn	%icc, 21f			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU0		Group		*/
-	ldx		[%o0], %o3			/* Load		Group		*/
-	sllx		%g1, 7, %g2			/* IEU0				*/
-
-	or		%g3, %g5, %g3			/* IEU1				*/
-1:	add		%o0, 8, %o0			/* IEU0		Group		*/
-	xor		%o3, %g3, %o4			/* IEU1				*/
-							/* %g1 = 0101010101010101	*
-							 * %g2 = 8080088080808080	*
-							 * %g3 =  c c c c c c c c	*
-							 * %o3 =      value		*
-							 * %o4 =   value XOR c		*/
-2:	cmp		%o0, %o2			/* IEU1		Group		*/
-
-	bg,pn		%XCC, 11f			/* CTI				*/
-	 ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	sub		%o4, %g1, %o5			/* IEU0		Group		*/
-	add		%o0, 8, %o0			/* IEU1				*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o5, %o4, %o5			/* IEU0		Group		*/
-#endif
-
-	andcc		%o5, %g2, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 2b			/* CTI				*/
-	 xor		%o3, %g3, %o4			/* IEU0				*/
-	srlx		%o4, 56, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 3f			/* CTI				*/
-	 srlx		%o4, 48, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 4f			/* CTI				*/
-	 srlx		%o4, 40, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-
-	 srlx		%o4, 32, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 6f			/* CTI				*/
-	 srlx		%o4, 24, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-	 srlx		%o4, 16, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 8f			/* CTI				*/
-	 srlx		%o4, 8, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 9f			/* CTI				*/
-
-	 andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	bne,pt		%icc, 2b			/* CTI				*/
-	 xor		%o3, %g3, %o4			/* IEU0				*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 add		%o0, -9, %o0			/* IEU0				*/
-
-	.align		16
-3:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -16, %o0			/* IEU0				*/
-4:   	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -15, %o0			/* IEU0				*/
-
-5:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -14, %o0			/* IEU0				*/
-6:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -13, %o0			/* IEU0				*/
-
-7:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -12, %o0			/* IEU0				*/
-8:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -11, %o0			/* IEU0				*/
-
-9:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -10, %o0			/* IEU0				*/
-11:	sub		%o4, %g1, %o5			/* IEU0		Group		*/
-	sub		%o0, 8, %o0			/* IEU1				*/
-
-	andcc		%o5, %g2, %g0			/* IEU1		Group		*/
-	be,pt		%xcc, 12f			/* CTI				*/
-	 sub		%o2, %o0, %o2			/* IEU0				*/
-	tst		%o2				/* IEU1		Group		*/
-
-	be,pn		%XCC, 12f			/* CTI				*/
-	 srlx		%o4, 56, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 13f			/* CTI				*/
-
-	 cmp		%o2, 1				/* IEU0				*/
-	be,pn		%XCC, 12f			/* CTI		Group		*/
-	 srlx		%o4, 48, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 14f			/* CTI				*/
-	 cmp		%o2, 2				/* IEU1		Group		*/
-	be,pn		%XCC, 12f			/* CTI				*/
-	 srlx		%o4, 40, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 15f			/* CTI				*/
-	 cmp		%o2, 3				/* IEU1		Group		*/
-	be,pn		%XCC, 12f			/* CTI				*/
-
-	 srlx		%o4, 32, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 16f			/* CTI				*/
-	 cmp		%o2, 4				/* IEU1		Group		*/
-
-	be,pn		%XCC, 12f			/* CTI				*/
-	 srlx		%o4, 24, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 17f			/* CTI				*/
-
-	 cmp		%o2, 5				/* IEU1		Group		*/
-	be,pn		%XCC, 12f			/* CTI				*/
-	 srlx		%o4, 16, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 18f			/* CTI				*/
-	 cmp		%o2, 6				/* IEU1		Group		*/
-	be,pn		%XCC, 12f			/* CTI				*/
-	 srlx		%o4, 8, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 19f			/* CTI				*/
-	 nop						/* IEU0				*/
-12:	retl						/* CTI+IEU1	Group		*/
-
-	 clr		%o0				/* IEU0				*/
-	nop						/* Stub				*/
-13:	retl						/* CTI+IEU1	Group		*/
-	 nop						/* IEU0				*/
-
-14:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, 1, %o0			/* IEU0				*/
-15:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, 2, %o0			/* IEU0				*/
-
-16:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, 3, %o0			/* IEU0				*/
-17:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, 4, %o0			/* IEU0				*/
-
-18:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, 5, %o0			/* IEU0				*/
-19:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, 6, %o0			/* IEU0				*/
-
-21:	cmp		%o0, %o2			/* IEU1				*/
-	be,pn		%XCC, 12b			/* CTI				*/
-	 sllx		%g1, 7, %g2			/* IEU0		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-
-	or		%g3, %g5, %g3			/* IEU1				*/
-22:	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	be,a,pn		%icc, 1b			/* CTI				*/
-	 ldx		[%o0], %o3			/* Load				*/
-
-	cmp		%o3, %o1			/* IEU1		Group		*/
-	be,pn		%xcc, 23f			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU0				*/
-	cmp		%o0, %o2			/* IEU1		Group		*/
-
-	bne,a,pt	%XCC, 22b			/* CTI				*/
-	 ldub		[%o0], %o3			/* Load				*/
-	retl						/* CTI+IEU1	Group		*/
-	 clr		%o0				/* IEU0				*/
-
-23:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -1, %o0			/* IEU0				*/
-END(memchr)
-
-libc_hidden_def(memchr)
-#if !__BOUNDED_POINTERS__
-weak_alias(memchr,__ubp_memchr)
-#endif
diff --git a/libc/string/sparc/sparc64/memcpy.S b/libc/string/sparc/sparc64/memcpy.S
deleted file mode 100644
index db63d1da2..000000000
--- a/libc/string/sparc/sparc64/memcpy.S
+++ /dev/null
@@ -1,923 +0,0 @@
-/* Copy SIZE bytes from SRC to DEST.
-   For UltraSPARC.
-   Copyright (C) 1996, 97, 98, 99, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by David S. Miller (davem@caip.rutgers.edu) and
-		  Jakub Jelinek (jakub@redhat.com).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <features.h>
-#include <asm/asi.h>
-#ifndef XCC
-#define USE_BPR
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-	.register	%g6, #scratch
-#define XCC	xcc
-#endif
-#define FPRS_FEF	4
-
-#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9)		\
-	faligndata	%f1, %f2, %f48;				\
-	faligndata	%f2, %f3, %f50;				\
-	faligndata	%f3, %f4, %f52;				\
-	faligndata	%f4, %f5, %f54;				\
-	faligndata	%f5, %f6, %f56;				\
-	faligndata	%f6, %f7, %f58;				\
-	faligndata	%f7, %f8, %f60;				\
-	faligndata	%f8, %f9, %f62;
-
-#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt)	\
-	ldda		[%src] %asi, %fdest;			\
-	add		%src, 0x40, %src;			\
-	add		%dest, 0x40, %dest;			\
-	subcc		%len, 0x40, %len;			\
-	be,pn		%xcc, jmptgt;				\
-	 stda		%fsrc, [%dest - 0x40] %asi;
-
-#define LOOP_CHUNK1(src, dest, len, branch_dest)		\
-	MAIN_LOOP_CHUNK(src, dest, f0,  f48, len, branch_dest)
-#define LOOP_CHUNK2(src, dest, len, branch_dest)		\
-	MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)
-#define LOOP_CHUNK3(src, dest, len, branch_dest)		\
-	MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
-
-#define STORE_SYNC(dest, fsrc)					\
-	stda		%fsrc, [%dest] %asi;			\
-	add		%dest, 0x40, %dest;
-
-#define STORE_JUMP(dest, fsrc, target)				\
-	stda		%fsrc, [%dest] %asi;			\
-	add		%dest, 0x40, %dest;			\
-	ba,pt		%xcc, target;
-
-#define VISLOOP_PAD nop; nop; nop; nop; 			\
-		    nop; nop; nop; nop; 			\
-		    nop; nop; nop; nop; 			\
-		    nop; nop; nop;
-
-#define FINISH_VISCHUNK(dest, f0, f1, left)			\
-	subcc		%left, 8, %left;			\
-	bl,pn		%xcc, 205f;				\
-	 faligndata	%f0, %f1, %f48;				\
-	std		%f48, [%dest];				\
-	add		%dest, 8, %dest;
-
-#define UNEVEN_VISCHUNK(dest, f0, f1, left)			\
-	subcc		%left, 8, %left;			\
-	bl,pn		%xcc, 205f;				\
-	 fsrc1		%f0, %f1;				\
-	ba,a,pt		%xcc, 204f;
-
-	/* Macros for non-VIS memcpy code. */
-#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3)		\
-	ldx		[%src + offset + 0x00], %t0; 		\
-	ldx		[%src + offset + 0x08], %t1; 		\
-	ldx		[%src + offset + 0x10], %t2; 		\
-	ldx		[%src + offset + 0x18], %t3; 		\
-	stw		%t0, [%dst + offset + 0x04]; 		\
-	srlx		%t0, 32, %t0;				\
-	stw		%t0, [%dst + offset + 0x00]; 		\
-	stw		%t1, [%dst + offset + 0x0c]; 		\
-	srlx		%t1, 32, %t1;				\
-	stw		%t1, [%dst + offset + 0x08]; 		\
-	stw		%t2, [%dst + offset + 0x14]; 		\
-	srlx		%t2, 32, %t2;				\
-	stw		%t2, [%dst + offset + 0x10]; 		\
-	stw		%t3, [%dst + offset + 0x1c];		\
-	srlx		%t3, 32, %t3;				\
-	stw		%t3, [%dst + offset + 0x18];
-
-#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src + offset + 0x00], %t0; 		\
-	ldx		[%src + offset + 0x08], %t1; 		\
-	ldx		[%src + offset + 0x10], %t2; 		\
-	ldx		[%src + offset + 0x18], %t3; 		\
-	stx		%t0, [%dst + offset + 0x00]; 		\
-	stx		%t1, [%dst + offset + 0x08]; 		\
-	stx		%t2, [%dst + offset + 0x10]; 		\
-	stx		%t3, [%dst + offset + 0x18]; 		\
-	ldx		[%src + offset + 0x20], %t0; 		\
-	ldx		[%src + offset + 0x28], %t1; 		\
-	ldx		[%src + offset + 0x30], %t2; 		\
-	ldx		[%src + offset + 0x38], %t3; 		\
-	stx		%t0, [%dst + offset + 0x20]; 		\
-	stx		%t1, [%dst + offset + 0x28]; 		\
-	stx		%t2, [%dst + offset + 0x30]; 		\
-	stx		%t3, [%dst + offset + 0x38];
-
-#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src - offset - 0x10], %t0;		\
-	ldx		[%src - offset - 0x08], %t1; 		\
-	stw		%t0, [%dst - offset - 0x0c]; 		\
-	srlx		%t0, 32, %t2;				\
-	stw		%t2, [%dst - offset - 0x10]; 		\
-	stw		%t1, [%dst - offset - 0x04]; 		\
-	srlx		%t1, 32, %t3;				\
-	stw		%t3, [%dst - offset - 0x08];
-
-#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1)		\
-	ldx		[%src - offset - 0x10], %t0; 		\
-	ldx		[%src - offset - 0x08], %t1; 		\
-	stx		%t0, [%dst - offset - 0x10]; 		\
-	stx		%t1, [%dst - offset - 0x08];
-
-	/* Macros for non-VIS memmove code. */
-#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src - offset - 0x20], %t0; 		\
-	ldx		[%src - offset - 0x18], %t1; 		\
-	ldx		[%src - offset - 0x10], %t2; 		\
-	ldx		[%src - offset - 0x08], %t3; 		\
-	stw		%t0, [%dst - offset - 0x1c]; 		\
-	srlx		%t0, 32, %t0;				\
-	stw		%t0, [%dst - offset - 0x20]; 		\
-	stw		%t1, [%dst - offset - 0x14]; 		\
-	srlx		%t1, 32, %t1;				\
-	stw		%t1, [%dst - offset - 0x18]; 		\
-	stw		%t2, [%dst - offset - 0x0c]; 		\
-	srlx		%t2, 32, %t2;				\
-	stw		%t2, [%dst - offset - 0x10]; 		\
-	stw		%t3, [%dst - offset - 0x04];		\
-	srlx		%t3, 32, %t3;				\
-	stw		%t3, [%dst - offset - 0x08];
-
-#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src - offset - 0x20], %t0; 		\
-	ldx		[%src - offset - 0x18], %t1; 		\
-	ldx		[%src - offset - 0x10], %t2; 		\
-	ldx		[%src - offset - 0x08], %t3; 		\
-	stx		%t0, [%dst - offset - 0x20]; 		\
-	stx		%t1, [%dst - offset - 0x18]; 		\
-	stx		%t2, [%dst - offset - 0x10]; 		\
-	stx		%t3, [%dst - offset - 0x08];		\
-	ldx		[%src - offset - 0x40], %t0; 		\
-	ldx		[%src - offset - 0x38], %t1; 		\
-	ldx		[%src - offset - 0x30], %t2; 		\
-	ldx		[%src - offset - 0x28], %t3; 		\
-	stx		%t0, [%dst - offset - 0x40]; 		\
-	stx		%t1, [%dst - offset - 0x38]; 		\
-	stx		%t2, [%dst - offset - 0x30]; 		\
-	stx		%t3, [%dst - offset - 0x28];
-
-#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src + offset + 0x00], %t0;		\
-	ldx		[%src + offset + 0x08], %t1; 		\
-	stw		%t0, [%dst + offset + 0x04]; 		\
-	srlx		%t0, 32, %t2;				\
-	stw		%t2, [%dst + offset + 0x00]; 		\
-	stw		%t1, [%dst + offset + 0x0c]; 		\
-	srlx		%t1, 32, %t3;				\
-	stw		%t3, [%dst + offset + 0x08];
-
-#define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1)		\
-	ldx		[%src + offset + 0x00], %t0; 		\
-	ldx		[%src + offset + 0x08], %t1; 		\
-	stx		%t0, [%dst + offset + 0x00]; 		\
-	stx		%t1, [%dst + offset + 0x08];
-
-	.text
-	.align		32
-
-#ifdef __UCLIBC_SUSV3_LEGACY__
-ENTRY(bcopy)
-	sub		%o1, %o0, %o4			/* IEU0		Group		*/
-	mov		%o0, %g3			/* IEU1				*/
-	cmp		%o4, %o2			/* IEU1		Group		*/
-	mov		%o1, %o0			/* IEU0				*/
-	bgeu,pt		%XCC, 210f			/* CTI				*/
-	 mov		%g3, %o1			/* IEU0		Group		*/
-#ifndef USE_BPR
-	srl		%o2, 0, %o2			/* IEU1				*/
-#endif
-	brnz,pn		%o2, 220f			/* CTI		Group		*/
-	 add		%o0, %o2, %o0			/* IEU0				*/
-	retl
-	 nop
-END(bcopy)
-#endif
-
-	.align		32
-200:	be,pt		%xcc, 201f			/* CTI				*/
-	 andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
-	mov		8, %g1				/* IEU0				*/
-	sub		%g1, %g2, %g2			/* IEU0		Group		*/
-	andcc		%o0, 1, %g0			/* IEU1				*/
-	be,pt		%icc, 2f			/* CTI				*/
-	 sub		%o2, %g2, %o2			/* IEU0		Group		*/
-1:	ldub		[%o1], %o5			/* Load		Group		*/
-	add		%o1, 1, %o1			/* IEU0				*/
-	add		%o0, 1, %o0			/* IEU1				*/
-	subcc		%g2, 1, %g2			/* IEU1		Group		*/
-	be,pn		%xcc, 3f			/* CTI				*/
-	 stb		%o5, [%o0 - 1]			/* Store			*/
-2:	ldub		[%o1], %o5			/* Load		Group		*/
-	add		%o0, 2, %o0			/* IEU0				*/
-	ldub		[%o1 + 1], %g3			/* Load		Group		*/
-	subcc		%g2, 2, %g2			/* IEU1		Group		*/
-	stb		%o5, [%o0 - 2]			/* Store			*/
-	add		%o1, 2, %o1			/* IEU0				*/
-	bne,pt		%xcc, 2b			/* CTI		Group		*/
-	 stb		%g3, [%o0 - 1]			/* Store			*/
-3:	andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
-201:	be,pt		%icc, 202f			/* CTI				*/
-	 mov		64, %g1				/* IEU0				*/
-	fmovd		%f0, %f2			/* FPU				*/
-	sub		%g1, %g5, %g5			/* IEU0		Group		*/
-	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
-	ldd		[%g1], %f4			/* Load		Group		*/
-	sub		%o2, %g5, %o2			/* IEU0				*/
-1:	ldd		[%g1 + 0x8], %f6		/* Load		Group		*/
-	add		%g1, 0x8, %g1			/* IEU0		Group		*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f4, %f6, %f0			/* GRU		Group		*/
-	std		%f0, [%o0]			/* Store			*/
-	add		%o1, 8, %o1			/* IEU0		Group		*/
-	be,pn		%xcc, 202f			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU1				*/
-	ldd		[%g1 + 0x8], %f4		/* Load		Group		*/
-	add		%g1, 8, %g1			/* IEU0				*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f6, %f4, %f0			/* GRU		Group		*/
-	std		%f0, [%o0]			/* Store			*/
-	add		%o1, 8, %o1			/* IEU0				*/
-	bne,pt		%xcc, 1b			/* CTI		Group		*/
-	 add		%o0, 8, %o0			/* IEU0				*/
-202:	membar	  #LoadStore | #StoreStore | #StoreLoad	/* LSU		Group		*/
-	wr		%g0, ASI_BLK_P, %asi		/* LSU		Group		*/
-	subcc		%o2, 0x40, %g6			/* IEU1		Group		*/
-	mov		%o1, %g1			/* IEU0				*/
-	andncc		%g6, (0x40 - 1), %g6		/* IEU1		Group		*/
-	srl		%g1, 3, %g2			/* IEU0				*/
-	sub		%o2, %g6, %g3			/* IEU0		Group		*/
-	andn		%o1, (0x40 - 1), %o1		/* IEU1				*/
-	and		%g2, 7, %g2			/* IEU0		Group		*/
-	andncc		%g3, 0x7, %g3			/* IEU1				*/
-	fmovd		%f0, %f2			/* FPU				*/
-	sub		%g3, 0x10, %g3			/* IEU0		Group		*/
-	sub		%o2, %g6, %o2			/* IEU1				*/
-	alignaddr	%g1, %g0, %g0			/* GRU		Group		*/
-	add		%g1, %g6, %g1			/* IEU0		Group		*/
-	subcc		%o2, %g3, %o2			/* IEU1				*/
-	ldda		[%o1 + 0x00] %asi, %f0		/* LSU		Group		*/
-	add		%g1, %g3, %g1			/* IEU0				*/
-	ldda		[%o1 + 0x40] %asi, %f16		/* LSU		Group		*/
-	sub		%g6, 0x80, %g6			/* IEU0				*/
-	ldda		[%o1 + 0x80] %asi, %f32		/* LSU		Group		*/
-							/* Clk1		Group 8-(	*/
-							/* Clk2		Group 8-(	*/
-							/* Clk3		Group 8-(	*/
-							/* Clk4		Group 8-(	*/
-203:	rd		%pc, %g5			/* PDU		Group 8-(	*/
-	addcc		%g5, %lo(300f - 203b), %g5	/* IEU1		Group		*/
-	sll		%g2, 9, %g2			/* IEU0				*/
-	jmpl		%g5 + %g2, %g0			/* CTI		Group brk forced*/
-	 addcc		%o1, 0xc0, %o1			/* IEU1		Group		*/
-
-	.align		512		/* OK, here comes the fun part... */
-300:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)	LOOP_CHUNK1(o1, o0, g6, 301f)
-	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)	LOOP_CHUNK2(o1, o0, g6, 302f)
-	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)	LOOP_CHUNK3(o1, o0, g6, 303f)
-	b,pt		%xcc, 300b+4; faligndata %f0, %f2, %f48
-301:	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)	STORE_JUMP(o0, f48, 400f) membar #Sync
-302:	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)	STORE_JUMP(o0, f48, 416f) membar #Sync
-303:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)	STORE_JUMP(o0, f48, 432f) membar #Sync
-	VISLOOP_PAD
-310:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)	LOOP_CHUNK1(o1, o0, g6, 311f)
-	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)	LOOP_CHUNK2(o1, o0, g6, 312f)
-	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)	LOOP_CHUNK3(o1, o0, g6, 313f)
-	b,pt		%xcc, 310b+4; faligndata %f2, %f4, %f48
-311:	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)	STORE_JUMP(o0, f48, 402f) membar #Sync
-312:	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)	STORE_JUMP(o0, f48, 418f) membar #Sync
-313:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)	STORE_JUMP(o0, f48, 434f) membar #Sync
-	VISLOOP_PAD
-320:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)	LOOP_CHUNK1(o1, o0, g6, 321f)
-	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)	LOOP_CHUNK2(o1, o0, g6, 322f)
-	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)	LOOP_CHUNK3(o1, o0, g6, 323f)
-	b,pt		%xcc, 320b+4; faligndata %f4, %f6, %f48
-321:	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)	STORE_JUMP(o0, f48, 404f) membar #Sync
-322:	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)	STORE_JUMP(o0, f48, 420f) membar #Sync
-323:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)	STORE_JUMP(o0, f48, 436f) membar #Sync
-	VISLOOP_PAD
-330:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)	LOOP_CHUNK1(o1, o0, g6, 331f)
-	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)	LOOP_CHUNK2(o1, o0, g6, 332f)
-	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)	LOOP_CHUNK3(o1, o0, g6, 333f)
-	b,pt		%xcc, 330b+4; faligndata %f6, %f8, %f48
-331:	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)	STORE_JUMP(o0, f48, 406f) membar #Sync
-332:	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)	STORE_JUMP(o0, f48, 422f) membar #Sync
-333:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)	STORE_JUMP(o0, f48, 438f) membar #Sync
-	VISLOOP_PAD
-340:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)	LOOP_CHUNK1(o1, o0, g6, 341f)
-	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)	LOOP_CHUNK2(o1, o0, g6, 342f)
-	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)	LOOP_CHUNK3(o1, o0, g6, 343f)
-	b,pt		%xcc, 340b+4; faligndata %f8, %f10, %f48
-341:	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)	STORE_JUMP(o0, f48, 408f) membar #Sync
-342:	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)	STORE_JUMP(o0, f48, 424f) membar #Sync
-343:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)	STORE_JUMP(o0, f48, 440f) membar #Sync
-	VISLOOP_PAD
-350:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)	LOOP_CHUNK1(o1, o0, g6, 351f)
-	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)	LOOP_CHUNK2(o1, o0, g6, 352f)
-	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)	LOOP_CHUNK3(o1, o0, g6, 353f)
-	b,pt		%xcc, 350b+4; faligndata %f10, %f12, %f48
-351:	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)	STORE_JUMP(o0, f48, 410f) membar #Sync
-352:	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)	STORE_JUMP(o0, f48, 426f) membar #Sync
-353:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)	STORE_JUMP(o0, f48, 442f) membar #Sync
-	VISLOOP_PAD
-360:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)	LOOP_CHUNK1(o1, o0, g6, 361f)
-	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)	LOOP_CHUNK2(o1, o0, g6, 362f)
-	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)	LOOP_CHUNK3(o1, o0, g6, 363f)
-	b,pt		%xcc, 360b+4; faligndata %f12, %f14, %f48
-361:	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)	STORE_JUMP(o0, f48, 412f) membar #Sync
-362:	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)	STORE_JUMP(o0, f48, 428f) membar #Sync
-363:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)	STORE_JUMP(o0, f48, 444f) membar #Sync
-	VISLOOP_PAD
-370:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)	LOOP_CHUNK1(o1, o0, g6, 371f)
-	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)	LOOP_CHUNK2(o1, o0, g6, 372f)
-	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)	LOOP_CHUNK3(o1, o0, g6, 373f)
-	b,pt		%xcc, 370b+4; faligndata %f14, %f16, %f48
-371:	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)	STORE_JUMP(o0, f48, 414f) membar #Sync
-372:	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)	STORE_JUMP(o0, f48, 430f) membar #Sync
-373:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)	STORE_SYNC(o0, f48) membar #Sync
-	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)	STORE_JUMP(o0, f48, 446f) membar #Sync
-	VISLOOP_PAD
-400:	FINISH_VISCHUNK(o0, f0,  f2,  g3)
-402:	FINISH_VISCHUNK(o0, f2,  f4,  g3)
-404:	FINISH_VISCHUNK(o0, f4,  f6,  g3)
-406:	FINISH_VISCHUNK(o0, f6,  f8,  g3)
-408:	FINISH_VISCHUNK(o0, f8,  f10, g3)
-410:	FINISH_VISCHUNK(o0, f10, f12, g3)
-412:	FINISH_VISCHUNK(o0, f12, f14, g3)
-414:	UNEVEN_VISCHUNK(o0, f14, f0,  g3)
-416:	FINISH_VISCHUNK(o0, f16, f18, g3)
-418:	FINISH_VISCHUNK(o0, f18, f20, g3)
-420:	FINISH_VISCHUNK(o0, f20, f22, g3)
-422:	FINISH_VISCHUNK(o0, f22, f24, g3)
-424:	FINISH_VISCHUNK(o0, f24, f26, g3)
-426:	FINISH_VISCHUNK(o0, f26, f28, g3)
-428:	FINISH_VISCHUNK(o0, f28, f30, g3)
-430:	UNEVEN_VISCHUNK(o0, f30, f0,  g3)
-432:	FINISH_VISCHUNK(o0, f32, f34, g3)
-434:	FINISH_VISCHUNK(o0, f34, f36, g3)
-436:	FINISH_VISCHUNK(o0, f36, f38, g3)
-438:	FINISH_VISCHUNK(o0, f38, f40, g3)
-440:	FINISH_VISCHUNK(o0, f40, f42, g3)
-442:	FINISH_VISCHUNK(o0, f42, f44, g3)
-444:	FINISH_VISCHUNK(o0, f44, f46, g3)
-446:	UNEVEN_VISCHUNK(o0, f46, f0,  g3)
-204:	ldd		[%o1], %f2			/* Load		Group		*/
-	add		%o1, 8, %o1			/* IEU0				*/
-	subcc		%g3, 8, %g3			/* IEU1				*/
-	faligndata	%f0, %f2, %f8			/* GRU		Group		*/
-	std		%f8, [%o0]			/* Store			*/
-	bl,pn		%xcc, 205f			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0		Group		*/
-	ldd		[%o1], %f0			/* Load		Group		*/
-	add		%o1, 8, %o1			/* IEU0				*/
-	subcc		%g3, 8, %g3			/* IEU1				*/
-	faligndata	%f2, %f0, %f8			/* GRU		Group		*/
-	std		%f8, [%o0]			/* Store			*/
-	bge,pt		%xcc, 204b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0		Group		*/
-205:	brz,pt		%o2, 207f			/* CTI		Group		*/
-	 mov		%g1, %o1			/* IEU0				*/
-206:	ldub		[%o1], %g5			/* LOAD				*/
-	add		%o1, 1, %o1			/* IEU0				*/
-	add		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1				*/
-	bne,pt		%xcc, 206b			/* CTI				*/
-	 stb		%g5, [%o0 - 1]			/* Store	Group		*/
-207:	membar		#StoreLoad | #StoreStore	/* LSU		Group		*/
-	wr		%g0, FPRS_FEF, %fprs
-	retl
-	 mov		%g4, %o0
-
-208:	andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	be,pt		%icc, 2f+4			/* CTI				*/
-1:	 ldub		[%o1], %g5			/* LOAD		Group		*/
-	add		%o1, 1, %o1			/* IEU0				*/
-	add		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1		Group		*/
-	be,pn		%xcc, 209f			/* CTI				*/
-	 stb		%g5, [%o0 - 1]			/* Store			*/
-2:	ldub		[%o1], %g5			/* LOAD		Group		*/
-	add		%o0, 2, %o0			/* IEU0				*/
-	ldub		[%o1 + 1], %o5			/* LOAD		Group		*/
-	add		%o1, 2, %o1			/* IEU0				*/
-	subcc		%o2, 2, %o2			/* IEU1		Group		*/
-	stb		%g5, [%o0 - 2]			/* Store			*/
-	bne,pt		%xcc, 2b			/* CTI				*/
-	 stb		%o5, [%o0 - 1]			/* Store			*/
-209:	retl
-	 mov		%g4, %o0
-
-#ifdef USE_BPR
-
-	/* void *__align_cpy_4(void *dest, void *src, size_t n)
-	 * SPARC v9 SYSV ABI
-	 * Like memcpy, but results are undefined if (!n || ((dest | src | n) & 3))
-	 */
-
-	.align		32
-ENTRY(__align_cpy_4)
-	mov		%o0, %g4			/* IEU0		Group		*/
-	cmp		%o2, 15				/* IEU1				*/
-	bleu,pn		%xcc, 208b			/* CTI				*/
-	 cmp		%o2, (64 * 6)			/* IEU1		Group		*/
-	bgeu,pn		%xcc, 200b			/* CTI				*/
-	 andcc		%o0, 7, %g2			/* IEU1		Group		*/
-	ba,pt		%xcc, 216f			/* CTI				*/
-	 andcc		%o1, 4, %g0			/* IEU1		Group		*/
-END(__align_cpy_4)
-
-	/* void *__align_cpy_8(void *dest, void *src, size_t n)
-	 * SPARC v9 SYSV ABI
-	 * Like memcpy, but results are undefined if (!n || ((dest | src | n) & 7))
-	 */
-
-	.align		32
-ENTRY(__align_cpy_8)
-	mov		%o0, %g4			/* IEU0		Group		*/
-	cmp		%o2, 15				/* IEU1				*/
-	bleu,pn		%xcc, 208b			/* CTI				*/
-	 cmp		%o2, (64 * 6)			/* IEU1		Group		*/
-	bgeu,pn		%xcc, 201b			/* CTI				*/
-	 andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
-	andcc		%o2, -128, %g6			/* IEU1		Group		*/
-	bne,a,pt	%xcc, 82f + 4			/* CTI				*/
-	 ldx		[%o1], %g1			/* Load				*/
-	ba,pt		%xcc, 41f			/* CTI		Group		*/
-	 andcc		%o2, 0x70, %g6			/* IEU1				*/
-END(__align_cpy_8)
-
-	/* void *__align_cpy_16(void *dest, void *src, size_t n)
-	 * SPARC v9 SYSV ABI
-	 * Like memcpy, but results are undefined if (!n || ((dest | src | n) & 15))
-	 */
-
-	.align		32
-ENTRY(__align_cpy_16)
-	mov		%o0, %g4			/* IEU0		Group		*/
-	cmp		%o2, (64 * 6)			/* IEU1				*/
-	bgeu,pn		%xcc, 201b			/* CTI				*/
-	 andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
-	andcc		%o2, -128, %g6			/* IEU1		Group		*/
-	bne,a,pt	%xcc, 82f + 4			/* CTI				*/
-	 ldx		[%o1], %g1			/* Load				*/
-	ba,pt		%xcc, 41f			/* CTI		Group		*/
-	 andcc		%o2, 0x70, %g6			/* IEU1				*/
-END(__align_cpy_16)
-
-#endif
-
-	.align		32
-ENTRY(memcpy)
-210:
-#ifndef USE_BPR
-	srl		%o2, 0, %o2			/* IEU1		Group		*/
-#endif	
-	brz,pn		%o2, 209b			/* CTI		Group		*/
-	 mov		%o0, %g4			/* IEU0				*/
-218:	cmp		%o2, 15				/* IEU1		Group		*/
-	bleu,pn		%xcc, 208b			/* CTI				*/
-	 cmp		%o2, (64 * 6)			/* IEU1		Group		*/
-	bgeu,pn		%xcc, 200b			/* CTI				*/
-	 andcc		%o0, 7, %g2			/* IEU1		Group		*/
-	sub		%o0, %o1, %g5			/* IEU0				*/
-	andcc		%g5, 3, %o5			/* IEU1		Group		*/
-	bne,pn		%xcc, 212f			/* CTI				*/
-	 andcc		%o1, 3, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 216f			/* CTI				*/
-	 andcc		%o1, 4, %g0			/* IEU1		Group		*/
-	andcc		%o1, 1, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 4f			/* CTI				*/
-	 andcc		%o1, 2, %g0			/* IEU1		Group		*/
-	ldub		[%o1], %g2			/* Load		Group		*/
-	add		%o1, 1, %o1			/* IEU0				*/
-	add		%o0, 1, %o0			/* IEU1				*/
-	sub		%o2, 1, %o2			/* IEU0		Group		*/
-	bne,pn		%xcc, 5f			/* CTI		Group		*/
-	 stb		%g2, [%o0 - 1]			/* Store			*/
-4:	lduh		[%o1], %g2			/* Load		Group		*/
-	add		%o1, 2, %o1			/* IEU0				*/
-	add		%o0, 2, %o0			/* IEU1				*/
-	sub		%o2, 2, %o2			/* IEU0				*/
-	sth		%g2, [%o0 - 2]			/* Store	Group + bubble	*/
-5:	andcc		%o1, 4, %g0			/* IEU1				*/
-216:	be,a,pn		%xcc, 2f			/* CTI				*/
-	 andcc		%o2, -128, %g6			/* IEU1		Group		*/
-	lduw		[%o1], %g5			/* Load		Group		*/
-	add		%o1, 4, %o1			/* IEU0				*/
-	add		%o0, 4, %o0			/* IEU1				*/
-	sub		%o2, 4, %o2			/* IEU0		Group		*/
-	stw		%g5, [%o0 - 4]			/* Store			*/
-	andcc		%o2, -128, %g6			/* IEU1		Group		*/
-2:	be,pn		%xcc, 215f			/* CTI				*/
-	 andcc		%o0, 4, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 82f + 4			/* CTI		Group		*/
-5:	MOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
-	MOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
-	MOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
-	MOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
-35:	subcc		%g6, 128, %g6			/* IEU1		Group		*/
-	add		%o1, 128, %o1			/* IEU0				*/
-	bne,pt		%xcc, 5b			/* CTI				*/
-	 add		%o0, 128, %o0			/* IEU0		Group		*/
-215:	andcc		%o2, 0x70, %g6			/* IEU1		Group		*/
-41:	be,pn		%xcc, 80f			/* CTI				*/
-	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
-							/* Clk1 8-(			*/
-							/* Clk2 8-(			*/
-							/* Clk3 8-(			*/
-							/* Clk4 8-(			*/
-79:	rd		%pc, %o5			/* PDU		Group		*/
-	sll		%g6, 1, %g5			/* IEU0		Group		*/
-	add		%o1, %g6, %o1			/* IEU1				*/
-	sub		%o5, %g5, %o5			/* IEU0  	Group		*/
-	jmpl		%o5 + %lo(80f - 79b), %g0	/* CTI		Group brk forced*/
-	 add		%o0, %g6, %o0			/* IEU0		Group		*/
-36:	MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
-	MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
-	MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
-	MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
-	MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
-	MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
-	MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
-80:	be,pt		%xcc, 81f			/* CTI				*/
-	 andcc		%o2, 4, %g0			/* IEU1				*/
-	ldx		[%o1], %g2			/* Load		Group		*/
-	add		%o0, 8, %o0			/* IEU0				*/
-	stw		%g2, [%o0 - 0x4]		/* Store	Group		*/
-	add		%o1, 8, %o1			/* IEU1				*/
-	srlx		%g2, 32, %g2			/* IEU0		Group		*/
-	stw		%g2, [%o0 - 0x8]		/* Store			*/
-81:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
-	lduw		[%o1], %g2			/* Load		Group		*/
-	add		%o1, 4, %o1			/* IEU0				*/
-	stw		%g2, [%o0]			/* Store	Group		*/
-	add		%o0, 4, %o0			/* IEU0				*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	lduh		[%o1], %g2			/* Load		Group		*/
-	add		%o1, 2, %o1			/* IEU0				*/
-	sth		%g2, [%o0]			/* Store	Group		*/
-	add		%o0, 2, %o0			/* IEU0				*/
-1:	be,pt		%xcc, 211f			/* CTI				*/
-	 nop						/* IEU1				*/
-	ldub		[%o1], %g2			/* Load		Group		*/
-	stb		%g2, [%o0]			/* Store	Group + bubble	*/
-211:	retl
-	 mov		%g4, %o0
-
-82:	MOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
-	MOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
-37:	subcc		%g6, 128, %g6			/* IEU1		Group		*/
-	add		%o1, 128, %o1			/* IEU0				*/
-	bne,pt		%xcc, 82b			/* CTI				*/
-	 add		%o0, 128, %o0			/* IEU0		Group		*/
-	andcc		%o2, 0x70, %g6			/* IEU1				*/
-	be,pn		%xcc, 84f			/* CTI				*/
-	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
-							/* Clk1 8-(			*/
-							/* Clk2 8-(			*/
-							/* Clk3 8-(			*/
-							/* Clk4 8-(			*/
-83:	rd		%pc, %o5			/* PDU		Group		*/
-	add		%o1, %g6, %o1			/* IEU0		Group		*/
-	sub		%o5, %g6, %o5			/* IEU1				*/
-	jmpl		%o5 + %lo(84f - 83b), %g0	/* CTI		Group brk forced*/
-	 add		%o0, %g6, %o0			/* IEU0		Group		*/
-38:	MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
-	MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
-	MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
-	MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
-	MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
-	MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
-	MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
-84:	be,pt		%xcc, 85f			/* CTI		Group		*/
-	 andcc		%o2, 4, %g0			/* IEU1				*/
-	ldx		[%o1], %g2			/* Load		Group		*/
-	add		%o0, 8, %o0			/* IEU0				*/
-	add		%o1, 8, %o1			/* IEU0		Group		*/
-	stx		%g2, [%o0 - 0x8]		/* Store			*/
-85:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
-	lduw		[%o1], %g2			/* Load		Group		*/
-	add		%o0, 4, %o0			/* IEU0				*/
-	add		%o1, 4, %o1			/* IEU0		Group		*/
-	stw		%g2, [%o0 - 0x4]		/* Store			*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	lduh		[%o1], %g2			/* Load		Group		*/
-	add		%o0, 2, %o0			/* IEU0				*/
-	add		%o1, 2, %o1			/* IEU0		Group		*/
-	sth		%g2, [%o0 - 0x2]		/* Store			*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 nop						/* IEU0		Group		*/
-	ldub		[%o1], %g2			/* Load		Group		*/
-	stb		%g2, [%o0]			/* Store	Group + bubble	*/
-1:	retl
-	 mov		%g4, %o0
-
-212:	brz,pt		%g2, 2f				/* CTI		Group		*/
-	 mov		8, %g1				/* IEU0				*/
-	sub		%g1, %g2, %g2			/* IEU0		Group		*/
-	sub		%o2, %g2, %o2			/* IEU0		Group		*/
-1:	ldub		[%o1], %g5			/* Load		Group		*/
-	add		%o1, 1, %o1			/* IEU0				*/
-	add		%o0, 1, %o0			/* IEU1				*/
-	subcc		%g2, 1, %g2			/* IEU1		Group		*/
-	bne,pt		%xcc, 1b			/* CTI				*/
-	 stb		%g5, [%o0 - 1]			/* Store			*/
-2:	andn		%o2, 7, %g5 			/* IEU0		Group		*/
-	and		%o2, 7, %o2			/* IEU1				*/
-	fmovd		%f0, %f2			/* FPU				*/
-	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
-	ldd		[%g1], %f4			/* Load		Group		*/
-1:	ldd		[%g1 + 0x8], %f6		/* Load		Group		*/
-	add		%g1, 0x8, %g1			/* IEU0		Group		*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f4, %f6, %f0			/* GRU		Group		*/
-	std		%f0, [%o0]			/* Store			*/
-	add		%o1, 8, %o1			/* IEU0		Group		*/
-	be,pn		%xcc, 213f			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU1				*/
-	ldd		[%g1 + 0x8], %f4		/* Load		Group		*/
-	add		%g1, 8, %g1			/* IEU0				*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f6, %f4, %f0			/* GRU		Group		*/
-	std		%f0, [%o0]			/* Store			*/
-	add		%o1, 8, %o1			/* IEU0				*/
-	bne,pn		%xcc, 1b			/* CTI		Group		*/
-	 add		%o0, 8, %o0			/* IEU0				*/
-213:	brz,pn		%o2, 214f			/* CTI		Group		*/
-	 nop						/* IEU0				*/
-	ldub		[%o1], %g5			/* LOAD				*/
-	add		%o1, 1, %o1			/* IEU0				*/
-	add		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1				*/
-	bne,pt		%xcc, 206b			/* CTI				*/
-	 stb		%g5, [%o0 - 1]			/* Store	Group		*/
-214:	wr		%g0, FPRS_FEF, %fprs
-	retl
-	 mov		%g4, %o0
-END(memcpy)
-libc_hidden_def(memcpy)
-
-	.align		32
-228:	andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	be,pt		%icc, 2f+4			/* CTI				*/
-1:	 ldub		[%o1 - 1], %o5			/* LOAD		Group		*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1		Group		*/
-	be,pn		%xcc, 229f			/* CTI				*/
-	 stb		%o5, [%o0]			/* Store			*/
-2:	ldub		[%o1 - 1], %o5			/* LOAD		Group		*/
-	sub		%o0, 2, %o0			/* IEU0				*/
-	ldub		[%o1 - 2], %g5			/* LOAD		Group		*/
-	sub		%o1, 2, %o1			/* IEU0				*/
-	subcc		%o2, 2, %o2			/* IEU1		Group		*/
-	stb		%o5, [%o0 + 1]			/* Store			*/
-	bne,pt		%xcc, 2b			/* CTI				*/
-	 stb		%g5, [%o0]			/* Store			*/
-229:	retl
-	 mov		%g4, %o0
-219:	retl
-	 nop
-
-	.align		32
-ENTRY(memmove)
-#ifndef USE_BPR
-	srl		%o2, 0, %o2			/* IEU1		Group		*/
-#endif
-	brz,pn		%o2, 219b			/* CTI		Group		*/
-	 sub		%o0, %o1, %o4			/* IEU0				*/
-	cmp		%o4, %o2			/* IEU1		Group		*/
-	bgeu,pt		%XCC, 218b			/* CTI				*/
-	 mov		%o0, %g4			/* IEU0				*/
-	add		%o0, %o2, %o0			/* IEU0		Group		*/
-220:	add		%o1, %o2, %o1			/* IEU1				*/
-	cmp		%o2, 15				/* IEU1		Group		*/
-	bleu,pn		%xcc, 228b			/* CTI				*/
-	 andcc		%o0, 7, %g2			/* IEU1		Group		*/
-	sub		%o0, %o1, %g5			/* IEU0				*/
-	andcc		%g5, 3, %o5			/* IEU1		Group		*/
-	bne,pn		%xcc, 232f			/* CTI				*/
-	 andcc		%o1, 3, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 236f			/* CTI				*/
-	 andcc		%o1, 4, %g0			/* IEU1		Group		*/
-	andcc		%o1, 1, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 4f			/* CTI				*/
-	 andcc		%o1, 2, %g0			/* IEU1		Group		*/
-	ldub		[%o1 - 1], %g2			/* Load		Group		*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	sub		%o2, 1, %o2			/* IEU0		Group		*/
-	be,pn		%xcc, 5f			/* CTI		Group		*/
-	 stb		%g2, [%o0]			/* Store			*/
-4:	lduh		[%o1 - 2], %g2			/* Load		Group		*/
-	sub		%o1, 2, %o1			/* IEU0				*/
-	sub		%o0, 2, %o0			/* IEU1				*/
-	sub		%o2, 2, %o2			/* IEU0				*/
-	sth		%g2, [%o0]			/* Store	Group + bubble	*/
-5:	andcc		%o1, 4, %g0			/* IEU1				*/
-236:	be,a,pn		%xcc, 2f			/* CTI				*/
-	 andcc		%o2, -128, %g6			/* IEU1		Group		*/
-	lduw		[%o1 - 4], %g5			/* Load		Group		*/
-	sub		%o1, 4, %o1			/* IEU0				*/
-	sub		%o0, 4, %o0			/* IEU1				*/
-	sub		%o2, 4, %o2			/* IEU0		Group		*/
-	stw		%g5, [%o0]			/* Store			*/
-	andcc		%o2, -128, %g6			/* IEU1		Group		*/
-2:	be,pn		%xcc, 235f			/* CTI				*/
-	 andcc		%o0, 4, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 282f + 4			/* CTI		Group		*/
-5:	RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
-	RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
-	RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
-	RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
-	subcc		%g6, 128, %g6			/* IEU1		Group		*/
-	sub		%o1, 128, %o1			/* IEU0				*/
-	bne,pt		%xcc, 5b			/* CTI				*/
-	 sub		%o0, 128, %o0			/* IEU0		Group		*/
-235:	andcc		%o2, 0x70, %g6			/* IEU1		Group		*/
-41:	be,pn		%xcc, 280f			/* CTI				*/
-	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
-							/* Clk1 8-(			*/
-							/* Clk2 8-(			*/
-							/* Clk3 8-(			*/
-							/* Clk4 8-(			*/
-279:	rd		%pc, %o5			/* PDU		Group		*/
-	sll		%g6, 1, %g5			/* IEU0		Group		*/
-	sub		%o1, %g6, %o1			/* IEU1				*/
-	sub		%o5, %g5, %o5			/* IEU0  	Group		*/
-	jmpl		%o5 + %lo(280f - 279b), %g0	/* CTI		Group brk forced*/
-	 sub		%o0, %g6, %o0			/* IEU0		Group		*/
-	RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
-280:	be,pt		%xcc, 281f			/* CTI				*/
-	 andcc		%o2, 4, %g0			/* IEU1				*/
-	ldx		[%o1 - 8], %g2			/* Load		Group		*/
-	sub		%o0, 8, %o0			/* IEU0				*/
-	stw		%g2, [%o0 + 4]			/* Store	Group		*/
-	sub		%o1, 8, %o1			/* IEU1				*/
-	srlx		%g2, 32, %g2			/* IEU0		Group		*/
-	stw		%g2, [%o0]			/* Store			*/
-281:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
-	lduw		[%o1 - 4], %g2			/* Load		Group		*/
-	sub		%o1, 4, %o1			/* IEU0				*/
-	stw		%g2, [%o0 - 4]			/* Store	Group		*/
-	sub		%o0, 4, %o0			/* IEU0				*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	lduh		[%o1 - 2], %g2			/* Load		Group		*/
-	sub		%o1, 2, %o1			/* IEU0				*/
-	sth		%g2, [%o0 - 2]			/* Store	Group		*/
-	sub		%o0, 2, %o0			/* IEU0				*/
-1:	be,pt		%xcc, 211f			/* CTI				*/
-	 nop						/* IEU1				*/
-	ldub		[%o1 - 1], %g2			/* Load		Group		*/
-	stb		%g2, [%o0 - 1]			/* Store	Group + bubble	*/
-211:	retl
-	 mov		%g4, %o0
-
-282:	RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
-	RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
-	subcc		%g6, 128, %g6			/* IEU1		Group		*/
-	sub		%o1, 128, %o1			/* IEU0				*/
-	bne,pt		%xcc, 282b			/* CTI				*/
-	 sub		%o0, 128, %o0			/* IEU0		Group		*/
-	andcc		%o2, 0x70, %g6			/* IEU1				*/
-	be,pn		%xcc, 284f			/* CTI				*/
-	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
-							/* Clk1 8-(			*/
-							/* Clk2 8-(			*/
-							/* Clk3 8-(			*/
-							/* Clk4 8-(			*/
-283:	rd		%pc, %o5			/* PDU		Group		*/
-	sub		%o1, %g6, %o1			/* IEU0		Group		*/
-	sub		%o5, %g6, %o5			/* IEU1				*/
-	jmpl		%o5 + %lo(284f - 283b), %g0	/* CTI		Group brk forced*/
-	 sub		%o0, %g6, %o0			/* IEU0		Group		*/
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
-284:	be,pt		%xcc, 285f			/* CTI		Group		*/
-	 andcc		%o2, 4, %g0			/* IEU1				*/
-	ldx		[%o1 - 8], %g2			/* Load		Group		*/
-	sub		%o0, 8, %o0			/* IEU0				*/
-	sub		%o1, 8, %o1			/* IEU0		Group		*/
-	stx		%g2, [%o0]			/* Store			*/
-285:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
-	lduw		[%o1 - 4], %g2			/* Load		Group		*/
-	sub		%o0, 4, %o0			/* IEU0				*/
-	sub		%o1, 4, %o1			/* IEU0		Group		*/
-	stw		%g2, [%o0]			/* Store			*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	lduh		[%o1 - 2], %g2			/* Load		Group		*/
-	sub		%o0, 2, %o0			/* IEU0				*/
-	sub		%o1, 2, %o1			/* IEU0		Group		*/
-	sth		%g2, [%o0]			/* Store			*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 nop						/* IEU0		Group		*/
-	ldub		[%o1 - 1], %g2			/* Load		Group		*/
-	stb		%g2, [%o0 - 1]			/* Store	Group + bubble	*/
-1:	retl
-	 mov		%g4, %o0
-
-232:	brz,pt		%g2, 2f				/* CTI		Group		*/
-	 sub		%o2, %g2, %o2			/* IEU0		Group		*/
-1:	ldub		[%o1 - 1], %g5			/* Load		Group		*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	subcc		%g2, 1, %g2			/* IEU1		Group		*/
-	bne,pt		%xcc, 1b			/* CTI				*/
-	 stb		%g5, [%o0]			/* Store			*/
-2:	andn		%o2, 7, %g5 			/* IEU0		Group		*/
-	and		%o2, 7, %o2			/* IEU1				*/
-	fmovd		%f0, %f2			/* FPU				*/
-	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
-	ldd		[%g1], %f4			/* Load		Group		*/
-1:	ldd		[%g1 - 8], %f6			/* Load		Group		*/
-	sub		%g1, 8, %g1			/* IEU0		Group		*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f6, %f4, %f0			/* GRU		Group		*/
-	std		%f0, [%o0 - 8]			/* Store			*/
-	sub		%o1, 8, %o1			/* IEU0		Group		*/
-	be,pn		%xcc, 233f			/* CTI				*/
-	 sub		%o0, 8, %o0			/* IEU1				*/
-	ldd		[%g1 - 8], %f4			/* Load		Group		*/
-	sub		%g1, 8, %g1			/* IEU0				*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f4, %f6, %f0			/* GRU		Group		*/
-	std		%f0, [%o0 - 8]			/* Store			*/
-	sub		%o1, 8, %o1			/* IEU0				*/
-	bne,pn		%xcc, 1b			/* CTI		Group		*/
-	 sub		%o0, 8, %o0			/* IEU0				*/
-233:	brz,pn		%o2, 234f			/* CTI		Group		*/
-	 nop						/* IEU0				*/
-237:	ldub		[%o1 - 1], %g5			/* LOAD				*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1				*/
-	bne,pt		%xcc, 237b			/* CTI				*/
-	 stb		%g5, [%o0]			/* Store	Group		*/
-234:	wr		%g0, FPRS_FEF, %fprs
-	retl
-	 mov		%g4, %o0
-END(memmove)
-libc_hidden_def(memmove)
-
-#ifdef USE_BPR
-weak_alias(memcpy,__align_cpy_1)
-weak_alias(memcpy,__align_cpy_2)
-#endif
diff --git a/libc/string/sparc/sparc64/memset.S b/libc/string/sparc/sparc64/memset.S
deleted file mode 100644
index 50e404bcc..000000000
--- a/libc/string/sparc/sparc64/memset.S
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Set a block of memory to some byte value.
-   For UltraSPARC.
-   Copyright (C) 1996, 97, 98, 99, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by David S. Miller (davem@caip.rutgers.edu) and
-                  Jakub Jelinek (jj@ultra.linux.cz).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <features.h>
-#include <asm/asi.h>
-#ifndef XCC
-#define XCC xcc
-#define USE_BPR
-#endif
-#define FPRS_FEF	4
-
-#define SET_BLOCKS(base, offset, source)		\
-	stx		source, [base - offset - 0x18];	\
-	stx		source, [base - offset - 0x10];	\
-	stx		source, [base - offset - 0x08];	\
-	stx		source, [base - offset - 0x00];
-
-	/* Well, memset is a lot easier to get right than bcopy... */
-	.text
-	.align		32
-ENTRY(memset)
-	andcc		%o1, 0xff, %o1
-	mov		%o0, %o5
-	be,a,pt		%icc, 50f
-#ifndef USE_BPR
-	 srl		%o2, 0, %o1
-#else
-	 mov		%o2, %o1
-#endif
-	cmp		%o2, 7
-#ifndef USE_BPR
-	srl		%o2, 0, %o2
-#endif
-	bleu,pn		%XCC, 17f
-	 andcc		%o0, 3, %g5
-	be,pt		%xcc, 4f
-	 and		%o1, 0xff, %o1
-	cmp		%g5, 3
-	be,pn		%xcc, 2f
-	 stb		%o1, [%o0 + 0x00]
-	cmp		%g5, 2
-	be,pt		%xcc, 2f
-	 stb		%o1, [%o0 + 0x01]
-	stb		%o1, [%o0 + 0x02]
-2:	sub		%g5, 4, %g5
-	sub		%o0, %g5, %o0
-	add		%o2, %g5, %o2
-4:	sllx		%o1, 8, %g1
-	andcc		%o0, 4, %g0
-	or		%o1, %g1, %o1
-	sllx		%o1, 16, %g1
-	or		%o1, %g1, %o1
-	be,pt		%xcc, 2f
-	 sllx		%o1, 32, %g1
-	stw		%o1, [%o0]
-	sub		%o2, 4, %o2
-	add		%o0, 4, %o0
-2:	cmp		%o2, 128
-	or		%o1, %g1, %o1
-	blu,pn		%xcc, 9f
-	 andcc		%o0, 0x38, %g5
-	be,pn		%icc, 6f
-	 mov		64, %o4
-	andcc		%o0, 8, %g0
-	be,pn		%icc, 1f
-	 sub		%o4, %g5, %o4
-	stx		%o1, [%o0]
-	add		%o0, 8, %o0
-1:	andcc		%o4, 16, %g0
-	be,pn		%icc, 1f
-	 sub		%o2, %o4, %o2
-	stx		%o1, [%o0]
-	stx		%o1, [%o0 + 8]
-	add		%o0, 16, %o0
-1:	andcc		%o4, 32, %g0
-	be,pn		%icc, 7f
-	 andncc		%o2, 0x3f, %o3
-	stw		%o1, [%o0]
-	stw		%o1, [%o0 + 4]
-	stw		%o1, [%o0 + 8]
-	stw		%o1, [%o0 + 12]
-	stw		%o1, [%o0 + 16]
-	stw		%o1, [%o0 + 20]
-	stw		%o1, [%o0 + 24]
-	stw		%o1, [%o0 + 28]
-	add		%o0, 32, %o0
-7:	be,pn		%xcc, 9f
-	 nop
-	ldd		[%o0 - 8], %f0
-18:	wr		%g0, ASI_BLK_P, %asi
-	membar		#StoreStore | #LoadStore
-	andcc		%o3, 0xc0, %g5
-	and		%o2, 0x3f, %o2
-	fmovd		%f0, %f2
-	fmovd		%f0, %f4
-	andn		%o3, 0xff, %o3
-	fmovd		%f0, %f6
-	cmp		%g5, 64
-	fmovd		%f0, %f8
-	fmovd		%f0, %f10
-	fmovd		%f0, %f12
-	brz,pn		%g5, 10f
-	 fmovd		%f0, %f14
-	be,pn		%icc, 2f
-	 stda		%f0, [%o0 + 0x00] %asi
-	cmp		%g5, 128
-	be,pn		%icc, 2f
-	 stda		%f0, [%o0 + 0x40] %asi
-	stda		%f0, [%o0 + 0x80] %asi
-2:	brz,pn		%o3, 12f
-	 add		%o0, %g5, %o0
-10:	stda		%f0, [%o0 + 0x00] %asi
-	stda		%f0, [%o0 + 0x40] %asi
-	stda		%f0, [%o0 + 0x80] %asi
-	stda		%f0, [%o0 + 0xc0] %asi
-11:	subcc		%o3, 256, %o3
-	bne,pt		%xcc, 10b
-	 add		%o0, 256, %o0
-12:	wr		%g0, FPRS_FEF, %fprs
-	membar		#StoreLoad | #StoreStore
-9:	andcc		%o2, 0x78, %g5
-	be,pn		%xcc, 13f
-	 andcc		%o2, 7, %o2
-14:	rd		%pc, %o4
-	srl		%g5, 1, %o3
-	sub		%o4, %o3, %o4
-	jmpl		%o4 + (13f - 14b), %g0
-	 add		%o0, %g5, %o0
-12:	SET_BLOCKS	(%o0, 0x68, %o1)
-	SET_BLOCKS	(%o0, 0x48, %o1)
-	SET_BLOCKS	(%o0, 0x28, %o1)
-	SET_BLOCKS	(%o0, 0x08, %o1)
-13:	be,pn		%xcc, 8f
-	 andcc		%o2, 4, %g0
-	be,pn		%xcc, 1f
-	 andcc		%o2, 2, %g0
-	stw		%o1, [%o0]
-	add		%o0, 4, %o0
-1:	be,pn		%xcc, 1f
-	 andcc		%o2, 1, %g0
-	sth		%o1, [%o0]
-	add		%o0, 2, %o0
-1:	bne,a,pn	%xcc, 8f
-	 stb		%o1, [%o0]
-8:	retl
-	 mov		%o5, %o0
-17:	brz,pn		%o2, 0f
-8:	 add		%o0, 1, %o0
-	subcc		%o2, 1, %o2
-	bne,pt		%xcc, 8b
-	 stb		%o1, [%o0 - 1]
-0:	retl
-	 mov		%o5, %o0
-
-6:	stx		%o1, [%o0]
-	andncc		%o2, 0x3f, %o3
-	be,pn		%xcc, 9b
-	 nop
-	ba,pt		%xcc, 18b
-	 ldd		[%o0], %f0
-END(memset)
-libc_hidden_def(memset)
-
-#define ZERO_BLOCKS(base, offset, source)		\
-	stx		source, [base - offset - 0x38];	\
-	stx		source, [base - offset - 0x30];	\
-	stx		source, [base - offset - 0x28];	\
-	stx		source, [base - offset - 0x20];	\
-	stx		source, [base - offset - 0x18];	\
-	stx		source, [base - offset - 0x10];	\
-	stx		source, [base - offset - 0x08];	\
-	stx		source, [base - offset - 0x00];
-
-	.text
-	.align		32
-#ifdef __UCLIBC_SUSV3_LEGACY__
-ENTRY(bzero)
-#ifndef USE_BPR
-	srl		%o1, 0, %o1
-#endif
-	mov		%o0, %o5
-#endif
-50:	cmp		%o1, 7
-	bleu,pn		%xcc, 17f
-	 andcc		%o0, 3, %o2
-	be,a,pt		%xcc, 4f
-	 andcc		%o0, 4, %g0
-	cmp		%o2, 3
-	be,pn		%xcc, 2f
-	 stb		%g0, [%o0 + 0x00]
-	cmp		%o2, 2
-	be,pt		%xcc, 2f
-	 stb		%g0, [%o0 + 0x01]
-	stb		%g0, [%o0 + 0x02]
-2:	sub		%o2, 4, %o2
-	sub		%o0, %o2, %o0
-	add		%o1, %o2, %o1
-	andcc		%o0, 4, %g0
-4:	be,pt		%xcc, 2f
-	 cmp		%o1, 128
-	stw		%g0, [%o0]
-	sub		%o1, 4, %o1
-	add		%o0, 4, %o0
-2:	blu,pn		%xcc, 9f
-	 andcc		%o0, 0x38, %o2
-	be,pn		%icc, 6f
-	 mov		64, %o4
-	andcc		%o0, 8, %g0
-	be,pn		%icc, 1f
-	 sub		%o4, %o2, %o4
-	stx		%g0, [%o0]
-	add		%o0, 8, %o0
-1:	andcc		%o4, 16, %g0
-	be,pn		%icc, 1f
-	 sub		%o1, %o4, %o1
-	stx		%g0, [%o0]
-	stx		%g0, [%o0 + 8]
-	add		%o0, 16, %o0
-1:	andcc		%o4, 32, %g0
-	be,pn		%icc, 7f
-	 andncc		%o1, 0x3f, %o3
-	stx		%g0, [%o0]
-	stx		%g0, [%o0 + 8]
-	stx		%g0, [%o0 + 16]
-	stx		%g0, [%o0 + 24]
-	add		%o0, 32, %o0
-6:	andncc		%o1, 0x3f, %o3
-7:	be,pn		%xcc, 9f
-	 wr		%g0, ASI_BLK_P, %asi
-	membar		#StoreLoad | #StoreStore | #LoadStore
-	fzero		%f0
-	andcc		%o3, 0xc0, %o2
-	and		%o1, 0x3f, %o1
-	fzero		%f2
-	andn		%o3, 0xff, %o3
-	faddd		%f0, %f2, %f4
-	fmuld		%f0, %f2, %f6
-	cmp		%o2, 64
-	faddd		%f0, %f2, %f8
-	fmuld		%f0, %f2, %f10
-	faddd		%f0, %f2, %f12
-	brz,pn		%o2, 10f
-	 fmuld		%f0, %f2, %f14
-	be,pn		%icc, 2f
-	 stda		%f0, [%o0 + 0x00] %asi
-	cmp		%o2, 128
-	be,pn		%icc, 2f
-	 stda		%f0, [%o0 + 0x40] %asi
-	stda		%f0, [%o0 + 0x80] %asi
-2:	brz,pn		%o3, 12f
-	 add		%o0, %o2, %o0
-10:	stda		%f0, [%o0 + 0x00] %asi
-	stda		%f0, [%o0 + 0x40] %asi
-	stda		%f0, [%o0 + 0x80] %asi
-	stda		%f0, [%o0 + 0xc0] %asi
-11:	subcc		%o3, 256, %o3
-	bne,pt		%xcc, 10b
-	 add		%o0, 256, %o0
-12:	wr		%g0, FPRS_FEF, %fprs
-	membar		#StoreLoad | #StoreStore
-9:	andcc		%o1, 0xf8, %o2
-	be,pn		%xcc, 13f
-	 andcc		%o1, 7, %o1
-14:	rd		%pc, %o4
-	srl		%o2, 1, %o3
-	sub		%o4, %o3, %o4
-	jmpl		%o4 + (13f - 14b), %g0
-	 add		%o0, %o2, %o0
-12:	ZERO_BLOCKS	(%o0, 0xc8, %g0)
-	ZERO_BLOCKS	(%o0, 0x88, %g0)
-	ZERO_BLOCKS	(%o0, 0x48, %g0)
-	ZERO_BLOCKS	(%o0, 0x08, %g0)
-13:	be,pn		%xcc, 8f
-	 andcc		%o1, 4, %g0
-	be,pn		%xcc, 1f
-	 andcc		%o1, 2, %g0
-	stw		%g0, [%o0]
-	add		%o0, 4, %o0
-1:	be,pn		%xcc, 1f
-	 andcc		%o1, 1, %g0
-	sth		%g0, [%o0]
-	add		%o0, 2, %o0
-1:	bne,a,pn	%xcc, 8f
-	 stb		%g0, [%o0]
-8:	retl
-	 mov		%o5, %o0
-17:	be,pn		%xcc, 13b
-	 orcc		%o1, 0, %g0
-	be,pn		%xcc, 0f
-8:	 add		%o0, 1, %o0
-	subcc		%o1, 1, %o1
-	bne,pt		%xcc, 8b
-	 stb		%g0, [%o0 - 1]
-0:	retl
-	 mov		%o5, %o0
-#ifdef __UCLIBC_SUSV3_LEGACY__
-END(bzero)
-#endif
diff --git a/libc/string/sparc/sparc64/sparcv9b/memcpy.S b/libc/string/sparc/sparc64/sparcv9b/memcpy.S
deleted file mode 100644
index 64f6a92e0..000000000
--- a/libc/string/sparc/sparc64/sparcv9b/memcpy.S
+++ /dev/null
@@ -1,612 +0,0 @@
-/* Copy SIZE bytes from SRC to DEST.
-   For UltraSPARC-III.
-   Copyright (C) 2001, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by David S. Miller (davem@redhat.com)
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <features.h>
-
-#define ASI_BLK_P 0xf0
-#define FPRS_FEF  0x04
-#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
-#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
-
-#ifndef XCC
-#define USE_BPR
-#define XCC xcc
-#endif
-
-	.register	%g2,#scratch
-	.register	%g3,#scratch
-	.register	%g6,#scratch
-
-	.text
-	.align	32
-
-#ifdef __UCLIBC_SUSV3_LEGACY__
-ENTRY(bcopy)
-	sub		%o1, %o0, %o4
-	mov		%o0, %g4
-	cmp		%o4, %o2
-	mov		%o1, %o0
-	bgeu,pt		%XCC, 100f
-	 mov		%g4, %o1
-#ifndef USE_BPR
-	srl		%o2, 0, %o2
-#endif
-	brnz,pn		%o2, 220f
-	 add		%o0, %o2, %o0
-	retl
-	 nop
-END(bcopy)
-#endif
-
-	/* Special/non-trivial issues of this code:
-	 *
-	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
-	 * 2) Only low 32 FPU registers are used so that only the
-	 *    lower half of the FPU register set is dirtied by this
-	 *    code.  This is especially important in the kernel.
-	 * 3) This code never prefetches cachelines past the end
-	 *    of the source buffer.
-	 *
-	 * The cheetah's flexible spine, oversized liver, enlarged heart,
-	 * slender muscular body, and claws make it the swiftest hunter
-	 * in Africa and the fastest animal on land.  Can reach speeds
-	 * of up to 2.4GB per second.
-	 */
-	.align		32
-ENTRY(memcpy)
-
-100: /* %o0=dst, %o1=src, %o2=len */
-	mov		%o0, %g5
-	cmp		%o2, 0
-	be,pn		%XCC, out
-218:	 or		%o0, %o1, %o3
-	cmp		%o2, 16
-	bleu,a,pn	%XCC, small_copy
-	 or		%o3, %o2, %o3
-
-	cmp		%o2, 256
-	blu,pt		%XCC, medium_copy
-	 andcc		%o3, 0x7, %g0
-
-	ba,pt		%xcc, enter
-	 andcc		%o0, 0x3f, %g2
-
-	/* Here len >= 256 and condition codes reflect execution
-	 * of "andcc %o0, 0x7, %g2", done by caller.
-	 */
-	.align		64
-enter:
-	/* Is 'dst' already aligned on an 64-byte boundary? */
-	be,pt		%XCC, 2f
-
-	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
-	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
-	 * subtract this from 'len'.
-	 */
-	 sub		%g2, 0x40, %g2
-	sub		%g0, %g2, %g2
-	sub		%o2, %g2, %o2
-
-	/* Copy %g2 bytes from src to dst, one byte at a time. */
-1:	ldub		[%o1 + 0x00], %o3
-	add		%o1, 0x1, %o1
-	add		%o0, 0x1, %o0
-	subcc		%g2, 0x1, %g2
-
-	bg,pt		%XCC, 1b
-	 stb		%o3, [%o0 + -1]
-
-2:	VISEntryHalf
-	and		%o1, 0x7, %g1
-	ba,pt		%xcc, begin
-	 alignaddr	%o1, %g0, %o1
-
-	.align		64
-begin:
-	prefetch	[%o1 + 0x000], #one_read
-	prefetch	[%o1 + 0x040], #one_read
-	andn		%o2, (0x40 - 1), %o4
-	prefetch	[%o1 + 0x080], #one_read
-	prefetch	[%o1 + 0x0c0], #one_read
-	ldd		[%o1 + 0x000], %f0
-	prefetch	[%o1 + 0x100], #one_read
-	ldd		[%o1 + 0x008], %f2
-	prefetch	[%o1 + 0x140], #one_read
-	ldd		[%o1 + 0x010], %f4
-	prefetch	[%o1 + 0x180], #one_read
-	faligndata	%f0, %f2, %f16
-	ldd		[%o1 + 0x018], %f6
-	faligndata	%f2, %f4, %f18
-	ldd		[%o1 + 0x020], %f8
-	faligndata	%f4, %f6, %f20
-	ldd		[%o1 + 0x028], %f10
-	faligndata	%f6, %f8, %f22
-
-	ldd		[%o1 + 0x030], %f12
-	faligndata	%f8, %f10, %f24
-	ldd		[%o1 + 0x038], %f14
-	faligndata	%f10, %f12, %f26
-	ldd		[%o1 + 0x040], %f0
-
-	sub		%o4, 0x80, %o4
-	add		%o1, 0x40, %o1
-	ba,pt		%xcc, loop
-	 srl		%o4, 6, %o3
-
-	.align		64
-loop:
-	ldd		[%o1 + 0x008], %f2
-	faligndata	%f12, %f14, %f28
-	ldd		[%o1 + 0x010], %f4
-	faligndata	%f14, %f0, %f30
-	stda		%f16, [%o0] ASI_BLK_P
-	ldd		[%o1 + 0x018], %f6
-	faligndata	%f0, %f2, %f16
-
-	ldd		[%o1 + 0x020], %f8
-	faligndata	%f2, %f4, %f18
-	ldd		[%o1 + 0x028], %f10
-	faligndata	%f4, %f6, %f20
-	ldd		[%o1 + 0x030], %f12
-	faligndata	%f6, %f8, %f22
-	ldd		[%o1 + 0x038], %f14
-	faligndata	%f8, %f10, %f24
-
-	ldd		[%o1 + 0x040], %f0
-	prefetch	[%o1 + 0x180], #one_read
-	faligndata	%f10, %f12, %f26
-	subcc		%o3, 0x01, %o3
-	add		%o1, 0x40, %o1
-	bg,pt		%XCC, loop
-	 add		%o0, 0x40, %o0
-
-	/* Finally we copy the last full 64-byte block. */
-loopfini:
-	ldd		[%o1 + 0x008], %f2
-	faligndata	%f12, %f14, %f28
-	ldd		[%o1 + 0x010], %f4
-	faligndata	%f14, %f0, %f30
-	stda		%f16, [%o0] ASI_BLK_P
-	ldd		[%o1 + 0x018], %f6
-	faligndata	%f0, %f2, %f16
-	ldd		[%o1 + 0x020], %f8
-	faligndata	%f2, %f4, %f18
-	ldd		[%o1 + 0x028], %f10
-	faligndata	%f4, %f6, %f20
-	ldd		[%o1 + 0x030], %f12
-	faligndata	%f6, %f8, %f22
-	ldd		[%o1 + 0x038], %f14
-	faligndata	%f8, %f10, %f24
-	cmp		%g1, 0
-	be,pt		%XCC, 1f
-	 add		%o0, 0x40, %o0
-	ldd		[%o1 + 0x040], %f0
-1:	faligndata	%f10, %f12, %f26
-	faligndata	%f12, %f14, %f28
-	faligndata	%f14, %f0, %f30
-	stda		%f16, [%o0] ASI_BLK_P
-	add		%o0, 0x40, %o0
-	add		%o1, 0x40, %o1
-	membar		#Sync
-
-	/* Now we copy the (len modulo 64) bytes at the end.
-	 * Note how we borrow the %f0 loaded above.
-	 *
-	 * Also notice how this code is careful not to perform a
-	 * load past the end of the src buffer.
-	 */
-loopend:
-	and		%o2, 0x3f, %o2
-	andcc		%o2, 0x38, %g2
-	be,pn		%XCC, endcruft
-	 subcc		%g2, 0x8, %g2
-	be,pn		%XCC, endcruft
-	 cmp		%g1, 0
-
-	be,a,pt		%XCC, 1f
-	 ldd		[%o1 + 0x00], %f0
-
-1:	ldd		[%o1 + 0x08], %f2
-	add		%o1, 0x8, %o1
-	sub		%o2, 0x8, %o2
-	subcc		%g2, 0x8, %g2
-	faligndata	%f0, %f2, %f8
-	std		%f8, [%o0 + 0x00]
-	be,pn		%XCC, endcruft
-	 add		%o0, 0x8, %o0
-	ldd		[%o1 + 0x08], %f0
-	add		%o1, 0x8, %o1
-	sub		%o2, 0x8, %o2
-	subcc		%g2, 0x8, %g2
-	faligndata	%f2, %f0, %f8
-	std		%f8, [%o0 + 0x00]
-	bne,pn		%XCC, 1b
-	 add		%o0, 0x8, %o0
-
-	/* If anything is left, we copy it one byte at a time.
-	 * Note that %g1 is (src & 0x3) saved above before the
-	 * alignaddr was performed.
-	 */
-endcruft:
-	cmp		%o2, 0
-	add		%o1, %g1, %o1
-	VISExitHalf
-	be,pn		%XCC, out
-	 sub		%o0, %o1, %o3
-
-	andcc		%g1, 0x7, %g0
-	bne,pn		%icc, small_copy_unaligned
-	 andcc		%o2, 0x8, %g0
-	be,pt		%icc, 1f
-	 nop
-	ldx		[%o1], %o5
-	stx		%o5, [%o1 + %o3]
-	add		%o1, 0x8, %o1
-
-1:	andcc		%o2, 0x4, %g0
-	be,pt		%icc, 1f
-	 nop
-	lduw		[%o1], %o5
-	stw		%o5, [%o1 + %o3]
-	add		%o1, 0x4, %o1
-
-1:	andcc		%o2, 0x2, %g0
-	be,pt		%icc, 1f
-	 nop
-	lduh		[%o1], %o5
-	sth		%o5, [%o1 + %o3]
-	add		%o1, 0x2, %o1
-
-1:	andcc		%o2, 0x1, %g0
-	be,pt		%icc, out
-	 nop
-	ldub		[%o1], %o5
-	ba,pt		%xcc, out
-	 stb		%o5, [%o1 + %o3]
-
-medium_copy: /* 16 < len <= 64 */
-	bne,pn		%XCC, small_copy_unaligned
-	 sub		%o0, %o1, %o3
-
-medium_copy_aligned:
-	andn		%o2, 0x7, %o4
-	and		%o2, 0x7, %o2
-1:	subcc		%o4, 0x8, %o4
-	ldx		[%o1], %o5
-	stx		%o5, [%o1 + %o3]
-	bgu,pt		%XCC, 1b
-	 add		%o1, 0x8, %o1
-	andcc		%o2, 0x4, %g0
-	be,pt		%XCC, 1f
-	 nop
-	sub		%o2, 0x4, %o2
-	lduw		[%o1], %o5
-	stw		%o5, [%o1 + %o3]
-	add		%o1, 0x4, %o1
-1:	cmp		%o2, 0
-	be,pt		%XCC, out
-	 nop
-	ba,pt		%xcc, small_copy_unaligned
-	 nop
-
-small_copy: /* 0 < len <= 16 */
-	andcc		%o3, 0x3, %g0
-	bne,pn		%XCC, small_copy_unaligned
-	 sub		%o0, %o1, %o3
-
-small_copy_aligned:
-	subcc		%o2, 4, %o2
-	lduw		[%o1], %g1
-	stw		%g1, [%o1 + %o3]
-	bgu,pt		%XCC, small_copy_aligned
-	 add		%o1, 4, %o1
-
-out:	retl
-	 mov		%g5, %o0
-
-	.align	32
-small_copy_unaligned:
-	subcc		%o2, 1, %o2
-	ldub		[%o1], %g1
-	stb		%g1, [%o1 + %o3]
-	bgu,pt		%XCC, small_copy_unaligned
-	 add		%o1, 1, %o1
-	retl
-	 mov		%g5, %o0
-
-END(memcpy)
-libc_hidden_def(memcpy)
-
-#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src - offset - 0x20], %t0; 		\
-	ldx		[%src - offset - 0x18], %t1; 		\
-	ldx		[%src - offset - 0x10], %t2; 		\
-	ldx		[%src - offset - 0x08], %t3; 		\
-	stw		%t0, [%dst - offset - 0x1c]; 		\
-	srlx		%t0, 32, %t0;				\
-	stw		%t0, [%dst - offset - 0x20]; 		\
-	stw		%t1, [%dst - offset - 0x14]; 		\
-	srlx		%t1, 32, %t1;				\
-	stw		%t1, [%dst - offset - 0x18]; 		\
-	stw		%t2, [%dst - offset - 0x0c]; 		\
-	srlx		%t2, 32, %t2;				\
-	stw		%t2, [%dst - offset - 0x10]; 		\
-	stw		%t3, [%dst - offset - 0x04];		\
-	srlx		%t3, 32, %t3;				\
-	stw		%t3, [%dst - offset - 0x08];
-
-#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src - offset - 0x20], %t0; 		\
-	ldx		[%src - offset - 0x18], %t1; 		\
-	ldx		[%src - offset - 0x10], %t2; 		\
-	ldx		[%src - offset - 0x08], %t3; 		\
-	stx		%t0, [%dst - offset - 0x20]; 		\
-	stx		%t1, [%dst - offset - 0x18]; 		\
-	stx		%t2, [%dst - offset - 0x10]; 		\
-	stx		%t3, [%dst - offset - 0x08];		\
-	ldx		[%src - offset - 0x40], %t0; 		\
-	ldx		[%src - offset - 0x38], %t1; 		\
-	ldx		[%src - offset - 0x30], %t2; 		\
-	ldx		[%src - offset - 0x28], %t3; 		\
-	stx		%t0, [%dst - offset - 0x40]; 		\
-	stx		%t1, [%dst - offset - 0x38]; 		\
-	stx		%t2, [%dst - offset - 0x30]; 		\
-	stx		%t3, [%dst - offset - 0x28];
-
-#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3)	\
-	ldx		[%src + offset + 0x00], %t0;		\
-	ldx		[%src + offset + 0x08], %t1; 		\
-	stw		%t0, [%dst + offset + 0x04]; 		\
-	srlx		%t0, 32, %t2;				\
-	stw		%t2, [%dst + offset + 0x00]; 		\
-	stw		%t1, [%dst + offset + 0x0c]; 		\
-	srlx		%t1, 32, %t3;				\
-	stw		%t3, [%dst + offset + 0x08];
-
-#define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1)		\
-	ldx		[%src + offset + 0x00], %t0; 		\
-	ldx		[%src + offset + 0x08], %t1; 		\
-	stx		%t0, [%dst + offset + 0x00]; 		\
-	stx		%t1, [%dst + offset + 0x08];
-
-	.align		32
-228:	andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	be,pt		%icc, 2f+4			/* CTI				*/
-1:	 ldub		[%o1 - 1], %o5			/* LOAD		Group		*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1		Group		*/
-	be,pn		%xcc, 229f			/* CTI				*/
-	 stb		%o5, [%o0]			/* Store			*/
-2:	ldub		[%o1 - 1], %o5			/* LOAD		Group		*/
-	sub		%o0, 2, %o0			/* IEU0				*/
-	ldub		[%o1 - 2], %g5			/* LOAD		Group		*/
-	sub		%o1, 2, %o1			/* IEU0				*/
-	subcc		%o2, 2, %o2			/* IEU1		Group		*/
-	stb		%o5, [%o0 + 1]			/* Store			*/
-	bne,pt		%xcc, 2b			/* CTI				*/
-	 stb		%g5, [%o0]			/* Store			*/
-229:	retl
-	 mov		%g4, %o0
-
-	.align		32
-ENTRY(memmove)
-	mov		%o0, %g5
-#ifndef USE_BPR
-	srl		%o2, 0, %o2			/* IEU1		Group		*/
-#endif
-	brz,pn		%o2, out			/* CTI		Group		*/
-	 sub		%o0, %o1, %o4			/* IEU0				*/
-	cmp		%o4, %o2			/* IEU1		Group		*/
-	bgeu,pt		%XCC, 218b			/* CTI				*/
-	 mov		%o0, %g4			/* IEU0				*/
-	add		%o0, %o2, %o0			/* IEU0		Group		*/
-220:	add		%o1, %o2, %o1			/* IEU1				*/
-	cmp		%o2, 15				/* IEU1		Group		*/
-	bleu,pn		%xcc, 228b			/* CTI				*/
-	 andcc		%o0, 7, %g2			/* IEU1		Group		*/
-	sub		%o0, %o1, %g5			/* IEU0				*/
-	andcc		%g5, 3, %o5			/* IEU1		Group		*/
-	bne,pn		%xcc, 232f			/* CTI				*/
-	 andcc		%o1, 3, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 236f			/* CTI				*/
-	 andcc		%o1, 4, %g0			/* IEU1		Group		*/
-	andcc		%o1, 1, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 4f			/* CTI				*/
-	 andcc		%o1, 2, %g0			/* IEU1		Group		*/
-	ldub		[%o1 - 1], %g2			/* Load		Group		*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	sub		%o2, 1, %o2			/* IEU0		Group		*/
-	be,pn		%xcc, 5f			/* CTI		Group		*/
-	 stb		%g2, [%o0]			/* Store			*/
-4:	lduh		[%o1 - 2], %g2			/* Load		Group		*/
-	sub		%o1, 2, %o1			/* IEU0				*/
-	sub		%o0, 2, %o0			/* IEU1				*/
-	sub		%o2, 2, %o2			/* IEU0				*/
-	sth		%g2, [%o0]			/* Store	Group + bubble	*/
-5:	andcc		%o1, 4, %g0			/* IEU1				*/
-236:	be,a,pn		%xcc, 2f			/* CTI				*/
-	 andcc		%o2, -128, %g6			/* IEU1		Group		*/
-	lduw		[%o1 - 4], %g5			/* Load		Group		*/
-	sub		%o1, 4, %o1			/* IEU0				*/
-	sub		%o0, 4, %o0			/* IEU1				*/
-	sub		%o2, 4, %o2			/* IEU0		Group		*/
-	stw		%g5, [%o0]			/* Store			*/
-	andcc		%o2, -128, %g6			/* IEU1		Group		*/
-2:	be,pn		%xcc, 235f			/* CTI				*/
-	 andcc		%o0, 4, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 282f + 4			/* CTI		Group		*/
-5:	RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
-	RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
-	RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
-	RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
-	subcc		%g6, 128, %g6			/* IEU1		Group		*/
-	sub		%o1, 128, %o1			/* IEU0				*/
-	bne,pt		%xcc, 5b			/* CTI				*/
-	 sub		%o0, 128, %o0			/* IEU0		Group		*/
-235:	andcc		%o2, 0x70, %g6			/* IEU1		Group		*/
-41:	be,pn		%xcc, 280f			/* CTI				*/
-	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
-							/* Clk1 8-(			*/
-							/* Clk2 8-(			*/
-							/* Clk3 8-(			*/
-							/* Clk4 8-(			*/
-279:	rd		%pc, %o5			/* PDU		Group		*/
-	sll		%g6, 1, %g5			/* IEU0		Group		*/
-	sub		%o1, %g6, %o1			/* IEU1				*/
-	sub		%o5, %g5, %o5			/* IEU0  	Group		*/
-	jmpl		%o5 + %lo(280f - 279b), %g0	/* CTI		Group brk forced*/
-	 sub		%o0, %g6, %o0			/* IEU0		Group		*/
-	RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
-	RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
-280:	be,pt		%xcc, 281f			/* CTI				*/
-	 andcc		%o2, 4, %g0			/* IEU1				*/
-	ldx		[%o1 - 8], %g2			/* Load		Group		*/
-	sub		%o0, 8, %o0			/* IEU0				*/
-	stw		%g2, [%o0 + 4]			/* Store	Group		*/
-	sub		%o1, 8, %o1			/* IEU1				*/
-	srlx		%g2, 32, %g2			/* IEU0		Group		*/
-	stw		%g2, [%o0]			/* Store			*/
-281:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
-	lduw		[%o1 - 4], %g2			/* Load		Group		*/
-	sub		%o1, 4, %o1			/* IEU0				*/
-	stw		%g2, [%o0 - 4]			/* Store	Group		*/
-	sub		%o0, 4, %o0			/* IEU0				*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	lduh		[%o1 - 2], %g2			/* Load		Group		*/
-	sub		%o1, 2, %o1			/* IEU0				*/
-	sth		%g2, [%o0 - 2]			/* Store	Group		*/
-	sub		%o0, 2, %o0			/* IEU0				*/
-1:	be,pt		%xcc, 211f			/* CTI				*/
-	 nop						/* IEU1				*/
-	ldub		[%o1 - 1], %g2			/* Load		Group		*/
-	stb		%g2, [%o0 - 1]			/* Store	Group + bubble	*/
-211:	retl
-	 mov		%g4, %o0
-
-282:	RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
-	RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
-	subcc		%g6, 128, %g6			/* IEU1		Group		*/
-	sub		%o1, 128, %o1			/* IEU0				*/
-	bne,pt		%xcc, 282b			/* CTI				*/
-	 sub		%o0, 128, %o0			/* IEU0		Group		*/
-	andcc		%o2, 0x70, %g6			/* IEU1				*/
-	be,pn		%xcc, 284f			/* CTI				*/
-	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
-							/* Clk1 8-(			*/
-							/* Clk2 8-(			*/
-							/* Clk3 8-(			*/
-							/* Clk4 8-(			*/
-283:	rd		%pc, %o5			/* PDU		Group		*/
-	sub		%o1, %g6, %o1			/* IEU0		Group		*/
-	sub		%o5, %g6, %o5			/* IEU1				*/
-	jmpl		%o5 + %lo(284f - 283b), %g0	/* CTI		Group brk forced*/
-	 sub		%o0, %g6, %o0			/* IEU0		Group		*/
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
-	RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
-284:	be,pt		%xcc, 285f			/* CTI		Group		*/
-	 andcc		%o2, 4, %g0			/* IEU1				*/
-	ldx		[%o1 - 8], %g2			/* Load		Group		*/
-	sub		%o0, 8, %o0			/* IEU0				*/
-	sub		%o1, 8, %o1			/* IEU0		Group		*/
-	stx		%g2, [%o0]			/* Store			*/
-285:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
-	lduw		[%o1 - 4], %g2			/* Load		Group		*/
-	sub		%o0, 4, %o0			/* IEU0				*/
-	sub		%o1, 4, %o1			/* IEU0		Group		*/
-	stw		%g2, [%o0]			/* Store			*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
-	lduh		[%o1 - 2], %g2			/* Load		Group		*/
-	sub		%o0, 2, %o0			/* IEU0				*/
-	sub		%o1, 2, %o1			/* IEU0		Group		*/
-	sth		%g2, [%o0]			/* Store			*/
-1:	be,pt		%xcc, 1f			/* CTI				*/
-	 nop						/* IEU0		Group		*/
-	ldub		[%o1 - 1], %g2			/* Load		Group		*/
-	stb		%g2, [%o0 - 1]			/* Store	Group + bubble	*/
-1:	retl
-	 mov		%g4, %o0
-
-232:	brz,pt		%g2, 2f				/* CTI		Group		*/
-	 sub		%o2, %g2, %o2			/* IEU0		Group		*/
-1:	ldub		[%o1 - 1], %g5			/* Load		Group		*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	subcc		%g2, 1, %g2			/* IEU1		Group		*/
-	bne,pt		%xcc, 1b			/* CTI				*/
-	 stb		%g5, [%o0]			/* Store			*/
-2:	andn		%o2, 7, %g5 			/* IEU0		Group		*/
-	and		%o2, 7, %o2			/* IEU1				*/
-	fmovd		%f0, %f2			/* FPU				*/
-	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
-	ldd		[%g1], %f4			/* Load		Group		*/
-1:	ldd		[%g1 - 8], %f6			/* Load		Group		*/
-	sub		%g1, 8, %g1			/* IEU0		Group		*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f6, %f4, %f0			/* GRU		Group		*/
-	std		%f0, [%o0 - 8]			/* Store			*/
-	sub		%o1, 8, %o1			/* IEU0		Group		*/
-	be,pn		%xcc, 233f			/* CTI				*/
-	 sub		%o0, 8, %o0			/* IEU1				*/
-	ldd		[%g1 - 8], %f4			/* Load		Group		*/
-	sub		%g1, 8, %g1			/* IEU0				*/
-	subcc		%g5, 8, %g5			/* IEU1				*/
-	faligndata	%f4, %f6, %f0			/* GRU		Group		*/
-	std		%f0, [%o0 - 8]			/* Store			*/
-	sub		%o1, 8, %o1			/* IEU0				*/
-	bne,pn		%xcc, 1b			/* CTI		Group		*/
-	 sub		%o0, 8, %o0			/* IEU0				*/
-233:	brz,pn		%o2, 234f			/* CTI		Group		*/
-	 nop						/* IEU0				*/
-237:	ldub		[%o1 - 1], %g5			/* LOAD				*/
-	sub		%o1, 1, %o1			/* IEU0				*/
-	sub		%o0, 1, %o0			/* IEU1				*/
-	subcc		%o2, 1, %o2			/* IEU1				*/
-	bne,pt		%xcc, 237b			/* CTI				*/
-	 stb		%g5, [%o0]			/* Store	Group		*/
-234:	wr		%g0, FPRS_FEF, %fprs
-	retl
-	 mov		%g4, %o0
-END(memmove)
-libc_hidden_def(memmove)
-
-#ifdef USE_BPR
-weak_alias(memcpy,__align_cpy_1)
-weak_alias(memcpy,__align_cpy_2)
-weak_alias(memcpy,__align_cpy_4)
-weak_alias(memcpy,__align_cpy_8)
-weak_alias(memcpy,__align_cpy_16)
-#endif
diff --git a/libc/string/sparc/sparc64/stpcpy.S b/libc/string/sparc/sparc64/stpcpy.S
deleted file mode 100644
index 8c26c6bec..000000000
--- a/libc/string/sparc/sparc64/stpcpy.S
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Copy SRC to DEST returning the address of the terminating '\0' in DEST.
-   For SPARC v9.
-   Copyright (C) 1998, 1999, 2002, 2003, 2004 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <asm/asi.h>
-#ifndef XCC
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-	.register	%g6, #scratch
-#endif
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(stpcpy)
-	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU0		Group		*/
-	andcc		%o0, 7, %g0			/* IEU1				*/
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-
-	bne,pn		%icc, 12f			/* CTI				*/
-	 andcc		%o1, 7, %g3			/* IEU1				*/
-	or		%g1, %g2, %g1			/* IEU0		Group		*/
-	bne,pn		%icc, 14f			/* CTI				*/
-
-	 sllx		%g1, 7, %g2			/* IEU0		Group		*/
-1:	ldx		[%o1], %o3			/* Load				*/
-	add		%o1, 8, %o1			/* IEU1				*/
-2:	mov		%o3, %g3			/* IEU0		Group		*/
-
-	sub		%o3, %g1, %o2			/* IEU1				*/
-3:	ldxa		[%o1] ASI_PNF, %o3		/* Load				*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %g3, %o2			/* IEU0		Group		*/
-#endif
-	add		%o0, 8, %o0			/* IEU0		Group		*/
-	andcc		%o2, %g2, %g0			/* IEU1				*/
-
-	add		%o1, 8, %o1			/* IEU0		Group		*/
-	be,a,pt		%xcc, 2b			/* CTI				*/
-	 stx		%g3, [%o0 - 8]			/* Store			*/
-	srlx		%g3, 56, %g5			/* IEU0		Group		*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 11f			/* CTI				*/
-	 srlx		%g3, 48, %g4			/* IEU0				*/
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 10f			/* CTI				*/
-	 srlx		%g3, 40, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 9f			/* CTI				*/
-
-	 srlx		%g3, 32, %g4			/* IEU0				*/
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 8f			/* CTI				*/
-	 srlx		%g3, 24, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-	 srlx		%g3, 16, %g4			/* IEU0				*/
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 6f			/* CTI				*/
-	 srlx		%g3, 8, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-	stx		%g3, [%o0 - 8]			/* Store	Group		*/
-	andcc		%g3, 0xff, %g0			/* IEU1				*/
-	bne,pt		%icc, 3b			/* CTI				*/
-
-	 mov		%o3, %g3			/* IEU0		Group		*/
-4:	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, 1, %o0			/* IEU0				*/
-
-	.align		16
-6:	ba,pt		%xcc, 23f			/* CTI		Group		*/
-	 sub		%o0, 3, %g6			/* IEU0				*/
-5:	sub		%o0, 2, %g6			/* IEU0		Group		*/
-	stb		%g5, [%o0 - 2]			/* Store			*/
-
-	srlx		%g3, 16, %g4			/* IEU0		Group		*/
-23:	sth		%g4, [%o0 - 4]			/* Store			*/
-	srlx		%g3, 32, %g4			/* IEU0		Group		*/
-	stw		%g4, [%o0 - 8]			/* Store			*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-8:	ba,pt		%xcc, 24f			/* CTI		Group		*/
-	 sub		%o0, 5, %g6			/* IEU0				*/
-
-7:	sub		%o0, 4, %g6			/* IEU0		Group		*/
-	stb		%g5, [%o0 - 4]			/* Store			*/
-	srlx		%g3, 32, %g4			/* IEU0		Group		*/
-24:	stw		%g4, [%o0 - 8]			/* Store			*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0 			/* IEU0				*/
-10:	ba,pt		%xcc, 25f			/* CTI		Group		*/
-	 sub		%o0, 7, %g6			/* IEU0				*/
-
-9:	sub		%o0, 6, %g6			/* IEU0		Group		*/
-	stb		%g5, [%o0 - 6]			/* Store			*/
-	srlx		%g3, 48, %g4			/* IEU0				*/
-25:	sth		%g4, [%o0 - 8]			/* Store	Group		*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-11:	stb		%g5, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 sub		%o0, 8, %o0			/* IEU0				*/
-
-	.align		16
-12:	or		%g1, %g2, %g1			/* IEU0		Group		*/
-	ldub		[%o1], %o3			/* Load				*/
-	sllx		%g1, 7, %g2			/* IEU0		Group		*/
-	stb		%o3, [%o0]			/* Store	Group		*/
-
-13:	add		%o0, 1, %o0			/* IEU0				*/
-	add		%o1, 1, %o1			/* IEU1				*/
-	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 4b			/* CTI				*/
-
-	 lduba		[%o1] ASI_PNF, %o3		/* Load				*/
-	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	bne,a,pt	%icc, 13b			/* CTI				*/
-	 stb		%o3, [%o0]			/* Store			*/
-
-	andcc		%o1, 7, %g3			/* IEU1		Group		*/
-	be,a,pt		%icc, 1b			/* CTI				*/
-	 ldx		[%o1], %o3			/* Load				*/
-14:	orcc		%g0, 64, %g4			/* IEU1		Group		*/
-
-	sllx		%g3, 3, %g5			/* IEU0				*/
-	sub		%o1, %g3, %o1			/* IEU0		Group		*/
-	sub		%g4, %g5, %g4			/* IEU1				*/
-							/* %g1 = 0101010101010101	*
-							 * %g2 = 8080808080808080	*
-							 * %g3 = source alignment	*
-							 * %g5 = number of bits to shift left  *
-							 * %g4 = number of bits to shift right */
-	ldxa		[%o1] ASI_PNF, %o5		/* Load		Group		*/
-
-	addcc		%o1, 8, %o1			/* IEU1				*/
-15:	sllx		%o5, %g5, %o3			/* IEU0		Group		*/
-	ldxa		[%o1] ASI_PNF, %o5		/* Load				*/
-	srlx		%o5, %g4, %o4			/* IEU0		Group		*/
-
-	add		%o0, 8, %o0			/* IEU1				*/
-	or		%o3, %o4, %o3			/* IEU0		Group		*/
-	add		%o1, 8, %o1			/* IEU1				*/
-	sub		%o3, %g1, %o4			/* IEU0		Group		*/
-
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o4, %o3, %o4			/* IEU0		Group		*/
-#endif
-	andcc		%o4, %g2, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 15b			/* CTI				*/
-	 stx		%o3, [%o0 - 8]			/* Store			*/
-	srlx		%o3, 56, %o4			/* IEU0		Group		*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 22f			/* CTI				*/
-	 srlx		%o3, 48, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 21f			/* CTI				*/
-	 srlx		%o3, 40, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 20f			/* CTI				*/
-
-	 srlx		%o3, 32, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 19f			/* CTI				*/
-	 srlx		%o3, 24, %o4			/* IEU0				*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 18f			/* CTI				*/
-	 srlx		%o3, 16, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 17f			/* CTI				*/
-	 srlx		%o3, 8, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 16f			/* CTI				*/
-
-	 andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	bne,pn		%icc, 15b			/* CTI				*/
-	 stx		%o3, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 sub		%o0, 1, %o0			/* IEU0				*/
-
-	.align		16
-17:	ba,pt		%xcc, 26f			/* CTI		Group		*/
-	 subcc		%o0, 3, %g6			/* IEU1				*/
-18:	ba,pt		%xcc, 27f			/* CTI		Group		*/
-	 subcc		%o0, 4, %g6			/* IEU1				*/
-
-19:	ba,pt		%xcc, 28f			/* CTI		Group		*/
-	 subcc		%o0, 5, %g6			/* IEU1				*/
-16:	subcc		%o0, 2, %g6			/* IEU1		Group		*/
-	srlx		%o3, 8, %o4			/* IEU0				*/
-
-	stb		%o4, [%o0 - 2]			/* Store			*/
-26:	srlx		%o3, 16, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 3]			/* Store			*/
-27:	srlx		%o3, 24, %o4			/* IEU0		Group		*/
-
-	stb		%o4, [%o0 - 4]			/* Store			*/
-28:	srlx		%o3, 32, %o4			/* IEU0		Group		*/
-	stw		%o4, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 mov		%g6, %o0 			/* IEU0				*/
-
-	.align		16
-21:	ba,pt		%xcc, 29f			/* CTI		Group		*/
-	 subcc		%o0, 7, %g6			/* IEU1				*/
-22:	ba,pt		%xcc, 30f			/* CTI		Group		*/
-	 subcc		%o0, 8, %g6			/* IEU1				*/
-
-20:	subcc		%o0, 6, %g6			/* IEU1		Group		*/
-	srlx		%o3, 40, %o4			/* IEU0				*/
-	stb		%o4, [%o0 - 6]			/* Store			*/
-29:	srlx		%o3, 48, %o4			/* IEU0		Group		*/
-
-	stb		%o4, [%o0 - 7]			/* Store			*/
-30:	srlx		%o3, 56, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 mov		%g6, %o0			/* IEU0				*/
-END(stpcpy)
-libc_hidden_def(stpcpy)
diff --git a/libc/string/sparc/sparc64/strcat.S b/libc/string/sparc/sparc64/strcat.S
deleted file mode 100644
index fcc4ba59c..000000000
--- a/libc/string/sparc/sparc64/strcat.S
+++ /dev/null
@@ -1,339 +0,0 @@
-/* strcat (dest, src) -- Append SRC on the end of DEST.
-   For SPARC v9.
-   Copyright (C) 1998, 1999, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jakub Jelinek <jj@ultra.linux.cz> and
-		  Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <asm/asi.h>
-#ifndef XCC
-#define XCC xcc
-#define USE_BPR
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-	.register	%g6, #scratch
-#endif
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(strcat)
-	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU0		Group		*/
-	mov		%o0, %g6			/* IEU1				*/
-
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-	andcc		%o0, 7, %g0			/* IEU1				*/
-	or		%g1, %g2, %g1			/* IEU0		Group		*/
-	bne,pn		%icc, 32f			/* CTI				*/
-
-	 sllx		%g1, 7, %g2			/* IEU0		Group		*/
-	brz,pn		%o3, 30f			/* CTI+IEU1			*/
-	 ldx		[%o0], %o3			/* Load				*/
-48:	add		%o0, 8, %o0			/* IEU0		Group		*/
-
-49:	sub		%o3, %g1, %o2			/* IEU0		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %o3, %g5			/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%g5, %g2, %g0			/* IEU1		Group		*/
-#else
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g2, %g0			/* IEU1		Group		*/
-#endif
-	be,pt		%xcc, 49b			/* CTI				*/
-
-	 add		%o0, 8, %o0			/* IEU0				*/
- 	addcc		%o2, %g1, %g3			/* IEU1		Group		*/
-	srlx		%o2, 32, %o2			/* IEU0				*/
-50:	andcc		%o2, %g2, %g0			/* IEU1		Group		*/
-
-	be,pn		%xcc, 51f			/* CTI				*/
-	 srlx		%g3, 56, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 29f			/* CTI				*/
-
-	 srlx		%g3, 48, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 28f			/* CTI				*/
-	 srlx		%g3, 40, %o2			/* IEU0				*/
-
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 27f			/* CTI				*/
-	 srlx		%g3, 32, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 26f			/* CTI				*/
-51:	 srlx		%g3, 24, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 25f			/* CTI				*/
-
-	 srlx		%g3, 16, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 24f			/* CTI				*/
-	 srlx		%g3, 8, %o2			/* IEU0				*/
-
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 23f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-	andcc		%g3, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 52f			/* CTI				*/
-	 ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g2, %g0			/* IEU1		Group		*/
-	be,pt		%xcc, 49b			/* CTI				*/
-
-	 add		%o0, 8, %o0			/* IEU0				*/
-	addcc		%o2, %g1, %g3			/* IEU1		Group		*/
-	ba,pt		%xcc, 50b			/* CTI				*/
-	 srlx		%o2, 32, %o2			/* IEU0				*/
-
-	.align		16
-52:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -9, %o0			/* IEU0				*/
-23:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -10, %o0			/* IEU0				*/
-
-24:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -11, %o0			/* IEU0				*/
-25:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -12, %o0			/* IEU0				*/
-
-26:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -13, %o0			/* IEU0				*/
-27:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -14, %o0			/* IEU0				*/
-
-28:	ba,pt		%xcc, 12f			/* CTI		Group		*/
-	 add		%o0, -15, %o0			/* IEU0				*/
-29:	add		%o0, -16, %o0			/* IEU0		Group		*/
-30:	andcc		%o1, 7, %g3			/* IEU1				*/
-
-31:	bne,pn		%icc, 14f			/* CTI				*/
-	 orcc		%g0, 64, %g4			/* IEU1		Group		*/
-1:	ldx		[%o1], %o3			/* Load				*/
-	add		%o1, 8, %o1			/* IEU1				*/
-
-2:	mov		%o3, %g3			/* IEU0		Group		*/
-3:	sub		%o3, %g1, %o2			/* IEU1				*/
-	ldxa		[%o1] ASI_PNF, %o3		/* Load				*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %g3, %o2			/* IEU0		Group		*/
-#endif
-	add		%o0, 8, %o0			/* IEU0		Group		*/
-
-	andcc		%o2, %g2, %g0			/* IEU1				*/
-	add		%o1, 8, %o1			/* IEU0		Group		*/
-	be,a,pt		%xcc, 2b			/* CTI				*/
-	 stx		%g3, [%o0 - 8]			/* Store			*/
-
-	srlx		%g3, 56, %g5			/* IEU0		Group		*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 11f			/* CTI				*/
-	 srlx		%g3, 48, %g4			/* IEU0				*/
-
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 10f			/* CTI				*/
-	 srlx		%g3, 40, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 9f			/* CTI				*/
-	 srlx		%g3, 32, %g4			/* IEU0				*/
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 8f			/* CTI				*/
-
-	 srlx		%g3, 24, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-	 srlx		%g3, 16, %g4			/* IEU0				*/
-
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 6f			/* CTI				*/
-	 srlx		%g3, 8, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 5f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-	stx		%g3, [%o0 - 8]			/* Store	Group		*/
-	andcc		%g3, 0xff, %g0			/* IEU1				*/
-
-	bne,pt		%icc, 3b			/* CTI				*/
-	 mov		%o3, %g3			/* IEU0		Group		*/
-4:	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-
-	.align		16
-5:	stb		%g5, [%o0 - 2]			/* Store	Group		*/
-	srlx		%g3, 16, %g4			/* IEU0				*/
-6:	sth		%g4, [%o0 - 4]			/* Store	Group		*/
-	srlx		%g3, 32, %g4			/* IEU0				*/
-
-	stw		%g4, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-7:	stb		%g5, [%o0 - 4]			/* Store	Group		*/
-
-	srlx		%g3, 32, %g4			/* IEU0				*/
-8:	stw		%g4, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0 			/* IEU0				*/
-
-9:	stb		%g5, [%o0 - 6]			/* Store	Group		*/
-	srlx		%g3, 48, %g4			/* IEU0				*/
-10:	sth		%g4, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 mov		%g6, %o0			/* IEU0				*/
-11:	stb		%g5, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-
-	.align		16
-32:	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	be,a,pn		%icc, 48b			/* CTI				*/
-	 ldx		[%o0], %o3			/* Load				*/
-	add		%o0, 1, %o0			/* IEU0		Group		*/
-
-	brnz,a,pt	%o3, 32b			/* CTI+IEU1			*/
-	 lduba		[%o0] ASI_PNF, %o3		/* Load				*/
-	add		%o0, -1, %o0			/* IEU0		Group		*/
-	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-
-	be,a,pn		%icc, 31b			/* CTI				*/
-	 andcc		%o1, 7, %g3			/* IEU1		Group		*/
-12:	ldub		[%o1], %o3			/* Load				*/
-	stb		%o3, [%o0]			/* Store	Group		*/
-
-13:	add		%o0, 1, %o0			/* IEU0				*/
-	add		%o1, 1, %o1			/* IEU1				*/
-	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 4b			/* CTI				*/
-
-	 lduba		[%o1] ASI_PNF, %o3		/* Load				*/
-	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	bne,a,pt	%icc, 13b			/* CTI				*/
-	 stb		%o3, [%o0]			/* Store			*/
-
-	andcc		%o1, 7, %g3			/* IEU1		Group		*/
-	be,a,pt		%icc, 1b			/* CTI				*/
-	 ldx		[%o1], %o3			/* Load				*/
-	orcc		%g0, 64, %g4			/* IEU1		Group		*/
-
-14:	sllx		%g3, 3, %g5			/* IEU0				*/
-	sub		%o1, %g3, %o1			/* IEU0		Group		*/
-	sub		%g4, %g5, %g4			/* IEU1				*/
-							/* %g1 = 0101010101010101	*
-							 * %g2 = 8080808080808080	*
-							 * %g3 = source alignment	*
-							 * %g5 = number of bits to shift left  *
-							 * %g4 = number of bits to shift right */
-	ldxa		[%o1] ASI_PNF, %o5		/* Load		Group		*/
-
-	addcc		%o1, 8, %o1			/* IEU1				*/
-15:	sllx		%o5, %g5, %o3			/* IEU0		Group		*/
-	ldxa		[%o1] ASI_PNF, %o5		/* Load				*/
-	srlx		%o5, %g4, %o4			/* IEU0		Group		*/
-
-	add		%o0, 8, %o0			/* IEU1				*/
-	or		%o3, %o4, %o3			/* IEU0		Group		*/
-	add		%o1, 8, %o1			/* IEU1				*/
-	sub		%o3, %g1, %o4			/* IEU0		Group		*/
-
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o4, %o3, %o4			/* IEU0		Group		*/
-#endif
-	andcc		%o4, %g2, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 15b			/* CTI				*/
-	 stx		%o3, [%o0 - 8]			/* Store			*/
-	srlx		%o3, 56, %o4			/* IEU0		Group		*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 22f			/* CTI				*/
-	 srlx		%o3, 48, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 21f			/* CTI				*/
-	 srlx		%o3, 40, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 20f			/* CTI				*/
-
-	 srlx		%o3, 32, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 19f			/* CTI				*/
-	 srlx		%o3, 24, %o4			/* IEU0				*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 18f			/* CTI				*/
-	 srlx		%o3, 16, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 17f			/* CTI				*/
-	 srlx		%o3, 8, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 16f			/* CTI				*/
-
-	 andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	bne,pn		%icc, 15b			/* CTI				*/
-	 stx		%o3, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 mov		%g6, %o0			/* IEU0				*/
-
-	.align		16
-16:	srlx		%o3, 8, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 2]			/* Store			*/
-17:	srlx		%o3, 16, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 3]			/* Store			*/
-
-18:	srlx		%o3, 24, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 4]			/* Store			*/
-19:	srlx		%o3, 32, %o4			/* IEU0		Group		*/
-	stw		%o4, [%o0 - 8]			/* Store			*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0 			/* IEU0				*/
-	nop
-	nop
-
-20:	srlx		%o3, 40, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 6]			/* Store			*/
-21:	srlx		%o3, 48, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 7]			/* Store			*/
-
-22:	srlx		%o3, 56, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-END(strcat)
-libc_hidden_def(strcat)
diff --git a/libc/string/sparc/sparc64/strchr.S b/libc/string/sparc/sparc64/strchr.S
deleted file mode 100644
index da26d1f9c..000000000
--- a/libc/string/sparc/sparc64/strchr.S
+++ /dev/null
@@ -1,486 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
-   For SPARC v9.
-   Copyright (C) 1998, 1999, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-		  Jakub Jelinek <jj@ultra.linux.cz>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <features.h>
-#include <asm/asi.h>
-#ifndef XCC
-#define XCC xcc
-#define USE_BPR
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-	.register	%g6, #scratch
-#endif
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(strchr)
-	andcc		%o1, 0xff, %o1			/* IEU1		Group		*/
-	be,pn		%icc, 17f			/* CTI				*/
-	 sllx		%o1, 8, %g3			/* IEU0		Group		*/
-	sethi		%hi(0x01010101), %g1		/* IEU1				*/
-
-	or		%g3, %o1, %g3			/* IEU0		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-	sllx		%g3, 16, %g5			/* IEU0		Group		*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU1				*/
-
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-	brz,pn		%o3, 5f				/* CTI+IEU1			*/
-	 orcc		%g3, %g5, %g3			/* IEU1		Group		*/
-	sllx		%g3, 32, %g5			/* IEU0				*/
-
-	cmp		%o3, %o1			/* IEU1		Group		*/
-	be,pn		%xcc, 14f			/* CTI				*/
-	 or		%g1, %g2, %g1			/* IEU0				*/
-	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-
-	bne,a,pn	%icc, 15f			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU0				*/
-	ldx		[%o0], %o3			/* Load		Group		*/
-1:	sllx		%g1, 7, %g2			/* IEU0				*/
-
-	or		%g3, %g5, %g3			/* IEU1				*/
-	add		%o0, 8, %o0			/* IEU0		Group		*/
-	xor		%o3, %g3, %o4			/* IEU1				*/
-							/* %g1 = 0101010101010101	*
-							 * %g2 = 8080088080808080	*
-							 * %g3 =  c c c c c c c c	*
-							 * %o3 =      value		*
-							 * %o4 =   value XOR c		*/
-2:	sub		%o3, %g1, %o2			/* IEU0		Group		*/
-
-	sub		%o4, %g1, %o5			/* IEU1				*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %o3, %g6			/* IEU0		Group		*/
-	andn		%o5, %o4, %o5			/* IEU1				*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	or		%o5, %g6, %o5			/* IEU0		Group		*/
-#else
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	or		%o5, %o2, %o5			/* IEU0		Group		*/
-#endif
-	add		%o0, 8, %o0			/* IEU1				*/
-
-	andcc		%o5, %g2, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 2b			/* CTI				*/
-	 xor		%o3, %g3, %o4			/* IEU0				*/
-	srlx		%o5, 32, %g5			/* IEU0		Group		*/
-
-	add		%o2, %g1, %o2			/* IEU1				*/
-3:	andcc		%g5, %g2, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 4f			/* CTI				*/
-	 srlx		%o2, 56, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 srlx		%o4, 56, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 6f			/* CTI				*/
-	 srlx		%o2, 48, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-
-	 srlx		%o4, 48, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-	 srlx		%o2, 40, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 srlx		%o4, 40, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 8f			/* CTI				*/
-	 srlx		%o2, 32, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-
-	 srlx		%o4, 32, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 9f			/* CTI				*/
-4:	 srlx		%o2, 24, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 srlx		%o4, 24, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 10f			/* CTI				*/
-	 srlx		%o2, 16, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-
-	 srlx		%o4, 16, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 11f			/* CTI				*/
-	 srlx		%o2, 8, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 srlx		%o4, 8, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 12f			/* CTI				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 13f			/* CTI				*/
-	 xor		%o3, %g3, %o4			/* IEU0				*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load		Group		*/
-
-	sub		%o4, %g1, %o5			/* IEU0				*/
-	or		%o5, %o2, %o5			/* IEU1				*/
-	add		%o0, 8, %o0			/* IEU0		Group		*/
-	andcc		%o5, %g2, %g0			/* IEU1				*/
-
-	be,a,pt		%xcc, 2b			/* CTI				*/
-	 xor		%o3, %g3, %o4			/* IEU0		Group		*/
-	srlx		%o5, 32, %g5			/* IEU0		Group		*/
-	ba,pt		%xcc, 3b			/* CTI				*/
-
-	 add		%o2, %g1, %o2			/* IEU1				*/
-
-	.align		16
-5:	retl						/* CTI+IEU1	Group		*/
-	 clr		%o0				/* IEU0				*/
-6:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -16, %o0			/* IEU0				*/
-
-7:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -15, %o0			/* IEU0				*/
-8:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -14, %o0			/* IEU0				*/
-
-9:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -13, %o0			/* IEU0				*/
-10:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -12, %o0			/* IEU0				*/
-
-11:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -11, %o0			/* IEU0				*/
-12:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -10, %o0			/* IEU0				*/
-
-13:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -9, %o0			/* IEU0				*/
-14: 	retl						/* CTI+IEU1	Group		*/
-	 nop						/* IEU0				*/
-
-	.align		16
-15:	ldub		[%o0], %o3			/* Load		Group		*/
-16:	andcc		%o0, 7, %g0			/* IEU1				*/
-	be,a,pn		%icc, 1b			/* CTI				*/
-	 ldx		[%o0], %o3			/* Load		Group		*/
-
-	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5b			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU0				*/
-	cmp		%o3, %o1			/* IEU1		Group		*/
-
-	bne,a,pn	%icc, 16b			/* CTI				*/
-	 ldub		[%o0], %o3			/* Load				*/
-	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -1, %o0			/* IEU0				*/
-
-	/* strchr (str, 0)			*/
-	.align		32
-	nop
-	.align		16
-17:	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU0		Group		*/
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-
-	andcc		%o0, 7, %g0			/* IEU1				*/
-	or		%g1, %g2, %g1			/* IEU0		Group		*/
-	bne,pn		%icc, 32f			/* CTI				*/
-	 sllx		%g1, 7, %g2			/* IEU0		Group		*/
-
-	brz,pn		%o3, 30f			/* CTI+IEU1			*/
-	 ldx		[%o0], %o3			/* Load				*/
-18:	add		%o0, 8, %o0			/* IEU0		Group		*/
-19:	sub		%o3, %g1, %o2			/* IEU0		Group		*/
-
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %o3, %g6			/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%g6, %g2, %g0			/* IEU1		Group		*/
-#else
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g2, %g0			/* IEU1		Group		*/
-#endif
-	be,pt		%xcc, 19b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0				*/
-
- 	addcc		%o2, %g1, %g3			/* IEU1		Group		*/
-	srlx		%o2, 32, %o2			/* IEU0				*/
-20:	andcc		%o2, %g2, %g0			/* IEU1		Group		*/
-	be,pn		%xcc, 21f			/* CTI				*/
-
-	 srlx		%g3, 56, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 29f			/* CTI				*/
-	 srlx		%g3, 48, %o2			/* IEU0				*/
-
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 28f			/* CTI				*/
-	 srlx		%g3, 40, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 27f			/* CTI				*/
-	 srlx		%g3, 32, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 26f			/* CTI				*/
-
-21:	 srlx		%g3, 24, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 25f			/* CTI				*/
-	 srlx		%g3, 16, %o2			/* IEU0				*/
-
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 24f			/* CTI				*/
-	 srlx		%g3, 8, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 23f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-	andcc		%g3, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 22f			/* CTI				*/
-
-	 ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g2, %g0			/* IEU1		Group		*/
-	be,pt		%xcc, 19b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0				*/
-
-	addcc		%o2, %g1, %g3			/* IEU1		Group		*/
-	ba,pt		%xcc, 20b			/* CTI				*/
-	 srlx		%o2, 32, %o2			/* IEU0				*/
-
-	.align		16
-22:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -9, %o0			/* IEU0				*/
-23:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -10, %o0			/* IEU0				*/
-
-24:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -11, %o0			/* IEU0				*/
-25:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -12, %o0			/* IEU0				*/
-
-26:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -13, %o0			/* IEU0				*/
-27:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -14, %o0			/* IEU0				*/
-
-28:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -15, %o0			/* IEU0				*/
-29:	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -16, %o0			/* IEU0				*/
-
-30:	retl						/* CTI+IEU1	Group		*/
-	 nop						/* IEU0				*/
-
-	.align		16
-32:	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	be,a,pn		%icc, 18b			/* CTI				*/
-	 ldx		[%o0], %o3			/* Load				*/
-	add		%o0, 1, %o0			/* IEU0		Group		*/
-
-	brnz,a,pt	%o3, 32b			/* CTI+IEU1			*/
-	 lduba		[%o0] ASI_PNF, %o3		/* Load				*/
-	retl						/* CTI+IEU1	Group		*/
-	 add		%o0, -1, %o0			/* IEU0				*/
-END(strchr)
-libc_hidden_def(strchr)
-#ifdef __UCLIBC_SUSV3_LEGACY__
-strong_alias(strchr,index)
-#endif
-
-	.align		32
-ENTRY(strrchr)
-	andcc		%o1, 0xff, %o1			/* IEU1		Group		*/
-	be,pn		%icc, 17b			/* CTI				*/
-	 clr		%g4				/* IEU0				*/
-	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-
-	bne,pn		%icc, 13f			/* CTI				*/
-	 sllx		%o1, 8, %g3			/* IEU0				*/
-	ldx		[%o0], %o3			/* Load		Group		*/
-1:	sethi		%hi(0x01010101), %g1		/* IEU0				*/
-
-	or		%g3, %o1, %g3			/* IEU1				*/
-	sllx		%g3, 16, %g5			/* IEU0		Group		*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU1				*/
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-
-	or		%g3, %g5, %g3			/* IEU1				*/
-	sllx		%g3, 32, %g5			/* IEU0		Group		*/
-	or		%g1, %g2, %g1			/* IEU1				*/
-	sllx		%g1, 7, %g2			/* IEU0		Group		*/
-
-	or		%g3, %g5, %g3			/* IEU1				*/
-	add		%o0, 8, %o0			/* IEU0		Group		*/
-	xor		%o3, %g3, %o4			/* IEU1				*/
-							/* %g1 = 0101010101010101	*
-							 * %g2 = 8080088080808080	*
-							 * %g3 =  c c c c c c c c	*
-							 * %o3 =   value		*
-							 * %o4 =   value XOR c		*/
-2:	sub		%o3, %g1, %o2			/* IEU0		Group		*/
-
-3:	sub		%o4, %g1, %o5			/* IEU1				*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %o3, %g6			/* IEU0		Group		*/
-	andn		%o5, %o4, %o5			/* IEU1				*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-
-	or		%o5, %g6, %o5			/* IEU0		Group		*/
-#else
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-
-	or		%o5, %o2, %o5			/* IEU0		Group		*/
-#endif
-	add		%o0, 8, %o0			/* IEU1				*/
-	andcc		%o5, %g2, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 2b			/* CTI				*/
-
-	 xor		%o3, %g3, %o4			/* IEU0				*/
-	srlx		%o5, 32, %g5			/* IEU0		Group		*/
-	add		%o2, %g1, %o2			/* IEU1				*/
-	andcc		%g5, %g2, %g0			/* IEU1		Group		*/
-
-	be,pn		%xcc, 7f			/* CTI				*/
-	 srlx		%o2, 56, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-
-	 srlx		%o4, 56, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	srlx		%o2, 48, %g5			/* IEU0				*/
-	be,a,pn		%icc, 4f			/* CTI				*/
-
-	 add		%o0, -16, %g4			/* IEU0		Group		*/
-4:	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%o4, 48, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	srlx		%o2, 40, %g5			/* IEU0				*/
-	be,a,pn		%icc, 5f			/* CTI				*/
-	 add		%o0, -15, %g4			/* IEU0		Group		*/
-
-5:	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%o4, 40, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	srlx		%o2, 32, %g5			/* IEU0				*/
-	be,a,pn		%icc, 6f			/* CTI				*/
-	 add		%o0, -14, %g4			/* IEU0		Group		*/
-6:	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%o4, 32, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,a,pn		%icc, 7f			/* CTI				*/
-
-	 add		%o0, -13, %g4			/* IEU0				*/
-7:	srlx		%o2, 24, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-
-	 srlx		%o4, 24, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	srlx		%o2, 16, %g5			/* IEU0				*/
-	be,a,pn		%icc, 8f			/* CTI				*/
-
-	 add		%o0, -12, %g4			/* IEU0		Group		*/
-8:	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%o4, 16, %g5			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	srlx		%o2, 8, %g5			/* IEU0				*/
-	be,a,pn		%icc, 9f			/* CTI				*/
-	 add		%o0, -11, %g4			/* IEU0		Group		*/
-
-9:	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%o4, 8, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,a,pn		%icc, 10f			/* CTI				*/
-	 add		%o0, -10, %g4			/* IEU0				*/
-10:	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12f			/* CTI				*/
-
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,a,pn		%icc, 11f			/* CTI				*/
-	 add		%o0, -9, %g4			/* IEU0				*/
-
-11:	ba,pt		%xcc, 3b			/* CTI		Group		*/
-	 xor		%o3, %g3, %o4			/* IEU0		Group		*/
-12:	retl						/* CTI+IEU1	Group		*/
-	 mov		%g4, %o0			/* IEU0				*/
-
-	.align		16
-13:	ldub		[%o0], %o3			/* Load		Group		*/
-	add		%o0, 1, %o0			/* IEU0				*/
-14:	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 12b			/* CTI				*/
-
-	cmp		%o3, %o1			/* IEU1		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-	be,a,pn 	%icc, 15f			/* CTI				*/
-	 add		%o0, -1, %g4			/* IEU0		Group		*/
-
-15:	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	bne,a,pt	%icc, 14b			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU0				*/
-	ba,pt		%xcc, 1b			/* CTI		Group		*/
-
-	 ldx		[%o0], %o3			/* Load				*/
-END(strrchr)
-libc_hidden_def(strrchr)
-#ifdef __UCLIBC_SUSV3_LEGACY__
-strong_alias(strrchr,rindex)
-#endif
diff --git a/libc/string/sparc/sparc64/strcmp.S b/libc/string/sparc/sparc64/strcmp.S
deleted file mode 100644
index df9e69179..000000000
--- a/libc/string/sparc/sparc64/strcmp.S
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Compare two strings for differences.
-   For SPARC v9.
-   Copyright (C) 1997, 1999, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <asm/asi.h>
-#ifndef XCC
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-	.register	%g6, #scratch
-#endif
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(strcmp)
-	sethi		%hi(0x01010101), %g1			/* IEU0		Group		*/
-	andcc		%o0, 7, %g0				/* IEU1				*/
-	bne,pn		%icc, 7f				/* CTI				*/
-	 or		%g1, %lo(0x01010101), %g1		/* IEU0		Group		*/
-
-	andcc		%o1, 7, %g3				/* IEU1				*/
-	bne,pn		%icc, 9f				/* CTI				*/
-	 sllx		%g1, 32, %g2				/* IEU0		Group		*/
-	ldx		[%o0], %o2				/* Load				*/
-
-	or		%g1, %g2, %g1				/* IEU0		Group		*/
-1:	ldx		[%o1], %o3				/* Load				*/
-	sub		%o1, %o0, %o1				/* IEU1				*/
-	sllx		%g1, 7, %g2				/* IEU0		Group		*/
-
-2:	add		%o0, 8, %o0				/* IEU1				*/
-	sub		%o2, %g1, %g3				/* IEU0		Group		*/
-	subcc		%o2, %o3, %g0				/* IEU1				*/
-	bne,pn		%xcc, 13f				/* CTI				*/
-
-#ifdef EIGHTBIT_NOT_RARE
-	 andn		%g3, %o2, %g4				/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o2			/* Load				*/
-	andcc		%g4, %g2, %g0				/* IEU1		Group		*/
-#else
-	 ldxa		[%o0] ASI_PNF, %o2			/* Load		Group		*/
-	andcc		%g3, %g2, %g0				/* IEU1				*/
-#endif
-	be,a,pt		%xcc, 2b				/* CTI				*/
-	 ldxa		[%o1 + %o0] ASI_PNF, %o3		/* Load		Group		*/
-
-	addcc		%g3, %g1, %o4				/* IEU1				*/
-	srlx		%g3, 32, %g3				/* IEU0				*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-	be,pt		%xcc, 3f				/* CTI				*/
-
-	 srlx		%o4, 56, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 48, %o5				/* IEU0				*/
-
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 40, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 32, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-
-3:	 srlx		%o4, 24, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 16, %o5				/* IEU0				*/
-
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4f				/* CTI				*/
-	 srlx		%o4, 8, %o5				/* IEU0				*/
-	andcc		%o5, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4f				/* CTI				*/
-	 andcc		%o4, 0xff, %g0				/* IEU1		Group		*/
-	bne,a,pn	%icc, 2b				/* CTI				*/
-	 ldxa		[%o1 + %o0] ASI_PNF, %o3		/* Load				*/
-
-4:	retl							/* CTI+IEU1	Group		*/
-	 clr		%o0					/* IEU0				*/
-
-	.align		32
-13:	mov		0xff, %g6				/* IEU0		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	andcc		%g4, %g2, %g0				/* IEU1				*/
-#else
-	andcc		%g3, %g2, %g0				/* IEU1				*/
-#endif
-	be,pt		%xcc, 25f				/* CTI				*/
-	 addcc		%g3, %g1, %o4				/* IEU1		Group		*/
-
-	srlx		%g3, 32, %g3				/* IEU0				*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-	be,pt		%xcc, 23f				/* CTI				*/
-	 sllx		%g6, 56, %o5				/* IEU0				*/
-
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%xcc, 24f				/* CTI				*/
-	 sllx		%g6, 48, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-
-	be,pn		%xcc, 24f				/* CTI				*/
-	 sllx		%g6, 40, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%xcc, 24f				/* CTI				*/
-
-	 sllx		%g6, 32, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%xcc, 24f				/* CTI				*/
-23:	 sllx		%g6, 24, %o5				/* IEU0				*/
-
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 24f				/* CTI				*/
-	 sllx		%g6, 16, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 24f				/* CTI				*/
-	 sllx		%g6, 8, %o5				/* IEU0				*/
-	andcc		%o4, %o5, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 24f				/* CTI				*/
-
-	 mov		%g6, %o5				/* IEU0				*/
-25:	cmp		%o4, %o3				/* IEU1		Group		*/
-5:	mov		-1, %o0					/* IEU0				*/
-	retl							/* CTI+IEU1	Group		*/
-
-	 movgu		%xcc, 1, %o0				/* Single	Group		*/
-
-	.align		16
-24:	sub		%o5, 1, %g6				/* IEU0		Group		*/
-	clr		%o0					/* IEU1				*/
-	or		%o5, %g6, %o5				/* IEU0		Group		*/
-	andn		%o4, %o5, %o4				/* IEU0		Group		*/
-
-	andn		%o3, %o5, %o3				/* IEU1				*/
-	cmp		%o4, %o3				/* IEU1		Group		*/
-	movgu		%xcc, 1, %o0				/* Single	Group		*/
-	retl							/* CTI+IEU1	Group		*/
-
-	 movlu		%xcc, -1, %o0				/* Single	Group		*/
-6:	retl							/* CTI+IEU1	Group		*/
-	 mov		%o4, %o0				/* IEU0				*/
-
-	.align		16
-7:	ldub		[%o0], %o2				/* Load				*/
-	add		%o0, 1, %o0				/* IEU1				*/
-	ldub		[%o1], %o3				/* Load		Group		*/
-	sllx		%g1, 32, %g2				/* IEU0				*/
-
-8:	add		%o1, 1, %o1				/* IEU1				*/
-	subcc		%o2, %o3, %o4				/* IEU1		Group		*/
-	bne,pn		%xcc, 6b				/* CTI				*/
-	 lduba		[%o0] ASI_PNF, %o2			/* Load				*/
-
-	brz,pn		%o3, 4b					/* CTI+IEU1	Group		*/
-	 lduba		[%o1] ASI_PNF, %o3			/* Load				*/
-	andcc		%o0, 7, %g0				/* IEU1		Group		*/
-	bne,a,pn	%icc, 8b				/* CTI				*/
-
-	 add		%o0, 1, %o0				/* IEU0				*/
-	or		%g1, %g2, %g1				/* IEU0		Group		*/
-	andcc		%o1, 7, %g3				/* IEU1				*/
-	be,a,pn		%icc, 1b				/* CTI				*/
-
-	 ldxa		[%o0] ASI_PNF, %o2			/* Load		Group		*/
-9:	sllx		%g3, 3, %g5				/* IEU0				*/
-	mov		64, %o5					/* IEU1				*/
-	sub		%o1, %g3, %o1				/* IEU0		Group		*/
-
-	sub		%o5, %g5, %o5				/* IEU1				*/
-	ldxa		[%o1] ASI_PNF, %g6			/* Load		Group		*/
-	or		%g1, %g2, %g1				/* IEU0				*/
-	sub		%o1, %o0, %o1				/* IEU1				*/
-
-	sllx		%g1, 7, %g2				/* IEU0		Group		*/
-	add		%o1, 8, %o1				/* IEU1				*/
-								/* %g1 = 0101010101010101
-								 * %g2 = 8080808080800880
-								 * %g5 = number of bits to shift left
-								 * %o5 = number of bits to shift right */
-10:	sllx		%g6, %g5, %o3				/* IEU0		Group		*/
-	ldxa		[%o1 + %o0] ASI_PNF, %g6		/* Load				*/
-
-11:	srlx		%g6, %o5, %o4				/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o2			/* Load				*/
-	or		%o3, %o4, %o3				/* IEU1				*/
-	add		%o0, 8, %o0				/* IEU0		Group		*/
-
-	subcc		%o2, %o3, %g0				/* IEU1				*/
-#ifdef EIGHTBIT_NOT_RARE
-	sub		%o2, %g1, %g3				/* IEU0		Group		*/
-	bne,pn		%xcc, 13b				/* CTI				*/
-	 andn		%g3, %o2, %g4				/* IEU0		Group		*/
-
-	andcc		%g4, %g2, %g0				/* IEU1		Group		*/
-	be,pt		%xcc, 10b				/* CTI				*/
-	 srlx		%g4, 32, %g4				/* IEU0				*/
-	andcc		%g4, %g2, %g0				/* IEU1		Group		*/
-#else
-	bne,pn		%xcc, 13b				/* CTI				*/
-	 sub		%o2, %g1, %g3				/* IEU0		Group		*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-
-	be,pt		%xcc, 10b				/* CTI				*/
-	 srlx		%g3, 32, %g3				/* IEU0				*/
-	andcc		%g3, %g2, %g0				/* IEU1		Group		*/
-#endif
-	be,pt		%xcc, 12f				/* CTI				*/
-
-	 srlx		%o2, 56, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 48, %g3				/* IEU0				*/
-
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 40, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 32, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-
-12:	 srlx		%o2, 24, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 16, %g3				/* IEU0				*/
-
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 srlx		%o2, 8, %g3				/* IEU0				*/
-	andcc		%g3, 0xff, %g0				/* IEU1		Group		*/
-
-	be,pn		%icc, 4b				/* CTI				*/
-	 andcc		%o2, 0xff, %g0				/* IEU1		Group		*/
-	be,pn		%icc, 4b				/* CTI				*/
-	 sllx		%g6, %g5, %o3				/* IEU0				*/
-
-	ba,pt		%xcc, 11b				/* CTI		Group		*/
-	 ldxa		[%o1 + %o0] ASI_PNF, %g6		/* Load				*/
-END(strcmp)
-libc_hidden_def(strcmp)
diff --git a/libc/string/sparc/sparc64/strcpy.S b/libc/string/sparc/sparc64/strcpy.S
deleted file mode 100644
index 1317d5489..000000000
--- a/libc/string/sparc/sparc64/strcpy.S
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copy SRC to DEST returning DEST.
-   For SPARC v9.
-   Copyright (C) 1998, 1999, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <asm/asi.h>
-#ifndef XCC
-	.register	%g2, #scratch
-	.register	%g3, #scratch
-	.register	%g6, #scratch
-#endif
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(strcpy)
-	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	mov		%o0, %g6			/* IEU1				*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU0		Group		*/
-	andcc		%o0, 7, %g0			/* IEU1				*/
-
-	sllx		%g1, 32, %g2			/* IEU0		Group		*/
-	bne,pn		%icc, 12f			/* CTI				*/
-	 andcc		%o1, 7, %g3			/* IEU1				*/
-	or		%g1, %g2, %g1			/* IEU0		Group		*/
-
-	bne,pn		%icc, 14f			/* CTI				*/
-	 sllx		%g1, 7, %g2			/* IEU0		Group		*/
-1:	ldx		[%o1], %o3			/* Load				*/
-	add		%o1, 8, %o1			/* IEU1				*/
-
-2:	mov		%o3, %g3			/* IEU0		Group		*/
-3:	sub		%o3, %g1, %o2			/* IEU1				*/
-	ldxa		[%o1] ASI_PNF, %o3		/* Load				*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %g3, %o2			/* IEU0		Group		*/
-#endif
-	add		%o0, 8, %o0			/* IEU0		Group		*/
-
-	andcc		%o2, %g2, %g0			/* IEU1				*/
-	add		%o1, 8, %o1			/* IEU0		Group		*/
-	be,a,pt		%xcc, 2b			/* CTI				*/
-	 stx		%g3, [%o0 - 8]			/* Store			*/
-
-	srlx		%g3, 56, %g5			/* IEU0		Group		*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 11f			/* CTI				*/
-	 srlx		%g3, 48, %g4			/* IEU0				*/
-
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 10f			/* CTI				*/
-	 srlx		%g3, 40, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 9f			/* CTI				*/
-	 srlx		%g3, 32, %g4			/* IEU0				*/
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 8f			/* CTI				*/
-
-	 srlx		%g3, 24, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-	 srlx		%g3, 16, %g4			/* IEU0				*/
-
-	andcc		%g4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 6f			/* CTI				*/
-	 srlx		%g3, 8, %g5			/* IEU0				*/
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 5f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-	stx		%g3, [%o0 - 8]			/* Store	Group		*/
-	andcc		%g3, 0xff, %g0			/* IEU1				*/
-
-	bne,pt		%icc, 3b			/* CTI				*/
-	 mov		%o3, %g3			/* IEU0		Group		*/
-4:	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-
-	.align		16
-5:	stb		%g5, [%o0 - 2]			/* Store	Group		*/
-	srlx		%g3, 16, %g4			/* IEU0				*/
-6:	sth		%g4, [%o0 - 4]			/* Store	Group		*/
-	srlx		%g3, 32, %g4			/* IEU0				*/
-
-	stw		%g4, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-7:	stb		%g5, [%o0 - 4]			/* Store	Group		*/
-
-	srlx		%g3, 32, %g4			/* IEU0				*/
-8:	stw		%g4, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0 			/* IEU0				*/
-
-9:	stb		%g5, [%o0 - 6]			/* Store	Group		*/
-	srlx		%g3, 48, %g4			/* IEU0				*/
-10:	sth		%g4, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 mov		%g6, %o0			/* IEU0				*/
-11:	stb		%g5, [%o0 - 8]			/* Store	Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-
-12:	or		%g1, %g2, %g1			/* IEU0		Group		*/
-	ldub		[%o1], %o3			/* Load				*/
-	sllx		%g1, 7, %g2			/* IEU0		Group		*/
-	stb		%o3, [%o0]			/* Store	Group		*/
-
-13:	add		%o0, 1, %o0			/* IEU0				*/
-	add		%o1, 1, %o1			/* IEU1				*/
-	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 4b			/* CTI				*/
-
-	 lduba		[%o1] ASI_PNF, %o3		/* Load				*/
-	andcc		%o0, 7, %g0			/* IEU1		Group		*/
-	bne,a,pt	%icc, 13b			/* CTI				*/
-	 stb		%o3, [%o0]			/* Store			*/
-
-	andcc		%o1, 7, %g3			/* IEU1		Group		*/
-	be,a,pt		%icc, 1b			/* CTI				*/
-	 ldx		[%o1], %o3			/* Load				*/
-14:	orcc		%g0, 64, %g4			/* IEU1		Group		*/
-
-	sllx		%g3, 3, %g5			/* IEU0				*/
-	sub		%o1, %g3, %o1			/* IEU0		Group		*/
-	sub		%g4, %g5, %g4			/* IEU1				*/
-							/* %g1 = 0101010101010101	*
-							 * %g2 = 8080808080808080	*
-							 * %g3 = source alignment	*
-							 * %g5 = number of bits to shift left  *
-							 * %g4 = number of bits to shift right */
-	ldxa		[%o1] ASI_PNF, %o5		/* Load		Group		*/
-
-	addcc		%o1, 8, %o1			/* IEU1				*/
-15:	sllx		%o5, %g5, %o3			/* IEU0		Group		*/
-	ldxa		[%o1] ASI_PNF, %o5		/* Load				*/
-	srlx		%o5, %g4, %o4			/* IEU0		Group		*/
-
-	add		%o0, 8, %o0			/* IEU1				*/
-	or		%o3, %o4, %o3			/* IEU0		Group		*/
-	add		%o1, 8, %o1			/* IEU1				*/
-	sub		%o3, %g1, %o4			/* IEU0		Group		*/
-
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o4, %o3, %o4			/* IEU0		Group		*/
-#endif
-	andcc		%o4, %g2, %g0			/* IEU1		Group		*/
-	be,a,pt		%xcc, 15b			/* CTI				*/
-	 stx		%o3, [%o0 - 8]			/* Store			*/
-	srlx		%o3, 56, %o4			/* IEU0		Group		*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 22f			/* CTI				*/
-	 srlx		%o3, 48, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 21f			/* CTI				*/
-	 srlx		%o3, 40, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 20f			/* CTI				*/
-
-	 srlx		%o3, 32, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 19f			/* CTI				*/
-	 srlx		%o3, 24, %o4			/* IEU0				*/
-
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 18f			/* CTI				*/
-	 srlx		%o3, 16, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 17f			/* CTI				*/
-	 srlx		%o3, 8, %o4			/* IEU0				*/
-	andcc		%o4, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 16f			/* CTI				*/
-
-	 andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	bne,pn		%icc, 15b			/* CTI				*/
-	 stx		%o3, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 mov		%g6, %o0			/* IEU0				*/
-
-	.align		16
-16:	srlx		%o3, 8, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 2]			/* Store			*/
-17:	srlx		%o3, 16, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 3]			/* Store			*/
-
-18:	srlx		%o3, 24, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 4]			/* Store			*/
-19:	srlx		%o3, 32, %o4			/* IEU0		Group		*/
-	stw		%o4, [%o0 - 8]			/* Store			*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0 			/* IEU0				*/
-	nop
-	nop
-
-20:	srlx		%o3, 40, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 6]			/* Store			*/
-21:	srlx		%o3, 48, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 7]			/* Store			*/
-
-22:	srlx		%o3, 56, %o4			/* IEU0		Group		*/
-	stb		%o4, [%o0 - 8]			/* Store			*/
-	retl						/* CTI+IEU1	Group		*/
-	 mov		%g6, %o0			/* IEU0				*/
-END(strcpy)
-
-libc_hidden_def(strcpy)
diff --git a/libc/string/sparc/sparc64/strlen.S b/libc/string/sparc/sparc64/strlen.S
deleted file mode 100644
index 1fe854961..000000000
--- a/libc/string/sparc/sparc64/strlen.S
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Determine the length of a string.  For SPARC v9.
-   Copyright (C) 1998, 1999, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <asm/asi.h>
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
-	.text
-	.align		32
-ENTRY(strlen)
-	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU0		Group		*/
-	mov		%o0, %o1			/* IEU1				*/
-
-	sllx		%g1, 32, %g4			/* IEU0		Group 		*/
-	andcc		%o0, 7, %g0			/* IEU1				*/
-	or		%g1, %g4, %g1			/* IEU0		Group		*/
-	brz,pn		%o3, 13f			/* CTI+IEU1			*/
-
-	 sllx		%g1, 7, %g4			/* IEU0		Group		*/
-	bne,a,pn	%icc, 15f			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU1				*/
-							/* %g1 = 0x0101010101010101	*
-							 * %g4 = 0x8080808080808080	*
-							 * %o0 = string pointer		*
-							 * %o1 = start of string	*/
-1:	ldx		[%o0], %o3			/* Load		Group		*/
-
-	add		%o0, 8, %o0			/* IEU1				*/
-2:	sub		%o3, %g1, %o2			/* IEU0		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %o3, %o5			/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o5, %g4, %g0			/* IEU1		Group		*/
-#else
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g4, %g0			/* IEU1		Group		*/
-#endif
-
-	be,pt		%xcc, 2b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0				*/
- 	addcc		%o2, %g1, %g5			/* IEU1		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	srlx		%o5, 32, %o5			/* IEU0				*/
-
-3:	andcc		%o5, %g4, %g0			/* IEU1		Group		*/
-#else
-	srlx		%o2, 32, %o2			/* IEU0				*/
-
-3:	andcc		%o2, %g4, %g0			/* IEU1		Group		*/
-#endif
-	be,pn		%xcc, 4f			/* CTI				*/
-	 srlx		%g5, 56, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%g5, 48, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 11f			/* CTI				*/
-
-	 srlx		%g5, 40, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 10f			/* CTI				*/
-	 srlx		%g5, 32, %o2			/* IEU0				*/
-
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 9f			/* CTI				*/
-4:	 srlx		%g5, 24, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 8f			/* CTI				*/
-	 srlx		%g5, 16, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-
-	 srlx		%g5, 8, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 6f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g4, %g0			/* IEU1		Group		*/
-
-	be,pt		%xcc, 2b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0				*/
-	addcc		%o2, %g1, %g5			/* IEU1		Group		*/
-	ba,pt		%xcc, 3b			/* CTI				*/
-
-	 srlx		%o2, 32, %o2			/* IEU0				*/
-5:	add		%o0, -9, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-
-6:	add		%o0, -10, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-7:	add		%o0, -11, %o0			/* IEU0		Group		*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-8:	add		%o0, -12, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-9:	add		%o0, -13, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-
-10:	add		%o0, -14, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-11:	add		%o0, -15, %o0			/* IEU0		Group		*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-12:	add		%o0, -16, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-13:	retl						/* CTI+IEU1	Group		*/
-	 mov		0, %o0				/* IEU0				*/
-	nop
-
-15:	ldub		[%o0], %o3			/* Load		Group		*/
-16:	andcc		%o0, 7, %g0			/* IEU1				*/
-	be,pn		%icc, 1b			/* CTI				*/
-	 nop						/* IEU0		Group		*/
-
-	add		%o0, 1, %o0			/* IEU1				*/
-	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	bne,a,pt	%icc, 16b			/* CTI				*/
-	 lduba		[%o0] ASI_PNF, %o3		/* Load				*/
-
-	add		%o0, -1, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-END(strlen)
-libc_hidden_def(strlen)
diff --git a/libc/string/stpcpy.c b/libc/string/stpcpy.c
index 8a487584e..2fd2c0648 100644
--- a/libc/string/stpcpy.c
+++ b/libc/string/stpcpy.c
@@ -10,19 +10,13 @@
 #ifdef WANT_WIDE
 # define Wstpcpy wcpcpy
 #else
-/* Experimentally off - libc_hidden_proto(stpcpy) */
+# undef stpcpy
 # define Wstpcpy stpcpy
 #endif
 
 Wchar *Wstpcpy(register Wchar * __restrict s1, const Wchar * __restrict s2)
 {
-#ifdef __BCC__
-	do {
-		*s1 = *s2++;
-	} while (*s1++ != 0);
-#else
 	while ( (*s1++ = *s2++) != 0 );
-#endif
 
 	return s1 - 1;
 }
diff --git a/libc/string/stpncpy.c b/libc/string/stpncpy.c
index dac8471fd..50d83a131 100644
--- a/libc/string/stpncpy.c
+++ b/libc/string/stpncpy.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstpncpy wcpncpy
 #else
-/* Experimentally off - libc_hidden_proto(stpncpy) */
 # define Wstpncpy stpncpy
 #endif
 
@@ -21,22 +20,10 @@ Wchar *Wstpncpy(register Wchar * __restrict s1,
 	Wchar *s = s1;
 	const Wchar *p = s2;
 
-#ifdef __BCC__
-	while (n--) {
-		if ((*s = *s2) != 0) s2++; /* Need to fill tail with 0s. */
-		++s;
-	}
-	return s1 + (s2 - p);
-#else
 	while (n) {
 		if ((*s = *s2) != 0) s2++; /* Need to fill tail with 0s. */
 		++s;
 		--n;
 	}
 	return s1 + (s2 - p);
-#endif
 }
-
-#ifndef WANT_WIDE
-libc_hidden_def(stpncpy)
-#endif
diff --git a/libc/string/strcasecmp.c b/libc/string/strcasecmp.c
index f9852236b..f894e426e 100644
--- a/libc/string/strcasecmp.c
+++ b/libc/string/strcasecmp.c
@@ -12,28 +12,15 @@
 #ifdef WANT_WIDE
 # define strcasecmp wcscasecmp
 # define strcasecmp_l wcscasecmp_l
-libc_hidden_proto(wcscasecmp)
-# if defined(__USE_GNU) && defined(__UCLIBC_HAS_XLOCALE__)
-libc_hidden_proto(wcscasecmp_l)
-# endif
 # ifdef __UCLIBC_DO_XLOCALE
-libc_hidden_proto(towlower_l)
 #  define TOLOWER(C) towlower_l((C), locale_arg)
 # else
-libc_hidden_proto(towlower)
 #  define TOLOWER(C) towlower((C))
 # endif
 #else
-/* Experimentally off - libc_hidden_proto(strcasecmp) */
-/* Experimentally off - libc_hidden_proto(strcasecmp_l) */
 # ifdef __UCLIBC_DO_XLOCALE
-libc_hidden_proto(tolower_l)
 #  define TOLOWER(C) tolower_l((C), locale_arg)
 # else
-#if !defined __UCLIBC_HAS_XLOCALE__ && defined __UCLIBC_HAS_CTYPE_TABLES__
-libc_hidden_proto(__ctype_tolower)
-#endif
-libc_hidden_proto(tolower)
 #  define TOLOWER(C) tolower((C))
 # endif
 #endif
@@ -44,11 +31,12 @@ int strcasecmp(register const Wchar *s1, register const Wchar *s2)
 {
 	return strcasecmp_l(s1, s2, __UCLIBC_CURLOCALE);
 }
+#ifndef WANT_WIDE
 libc_hidden_def(strcasecmp)
+#endif
 
 #else  /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
 
-/* Experimentally off - libc_hidden_proto(__XL_NPP(strcasecmp)) */
 int __XL_NPP(strcasecmp)(register const Wchar *s1, register const Wchar *s2
 					  __LOCALE_PARAM )
 {
@@ -73,6 +61,8 @@ int __XL_NPP(strcasecmp)(register const Wchar *s1, register const Wchar *s2
 	return r;
 #endif
 }
+#if !defined WANT_WIDE || (defined WANT_WIDE && defined __UCLIBC_DO_XLOCALE)
 libc_hidden_def(__XL_NPP(strcasecmp))
+#endif
 
 #endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
diff --git a/libc/string/strcasestr.c b/libc/string/strcasestr.c
index 2671b4b98..3334086bf 100644
--- a/libc/string/strcasestr.c
+++ b/libc/string/strcasestr.c
@@ -8,13 +8,6 @@
 #include "_string.h"
 #include <ctype.h>
 
-#ifdef __UCLIBC_HAS_XLOCALE__
-libc_hidden_proto(__ctype_tolower_loc)
-#elif defined __UCLIBC_HAS_CTYPE_TABLES__
-libc_hidden_proto(__ctype_tolower)
-#endif
-libc_hidden_proto(tolower)
-
 char *strcasestr(const char *s1, const char *s2)
 {
 	register const char *s = s1;
diff --git a/libc/string/strcat.c b/libc/string/strcat.c
index 40a9be111..63619bcc8 100644
--- a/libc/string/strcat.c
+++ b/libc/string/strcat.c
@@ -13,8 +13,6 @@
 # define Wstrcat strcat
 #endif
 
-libc_hidden_proto(Wstrcat)
-
 Wchar *Wstrcat(Wchar * __restrict s1, register const Wchar * __restrict s2)
 {
 	register Wchar *s = s1;
diff --git a/libc/string/strchr.c b/libc/string/strchr.c
index 329545e9f..7ea477362 100644
--- a/libc/string/strchr.c
+++ b/libc/string/strchr.c
@@ -13,8 +13,6 @@
 # define Wstrchr strchr
 #endif
 
-libc_hidden_proto(Wstrchr)
-
 Wchar *Wstrchr(register const Wchar *s, Wint c)
 {
 	do {
@@ -25,8 +23,9 @@ Wchar *Wstrchr(register const Wchar *s, Wint c)
 
 	return NULL;
 }
-libc_hidden_def(Wstrchr)
-
-#if !defined WANT_WIDE && defined __UCLIBC_SUSV3_LEGACY__
+#ifndef WANT_WIDE
+libc_hidden_def(strchr)
+# ifdef __UCLIBC_SUSV3_LEGACY__
 weak_alias(strchr,index)
+# endif
 #endif
diff --git a/libc/string/strchrnul.c b/libc/string/strchrnul.c
index 6fe7f6c3d..9c10e1fc8 100644
--- a/libc/string/strchrnul.c
+++ b/libc/string/strchrnul.c
@@ -15,13 +15,13 @@
 # define Wstrchrnul strchrnul
 #endif
 
-libc_hidden_proto(Wstrchrnul)
-
 Wchar *Wstrchrnul(register const Wchar *s, Wint c)
 {
 	--s;
 	while (*++s && (*s != ((Wchar)c)));
 	return (Wchar *) s;
 }
-libc_hidden_def(Wstrchrnul)
+# ifndef WANT_WIDE
+libc_hidden_def(strchrnul)
+# endif
 #endif
diff --git a/libc/string/strcmp.c b/libc/string/strcmp.c
index 5477adf3a..abae61812 100644
--- a/libc/string/strcmp.c
+++ b/libc/string/strcmp.c
@@ -15,8 +15,6 @@
 # define Wstrcoll strcoll
 #endif
 
-libc_hidden_proto(Wstrcmp)
-
 int Wstrcmp(register const Wchar *s1, register const Wchar *s2)
 {
 #ifdef WANT_WIDE
@@ -40,7 +38,6 @@ int Wstrcmp(register const Wchar *s1, register const Wchar *s2)
 libc_hidden_def(Wstrcmp)
 
 #ifndef __UCLIBC_HAS_LOCALE__
-libc_hidden_proto(Wstrcoll)
 strong_alias(Wstrcmp,Wstrcoll)
 libc_hidden_def(Wstrcoll)
 #endif
diff --git a/libc/string/strcpy.c b/libc/string/strcpy.c
index cda4094ac..549360c22 100644
--- a/libc/string/strcpy.c
+++ b/libc/string/strcpy.c
@@ -13,20 +13,15 @@
 # define Wstrcpy strcpy
 #endif
 
-libc_hidden_proto(Wstrcpy)
-
 Wchar *Wstrcpy(Wchar * __restrict s1, const Wchar * __restrict s2)
 {
 	register Wchar *s = s1;
 
-#ifdef __BCC__
-	do {
-		*s = *s2++;
-	} while (*s++ != 0);
-#else
 	while ( (*s++ = *s2++) != 0 );
-#endif
 
 	return s1;
 }
-libc_hidden_def(Wstrcpy)
+
+#ifndef WANT_WIDE
+libc_hidden_def(strcpy)
+#endif
diff --git a/libc/string/strcspn.c b/libc/string/strcspn.c
index 1ec460a15..0466af99b 100644
--- a/libc/string/strcspn.c
+++ b/libc/string/strcspn.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstrcspn wcscspn
 #else
-/* Experimentally off - libc_hidden_proto(strcspn) */
 # define Wstrcspn strcspn
 #endif
 
diff --git a/libc/string/strdup.c b/libc/string/strdup.c
index 61fc186c8..049a23f63 100644
--- a/libc/string/strdup.c
+++ b/libc/string/strdup.c
@@ -9,16 +9,12 @@
 #include <stdlib.h>
 
 #ifdef WANT_WIDE
-libc_hidden_proto(wcslen)
 # define Wstrdup wcsdup
 # define Wstrlen wcslen
 #else
-/* Experimentally off - libc_hidden_proto(strdup) */
-/* Experimentally off - libc_hidden_proto(strlen) */
 # define Wstrdup strdup
 # define Wstrlen strlen
 #endif
-/* Experimentally off - libc_hidden_proto(memcpy) */
 
 Wchar *Wstrdup(register const Wchar *s1)
 {
diff --git a/libc/string/strerror.c b/libc/string/strerror.c
index 355c7bdda..7250da07d 100644
--- a/libc/string/strerror.c
+++ b/libc/string/strerror.c
@@ -9,8 +9,6 @@
 #include <string.h>
 #include "_syserrmsg.h"
 
-/* Experimentally off - libc_hidden_proto(strerror) */
-libc_hidden_proto(__xpg_strerror_r)
 
 char *strerror(int errnum)
 {
diff --git a/libc/string/strlcpy.c b/libc/string/strlcpy.c
index cdad4dc5d..83787049a 100644
--- a/libc/string/strlcpy.c
+++ b/libc/string/strlcpy.c
@@ -11,21 +11,14 @@
 # define Wstrlcpy __wcslcpy
 # define Wstrxfrm wcsxfrm
 #else
-/* Experimentally off - libc_hidden_proto(strlcpy) */
 # define Wstrlcpy strlcpy
 # define Wstrxfrm strxfrm
 #endif
 
-
 /* OpenBSD function:
  * Copy at most n-1 chars from src to dst and nul-terminate dst.
  * Returns strlen(src), so truncation occurred if the return value is >= n. */
 
-#ifdef WANT_WIDE
-size_t Wstrlcpy(register Wchar *__restrict dst,
-				  register const Wchar *__restrict src,
-				  size_t n) attribute_hidden;
-#endif
 size_t Wstrlcpy(register Wchar *__restrict dst,
 				  register const Wchar *__restrict src,
 				  size_t n)
@@ -51,13 +44,8 @@ size_t Wstrlcpy(register Wchar *__restrict dst,
 }
 #ifndef WANT_WIDE
 libc_hidden_def(strlcpy)
-#ifndef __UCLIBC_HAS_LOCALE__
-/* Experimentally off - libc_hidden_proto(strxfrm) */
-strong_alias(strlcpy,strxfrm)
-libc_hidden_def(strxfrm)
 #endif
-#else
+
 #ifndef __UCLIBC_HAS_LOCALE__
-strong_alias(__wcslcpy,wcsxfrm)
-#endif
+strong_alias(Wstrlcpy,Wstrxfrm)
 #endif
diff --git a/libc/string/strlen.c b/libc/string/strlen.c
index 2edb6e4e8..021a8cabc 100644
--- a/libc/string/strlen.c
+++ b/libc/string/strlen.c
@@ -13,8 +13,6 @@
 # define Wstrlen strlen
 #endif
 
-libc_hidden_proto(Wstrlen)
-
 size_t Wstrlen(const Wchar *s)
 {
 	register const Wchar *p;
diff --git a/libc/string/strncasecmp.c b/libc/string/strncasecmp.c
index ed052fa21..2eac47dd4 100644
--- a/libc/string/strncasecmp.c
+++ b/libc/string/strncasecmp.c
@@ -12,28 +12,15 @@
 #ifdef WANT_WIDE
 # define strncasecmp wcsncasecmp
 # define strncasecmp_l wcsncasecmp_l
-libc_hidden_proto(wcsncasecmp)
-# if defined(__USE_GNU) && defined(__UCLIBC_HAS_XLOCALE__)
-libc_hidden_proto(wcsncasecmp_l)
-# endif
 # ifdef __UCLIBC_DO_XLOCALE
-libc_hidden_proto(towlower_l)
 #  define TOLOWER(C) towlower_l((C), locale_arg)
 # else
-libc_hidden_proto(towlower)
 #  define TOLOWER(C) towlower((C))
 # endif
 #else
-/* Experimentally off - libc_hidden_proto(strncasecmp) */
-/* Experimentally off - libc_hidden_proto(strncasecmp_l) */
 # ifdef __UCLIBC_DO_XLOCALE
-libc_hidden_proto(tolower_l)
 #  define TOLOWER(C) tolower_l((C), locale_arg)
 # else
-#if !defined __UCLIBC_HAS_XLOCALE__ && defined __UCLIBC_HAS_CTYPE_TABLES__
-libc_hidden_proto(__ctype_tolower)
-#endif
-libc_hidden_proto(tolower)
 #  define TOLOWER(C) tolower((C))
 # endif
 #endif
@@ -44,11 +31,12 @@ int strncasecmp(register const Wchar *s1, register const Wchar *s2, size_t n)
 {
 	return strncasecmp_l(s1, s2, n, __UCLIBC_CURLOCALE);
 }
+#ifndef WANT_WIDE
 libc_hidden_def(strncasecmp)
+#endif
 
 #else  /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
 
-/* Experimentally off - libc_hidden_proto(__XL_NPP(strncasecmp)) */
 int __XL_NPP(strncasecmp)(register const Wchar *s1, register const Wchar *s2,
 					  size_t n   __LOCALE_PARAM )
 {
@@ -76,6 +64,8 @@ int __XL_NPP(strncasecmp)(register const Wchar *s1, register const Wchar *s2,
 	return r;
 #endif
 }
+#if !defined WANT_WIDE || (defined WANT_WIDE && defined __UCLIBC_DO_XLOCALE)
 libc_hidden_def(__XL_NPP(strncasecmp))
+#endif
 
 #endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
diff --git a/libc/string/strncat.c b/libc/string/strncat.c
index 0180d1328..0fa9b4ae1 100644
--- a/libc/string/strncat.c
+++ b/libc/string/strncat.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstrncat wcsncat
 #else
-/* Experimentally off - libc_hidden_proto(strncat) */
 # define Wstrncat strncat
 #endif
 
@@ -21,14 +20,10 @@ Wchar *Wstrncat(Wchar * __restrict s1, register const Wchar * __restrict s2,
 
 	while (*s++);
 	--s;
-#ifdef __BCC__
-	while (n-- && ((*s = *s2++) != 0)) ++s;
-#else
 	while (n && ((*s = *s2++) != 0)) {
 		--n;
 		++s;
 	}
-#endif
 	*s = 0;
 
 	return s1;
diff --git a/libc/string/strncmp.c b/libc/string/strncmp.c
index 59e4a2c22..2da61771c 100644
--- a/libc/string/strncmp.c
+++ b/libc/string/strncmp.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstrncmp wcsncmp
 #else
-/* Experimentally off - libc_hidden_proto(strncmp) */
 # define Wstrncmp strncmp
 #endif
 
diff --git a/libc/string/strncpy.c b/libc/string/strncpy.c
index d93561294..4a44e1f02 100644
--- a/libc/string/strncpy.c
+++ b/libc/string/strncpy.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstrncpy wcsncpy
 #else
-/* Experimentally off - libc_hidden_proto(strncpy) */
 # define Wstrncpy strncpy
 #endif
 
@@ -19,18 +18,11 @@ Wchar *Wstrncpy(Wchar * __restrict s1, register const Wchar * __restrict s2,
 {
 	register Wchar *s = s1;
 
-#ifdef __BCC__
-	while (n--) {
-		if ((*s = *s2) != 0) s2++; /* Need to fill tail with 0s. */
-		++s;
-	}
-#else
 	while (n) {
 		if ((*s = *s2) != 0) s2++; /* Need to fill tail with 0s. */
 		++s;
 		--n;
 	}
-#endif
 
 	return s1;
 }
diff --git a/libc/string/strndup.c b/libc/string/strndup.c
index 96a36d404..8e608669c 100644
--- a/libc/string/strndup.c
+++ b/libc/string/strndup.c
@@ -8,9 +8,6 @@
 #include "_string.h"
 #include <stdlib.h>
 
-/* Experimentally off - libc_hidden_proto(strndup) */
-/* Experimentally off - libc_hidden_proto(strnlen) */
-/* Experimentally off - libc_hidden_proto(memcpy) */
 
 char *strndup(register const char *s1, size_t n)
 {
diff --git a/libc/string/strnlen.c b/libc/string/strnlen.c
index 8fbc25c11..08de0887d 100644
--- a/libc/string/strnlen.c
+++ b/libc/string/strnlen.c
@@ -15,26 +15,17 @@
 # define Wstrnlen strnlen
 #endif
 
-libc_hidden_proto(Wstrnlen)
-
 size_t Wstrnlen(const Wchar *s, size_t max)
 {
 	register const Wchar *p = s;
-#ifdef __BCC__
-	/* bcc can optimize the counter if it thinks it is a pointer... */
-	register const char *maxp = (const char *) max;
-#else
-# define maxp max
-#endif
 
-	while (maxp && *p) {
+	while (max && *p) {
 		++p;
-		--maxp;
+		--max;
 	}
 
 	return p - s;
 }
-#undef maxp
 
 libc_hidden_def(Wstrnlen)
 #endif
diff --git a/libc/string/strpbrk.c b/libc/string/strpbrk.c
index abeb84380..ddfc75172 100644
--- a/libc/string/strpbrk.c
+++ b/libc/string/strpbrk.c
@@ -13,8 +13,6 @@
 # define Wstrpbrk strpbrk
 #endif
 
-libc_hidden_proto(Wstrpbrk)
-
 Wchar *Wstrpbrk(const Wchar *s1, const Wchar *s2)
 {
 	register const Wchar *s;
diff --git a/libc/string/strrchr.c b/libc/string/strrchr.c
index 253c4166d..db12bbc7c 100644
--- a/libc/string/strrchr.c
+++ b/libc/string/strrchr.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstrrchr wcsrchr
 #else
-/* Experimentally off - libc_hidden_proto(strrchr) */
 # define Wstrrchr strrchr
 #endif
 
diff --git a/libc/string/strsep.c b/libc/string/strsep.c
index 373b00a71..ce17dcf89 100644
--- a/libc/string/strsep.c
+++ b/libc/string/strsep.c
@@ -9,10 +9,7 @@
 
 #ifdef __USE_BSD
 
-/* Experimentally off - libc_hidden_proto(strpbrk) */
-/* Experimentally off - libc_hidden_proto(strcspn) */
 
-/* Experimentally off - libc_hidden_proto(strsep) */
 char *strsep(char ** __restrict s1, const char * __restrict s2)
 {
 	register char *s = *s1;
diff --git a/libc/string/strsignal.c b/libc/string/strsignal.c
index ee083d649..0fbbf8504 100644
--- a/libc/string/strsignal.c
+++ b/libc/string/strsignal.c
@@ -18,16 +18,13 @@
 #include <bits/uClibc_uintmaxtostr.h>
 #include <signal.h>
 
-/* Experimentally off - libc_hidden_proto(strsignal) */
-/* Experimentally off - libc_hidden_proto(memcpy) */
-
 #define _SYS_NSIG			32
 
 #ifdef __UCLIBC_HAS_SIGNUM_MESSAGES__
 # define _SYS_SIGMSG_MAXLEN	25
-#else  /* __UCLIBC_HAS_SIGNUM_MESSAGES__ */
+#else
 # define _SYS_SIGMSG_MAXLEN	0
-#endif /* __UCLIBC_HAS_SIGNUM_MESSAGES__ */
+#endif
 
 #if _SYS_SIGMSG_MAXLEN < __UIM_BUFLEN_INT + 15
 # define _STRSIGNAL_BUFSIZE (__UIM_BUFLEN_INT + 15)
@@ -85,16 +82,16 @@ static const unsigned char sstridx[] = {
 
 char *strsignal(int signum)
 {
-    register char *s;
-    int i;
-    static char buf[_STRSIGNAL_BUFSIZE];
-    static const char unknown[] = {
+	register char *s;
+	int i;
+	static char buf[_STRSIGNAL_BUFSIZE];
+	static const char unknown[] = {
 		'U', 'n', 'k', 'n', 'o', 'w', 'n', ' ', 's', 'i', 'g', 'n', 'a', 'l', ' '
-    };
+	};
 
 #if defined(__alpha__) || defined(__mips__) || defined(__hppa__) || defined(__sparc__)
 	/* Need to translate signum to string index. */
-	for (i = 0 ; i < sizeof(sstridx)/sizeof(sstridx[0]) ; i++) {
+	for (i = 0; i < sizeof(sstridx)/sizeof(sstridx[0]); i++) {
 		if (sstridx[i] == signum) {
 			goto GOT_SSTRIDX;
 		}
@@ -106,12 +103,12 @@ char *strsignal(int signum)
 	i = signum;
 #endif
 
-    if (((unsigned int) signum) < _SYS_NSIG) {
+	if (((unsigned int) signum) < _SYS_NSIG) {
 		/* Trade time for space.  This function should rarely be called
 		 * so rather than keeping an array of pointers for the different
 		 * messages, just run through the buffer until we find the
 		 * correct string. */
-		for (s = (char *) _string_syssigmsgs ; i ; ++s) {
+		for (s = (char *) _string_syssigmsgs; i; ++s) {
 			if (!*s) {
 				--i;
 			}
@@ -119,10 +116,10 @@ char *strsignal(int signum)
 		if (*s) {		/* Make sure we have an actual message. */
 			goto DONE;
 		}
-    }
+	}
 
-    s = _int10tostr(buf+sizeof(buf)-1, signum) - sizeof(unknown);
-    memcpy(s, unknown, sizeof(unknown));
+	s = _int10tostr(buf + sizeof(buf)-1, signum) - sizeof(unknown);
+	memcpy(s, unknown, sizeof(unknown));
 
  DONE:
 	return s;
@@ -132,13 +129,12 @@ char *strsignal(int signum)
 
 char *strsignal(int signum)
 {
-    static char buf[_STRSIGNAL_BUFSIZE];
-    static const char unknown[] = {
+	static char buf[_STRSIGNAL_BUFSIZE];
+	static const char unknown[] = {
 		'U', 'n', 'k', 'n', 'o', 'w', 'n', ' ', 's', 'i', 'g', 'n', 'a', 'l', ' '
-    };
+	};
 
-    return (char *) memcpy(_int10tostr(buf+sizeof(buf)-1, signum)
-						   - sizeof(unknown),
+	return memcpy(_int10tostr(buf + sizeof(buf)-1, signum) - sizeof(unknown),
 						   unknown, sizeof(unknown));
 }
 
diff --git a/libc/string/strspn.c b/libc/string/strspn.c
index ca83ef900..942b6f308 100644
--- a/libc/string/strspn.c
+++ b/libc/string/strspn.c
@@ -13,8 +13,6 @@
 # define Wstrspn strspn
 #endif
 
-libc_hidden_proto(Wstrspn)
-
 size_t Wstrspn(const Wchar *s1, const Wchar *s2)
 {
 	register const Wchar *s = s1;
diff --git a/libc/string/strstr.c b/libc/string/strstr.c
index 05712e62b..7e2a64e7d 100644
--- a/libc/string/strstr.c
+++ b/libc/string/strstr.c
@@ -10,7 +10,6 @@
 #ifdef WANT_WIDE
 # define Wstrstr wcsstr
 #else
-/* Experimentally off - libc_hidden_proto(strstr) */
 # define Wstrstr strstr
 #endif
 
@@ -39,6 +38,6 @@ Wchar *Wstrstr(const Wchar *s1, const Wchar *s2)
 }
 #ifndef WANT_WIDE
 libc_hidden_def(strstr)
-#else
+#elif defined __UCLIBC_SUSV3_LEGACY__
 strong_alias(wcsstr,wcswcs)
 #endif
diff --git a/libc/string/strtok.c b/libc/string/strtok.c
index 159dd6b6a..c337d81a7 100644
--- a/libc/string/strtok.c
+++ b/libc/string/strtok.c
@@ -15,7 +15,6 @@
 # define Wstrtok_r strtok_r
 #endif
 
-/* Experimentally off - libc_hidden_proto(Wstrtok_r) */
 
 Wchar *Wstrtok(Wchar * __restrict s1, const Wchar * __restrict s2)
 {
diff --git a/libc/string/strtok_r.c b/libc/string/strtok_r.c
index 2ad7746b1..2026888f8 100644
--- a/libc/string/strtok_r.c
+++ b/libc/string/strtok_r.c
@@ -8,15 +8,10 @@
 #include "_string.h"
 
 #ifdef WANT_WIDE
-libc_hidden_proto(wcsspn)
-libc_hidden_proto(wcspbrk)
 # define Wstrtok_r wcstok
 # define Wstrspn wcsspn
 # define Wstrpbrk wcspbrk
 #else
-/* Experimentally off - libc_hidden_proto(strtok_r) */
-/* Experimentally off - libc_hidden_proto(strspn) */
-/* Experimentally off - libc_hidden_proto(strpbrk) */
 # define Wstrtok_r strtok_r
 # define Wstrspn strspn
 # define Wstrpbrk strpbrk
diff --git a/libc/string/strverscmp.c b/libc/string/strverscmp.c
new file mode 100644
index 000000000..7818a9186
--- /dev/null
+++ b/libc/string/strverscmp.c
@@ -0,0 +1,106 @@
+/* Compare strings while treating digits characters numerically.
+   Copyright (C) 1997-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jean-Fran�ois Bignolles <bignolle@ecoledoc.ibp.fr>, 1997.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <string.h>
+#include <ctype.h>
+
+/* states: S_N: normal, S_I: comparing integral part, S_F: comparing
+           fractionnal parts, S_Z: idem but with leading Zeroes only */
+#define  S_N    0x0
+#define  S_I    0x3
+#define  S_F    0x6
+#define  S_Z    0x9
+
+/* result_type: CMP: return diff; LEN: compare using len_diff/diff */
+#define  CMP    2
+#define  LEN    3
+
+
+/* Compare S1 and S2 as strings holding indices/version numbers,
+   returning less than, equal to or greater than zero if S1 is less than,
+   equal to or greater than S2 (for more info, see the texinfo doc).
+*/
+
+int strverscmp (const char *s1, const char *s2)
+{
+  const unsigned char *p1 = (const unsigned char *) s1;
+  const unsigned char *p2 = (const unsigned char *) s2;
+
+  /* Symbol(s)    0       [1-9]   others
+     Transition   (10) 0  (01) d  (00) x   */
+  static const uint8_t next_state[] =
+  {
+      /* state    x    d    0  */
+      /* S_N */  S_N, S_I, S_Z,
+      /* S_I */  S_N, S_I, S_I,
+      /* S_F */  S_N, S_F, S_F,
+      /* S_Z */  S_N, S_F, S_Z
+  };
+
+  static const int8_t result_type[] =
+  {
+      /* state   x/x  x/d  x/0  d/x  d/d  d/0  0/x  0/d  0/0  */
+
+      /* S_N */  CMP, CMP, CMP, CMP, LEN, CMP, CMP, CMP, CMP,
+      /* S_I */  CMP, -1,  -1,  +1,  LEN, LEN, +1,  LEN, LEN,
+      /* S_F */  CMP, CMP, CMP, CMP, CMP, CMP, CMP, CMP, CMP,
+      /* S_Z */  CMP, +1,  +1,  -1,  CMP, CMP, -1,  CMP, CMP
+  };
+  unsigned char c1, c2;
+  int state, diff;
+
+  if (p1 == p2)
+    return 0;
+
+  c1 = *p1++;
+  c2 = *p2++;
+  /* Hint: '0' is a digit too.  */
+  state = S_N + ((c1 == '0') + (isdigit (c1) != 0));
+
+  while ((diff = c1 - c2) == 0)
+    {
+      if (c1 == '\0')
+	return diff;
+
+      state = next_state[state];
+      c1 = *p1++;
+      c2 = *p2++;
+      state += (c1 == '0') + (isdigit (c1) != 0);
+    }
+
+  state = result_type[state * 3 + (((c2 == '0') + (isdigit (c2) != 0)))];
+
+  switch (state)
+  {
+    case CMP:
+      return diff;
+
+    case LEN:
+      while (isdigit (*p1++))
+	if (!isdigit (*p2++))
+	  return 1;
+
+      return isdigit (*p2) ? -1 : diff;
+
+    default:
+      return state;
+  }
+}
+libc_hidden_def(strverscmp)
diff --git a/libc/string/sys_errlist.c b/libc/string/sys_errlist.c
index 17ed4d62c..682ff0e7e 100644
--- a/libc/string/sys_errlist.c
+++ b/libc/string/sys_errlist.c
@@ -12,8 +12,6 @@ extern const char _string_syserrmsgs[] attribute_hidden;
 
 #ifdef __UCLIBC_HAS_SYS_ERRLIST__
 
-link_warning(_sys_errlist, "sys_nerr and sys_errlist are obsolete and uClibc support for them (in at least some configurations) will probably be unavailable in the near future.")
-
 const char *const sys_errlist[] = {
 	[0] =				_string_syserrmsgs + 0,
 	[EPERM] =			_string_syserrmsgs + 8,
diff --git a/libc/string/x86_64/bzero.S b/libc/string/x86_64/bzero.S
index 4d179ec4e..231d7cb41 100644
--- a/libc/string/x86_64/bzero.S
+++ b/libc/string/x86_64/bzero.S
@@ -1,5 +1,6 @@
 #include <features.h>
 #ifdef __UCLIBC_SUSV3_LEGACY__
 # define memset bzero
+# define __memset_chk __bzero_chk
 # include "memset.S"
 #endif
diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S
index 697b992d0..e164278df 100644
--- a/libc/string/x86_64/memcpy.S
+++ b/libc/string/x86_64/memcpy.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
@@ -26,7 +25,7 @@
 #define MEMPCPY_P (defined memcpy)
 
         .text
-#if defined PIC && !defined NOT_IN_libc
+#if defined __PIC__ && !defined NOT_IN_libc && defined __UCLIBC_HAS_FORTIFY__
 ENTRY (__memcpy_chk)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
diff --git a/libc/string/x86_64/mempcpy.S b/libc/string/x86_64/mempcpy.S
index 3816d9f72..b0607aa57 100644
--- a/libc/string/x86_64/mempcpy.S
+++ b/libc/string/x86_64/mempcpy.S
@@ -1,3 +1,4 @@
 #define memcpy mempcpy
+#define __memcpy_chk __mempcpy_chk
 #include "memcpy.S"
 libc_hidden_def(mempcpy)
diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S
index 46751006b..d6744129d 100644
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
@@ -29,7 +28,7 @@
 #define LARGE $120000
 
         .text
-#if !BZERO_P && defined PIC && !defined NOT_IN_libc
+#if defined __PIC__ && !defined NOT_IN_libc && defined __UCLIBC_HAS_FORTIFY__
 ENTRY (__memset_chk)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
@@ -142,6 +141,6 @@ END (memset)
 libc_hidden_def(memset)
 #endif
 
-#if !BZERO_P && defined PIC && !defined NOT_IN_libc
+#if !BZERO_P && defined __PIC__ && !defined NOT_IN_libc && defined __UCLIBC_HAS_FORTIFY__
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 #endif
diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S
index 23d068fea..55e09e5f1 100644
--- a/libc/string/x86_64/strcat.S
+++ b/libc/string/x86_64/strcat.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S
index 9ef46b7f2..256b97911 100644
--- a/libc/string/x86_64/strchr.S
+++ b/libc/string/x86_64/strchr.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/x86_64/strcmp.S b/libc/string/x86_64/strcmp.S
index 437e145bf..05d6f39c1 100644
--- a/libc/string/x86_64/strcmp.S
+++ b/libc/string/x86_64/strcmp.S
@@ -15,9 +15,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/x86_64/strcpy.S b/libc/string/x86_64/strcpy.S
index 612a30d1a..3ada70fbd 100644
--- a/libc/string/x86_64/strcpy.S
+++ b/libc/string/x86_64/strcpy.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/x86_64/strcspn.S b/libc/string/x86_64/strcspn.S
index fd9b09c48..7a06c8867 100644
--- a/libc/string/x86_64/strcspn.S
+++ b/libc/string/x86_64/strcspn.S
@@ -19,9 +19,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/x86_64/strlen.S b/libc/string/x86_64/strlen.S
index 4213f0ab6..9e84326c2 100644
--- a/libc/string/x86_64/strlen.S
+++ b/libc/string/x86_64/strlen.S
@@ -14,9 +14,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/x86_64/strspn.S b/libc/string/x86_64/strspn.S
index 41cff0490..366377649 100644
--- a/libc/string/x86_64/strspn.S
+++ b/libc/string/x86_64/strspn.S
@@ -19,9 +19,8 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
 #include "_glibc_inc.h"
 
diff --git a/libc/string/xtensa/memcpy.S b/libc/string/xtensa/memcpy.S
index 19f3a6818..244205611 100644
--- a/libc/string/xtensa/memcpy.S
+++ b/libc/string/xtensa/memcpy.S
@@ -13,11 +13,10 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <sysdep.h>
 #include <bits/xtensa-config.h>
 
 	.macro	src_b	r, w0, w1
@@ -83,7 +82,7 @@ __memcpy_aux:
 	loopnez	a4, 2f
 #else
 	beqz	a4, 2f
-	add	a7, a3, a4	// a7 = end address for source
+	add	a7, a3, a4	/* a7 = end address for source */
 #endif
 1:	l8ui	a6, a3, 0
 	addi	a3, a3, 1
@@ -92,13 +91,13 @@ __memcpy_aux:
 #if !XCHAL_HAVE_LOOPS
 	blt	a3, a7, 1b
 #endif
-2:	retw
+2:	abi_ret
 
 
 /* Destination is unaligned.  */
 
 	.align	4
-.Ldst1mod2: // dst is only byte aligned
+.Ldst1mod2: /* dst is only byte aligned */
 
 	/* Do short copies byte-by-byte.  */
 	_bltui	a4, 7, .Lbytecopy
@@ -113,7 +112,7 @@ __memcpy_aux:
 	/* Return to main algorithm if dst is now aligned.  */
 	_bbci.l	a5, 1, .Ldstaligned
 
-.Ldst2mod4: // dst has 16-bit alignment
+.Ldst2mod4: /* dst has 16-bit alignment */
 
 	/* Do short copies byte-by-byte.  */
 	_bltui	a4, 6, .Lbytecopy
@@ -134,7 +133,7 @@ __memcpy_aux:
 ENTRY (memcpy)
 	/* a2 = dst, a3 = src, a4 = len */
 
-	mov	a5, a2		// copy dst so that a2 is return value
+	mov	a5, a2		/* copy dst so that a2 is return value */
 	_bbsi.l	a2, 0, .Ldst1mod2
 	_bbsi.l	a2, 1, .Ldst2mod4
 .Ldstaligned:
@@ -152,7 +151,7 @@ ENTRY (memcpy)
 #else
 	beqz	a7, 2f
 	slli	a8, a7, 4
-	add	a8, a8, a3	// a8 = end of last 16B source chunk
+	add	a8, a8, a3	/* a8 = end of last 16B source chunk */
 #endif
 1:	l32i	a6, a3, 0
 	l32i	a7, a3, 4
@@ -182,7 +181,7 @@ ENTRY (memcpy)
 3:	bbsi.l	a4, 2, 4f
 	bbsi.l	a4, 1, 5f
 	bbsi.l	a4, 0, 6f
-	retw
+	abi_ret
 
 	/* Copy 4 bytes.  */
 4:	l32i	a6, a3, 0
@@ -191,7 +190,7 @@ ENTRY (memcpy)
 	addi	a5, a5, 4
 	bbsi.l	a4, 1, 5f
 	bbsi.l	a4, 0, 6f
-	retw
+	abi_ret
 
 	/* Copy 2 bytes.  */
 5:	l16ui	a6, a3, 0
@@ -199,14 +198,14 @@ ENTRY (memcpy)
 	s16i	a6, a5, 0
 	addi	a5, a5, 2
 	bbsi.l	a4, 0, 6f
-	retw
+	abi_ret
 
 	/* Copy 1 byte.  */
 6:	l8ui	a6, a3, 0
 	s8i	a6, a5, 0
 
 .Ldone:
-	retw
+	abi_ret
 
 
 /* Destination is aligned; source is unaligned.  */
@@ -218,18 +217,18 @@ ENTRY (memcpy)
 
 	/* Copy 16 bytes per iteration for word-aligned dst and
 	   unaligned src.  */
-	ssa8	a3		// set shift amount from byte offset
+	ssa8	a3		/* set shift amount from byte offset */
 #if UNALIGNED_ADDRESSES_CHECKED
-	and	a11, a3, a8	// save unalignment offset for below
-	sub	a3, a3, a11	// align a3
+	and	a11, a3, a8	/* save unalignment offset for below */
+	sub	a3, a3, a11	/* align a3 */
 #endif
-	l32i	a6, a3, 0	// load first word
+	l32i	a6, a3, 0	/* load first word */
 #if XCHAL_HAVE_LOOPS
 	loopnez	a7, 2f
 #else
 	beqz	a7, 2f
 	slli	a10, a7, 4
-	add	a10, a10, a3	// a10 = end of last 16B source chunk
+	add	a10, a10, a3	/* a10 = end of last 16B source chunk */
 #endif
 1:	l32i	a7, a3, 4
 	l32i	a8, a3, 8
@@ -273,11 +272,11 @@ ENTRY (memcpy)
 	mov	a6, a7
 4:
 #if UNALIGNED_ADDRESSES_CHECKED
-	add	a3, a3, a11	// readjust a3 with correct misalignment
+	add	a3, a3, a11	/* readjust a3 with correct misalignment */
 #endif
 	bbsi.l	a4, 1, 5f
 	bbsi.l	a4, 0, 6f
-	retw
+	abi_ret
 
 	/* Copy 2 bytes.  */
 5:	l8ui	a6, a3, 0
@@ -287,11 +286,11 @@ ENTRY (memcpy)
 	s8i	a7, a5, 1
 	addi	a5, a5, 2
 	bbsi.l	a4, 0, 6f
-	retw
+	abi_ret
 
 	/* Copy 1 byte.  */
 6:	l8ui	a6, a3, 0
 	s8i	a6, a5, 0
-	retw
+	abi_ret
 
 libc_hidden_def (memcpy)
diff --git a/libc/string/xtensa/memset.S b/libc/string/xtensa/memset.S
index c0928825d..20bf14c75 100644
--- a/libc/string/xtensa/memset.S
+++ b/libc/string/xtensa/memset.S
@@ -13,11 +13,10 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <sysdep.h>
 #include <bits/xtensa-config.h>
 
 /* Do not use .literal_position in the ENTRY macro.  */
@@ -29,7 +28,7 @@
    The algorithm is as follows:
 
    Create a word with c in all byte positions.
-	
+
    If the destination is aligned, set 16B chunks with a loop, and then
    finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
 
@@ -57,21 +56,21 @@ __memset_aux:
 	loopnez	a4, 2f
 #else
 	beqz	a4, 2f
-	add	a6, a5, a4	// a6 = ending address
+	add	a6, a5, a4	/* a6 = ending address */
 #endif
 1:	s8i	a3, a5, 0
 	addi	a5, a5, 1
 #if !XCHAL_HAVE_LOOPS
 	blt	a5, a6, 1b
 #endif
-2:	retw
+2:	abi_ret
 
 
 /* Destination is unaligned.  */
 
 	.align	4
 
-.Ldst1mod2: // dst is only byte aligned
+.Ldst1mod2: /* dst is only byte aligned */
 
 	/* Do short sizes byte-by-byte.  */
 	bltui	a4, 8, .Lbyteset
@@ -84,7 +83,7 @@ __memset_aux:
 	/* Now retest if dst is aligned.  */
 	_bbci.l	a5, 1, .Ldstaligned
 
-.Ldst2mod4: // dst has 16-bit alignment
+.Ldst2mod4: /* dst has 16-bit alignment */
 
 	/* Do short sizes byte-by-byte.  */
 	bltui	a4, 8, .Lbyteset
@@ -108,7 +107,7 @@ ENTRY (memset)
 	slli	a7, a3, 16
 	or	a3, a3, a7
 
-	mov	a5, a2		// copy dst so that a2 is return value
+	mov	a5, a2		/* copy dst so that a2 is return value */
 
 	/* Check if dst is unaligned.  */
 	_bbsi.l	a2, 0, .Ldst1mod2
@@ -124,7 +123,7 @@ ENTRY (memset)
 #else
 	beqz	a7, 2f
 	slli	a6, a7, 4
-	add	a6, a6, a5	// a6 = end of last 16B chunk
+	add	a6, a6, a5	/* a6 = end of last 16B chunk */
 #endif
 	/* Set 16 bytes per iteration.  */
 1:	s32i	a3, a5, 0
@@ -160,6 +159,6 @@ ENTRY (memset)
 
 	/* Set 1 byte.  */
 	s8i	a3, a5, 0
-6:	retw
+6:	abi_ret
 
 libc_hidden_def (memset)
diff --git a/libc/string/xtensa/strcmp.S b/libc/string/xtensa/strcmp.S
index 622bb27ed..2dce590db 100644
--- a/libc/string/xtensa/strcmp.S
+++ b/libc/string/xtensa/strcmp.S
@@ -13,11 +13,10 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <sysdep.h>
 #include <bits/xtensa-config.h>
 #include <features.h>
 
@@ -35,45 +34,46 @@
 
 #define MASK4 0x40404040
 
+	.text
+	.align 4
+	.literal_position
 	.literal .Lmask0, MASK0
 	.literal .Lmask1, MASK1
 	.literal .Lmask2, MASK2
 	.literal .Lmask3, MASK3
 	.literal .Lmask4, MASK4
-
-	.text
 ENTRY (strcmp)
 	/* a2 = s1, a3 = s2 */
 
-	l8ui	a8, a2, 0	// byte 0 from s1
-	l8ui	a9, a3, 0	// byte 0 from s2
-	movi	a10, 3		// mask
+	l8ui	a8, a2, 0	/* byte 0 from s1 */
+	l8ui	a9, a3, 0	/* byte 0 from s2 */
+	movi	a10, 3		/* mask */
 	bne	a8, a9, .Lretdiff
 
 	or	a11, a2, a3
 	bnone	a11, a10, .Laligned
 
-	xor	a11, a2, a3	// compare low two bits of s1 and s2
-	bany	a11, a10, .Lunaligned	// if they have different alignment
+	xor	a11, a2, a3	/* compare low two bits of s1 and s2 */
+	bany	a11, a10, .Lunaligned	/* if they have different alignment */
 
 	/* s1/s2 are not word-aligned.  */
-	addi	a2, a2, 1	// advance s1
-	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
-	addi	a3, a3, 1	// advance s2
-	bnone	a2, a10, .Laligned // if s1/s2 now aligned
-	l8ui	a8, a2, 0	// byte 1 from s1
-	l8ui	a9, a3, 0	// byte 1 from s2
-	addi	a2, a2, 1	// advance s1
-	bne	a8, a9, .Lretdiff // if different, return difference
-	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
-	addi	a3, a3, 1	// advance s2
-	bnone	a2, a10, .Laligned // if s1/s2 now aligned
-	l8ui	a8, a2, 0	// byte 2 from s1
-	l8ui	a9, a3, 0	// byte 2 from s2
-	addi	a2, a2, 1	// advance s1
-	bne	a8, a9, .Lretdiff // if different, return difference
-	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
-	addi	a3, a3, 1	// advance s2
+	addi	a2, a2, 1	/* advance s1 */
+	beqz	a8, .Leq	/* bytes equal, if zero, strings are equal */
+	addi	a3, a3, 1	/* advance s2 */
+	bnone	a2, a10, .Laligned /* if s1/s2 now aligned */
+	l8ui	a8, a2, 0	/* byte 1 from s1 */
+	l8ui	a9, a3, 0	/* byte 1 from s2 */
+	addi	a2, a2, 1	/* advance s1 */
+	bne	a8, a9, .Lretdiff /* if different, return difference */
+	beqz	a8, .Leq	/* bytes equal, if zero, strings are equal */
+	addi	a3, a3, 1	/* advance s2 */
+	bnone	a2, a10, .Laligned /* if s1/s2 now aligned */
+	l8ui	a8, a2, 0	/* byte 2 from s1 */
+	l8ui	a9, a3, 0	/* byte 2 from s2 */
+	addi	a2, a2, 1	/* advance s1 */
+	bne	a8, a9, .Lretdiff /* if different, return difference */
+	beqz	a8, .Leq	/* bytes equal, if zero, strings are equal */
+	addi	a3, a3, 1	/* advance s2 */
 	j	.Laligned
 
 /* s1 and s2 have different alignment.
@@ -92,8 +92,8 @@ ENTRY (strcmp)
 	/* (2 mod 4) alignment for loop instruction */
 .Lunaligned:
 #if XCHAL_HAVE_LOOPS
-	_movi.n	a8, 0		// set up for the maximum loop count
-	loop	a8, .Lretdiff	// loop forever (almost anyway)
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, .Lretdiff	/* loop forever (almost anyway) */
 #endif
 .Lnextbyte:
 	l8ui	a8, a2, 0
@@ -108,7 +108,7 @@ ENTRY (strcmp)
 #endif
 .Lretdiff:
 	sub	a2, a8, a9
-	retw
+	abi_ret
 
 /* s1 is word-aligned; s2 is word-aligned.
 
@@ -131,32 +131,32 @@ ENTRY (strcmp)
 #if XCHAL_HAVE_LOOPS
 .Laligned:
 	.begin	no-transform
-	l32r	a4, .Lmask0	// mask for byte 0
+	l32r	a4, .Lmask0	/* mask for byte 0 */
 	l32r	a7, .Lmask4
 	/* Loop forever.  (a4 is more than than the maximum number
 	   of iterations) */
 	loop	a4, .Laligned_done
 
 	/* First unrolled loop body.  */
-	l32i	a8, a2, 0	// get word from s1
-	l32i	a9, a3, 0	// get word from s2
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
 	slli	a5, a8, 1
 	bne	a8, a9, .Lwne2
 	or	a9, a8, a5
 	bnall	a9, a7, .Lprobeq
 
 	/* Second unrolled loop body.  */
-	l32i	a8, a2, 4	// get word from s1+4
-	l32i	a9, a3, 4	// get word from s2+4
+	l32i	a8, a2, 4	/* get word from s1+4 */
+	l32i	a9, a3, 4	/* get word from s2+4 */
 	slli	a5, a8, 1
 	bne	a8, a9, .Lwne2
 	or	a9, a8, a5
 	bnall	a9, a7, .Lprobeq2
 
-	addi	a2, a2, 8	// advance s1 pointer
-	addi	a3, a3, 8	// advance s2 pointer
+	addi	a2, a2, 8	/* advance s1 pointer */
+	addi	a3, a3, 8	/* advance s2 pointer */
 .Laligned_done:
-	or	a1, a1, a1	// nop
+	or	a1, a1, a1	/* nop */
 
 .Lprobeq2:
 	/* Adjust pointers to account for the loop unrolling.  */
@@ -166,15 +166,15 @@ ENTRY (strcmp)
 #else /* !XCHAL_HAVE_LOOPS */
 
 .Laligned:
-	movi	a4, MASK0	// mask for byte 0
+	movi	a4, MASK0	/* mask for byte 0 */
 	movi	a7, MASK4
 	j	.Lfirstword
 .Lnextword:
-	addi	a2, a2, 4	// advance s1 pointer
-	addi	a3, a3, 4	// advance s2 pointer
+	addi	a2, a2, 4	/* advance s1 pointer */
+	addi	a3, a3, 4	/* advance s2 pointer */
 .Lfirstword:
-	l32i	a8, a2, 0	// get word from s1
-	l32i	a9, a3, 0	// get word from s2
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
 	slli	a5, a8, 1
 	bne	a8, a9, .Lwne2
 	or	a9, a8, a5
@@ -186,50 +186,50 @@ ENTRY (strcmp)
 	/* Words are probably equal, but check for sure.
 	   If not, loop over the rest of string using normal algorithm.  */
 
-	bnone	a8, a4, .Leq	// if byte 0 is zero
-	l32r	a5, .Lmask1	// mask for byte 1
-	l32r	a6, .Lmask2	// mask for byte 2
-	bnone	a8, a5, .Leq	// if byte 1 is zero
-	l32r	a7, .Lmask3	// mask for byte 3
-	bnone	a8, a6, .Leq	// if byte 2 is zero
-	bnone	a8, a7, .Leq	// if byte 3 is zero
-	addi.n	a2, a2, 4	// advance s1 pointer
-	addi.n	a3, a3, 4	// advance s2 pointer
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	l32r	a5, .Lmask1	/* mask for byte 1 */
+	l32r	a6, .Lmask2	/* mask for byte 2 */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	l32r	a7, .Lmask3	/* mask for byte 3 */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+	bnone	a8, a7, .Leq	/* if byte 3 is zero */
+	addi.n	a2, a2, 4	/* advance s1 pointer */
+	addi.n	a3, a3, 4	/* advance s2 pointer */
 #if XCHAL_HAVE_LOOPS
 
 	/* align (1 mod 4) */
-	loop	a4, .Leq	// loop forever (a4 is bigger than max iters)
+	loop	a4, .Leq	/* loop forever (a4 is bigger than max iters) */
 	.end	no-transform
 
-	l32i	a8, a2, 0	// get word from s1
-	l32i	a9, a3, 0	// get word from s2
-	addi	a2, a2, 4	// advance s1 pointer
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
+	addi	a2, a2, 4	/* advance s1 pointer */
 	bne	a8, a9, .Lwne
-	bnone	a8, a4, .Leq	// if byte 0 is zero
-	bnone	a8, a5, .Leq	// if byte 1 is zero
-	bnone	a8, a6, .Leq	// if byte 2 is zero
-	bnone	a8, a7, .Leq	// if byte 3 is zero
-	addi	a3, a3, 4	// advance s2 pointer
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+	bnone	a8, a7, .Leq	/* if byte 3 is zero */
+	addi	a3, a3, 4	/* advance s2 pointer */
 
 #else /* !XCHAL_HAVE_LOOPS */
 
 	j	.Lfirstword2
 .Lnextword2:
-	addi	a3, a3, 4	// advance s2 pointer
+	addi	a3, a3, 4	/* advance s2 pointer */
 .Lfirstword2:
-	l32i	a8, a2, 0	// get word from s1
-	l32i	a9, a3, 0	// get word from s2
-	addi	a2, a2, 4	// advance s1 pointer
+	l32i	a8, a2, 0	/* get word from s1 */
+	l32i	a9, a3, 0	/* get word from s2 */
+	addi	a2, a2, 4	/* advance s1 pointer */
 	bne	a8, a9, .Lwne
-	bnone	a8, a4, .Leq	// if byte 0 is zero
-	bnone	a8, a5, .Leq	// if byte 1 is zero
-	bnone	a8, a6, .Leq	// if byte 2 is zero
-	bany	a8, a7, .Lnextword2	// if byte 3 is zero
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
+	bany	a8, a7, .Lnextword2	/* if byte 3 is zero */
 #endif /* !XCHAL_HAVE_LOOPS */
 
 	/* Words are equal; some byte is zero.  */
-.Leq:	movi	a2, 0		// return equal
-	retw
+.Leq:	movi	a2, 0		/* return equal */
+	abi_ret
 
 .Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
 	   bytes are zero, the return value can be determined by a simple
@@ -239,22 +239,22 @@ ENTRY (strcmp)
 	bnall	a10, a7, .Lsomezero
 	bgeu	a8, a9, .Lposreturn
 	movi	a2, -1
-	retw
+	abi_ret
 .Lposreturn:
 	movi	a2, 1
-	retw
-.Lsomezero:	// There is probably some zero byte.
+	abi_ret
+.Lsomezero:	/* There is probably some zero byte. */
 #endif /* __XTENSA_EB__ */
 .Lwne:	/* Words are not equal.  */
-	xor	a2, a8, a9	// get word with nonzero in byte that differs
-	bany	a2, a4, .Ldiff0	// if byte 0 differs
-	movi	a5, MASK1	// mask for byte 1
-	bnone	a8, a4, .Leq	// if byte 0 is zero
-	bany	a2, a5, .Ldiff1	// if byte 1 differs
-	movi	a6, MASK2	// mask for byte 2
-	bnone	a8, a5, .Leq	// if byte 1 is zero
-	bany	a2, a6, .Ldiff2	// if byte 2 differs
-	bnone	a8, a6, .Leq	// if byte 2 is zero
+	xor	a2, a8, a9	/* get word with nonzero in byte that differs */
+	bany	a2, a4, .Ldiff0	/* if byte 0 differs */
+	movi	a5, MASK1	/* mask for byte 1 */
+	bnone	a8, a4, .Leq	/* if byte 0 is zero */
+	bany	a2, a5, .Ldiff1	/* if byte 1 differs */
+	movi	a6, MASK2	/* mask for byte 2 */
+	bnone	a8, a5, .Leq	/* if byte 1 is zero */
+	bany	a2, a6, .Ldiff2	/* if byte 2 differs */
+	bnone	a8, a6, .Leq	/* if byte 2 is zero */
 #ifdef __XTENSA_EB__
 .Ldiff3:
 .Ldiff2:
@@ -263,14 +263,14 @@ ENTRY (strcmp)
 	   byte.  Just subtract words to get the return value.
 	   The high order equal bytes cancel, leaving room for the sign.  */
 	sub	a2, a8, a9
-	retw
+	abi_ret
 
 .Ldiff0:
 	/* Need to make room for the sign, so can't subtract whole words.  */
 	extui	a10, a8, 24, 8
 	extui	a11, a9, 24, 8
 	sub	a2, a10, a11
-	retw
+	abi_ret
 
 #else /* !__XTENSA_EB__ */
 	/* Little-endian is a little more difficult because can't subtract
@@ -281,28 +281,28 @@ ENTRY (strcmp)
 	extui	a10, a8, 24, 8
 	extui	a11, a9, 24, 8
 	sub	a2, a10, a11
-	retw
+	abi_ret
 
 .Ldiff0:
 	/* Byte 0 is different.  */
 	extui	a10, a8, 0, 8
 	extui	a11, a9, 0, 8
 	sub	a2, a10, a11
-	retw
+	abi_ret
 
 .Ldiff1:
 	/* Byte 0 is equal; byte 1 is different.  */
 	extui	a10, a8, 8, 8
 	extui	a11, a9, 8, 8
 	sub	a2, a10, a11
-	retw
+	abi_ret
 
 .Ldiff2:
 	/* Bytes 0-1 are equal; byte 2 is different.  */
 	extui	a10, a8, 16, 8
 	extui	a11, a9, 16, 8
 	sub	a2, a10, a11
-	retw
+	abi_ret
 
 #endif /* !__XTENSA_EB */
 
diff --git a/libc/string/xtensa/strcpy.S b/libc/string/xtensa/strcpy.S
index 108070384..9f42b34e6 100644
--- a/libc/string/xtensa/strcpy.S
+++ b/libc/string/xtensa/strcpy.S
@@ -13,11 +13,10 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <sysdep.h>
 #include <bits/xtensa-config.h>
 
 #ifdef __XTENSA_EB__
@@ -36,7 +35,7 @@
 ENTRY (strcpy)
 	/* a2 = dst, a3 = src */
 
-	mov	a10, a2		// leave dst in return value register
+	mov	a10, a2		/* leave dst in return value register */
 	movi	a4, MASK0
 	movi	a5, MASK1
 	movi	a6, MASK2
@@ -51,25 +50,25 @@ ENTRY (strcpy)
 
 	j	.Ldstunaligned
 
-.Lsrc1mod2: // src address is odd
-	l8ui	a8, a3, 0	// get byte 0
-	addi	a3, a3, 1	// advance src pointer
-	s8i	a8, a10, 0	// store byte 0
-	beqz	a8, 1f		// if byte 0 is zero
-	addi	a10, a10, 1	// advance dst pointer
-	bbci.l	a3, 1, .Lsrcaligned // if src is now word-aligned
+.Lsrc1mod2: /* src address is odd */
+	l8ui	a8, a3, 0	/* get byte 0 */
+	addi	a3, a3, 1	/* advance src pointer */
+	s8i	a8, a10, 0	/* store byte 0 */
+	beqz	a8, 1f		/* if byte 0 is zero */
+	addi	a10, a10, 1	/* advance dst pointer */
+	bbci.l	a3, 1, .Lsrcaligned /* if src is now word-aligned */
 
-.Lsrc2mod4: // src address is 2 mod 4
-	l8ui	a8, a3, 0	// get byte 0
+.Lsrc2mod4: /* src address is 2 mod 4 */
+	l8ui	a8, a3, 0	/* get byte 0 */
 	/* 1-cycle interlock */
-	s8i	a8, a10, 0	// store byte 0
-	beqz	a8, 1f		// if byte 0 is zero
-	l8ui	a8, a3, 1	// get byte 0
-	addi	a3, a3, 2	// advance src pointer
-	s8i	a8, a10, 1	// store byte 0
-	addi	a10, a10, 2	// advance dst pointer
+	s8i	a8, a10, 0	/* store byte 0 */
+	beqz	a8, 1f		/* if byte 0 is zero */
+	l8ui	a8, a3, 1	/* get byte 0 */
+	addi	a3, a3, 2	/* advance src pointer */
+	s8i	a8, a10, 1	/* store byte 0 */
+	addi	a10, a10, 2	/* advance dst pointer */
 	bnez	a8, .Lsrcaligned
-1:	retw
+1:	abi_ret
 
 
 /* dst is word-aligned; src is word-aligned.  */
@@ -78,46 +77,46 @@ ENTRY (strcpy)
 #if XCHAL_HAVE_LOOPS
 	/* (2 mod 4) alignment for loop instruction */
 .Laligned:
-	_movi.n	a8, 0		// set up for the maximum loop count
-	loop	a8, .Lz3	// loop forever (almost anyway)
-	l32i	a8, a3, 0	// get word from src
-	addi	a3, a3, 4	// advance src pointer
-	bnone	a8, a4, .Lz0	// if byte 0 is zero
-	bnone	a8, a5, .Lz1	// if byte 1 is zero
-	bnone	a8, a6, .Lz2	// if byte 2 is zero
-	s32i	a8, a10, 0	// store word to dst
-	bnone	a8, a7, .Lz3	// if byte 3 is zero
-	addi	a10, a10, 4	// advance dst pointer
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, .Lz3	/* loop forever (almost anyway) */
+	l32i	a8, a3, 0	/* get word from src */
+	addi	a3, a3, 4	/* advance src pointer */
+	bnone	a8, a4, .Lz0	/* if byte 0 is zero */
+	bnone	a8, a5, .Lz1	/* if byte 1 is zero */
+	bnone	a8, a6, .Lz2	/* if byte 2 is zero */
+	s32i	a8, a10, 0	/* store word to dst */
+	bnone	a8, a7, .Lz3	/* if byte 3 is zero */
+	addi	a10, a10, 4	/* advance dst pointer */
 
 #else /* !XCHAL_HAVE_LOOPS */
 
-1:	addi	a10, a10, 4	// advance dst pointer
+1:	addi	a10, a10, 4	/* advance dst pointer */
 .Laligned:
-	l32i	a8, a3, 0	// get word from src
-	addi	a3, a3, 4	// advance src pointer
-	bnone	a8, a4, .Lz0	// if byte 0 is zero
-	bnone	a8, a5, .Lz1	// if byte 1 is zero
-	bnone	a8, a6, .Lz2	// if byte 2 is zero
-	s32i	a8, a10, 0	// store word to dst
-	bany	a8, a7, 1b	// if byte 3 is zero
+	l32i	a8, a3, 0	/* get word from src */
+	addi	a3, a3, 4	/* advance src pointer */
+	bnone	a8, a4, .Lz0	/* if byte 0 is zero */
+	bnone	a8, a5, .Lz1	/* if byte 1 is zero */
+	bnone	a8, a6, .Lz2	/* if byte 2 is zero */
+	s32i	a8, a10, 0	/* store word to dst */
+	bany	a8, a7, 1b	/* if byte 3 is zero */
 #endif /* !XCHAL_HAVE_LOOPS */
 
 .Lz3:	/* Byte 3 is zero.  */
-	retw
+	abi_ret
 
 .Lz0:	/* Byte 0 is zero.  */
 #ifdef __XTENSA_EB__
 	movi	a8, 0
 #endif
 	s8i	a8, a10, 0
-	retw
+	abi_ret
 
 .Lz1:	/* Byte 1 is zero.  */
 #ifdef __XTENSA_EB__
         extui   a8, a8, 16, 16
 #endif
 	s16i	a8, a10, 0
-	retw
+	abi_ret
 
 .Lz2:	/* Byte 2 is zero.  */
 #ifdef __XTENSA_EB__
@@ -126,15 +125,15 @@ ENTRY (strcpy)
 	s16i	a8, a10, 0
 	movi	a8, 0
 	s8i	a8, a10, 2
-	retw
+	abi_ret
 
 	.align	4
 	/* (2 mod 4) alignment for loop instruction */
 .Ldstunaligned:
 
 #if XCHAL_HAVE_LOOPS
-	_movi.n	a8, 0		// set up for the maximum loop count
-	loop	a8, 2f		// loop forever (almost anyway)
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, 2f		/* loop forever (almost anyway) */
 #endif
 1:	l8ui	a8, a3, 0
 	addi	a3, a3, 1
@@ -145,6 +144,6 @@ ENTRY (strcpy)
 #else
 	bnez	a8, 1b
 #endif
-2:	retw
+2:	abi_ret
 
 libc_hidden_def (strcpy)
diff --git a/libc/string/xtensa/strlen.S b/libc/string/xtensa/strlen.S
index dd72c16fa..e1c98c8f0 100644
--- a/libc/string/xtensa/strlen.S
+++ b/libc/string/xtensa/strlen.S
@@ -13,11 +13,10 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <sysdep.h>
 #include <bits/xtensa-config.h>
 
 #ifdef __XTENSA_EB__
@@ -36,7 +35,7 @@
 ENTRY (strlen)
 	/* a2 = s */
 
-	addi	a3, a2, -4	// because we overincrement at the end
+	addi	a3, a2, -4	/* because we overincrement at the end */
 	movi	a4, MASK0
 	movi	a5, MASK1
 	movi	a6, MASK2
@@ -45,22 +44,22 @@ ENTRY (strlen)
 	bbsi.l	a2, 1, .L2mod4
 	j	.Laligned
 
-.L1mod2: // address is odd
-	l8ui	a8, a3, 4	// get byte 0
-	addi	a3, a3, 1	// advance string pointer
-	beqz	a8, .Lz3	// if byte 0 is zero
-	bbci.l	a3, 1, .Laligned // if string pointer is now word-aligned
+.L1mod2: /* address is odd */
+	l8ui	a8, a3, 4	/* get byte 0 */
+	addi	a3, a3, 1	/* advance string pointer */
+	beqz	a8, .Lz3	/* if byte 0 is zero */
+	bbci.l	a3, 1, .Laligned /* if string pointer is now word-aligned */
 
-.L2mod4: // address is 2 mod 4
-	addi	a3, a3, 2	// advance ptr for aligned access
-	l32i	a8, a3, 0	// get word with first two bytes of string
-	bnone	a8, a6, .Lz2	// if byte 2 (of word, not string) is zero
-	bany	a8, a7, .Laligned // if byte 3 (of word, not string) is nonzero
+.L2mod4: /* address is 2 mod 4 */
+	addi	a3, a3, 2	/* advance ptr for aligned access */
+	l32i	a8, a3, 0	/* get word with first two bytes of string */
+	bnone	a8, a6, .Lz2	/* if byte 2 (of word, not string) is zero */
+	bany	a8, a7, .Laligned /* if byte 3 (of word, not string) is nonzero */
 
 	/* Byte 3 is zero.  */
-	addi	a3, a3, 3	// point to zero byte
-	sub	a2, a3, a2	// subtract to get length
-	retw
+	addi	a3, a3, 3	/* point to zero byte */
+	sub	a2, a3, a2	/* subtract to get length */
+	abi_ret
 
 
 /* String is word-aligned.  */
@@ -69,36 +68,36 @@ ENTRY (strlen)
 	/* (2 mod 4) alignment for loop instruction */
 .Laligned:
 #if XCHAL_HAVE_LOOPS
-	_movi.n	a8, 0		// set up for the maximum loop count
-	loop	a8, .Lz3	// loop forever (almost anyway)
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, .Lz3	/* loop forever (almost anyway) */
 #endif
-1:	l32i	a8, a3, 4	// get next word of string
-	addi	a3, a3, 4	// advance string pointer
-	bnone	a8, a4, .Lz0	// if byte 0 is zero
-	bnone	a8, a5, .Lz1	// if byte 1 is zero
-	bnone	a8, a6, .Lz2	// if byte 2 is zero
+1:	l32i	a8, a3, 4	/* get next word of string */
+	addi	a3, a3, 4	/* advance string pointer */
+	bnone	a8, a4, .Lz0	/* if byte 0 is zero */
+	bnone	a8, a5, .Lz1	/* if byte 1 is zero */
+	bnone	a8, a6, .Lz2	/* if byte 2 is zero */
 #if XCHAL_HAVE_LOOPS
-	bnone	a8, a7, .Lz3	// if byte 3 is zero
+	bnone	a8, a7, .Lz3	/* if byte 3 is zero */
 #else
-	bany	a8, a7, 1b	// repeat if byte 3 is non-zero
+	bany	a8, a7, 1b	/* repeat if byte 3 is non-zero */
 #endif
 
 .Lz3:	/* Byte 3 is zero.  */
-	addi	a3, a3, 3	// point to zero byte
+	addi	a3, a3, 3	/* point to zero byte */
 	/* Fall through....  */
 
 .Lz0:	/* Byte 0 is zero.  */
-	sub	a2, a3, a2	// subtract to get length
-	retw
+	sub	a2, a3, a2	/* subtract to get length */
+	abi_ret
 
 .Lz1:	/* Byte 1 is zero.  */
-	addi	a3, a3, 1	// point to zero byte
-	sub	a2, a3, a2	// subtract to get length
-	retw
+	addi	a3, a3, 1	/* point to zero byte */
+	sub	a2, a3, a2	/* subtract to get length */
+	abi_ret
 
 .Lz2:	/* Byte 2 is zero.  */
-	addi	a3, a3, 2	// point to zero byte
-	sub	a2, a3, a2	// subtract to get length
-	retw
+	addi	a3, a3, 2	/* point to zero byte */
+	sub	a2, a3, a2	/* subtract to get length */
+	abi_ret
 
 libc_hidden_def (strlen)
diff --git a/libc/string/xtensa/strncpy.S b/libc/string/xtensa/strncpy.S
index 7ba2ef77d..aa8db5da1 100644
--- a/libc/string/xtensa/strncpy.S
+++ b/libc/string/xtensa/strncpy.S
@@ -13,11 +13,10 @@
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
-   Boston, MA 02110-1301, USA.  */
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <sysdep.h>
 #include <bits/xtensa-config.h>
 
 #ifdef __XTENSA_EB__
@@ -41,41 +40,41 @@
 	.literal_position
 __strncpy_aux:
 
-.Lsrc1mod2: // src address is odd
-	l8ui	a8, a3, 0	// get byte 0
-	addi	a3, a3, 1	// advance src pointer
-	s8i	a8, a10, 0	// store byte 0
-	addi	a4, a4, -1	// decrement n
-	beqz    a4, .Lret       // if n is zero
-	addi	a10, a10, 1	// advance dst pointer
-	beqz	a8, .Lfill	// if byte 0 is zero
-	bbci.l	a3, 1, .Lsrcaligned // if src is now word-aligned
-
-.Lsrc2mod4: // src address is 2 mod 4
-	l8ui	a8, a3, 0	// get byte 0
-	addi	a4, a4, -1	// decrement n
-	s8i	a8, a10, 0	// store byte 0
-	beqz    a4, .Lret       // if n is zero
-	addi	a10, a10, 1	// advance dst pointer
-	beqz	a8, .Lfill	// if byte 0 is zero
-	l8ui	a8, a3, 1	// get byte 0
-	addi	a3, a3, 2	// advance src pointer
-	s8i	a8, a10, 0	// store byte 0
-	addi	a4, a4, -1	// decrement n
-	beqz    a4, .Lret       // if n is zero
-	addi	a10, a10, 1	// advance dst pointer
+.Lsrc1mod2: /* src address is odd */
+	l8ui	a8, a3, 0	/* get byte 0 */
+	addi	a3, a3, 1	/* advance src pointer */
+	s8i	a8, a10, 0	/* store byte 0 */
+	addi	a4, a4, -1	/* decrement n */
+	beqz    a4, .Lret       /* if n is zero */
+	addi	a10, a10, 1	/* advance dst pointer */
+	beqz	a8, .Lfill	/* if byte 0 is zero */
+	bbci.l	a3, 1, .Lsrcaligned /* if src is now word-aligned */
+
+.Lsrc2mod4: /* src address is 2 mod 4 */
+	l8ui	a8, a3, 0	/* get byte 0 */
+	addi	a4, a4, -1	/* decrement n */
+	s8i	a8, a10, 0	/* store byte 0 */
+	beqz    a4, .Lret       /* if n is zero */
+	addi	a10, a10, 1	/* advance dst pointer */
+	beqz	a8, .Lfill	/* if byte 0 is zero */
+	l8ui	a8, a3, 1	/* get byte 0 */
+	addi	a3, a3, 2	/* advance src pointer */
+	s8i	a8, a10, 0	/* store byte 0 */
+	addi	a4, a4, -1	/* decrement n */
+	beqz    a4, .Lret       /* if n is zero */
+	addi	a10, a10, 1	/* advance dst pointer */
 	bnez	a8, .Lsrcaligned
 	j	.Lfill
 
 .Lret:
-	retw
+	abi_ret
 
 
 ENTRY (strncpy)
 	/* a2 = dst, a3 = src */
 
-	mov	a10, a2		// leave dst in return value register
-	beqz    a4, .Lret       // if n is zero
+	mov	a10, a2		/* leave dst in return value register */
+	beqz    a4, .Lret       /* if n is zero */
 
 	movi	a11, MASK0
 	movi	a5, MASK1
@@ -125,28 +124,28 @@ ENTRY (strncpy)
 
 .Lfillcleanup:
 	/* Fill leftover (1 to 3) bytes with zero.  */
-	s8i	a9, a10, 0	// store byte 0
-	addi	a4, a4, -1	// decrement n
+	s8i	a9, a10, 0	/* store byte 0 */
+	addi	a4, a4, -1	/* decrement n */
 	addi	a10, a10, 1
-	bnez    a4, .Lfillcleanup 
-
-2:	retw
-	
-.Lfill1mod2: // dst address is odd
-	s8i	a9, a10, 0	// store byte 0
-	addi	a4, a4, -1	// decrement n
-	beqz    a4, 2b		// if n is zero
-	addi    a10, a10, 1	// advance dst pointer
-	bbci.l	a10, 1, .Lfillaligned // if dst is now word-aligned
-
-.Lfill2mod4: // dst address is 2 mod 4
-	s8i	a9, a10, 0	// store byte 0
-	addi	a4, a4, -1	// decrement n
-	beqz    a4, 2b		// if n is zero
-	s8i	a9, a10, 1	// store byte 1
-	addi	a4, a4, -1	// decrement n
-	beqz    a4, 2b		// if n is zero
-	addi    a10, a10, 2	// advance dst pointer
+	bnez    a4, .Lfillcleanup
+
+2:	abi_ret
+
+.Lfill1mod2: /* dst address is odd */
+	s8i	a9, a10, 0	/* store byte 0 */
+	addi	a4, a4, -1	/* decrement n */
+	beqz    a4, 2b		/* if n is zero */
+	addi    a10, a10, 1	/* advance dst pointer */
+	bbci.l	a10, 1, .Lfillaligned /* if dst is now word-aligned */
+
+.Lfill2mod4: /* dst address is 2 mod 4 */
+	s8i	a9, a10, 0	/* store byte 0 */
+	addi	a4, a4, -1	/* decrement n */
+	beqz    a4, 2b		/* if n is zero */
+	s8i	a9, a10, 1	/* store byte 1 */
+	addi	a4, a4, -1	/* decrement n */
+	beqz    a4, 2b		/* if n is zero */
+	addi    a10, a10, 2	/* advance dst pointer */
 	j	.Lfillaligned
 
 
@@ -156,32 +155,32 @@ ENTRY (strncpy)
 	/* (2 mod 4) alignment for loop instruction */
 .Laligned:
 #if XCHAL_HAVE_LOOPS
-	_movi.n	a8, 0		// set up for the maximum loop count
-	loop	a8, 1f		// loop forever (almost anyway)
-	blti	a4, 5, .Ldstunaligned // n is near limit; do one at a time
-	l32i	a8, a3, 0	// get word from src
-	addi	a3, a3, 4	// advance src pointer
-	bnone	a8, a11, .Lz0	// if byte 0 is zero
-	bnone	a8, a5, .Lz1	// if byte 1 is zero
-	bnone	a8, a6, .Lz2	// if byte 2 is zero
-	s32i	a8, a10, 0	// store word to dst
-	addi	a4, a4, -4	// decrement n
-	addi	a10, a10, 4	// advance dst pointer
-	bnone	a8, a7, .Lfill	// if byte 3 is zero
-1:	
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, 1f		/* loop forever (almost anyway) */
+	blti	a4, 5, .Ldstunaligned /* n is near limit; do one at a time */
+	l32i	a8, a3, 0	/* get word from src */
+	addi	a3, a3, 4	/* advance src pointer */
+	bnone	a8, a11, .Lz0	/* if byte 0 is zero */
+	bnone	a8, a5, .Lz1	/* if byte 1 is zero */
+	bnone	a8, a6, .Lz2	/* if byte 2 is zero */
+	s32i	a8, a10, 0	/* store word to dst */
+	addi	a4, a4, -4	/* decrement n */
+	addi	a10, a10, 4	/* advance dst pointer */
+	bnone	a8, a7, .Lfill	/* if byte 3 is zero */
+1:
 
 #else /* !XCHAL_HAVE_LOOPS */
 
-1:	blti	a4, 5, .Ldstunaligned // n is near limit; do one at a time
-	l32i	a8, a3, 0	// get word from src
-	addi	a3, a3, 4	// advance src pointer
-	bnone	a8, a11, .Lz0	// if byte 0 is zero
-	bnone	a8, a5, .Lz1	// if byte 1 is zero
-	bnone	a8, a6, .Lz2	// if byte 2 is zero
-	s32i	a8, a10, 0	// store word to dst
-	addi	a4, a4, -4	// decrement n
-	addi	a10, a10, 4	// advance dst pointer
-	bany	a8, a7, 1b	// no zeroes
+1:	blti	a4, 5, .Ldstunaligned /* n is near limit; do one at a time */
+	l32i	a8, a3, 0	/* get word from src */
+	addi	a3, a3, 4	/* advance src pointer */
+	bnone	a8, a11, .Lz0	/* if byte 0 is zero */
+	bnone	a8, a5, .Lz1	/* if byte 1 is zero */
+	bnone	a8, a6, .Lz2	/* if byte 2 is zero */
+	s32i	a8, a10, 0	/* store word to dst */
+	addi	a4, a4, -4	/* decrement n */
+	addi	a10, a10, 4	/* advance dst pointer */
+	bany	a8, a7, 1b	/* no zeroes */
 #endif /* !XCHAL_HAVE_LOOPS */
 
 	j	.Lfill
@@ -191,8 +190,8 @@ ENTRY (strncpy)
 	movi	a8, 0
 #endif
 	s8i	a8, a10, 0
-	addi	a4, a4, -1	// decrement n
-	addi	a10, a10, 1	// advance dst pointer
+	addi	a4, a4, -1	/* decrement n */
+	addi	a10, a10, 1	/* advance dst pointer */
 	j	.Lfill
 
 .Lz1:	/* Byte 1 is zero.  */
@@ -200,8 +199,8 @@ ENTRY (strncpy)
         extui   a8, a8, 16, 16
 #endif
 	s16i	a8, a10, 0
-	addi	a4, a4, -2	// decrement n
-	addi	a10, a10, 2	// advance dst pointer
+	addi	a4, a4, -2	/* decrement n */
+	addi	a10, a10, 2	/* advance dst pointer */
 	j	.Lfill
 
 .Lz2:	/* Byte 2 is zero.  */
@@ -211,8 +210,8 @@ ENTRY (strncpy)
 	s16i	a8, a10, 0
 	movi	a8, 0
 	s8i	a8, a10, 2
-	addi	a4, a4, -3	// decrement n
-	addi	a10, a10, 3	// advance dst pointer
+	addi	a4, a4, -3	/* decrement n */
+	addi	a10, a10, 3	/* advance dst pointer */
 	j	.Lfill
 
 	.align	4
@@ -220,8 +219,8 @@ ENTRY (strncpy)
 .Ldstunaligned:
 
 #if XCHAL_HAVE_LOOPS
-	_movi.n	a8, 0		// set up for the maximum loop count
-	loop	a8, 2f		// loop forever (almost anyway)
+	_movi.n	a8, 0		/* set up for the maximum loop count */
+	loop	a8, 2f		/* loop forever (almost anyway) */
 #endif
 1:	l8ui	a8, a3, 0
 	addi	a3, a3, 1
@@ -236,6 +235,6 @@ ENTRY (strncpy)
 #endif
 2:	j	.Lfill
 
-3:	retw
+3:	abi_ret
 
 libc_hidden_def (strncpy)