summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorMike Frysinger <vapier@gentoo.org>2008-01-05 10:05:27 +0000
committerMike Frysinger <vapier@gentoo.org>2008-01-05 10:05:27 +0000
commit124ec188720b6bdea85ade49e7ea195161b12fce (patch)
tree2bce39bc1e51bd587e010a61419b47d122be3165 /libc
parent9c95d5d28d8d40f7b826c9399f5ce781bbc61567 (diff)
Chris Zankel writes:
The following patches add support for the Xtensa processor architecture to uClibc. They are based on a recent SVN checkout (12/05/2007). The first patch (attached to this post) adds Xtensa support to various shared configuration and make files. The following patches then include the Xtensa specific files and directories. I welcome any feedback and would appreciate it if you could include the patches into the mainline tree. I am certainly committed to maintain the port. Bob Wilson was kind enough to review the patches. Some notes about the architecture: Xtensa is a configurable and extensible processor architecture developed by Tensilica. For more information, please visit: www.linux-xtensa.org.
Diffstat (limited to 'libc')
-rw-r--r--libc/string/xtensa/Makefile13
-rw-r--r--libc/string/xtensa/memcpy.S297
-rw-r--r--libc/string/xtensa/memset.S165
-rw-r--r--libc/string/xtensa/strcmp.S313
-rw-r--r--libc/string/xtensa/strcpy.S150
-rw-r--r--libc/string/xtensa/strlen.S104
-rw-r--r--libc/string/xtensa/strncpy.S241
-rw-r--r--libc/sysdeps/linux/xtensa/Makefile13
-rw-r--r--libc/sysdeps/linux/xtensa/Makefile.arch14
-rw-r--r--libc/sysdeps/linux/xtensa/__longjmp.S126
-rw-r--r--libc/sysdeps/linux/xtensa/__syscall_error.c18
-rw-r--r--libc/sysdeps/linux/xtensa/bits/endian.h10
-rw-r--r--libc/sysdeps/linux/xtensa/bits/fcntl.h196
-rw-r--r--libc/sysdeps/linux/xtensa/bits/ipc.h54
-rw-r--r--libc/sysdeps/linux/xtensa/bits/kernel_stat.h57
-rw-r--r--libc/sysdeps/linux/xtensa/bits/kernel_types.h48
-rw-r--r--libc/sysdeps/linux/xtensa/bits/mathdef.h43
-rw-r--r--libc/sysdeps/linux/xtensa/bits/mman.h104
-rw-r--r--libc/sysdeps/linux/xtensa/bits/msq.h88
-rw-r--r--libc/sysdeps/linux/xtensa/bits/setjmp.h46
-rw-r--r--libc/sysdeps/linux/xtensa/bits/shm.h115
-rw-r--r--libc/sysdeps/linux/xtensa/bits/sigcontextinfo.h33
-rw-r--r--libc/sysdeps/linux/xtensa/bits/stackinfo.h28
-rw-r--r--libc/sysdeps/linux/xtensa/bits/stat.h153
-rw-r--r--libc/sysdeps/linux/xtensa/bits/syscalls.h140
-rw-r--r--libc/sysdeps/linux/xtensa/bits/uClibc_arch_features.h44
-rw-r--r--libc/sysdeps/linux/xtensa/bits/uClibc_page.h31
-rw-r--r--libc/sysdeps/linux/xtensa/bits/wordsize.h19
-rw-r--r--libc/sysdeps/linux/xtensa/bits/xtensa-config.h53
-rw-r--r--libc/sysdeps/linux/xtensa/brk.c43
-rw-r--r--libc/sysdeps/linux/xtensa/bsd-_setjmp.S1
-rw-r--r--libc/sysdeps/linux/xtensa/bsd-setjmp.S1
-rw-r--r--libc/sysdeps/linux/xtensa/clone.S103
-rw-r--r--libc/sysdeps/linux/xtensa/crt1.S119
-rw-r--r--libc/sysdeps/linux/xtensa/crti.S16
-rw-r--r--libc/sysdeps/linux/xtensa/crtn.S8
-rw-r--r--libc/sysdeps/linux/xtensa/fork.c25
-rw-r--r--libc/sysdeps/linux/xtensa/mmap.S57
-rw-r--r--libc/sysdeps/linux/xtensa/posix_fadvise.c29
-rw-r--r--libc/sysdeps/linux/xtensa/posix_fadvise64.c39
-rw-r--r--libc/sysdeps/linux/xtensa/pread_write.c193
-rw-r--r--libc/sysdeps/linux/xtensa/setjmp.S131
-rw-r--r--libc/sysdeps/linux/xtensa/sys/procfs.h121
-rw-r--r--libc/sysdeps/linux/xtensa/sys/ptrace.h156
-rw-r--r--libc/sysdeps/linux/xtensa/sys/ucontext.h49
-rw-r--r--libc/sysdeps/linux/xtensa/syscall.S42
-rw-r--r--libc/sysdeps/linux/xtensa/sysdep.h160
-rw-r--r--libc/sysdeps/linux/xtensa/vfork.S170
-rw-r--r--libc/sysdeps/linux/xtensa/windowspill.S95
49 files changed, 4274 insertions, 0 deletions
diff --git a/libc/string/xtensa/Makefile b/libc/string/xtensa/Makefile
new file mode 100644
index 000000000..0a95346fd
--- /dev/null
+++ b/libc/string/xtensa/Makefile
@@ -0,0 +1,13 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
+#
+# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+#
+
+top_srcdir:=../../../
+top_builddir:=../../../
+all: objs
+include $(top_builddir)Rules.mak
+include ../Makefile.in
+include $(top_srcdir)Makerules
diff --git a/libc/string/xtensa/memcpy.S b/libc/string/xtensa/memcpy.S
new file mode 100644
index 000000000..19f3a6818
--- /dev/null
+++ b/libc/string/xtensa/memcpy.S
@@ -0,0 +1,297 @@
+/* Optimized memcpy for Xtensa.
+ Copyright (C) 2001, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+ Boston, MA 02110-1301, USA. */
+
+#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <bits/xtensa-config.h>
+
+ .macro src_b r, w0, w1
+#ifdef __XTENSA_EB__
+ src \r, \w0, \w1
+#else
+ src \r, \w1, \w0
+#endif
+ .endm
+
+ .macro ssa8 r
+#ifdef __XTENSA_EB__
+ ssa8b \r
+#else
+ ssa8l \r
+#endif
+ .endm
+
+/* If the Xtensa Unaligned Load Exception option is not used, this
+ code can run a few cycles faster by relying on the low address bits
+ being ignored. However, if the code is then run with an Xtensa ISS
+ client that checks for unaligned accesses, it will produce a lot of
+ warning messages. Set this flag to disable the use of unaligned
+ accesses and keep the ISS happy. */
+
+#define UNALIGNED_ADDRESSES_CHECKED 1
+
+/* Do not use .literal_position in the ENTRY macro. */
+#undef LITERAL_POSITION
+#define LITERAL_POSITION
+
+
+/* void *memcpy (void *dst, const void *src, size_t len)
+
+ The algorithm is as follows:
+
+ If the destination is unaligned, align it by conditionally
+ copying 1- and/or 2-byte pieces.
+
+ If the source is aligned, copy 16 bytes with a loop, and then finish up
+ with 8, 4, 2, and 1-byte copies conditional on the length.
+
+ Else (if source is unaligned), do the same, but use SRC to align the
+ source data.
+
+ This code tries to use fall-through branches for the common
+ case of aligned source and destination and multiple of 4 (or 8) length. */
+
+
+/* Byte by byte copy. */
+
+ .text
+ .align 4
+ .literal_position
+__memcpy_aux:
+
+ /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
+ (0 mod 4 alignment for LBEG). */
+ .byte 0
+
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+ loopnez a4, 2f
+#else
+ beqz a4, 2f
+ add a7, a3, a4 // a7 = end address for source
+#endif
+1: l8ui a6, a3, 0
+ addi a3, a3, 1
+ s8i a6, a5, 0
+ addi a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+ blt a3, a7, 1b
+#endif
+2: retw
+
+
+/* Destination is unaligned. */
+
+ .align 4
+.Ldst1mod2: // dst is only byte aligned
+
+ /* Do short copies byte-by-byte. */
+ _bltui a4, 7, .Lbytecopy
+
+ /* Copy 1 byte. */
+ l8ui a6, a3, 0
+ addi a3, a3, 1
+ addi a4, a4, -1
+ s8i a6, a5, 0
+ addi a5, a5, 1
+
+ /* Return to main algorithm if dst is now aligned. */
+ _bbci.l a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+ /* Do short copies byte-by-byte. */
+ _bltui a4, 6, .Lbytecopy
+
+ /* Copy 2 bytes. */
+ l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a3, a3, 2
+ addi a4, a4, -2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a5, a5, 2
+
+ /* dst is now aligned; return to main algorithm. */
+ j .Ldstaligned
+
+
+ENTRY (memcpy)
+ /* a2 = dst, a3 = src, a4 = len */
+
+ mov a5, a2 // copy dst so that a2 is return value
+ _bbsi.l a2, 0, .Ldst1mod2
+ _bbsi.l a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+ /* Get number of loop iterations with 16B per iteration. */
+ srli a7, a4, 4
+
+ /* Check if source is aligned. */
+ movi a8, 3
+ _bany a3, a8, .Lsrcunaligned
+
+ /* Destination and source are word-aligned, use word copy. */
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, 2f
+#else
+ beqz a7, 2f
+ slli a8, a7, 4
+ add a8, a8, a3 // a8 = end of last 16B source chunk
+#endif
+1: l32i a6, a3, 0
+ l32i a7, a3, 4
+ s32i a6, a5, 0
+ l32i a6, a3, 8
+ s32i a7, a5, 4
+ l32i a7, a3, 12
+ s32i a6, a5, 8
+ addi a3, a3, 16
+ s32i a7, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ blt a3, a8, 1b
+#endif
+
+ /* Copy any leftover pieces smaller than 16B. */
+2: bbci.l a4, 3, 3f
+
+ /* Copy 8 bytes. */
+ l32i a6, a3, 0
+ l32i a7, a3, 4
+ addi a3, a3, 8
+ s32i a6, a5, 0
+ s32i a7, a5, 4
+ addi a5, a5, 8
+
+3: bbsi.l a4, 2, 4f
+ bbsi.l a4, 1, 5f
+ bbsi.l a4, 0, 6f
+ retw
+
+ /* Copy 4 bytes. */
+4: l32i a6, a3, 0
+ addi a3, a3, 4
+ s32i a6, a5, 0
+ addi a5, a5, 4
+ bbsi.l a4, 1, 5f
+ bbsi.l a4, 0, 6f
+ retw
+
+ /* Copy 2 bytes. */
+5: l16ui a6, a3, 0
+ addi a3, a3, 2
+ s16i a6, a5, 0
+ addi a5, a5, 2
+ bbsi.l a4, 0, 6f
+ retw
+
+ /* Copy 1 byte. */
+6: l8ui a6, a3, 0
+ s8i a6, a5, 0
+
+.Ldone:
+ retw
+
+
+/* Destination is aligned; source is unaligned. */
+
+ .align 4
+.Lsrcunaligned:
+ /* Avoid loading anything for zero-length copies. */
+ _beqz a4, .Ldone
+
+ /* Copy 16 bytes per iteration for word-aligned dst and
+ unaligned src. */
+ ssa8 a3 // set shift amount from byte offset
+#if UNALIGNED_ADDRESSES_CHECKED
+ and a11, a3, a8 // save unalignment offset for below
+ sub a3, a3, a11 // align a3
+#endif
+ l32i a6, a3, 0 // load first word
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, 2f
+#else
+ beqz a7, 2f
+ slli a10, a7, 4
+ add a10, a10, a3 // a10 = end of last 16B source chunk
+#endif
+1: l32i a7, a3, 4
+ l32i a8, a3, 8
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ l32i a9, a3, 12
+ src_b a7, a7, a8
+ s32i a7, a5, 4
+ l32i a6, a3, 16
+ src_b a8, a8, a9
+ s32i a8, a5, 8
+ addi a3, a3, 16
+ src_b a9, a9, a6
+ s32i a9, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ blt a3, a10, 1b
+#endif
+
+2: bbci.l a4, 3, 3f
+
+ /* Copy 8 bytes. */
+ l32i a7, a3, 4
+ l32i a8, a3, 8
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ addi a3, a3, 8
+ src_b a7, a7, a8
+ s32i a7, a5, 4
+ addi a5, a5, 8
+ mov a6, a8
+
+3: bbci.l a4, 2, 4f
+
+ /* Copy 4 bytes. */
+ l32i a7, a3, 4
+ addi a3, a3, 4
+ src_b a6, a6, a7
+ s32i a6, a5, 0
+ addi a5, a5, 4
+ mov a6, a7
+4:
+#if UNALIGNED_ADDRESSES_CHECKED
+ add a3, a3, a11 // readjust a3 with correct misalignment
+#endif
+ bbsi.l a4, 1, 5f
+ bbsi.l a4, 0, 6f
+ retw
+
+ /* Copy 2 bytes. */
+5: l8ui a6, a3, 0
+ l8ui a7, a3, 1
+ addi a3, a3, 2
+ s8i a6, a5, 0
+ s8i a7, a5, 1
+ addi a5, a5, 2
+ bbsi.l a4, 0, 6f
+ retw
+
+ /* Copy 1 byte. */
+6: l8ui a6, a3, 0
+ s8i a6, a5, 0
+ retw
+
+libc_hidden_def (memcpy)
diff --git a/libc/string/xtensa/memset.S b/libc/string/xtensa/memset.S
new file mode 100644
index 000000000..c0928825d
--- /dev/null
+++ b/libc/string/xtensa/memset.S
@@ -0,0 +1,165 @@
+/* Optimized memset for Xtensa.
+ Copyright (C) 2001, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+ Boston, MA 02110-1301, USA. */
+
+#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <bits/xtensa-config.h>
+
+/* Do not use .literal_position in the ENTRY macro. */
+#undef LITERAL_POSITION
+#define LITERAL_POSITION
+
+/* void *memset (void *dst, int c, size_t length)
+
+ The algorithm is as follows:
+
+ Create a word with c in all byte positions.
+
+ If the destination is aligned, set 16B chunks with a loop, and then
+ finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
+
+ If the destination is unaligned, align it by conditionally
+ setting 1B and/or 2B and then go to aligned case.
+
+ This code tries to use fall-through branches for the common
+ case of an aligned destination (except for the branches to
+ the alignment labels). */
+
+
+/* Byte-by-byte set. */
+
+ .text
+ .align 4
+ .literal_position
+__memset_aux:
+
+ /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
+ (0 mod 4 alignment for LBEG). */
+ .byte 0
+
+.Lbyteset:
+#if XCHAL_HAVE_LOOPS
+ loopnez a4, 2f
+#else
+ beqz a4, 2f
+ add a6, a5, a4 // a6 = ending address
+#endif
+1: s8i a3, a5, 0
+ addi a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+ blt a5, a6, 1b
+#endif
+2: retw
+
+
+/* Destination is unaligned. */
+
+ .align 4
+
+.Ldst1mod2: // dst is only byte aligned
+
+ /* Do short sizes byte-by-byte. */
+ bltui a4, 8, .Lbyteset
+
+ /* Set 1 byte. */
+ s8i a3, a5, 0
+ addi a5, a5, 1
+ addi a4, a4, -1
+
+ /* Now retest if dst is aligned. */
+ _bbci.l a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+ /* Do short sizes byte-by-byte. */
+ bltui a4, 8, .Lbyteset
+
+ /* Set 2 bytes. */
+ s16i a3, a5, 0
+ addi a5, a5, 2
+ addi a4, a4, -2
+
+ /* dst is now aligned; return to main algorithm */
+ j .Ldstaligned
+
+
+ENTRY (memset)
+ /* a2 = dst, a3 = c, a4 = length */
+
+ /* Duplicate character into all bytes of word. */
+ extui a3, a3, 0, 8
+ slli a7, a3, 8
+ or a3, a3, a7
+ slli a7, a3, 16
+ or a3, a3, a7
+
+ mov a5, a2 // copy dst so that a2 is return value
+
+ /* Check if dst is unaligned. */
+ _bbsi.l a2, 0, .Ldst1mod2
+ _bbsi.l a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+ /* Get number of loop iterations with 16B per iteration. */
+ srli a7, a4, 4
+
+ /* Destination is word-aligned. */
+#if XCHAL_HAVE_LOOPS
+ loopnez a7, 2f
+#else
+ beqz a7, 2f
+ slli a6, a7, 4
+ add a6, a6, a5 // a6 = end of last 16B chunk
+#endif
+ /* Set 16 bytes per iteration. */
+1: s32i a3, a5, 0
+ s32i a3, a5, 4
+ s32i a3, a5, 8
+ s32i a3, a5, 12
+ addi a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+ blt a5, a6, 1b
+#endif
+
+ /* Set any leftover pieces smaller than 16B. */
+2: bbci.l a4, 3, 3f
+
+ /* Set 8 bytes. */
+ s32i a3, a5, 0
+ s32i a3, a5, 4
+ addi a5, a5, 8
+
+3: bbci.l a4, 2, 4f
+
+ /* Set 4 bytes. */
+ s32i a3, a5, 0
+ addi a5, a5, 4
+
+4: bbci.l a4, 1, 5f
+
+ /* Set 2 bytes. */
+ s16i a3, a5, 0
+ addi a5, a5, 2
+
+5: bbci.l a4, 0, 6f
+
+ /* Set 1 byte. */
+ s8i a3, a5, 0
+6: retw
+
+libc_hidden_def (memset)
diff --git a/libc/string/xtensa/strcmp.S b/libc/string/xtensa/strcmp.S
new file mode 100644
index 000000000..90c418d12
--- /dev/null
+++ b/libc/string/xtensa/strcmp.S
@@ -0,0 +1,313 @@
+/* Optimized strcmp for Xtensa.
+ Copyright (C) 2001, 2007 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+ Boston, MA 02110-1301, USA. */
+
+#include "../../sysdeps/linux/xtensa/sysdep.h"
+#include <bits/xtensa-config.h>
+
+#ifdef __XTENSA_EB__
+#define MASK0 0xff000000
+#define MASK1 0x00ff0000
+#define MASK2 0x0000ff00
+#define MASK3 0x000000ff
+#else
+#define MASK0 0x000000ff
+#define MASK1 0x0000ff00
+#define MASK2 0x00ff0000
+#define MASK3 0xff000000
+#endif
+
+#define MASK4 0x40404040
+
+ .literal .Lmask0, MASK0
+ .literal .Lmask1, MASK1
+ .literal .Lmask2, MASK2
+ .literal .Lmask3, MASK3
+ .literal .Lmask4, MASK4
+
+ .text
+ENTRY (strcmp)
+ /* a2 = s1, a3 = s2 */
+
+ l8ui a8, a2, 0 // byte 0 from s1
+ l8ui a9, a3, 0 // byte 0 from s2
+ movi a10, 3 // mask
+ bne a8, a9, .Lretdiff
+
+ or a11, a2, a3
+ bnone a11, a10, .Laligned
+
+ xor a11, a2, a3 // compare low two bits of s1 and s2
+ bany a11, a10, .Lunaligned // if they have different alignment
+
+ /* s1/s2 are not word-aligned. */
+ addi a2, a2, 1 // advance s1
+ beqz a8, .Leq // bytes equal, if zero, strings are equal
+ addi a3, a3, 1 // advance s2
+ bnone a2, a10, .Laligned // if s1/s2 now aligned
+ l8ui a8, a2, 0 // byte 1 from s1
+ l8ui a9, a3, 0 // byte 1 from s2
+ addi a2, a2, 1 // advance s1
+ bne a8, a9, .Lretdiff // if different, return difference
+ beqz a8, .Leq // bytes equal, if zero, strings are equal
+ addi a3, a3, 1 // advance s2
+ bnone a2, a10, .Laligned // if s1/s2 now aligned
+ l8ui a8, a2, 0 // byte 2 from s1
+ l8ui a9, a3, 0 // byte 2 from s2
+ addi a2, a2, 1 // advance s1
+ bne a8, a9, .Lretdiff // if different, return difference
+ beqz a8, .Leq // bytes equal, if zero, strings are equal
+ addi a3, a3, 1 // advance s2
+ j .Laligned
+
+/* s1 and s2 have different alignment.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop.
+
+ Note: It is important for this unaligned case to come before the
+ code for aligned strings, because otherwise some of the branches
+ above cannot reach and have to be transformed to branches around
+ jumps. The unaligned code is smaller and the branches can reach
+ over it. */
+
+ .align 4
+ /* (2 mod 4) alignment for loop instruction */
+.Lunaligned:
+#if XCHAL_HAVE_LOOPS
+ _movi.n a8, 0 // set up for the maximum loop count
+ loop a8, .Lretdiff // loop forever (almost anyway)
+#endif
+.Lnextbyte:
+ l8ui a8, a2, 0
+ l8ui a9, a3, 0
+ addi a2, a2, 1
+ bne a8, a9, .Lretdiff
+ addi a3, a3, 1
+#if XCHAL_HAVE_LOOPS
+ beqz a8, .Lretdiff
+#else
+ bnez a8, .Lnextbyte
+#endif
+.Lretdiff:
+ sub a2, a8, a9
+ retw
+
+/* s1 is word-aligned; s2 is word-aligned.
+
+ If the zero-overhead loop option is available, use an (almost)
+ infinite zero-overhead loop with conditional exits so we only pay
+ for taken branches when exiting the loop. */
+
+/* New algorithm, relying on the fact that all normal ASCII is between
+ 32 and 127.
+
+ Rather than check all bytes for zero:
+ Take one word (4 bytes). Call it w1.
+ Shift w1 left by one into w1'.
+ Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
+ Check that all 4 bit 6's (one for each byte) are one:
+ If they are, we are definitely not done.
+ If they are not, we are probably done, but need to check for zero. */
+
+ .align 4
+#if XCHAL_HAVE_LOOPS
+.Laligned:
+ .begin no-transform
+ l32r a4, .Lmask0 // mask for byte 0
+ l32r a7, .Lmask4
+ /* Loop forever. (a4 is more than than the maximum number
+ of iterations) */
+ loop a4, .Laligned_done
+
+ /* First unrolled loop body. */
+ l32i a8, a2, 0 // get word from s1
+ l32i a9, a3, 0 // get word from s2
+ slli a5, a8, 1
+ bne a8, a9, .Lwne2
+ or a9, a8, a5
+ bnall a9, a7, .Lprobeq
+
+ /* Second unrolled loop body. */
+ l32i a8, a2, 4 // get word from s1+4
+ l32i a9, a3, 4 // get word from s2+4
+ slli a5, a8, 1
+ bne a8, a9, .Lwne2
+ or a9, a8, a5
+ bnall a9, a7, .Lprobeq2
+
+ addi a2, a2, 8 // advance s1 pointer
+ addi a3, a3, 8 // advance s2 pointer
+.Laligned_done:
+ or a1, a1, a1 // nop
+
+.Lprobeq2:
+ /* Adjust pointers to account for the loop unrolling. */
+ addi a2, a2, 4
+ addi a3, a3, 4
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+.Laligned:
+ movi a4, MASK0 // mask for byte 0
+ movi a7, MASK4
+ j .Lfirstword
+.Lnextword:
+ addi a2, a2, 4 // advance s1 pointer
+ addi a3, a3, 4 // advance s2 pointer
+.Lfirstword:
+ l32i a8, a2, 0 // get word from s1
+ l32i a9, a3, 0 // get word from s2
+ slli a5, a8, 1
+ bne a8, a9, .Lwne2
+ or a9, a8, a5
+ ball a9, a7, .Lnextword
+#endif /* !XCHAL_HAVE_LOOPS */
+
+ /* align (0 mod 4) */
+.Lprobeq:
+ /* Words are probably equal, but check for sure.
+ If not, loop over the rest of string using normal algorithm. */
+
+ bnone a8, a4, .Leq // if byte 0 is zero
+ l32r a5, .Lmask1 // mask for byte 1
+ l32r a6, .Lmask2 // mask for byte 2
+ bnone a8, a5, .Leq // if byte 1 is zero
+ l32r a7, .Lmask3 // mask for byte 3
+ bnone a8, a6, .Leq // if byte 2 is zero
+ bnone a8, a7, .Leq // if byte 3 is zero
+ addi.n a2, a2, 4 // advance s1 pointer
+ addi.n a3, a3, 4 // advance s2 pointer
+#if XCHAL_HAVE_LOOPS
+
+ /* align (1 mod 4) */
+ loop a4, .Leq // loop forever (a4 is bigger than max iters)
+ .end no-transform
+
+ l32i a8, a2, 0 // get word from s1
+ l32i a9, a3, 0 // get word from s2
+ addi a2, a2, 4 // advance s1 pointer
+ bne a8, a9, .Lwne
+ bnone a8, a4, .Leq // if byte 0 is zero
+ bnone a8, a5, .Leq // if byte 1 is zero
+ bnone a8, a6, .Leq // if byte 2 is zero
+ bnone a8, a7, .Leq // if byte 3 is zero
+ addi a3, a3, 4 // advance s2 pointer
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+ j .Lfirstword2
+.Lnextword2:
+ addi a3, a3, 4 // advance s2 pointer
+.Lfirstword2:
+ l32i a8, a2, 0 // get word from s1
+ l32i a9, a3, 0 // get word from s2
+ addi a2, a2, 4 // advance s1 pointer
+ bne a8, a9, .Lwne
+ bnone a8, a4, .Leq // if byte 0 is zero
+ bnone a8, a5, .Leq // if byte 1 is zero
+ bnone a8, a6, .Leq // if byte 2 is zero
+ bany a8, a7, .Lnextword2 // if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+ /* Words are equal; some byte is zero. */
+.Leq: movi a2, 0 // return equal
+ retw
+
+.Lwne2: /* Words are not equal. On big-endian processors, if none of the
+ bytes are zero, the return value can be determined by a simple
+ comparison. */
+#ifdef __XTENSA_EB__
+ or a10, a8, a5
+ bnall a10, a7, .Lsomezero
+ bgeu a8, a9, .Lposreturn
+ movi a2, -1
+ retw
+.Lposreturn:
+ movi a2, 1
+ retw
+.Lsomezero: // There is probably some zero byte.
+#endif /* __XTENSA_EB__ */
+.Lwne: /* Words are not equal. */
+ xor a2, a8, a9 // get word with nonzero in byte that differs
+ bany a2, a4, .Ldiff0 // if byte 0 differs
+ movi a5, MASK1 // mask for byte 1
+ bnone a8, a4, .Leq // if byte 0 is zero
+ bany a2, a5, .Ldiff1 // if byte 1 differs
+ movi a6, MASK2 // mask for byte 2
+ bnone a8, a5, .Leq // if byte 1 is zero
+ bany a2, a6, .Ldiff2 // if byte 2 differs
+ bnone a8, a6, .Leq // if byte 2 is zero
+#ifdef __XTENSA_EB__
+.Ldiff3:
+.Ldiff2:
+.Ldiff1:
+ /* Byte 0 is