diff options
author | Mike Frysinger <vapier@gentoo.org> | 2005-09-21 02:18:29 +0000 |
---|---|---|
committer | Mike Frysinger <vapier@gentoo.org> | 2005-09-21 02:18:29 +0000 |
commit | f5c0ac3d4499a11f4581c1b4ff16cef7d8cf4c0b (patch) | |
tree | 4f7ce150130560ccff718076cf102fb4d114752c /libc | |
parent | 37016e09de57c7145d7dd29cd1166f21f150d2cb (diff) |
merge x86_64 optimized string support
Diffstat (limited to 'libc')
-rw-r--r-- | libc/string/Makefile | 19 | ||||
-rw-r--r-- | libc/string/x86_64/Makefile | 31 | ||||
-rw-r--r-- | libc/string/x86_64/_glibc_inc.h | 33 | ||||
-rw-r--r-- | libc/string/x86_64/bzero.S | 3 | ||||
-rw-r--r-- | libc/string/x86_64/memcpy.S | 95 | ||||
-rw-r--r-- | libc/string/x86_64/memset.S | 138 | ||||
-rw-r--r-- | libc/string/x86_64/stpcpy.S | 6 | ||||
-rw-r--r-- | libc/string/x86_64/strcat.S | 256 | ||||
-rw-r--r-- | libc/string/x86_64/strchr.S | 287 | ||||
-rw-r--r-- | libc/string/x86_64/strcmp.S | 41 | ||||
-rw-r--r-- | libc/string/x86_64/strcpy.S | 153 | ||||
-rw-r--r-- | libc/string/x86_64/strcspn.S | 123 | ||||
-rw-r--r-- | libc/string/x86_64/string.c | 1454 | ||||
-rw-r--r-- | libc/string/x86_64/strlen.S | 135 | ||||
-rw-r--r-- | libc/string/x86_64/strpbrk.S | 2 | ||||
-rw-r--r-- | libc/string/x86_64/strspn.S | 114 |
16 files changed, 2875 insertions, 15 deletions
diff --git a/libc/string/Makefile b/libc/string/Makefile index 600f3b81d..576f915af 100644 --- a/libc/string/Makefile +++ b/libc/string/Makefile @@ -1,20 +1,9 @@ # Makefile for uClibc # -# Copyright (C) 2000-2003 Erik Andersen <andersen@uclibc.org> +# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> # -# This program is free software; you can redistribute it and/or modify it under -# the terms of the GNU Library General Public License as published by the Free -# Software Foundation; either version 2 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more -# details. -# -# You should have received a copy of the GNU Library General Public License -# along with this program; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# Licensed under the GNU Library General Public License version 2 or later. +# See the COPYING.LIB file in the toplevel for more information. TOPDIR=../../ include $(TOPDIR)Rules.mak @@ -31,7 +20,7 @@ DIRS += $(TARGET_ARCH) endif endif -ALL_SUBDIRS = generic arm frv i386 mips powerpc sh64 sparc +ALL_SUBDIRS = generic arm frv i386 mips powerpc sh64 sparc x86_64 MSRC= wstring.c MOBJ= basename.o bcopy.o bzero.o dirname.o ffs.o memccpy.o memchr.o memcmp.o \ diff --git a/libc/string/x86_64/Makefile b/libc/string/x86_64/Makefile new file mode 100644 index 000000000..2215a6025 --- /dev/null +++ b/libc/string/x86_64/Makefile @@ -0,0 +1,31 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> +# +# Licensed under the GNU Library General Public License version 2 or later. +# See the COPYING.LIB file in the toplevel for more information. + +TOPDIR=../../../ +include $(TOPDIR)Rules.mak + +CSRCS = $(wildcard *.c) +COBJS = $(patsubst %.c,%.o,$(CSRCS)) + +SSRCS = $(wildcard *.S) +SOBJS = $(patsubst %.S,%.o,$(SSRCS)) + +OBJS = $(COBJS) $(SOBJS) + +OBJ_LIST = ../../obj.string.$(TARGET_ARCH) + +all: $(OBJ_LIST) + +$(OBJ_LIST): $(OBJS) + echo $(patsubst %, string/$(TARGET_ARCH)/%, $(OBJS)) > $(OBJ_LIST) + +$(COBJS): %.o : %.c + $(CC) $(CFLAGS) -c $< -o $@ + $(STRIPTOOL) -x -R .note -R .comment $*.o + +clean: + $(RM) *.[oa] *~ core diff --git a/libc/string/x86_64/_glibc_inc.h b/libc/string/x86_64/_glibc_inc.h new file mode 100644 index 000000000..f14b23c94 --- /dev/null +++ b/libc/string/x86_64/_glibc_inc.h @@ -0,0 +1,33 @@ +/* + * Setup some glibc defines so we can just drop in the + * asm files from glibc without any modification. + */ + +#include <features.h> +#include <bits/wordsize.h> + +#if __WORDSIZE == 32 +# define ENTRY_ALIGN 4 +#else +# define ENTRY_ALIGN 2 +#endif + +#define ENTRY(sym) \ + .global sym; \ + .align ENTRY_ALIGN; \ + .type sym,%function; \ + sym: + +#define BP_SYM(sym) sym + +#define L(sym) LOC(sym) +#define LOC(sym) \ + .L ## sym + +#define END(sym) \ + .size sym,.-sym; + +#undef weak_alias +#define weak_alias(sym, alias) \ + .weak alias; \ + alias = sym; diff --git a/libc/string/x86_64/bzero.S b/libc/string/x86_64/bzero.S new file mode 100644 index 000000000..abd252e7b --- /dev/null +++ b/libc/string/x86_64/bzero.S @@ -0,0 +1,3 @@ +#define memset __bzero +#include "memset.S" +weak_alias (__bzero, bzero) diff --git a/libc/string/x86_64/memcpy.S b/libc/string/x86_64/memcpy.S new file mode 100644 index 000000000..4fa38a640 --- /dev/null +++ b/libc/string/x86_64/memcpy.S @@ -0,0 +1,95 @@ +/* Highly optimized version for x86-64. + Copyright (C) 1997, 2000, 2002, 2003, 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Based on i586 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "_glibc_inc.h" + +/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy', + and the return value is the byte after the last one copied in + the destination. */ +#define MEMPCPY_P (defined memcpy) + + .text +#if defined PIC && !defined NOT_IN_libc +ENTRY (__memcpy_chk) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memcpy_chk) +#endif +ENTRY (BP_SYM (memcpy)) + /* Cutoff for the big loop is a size of 32 bytes since otherwise + the loop will never be entered. */ + cmpq $32, %rdx + movq %rdx, %rcx +#if !MEMPCPY_P + movq %rdi, %r10 /* Save value. */ +#endif + + /* We need this in any case. */ + cld + + jbe 1f + + /* Align destination. */ + movq %rdi, %rax + negq %rax + andq $7, %rax + subq %rax, %rcx + xchgq %rax, %rcx + + rep; movsb + + movq %rax, %rcx + subq $32, %rcx + js 2f + + .p2align 4 +3: + + /* Now correct the loop counter. Please note that in the following + code the flags are not changed anymore. */ + subq $32, %rcx + + movq (%rsi), %rax + movq 8(%rsi), %rdx + movq 16(%rsi), %r8 + movq 24(%rsi), %r9 + movq %rax, (%rdi) + movq %rdx, 8(%rdi) + movq %r8, 16(%rdi) + movq %r9, 24(%rdi) + + leaq 32(%rsi), %rsi + leaq 32(%rdi), %rdi + + jns 3b + + /* Correct extra loop counter modification. */ +2: addq $32, %rcx +1: rep; movsb + +#if MEMPCPY_P + movq %rdi, %rax /* Set return value. */ +#else + movq %r10, %rax /* Set return value. */ + +#endif + ret + +END (BP_SYM (memcpy)) diff --git a/libc/string/x86_64/memset.S b/libc/string/x86_64/memset.S new file mode 100644 index 000000000..d74ec8ccb --- /dev/null +++ b/libc/string/x86_64/memset.S @@ -0,0 +1,138 @@ +/* memset/bzero -- set memory area to CH/0 + Optimized version for x86-64. + Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "_glibc_inc.h" + +/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */ +#define BZERO_P (defined memset) + +/* This is somehow experimental and could made dependend on the cache + size. */ +#define LARGE $120000 + + .text +#if !BZERO_P && defined PIC && !defined NOT_IN_libc +ENTRY (__memset_chk) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk) +#endif +ENTRY (memset) +#if BZERO_P + mov %rsi,%rdx /* Adjust parameter. */ + xorl %esi,%esi /* Fill with 0s. */ +#endif + cmp $0x7,%rdx /* Check for small length. */ + mov %rdi,%rcx /* Save ptr as return value. */ + jbe 7f + +#if BZERO_P + mov %rsi,%r8 /* Just copy 0. */ +#else + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r8 + movzbl %sil,%eax + imul %rax,%r8 +#endif + test $0x7,%edi /* Check for alignment. */ + je 2f + + .p2align 4 +1: /* Align ptr to 8 byte. */ + mov %sil,(%rcx) + dec %rdx + inc %rcx + test $0x7,%ecx + jne 1b + +2: /* Check for really large regions. */ + mov %rdx,%rax + shr $0x6,%rax + je 4f + cmp LARGE, %rdx + jae 11f + + .p2align 4 +3: /* Copy 64 bytes. */ + mov %r8,(%rcx) + mov %r8,0x8(%rcx) + mov %r8,0x10(%rcx) + mov %r8,0x18(%rcx) + mov %r8,0x20(%rcx) + mov %r8,0x28(%rcx) + mov %r8,0x30(%rcx) + mov %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 3b + +4: /* Copy final bytes. */ + and $0x3f,%edx + mov %rdx,%rax + shr $0x3,%rax + je 6f + +5: /* First in chunks of 8 bytes. */ + mov %r8,(%rcx) + add $0x8,%rcx + dec %rax + jne 5b +6: + and $0x7,%edx +7: + test %rdx,%rdx + je 9f +8: /* And finally as bytes (up to 7). */ + mov %sil,(%rcx) + inc %rcx + dec %rdx + jne 8b +9: +#if BZERO_P + nop +#else + /* Load result (only if used as memset). */ + mov %rdi,%rax /* start address of destination is result */ +#endif + retq + + .p2align 4 +11: /* Copy 64 bytes without polluting the cache. */ + /* We could use movntdq %xmm0,(%rcx) here to further + speed up for large cases but let's not use XMM registers. */ + movnti %r8,(%rcx) + movnti %r8,0x8(%rcx) + movnti %r8,0x10(%rcx) + movnti %r8,0x18(%rcx) + movnti %r8,0x20(%rcx) + movnti %r8,0x28(%rcx) + movnti %r8,0x30(%rcx) + movnti %r8,0x38(%rcx) + add $0x40,%rcx + dec %rax + jne 11b + jmp 4b + +END (memset) + +#if !BZERO_P && defined PIC && !defined NOT_IN_libc +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) +#endif diff --git a/libc/string/x86_64/stpcpy.S b/libc/string/x86_64/stpcpy.S new file mode 100644 index 000000000..83294e1a8 --- /dev/null +++ b/libc/string/x86_64/stpcpy.S @@ -0,0 +1,6 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy + +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) diff --git a/libc/string/x86_64/strcat.S b/libc/string/x86_64/strcat.S new file mode 100644 index 000000000..1106cb4ed --- /dev/null +++ b/libc/string/x86_64/strcat.S @@ -0,0 +1,256 @@ +/* strcat(dest, src) -- Append SRC on the end of DEST. + Optimized for x86-64. + Copyright (C) 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "_glibc_inc.h" + + + .text +ENTRY (BP_SYM (strcat)) + movq %rdi, %rcx /* Dest. register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* Duplicate destination pointer. */ + movq $0xfefefefefefefeff,%r8 + + /* First step: Find end of destination. */ + jz 4f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => start copy */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + + + + /* Now the source is aligned. Scan for NUL byte. */ + .p2align 4 +4: + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + +2: + /* Second step: Copy source to destination. */ + + movq %rsi, %rcx /* duplicate */ + andl $7,%ecx /* mask alignment bits */ + movq %rax, %rdx /* move around */ + jz 22f /* aligned => start loop */ + + neg %ecx /* align to 8 bytes. */ + addl $8, %ecx + /* Align the source pointer. */ +21: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 24f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 21b + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +22: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 23f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 23f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 22b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +23: + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 24f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 23b /* and look at next two bytes in %rax. */ + + +24: + movq %rdi, %rax /* Source is return value. */ + retq +END (BP_SYM (strcat)) diff --git a/libc/string/x86_64/strchr.S b/libc/string/x86_64/strchr.S new file mode 100644 index 000000000..b84b012b2 --- /dev/null +++ b/libc/string/x86_64/strchr.S @@ -0,0 +1,287 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For AMD x86-64. + Copyright (C) 2002, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include "_glibc_inc.h" + + + .text +ENTRY (BP_SYM (strchr)) + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 64-bit memory access is faster + and (more important) + 2. we process in the main loop 64 bit in one step although + we don't know the end of the string. But accessing at + 8-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 8). */ + + movq %rdi, %rdx + andl $7, %edx /* Mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + neg %edx + addl $8, %edx /* Align to 8 bytes. */ + + /* Search the first bytes directly. */ +0: movb (%rax), %cl /* load byte */ + cmpb %cl,%sil /* compare byte. */ + je 6f /* target found */ + testb %cl,%cl /* is byte NUL? */ + je 7f /* yes => return NULL */ + incq %rax /* increment pointer */ + decl %edx + jnz 0b + + +1: + /* At the moment %rsi contains C. What we need for the + algorithm is C in all bytes of the register. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r9 + movzbl %sil,%edx + imul %rdx,%r9 + + movq $0xfefefefefefefeff, %r8 /* Save magic. */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of QUARDWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24 tec.. If one of bits 54-63 is set, there will be a carry + into bit 64 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + .p2align 4 +4: + /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => restart loop */ + + +7: /* Return NULL. */ + xorl %eax, %eax + retq + + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. Note that we XORed %rcx + with the byte we're looking for, therefore the tests below look + reversed. */ + + + .p2align 4 /* Align, it's a jump target. */ +3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ + subq $8,%rax /* correct pointer increment. */ + testb %cl, %cl /* is first byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is third byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is fourth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is fourth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is fifth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is fifth byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is sixth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is sixth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is seventh byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is seventh byte NUL? */ + je 7b /* yes => return NULL */ + + /* It must be in the eigth byte and it cannot be NUL. */ + incq %rax + +6: + nop + retq +END (BP_SYM (strchr)) + +weak_alias (BP_SYM (strchr), BP_SYM (index)) |