diff options
Diffstat (limited to 'libc/string')
-rw-r--r-- | libc/string/arc/memcmp.S | 94 | ||||
-rw-r--r-- | libc/string/arc/memcpy.S | 65 | ||||
-rw-r--r-- | libc/string/arc/memset.S | 61 | ||||
-rw-r--r-- | libc/string/arc/strchr.S | 25 | ||||
-rw-r--r-- | libc/string/arc/strcmp.S | 29 | ||||
-rw-r--r-- | libc/string/arc/strlen.S | 7 | ||||
-rw-r--r-- | libc/string/arm/memset.S | 2 | ||||
-rw-r--r-- | libc/string/explicit_bzero.c | 30 | ||||
-rw-r--r-- | libc/string/generic/memmove.c | 10 | ||||
-rw-r--r-- | libc/string/generic/strchr.c | 23 | ||||
-rw-r--r-- | libc/string/generic/strchrnul.c | 23 | ||||
-rw-r--r-- | libc/string/generic/strlen.c | 21 | ||||
-rw-r--r-- | libc/string/kvx/Makefile | 13 | ||||
-rw-r--r-- | libc/string/kvx/memcpy.S | 221 | ||||
-rw-r--r-- | libc/string/kvx/memset.S | 146 | ||||
-rw-r--r-- | libc/string/strcasestr.c | 2 | ||||
-rw-r--r-- | libc/string/strstr.c | 2 |
17 files changed, 674 insertions, 100 deletions
diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S index a60757e7a..20122a296 100644 --- a/libc/string/arc/memcmp.S +++ b/libc/string/arc/memcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -17,6 +17,8 @@ #endif ENTRY(memcmp) + +#if defined(__ARC700__) || defined(__ARCHS__) or r12,r0,r1 asl_s r12,r12,30 sub r3,r2,1 @@ -149,6 +151,96 @@ ENTRY(memcmp) .Lnil: j_s.d [blink] mov r0,0 + +#elif (__ARC64_ARCH32__) + ;; Based on Synopsys code from newlib's arc64/memcmp.S + cmp r2, 32 + bls.d @.L_compare_1_bytes + mov r3, r0 ; "r0" will be used as return value + + lsr r12, r2, 4 ; counter for 16-byte chunks + xor r13, r13, r13 ; the mask showing inequal registers + +.L_compare_16_bytes: + ld.ab r4, [r3, +4] + ld.ab r5, [r1, +4] + ld.ab r6, [r3, +4] + ld.ab r7, [r1, +4] + ld.ab r8, [r3, +4] + ld.ab r9, [r1, +4] + ld.ab r10, [r3, +4] + ld.ab r11, [r1, +4] + xor.f 0, r4, r5 + xor.ne r13, r13, 0b0001 + xor.f 0, r6, r7 + xor.ne r13, r13, 0b0010 + xor.f 0, r8, r9 + xor.ne r13, r13, 0b0100 + xor.f 0, r10, r11 + xor.ne r13, r13, 0b1000 + brne r13, 0, @.L_unequal_find + dbnz r12, @.L_compare_16_bytes + + ;; Adjusting the pointers because of the extra loads in the end + sub r1, r1, 4 + sub r3, r3, 4 + bmsk_s r2, r2, 3 ; any remaining bytes to compare + +.L_compare_1_bytes: + cmp r2, 0 + jeq.d [blink] + xor_s r0, r0, r0 + +2: + ldb.ab r4, [r3, +1] + ldb.ab r5, [r1, +1] + sub.f r0, r4, r5 + jne [blink] + dbnz r2, @2b + j_s [blink] + + ;; At this point, we want to find the _first_ comparison that marked the + ;; inequality of "lhs" and "rhs" +.L_unequal_find: + ffs r13, r13 + asl r13, r13, 2 + bi [r13] +.L_unequal_r4r5: + mov r1, r4 + b.d @.L_diff_byte_in_regs + mov r2, r5 + nop +.L_unequal_r6r7: + mov r1, r6 + b.d @.L_diff_byte_in_regs + mov r2, r7 + nop +.L_unequal_r8r9: + mov r1, r8 + b.d @.L_diff_byte_in_regs + mov r2, r9 + nop +.L_unequal_r10r11: + mov r1, r10 + mov r2, r11 + + ;; fall-through + ;; If we're here, that means the two operands are not equal. +.L_diff_byte_in_regs: + xor r0, r1, r2 + ffs r0, r0 + and r0, r0, 0x18 + lsr r1, r1, r0 + lsr r2, r2, r0 + bmsk_s r1, r1, 7 + bmsk_s r2, r2, 7 + j_s.d [blink] + sub r0, r1, r2 + +#else +#error "Unsupported ARC CPU type" +#endif + END(memcmp) libc_hidden_def(memcmp) diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S index 69d7220b8..153083765 100644 --- a/libc/string/arc/memcpy.S +++ b/libc/string/arc/memcpy.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@ #include <sysdep.h> -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif - ENTRY(memcpy) -#ifdef __ARC700__ +#if defined(__ARC700__) /* This memcpy implementation does not support objects of 1GB or larger - the check for alignment does not work then. */ /* We assume that most sources and destinations are aligned, and @@ -73,9 +69,9 @@ ENTRY(memcpy) .Lendbloop: j_s.d [blink] stb r12,[r5,0] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) + #ifdef __LITTLE_ENDIAN__ # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> @@ -299,7 +295,58 @@ ENTRY(memcpy) stb.ab r6, [r3,1] .Lcopybytewise_3: j [blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) + ;; Based on Synopsys code from newlib's arc64/memcpy.S + lsr.f r11, r2, 4 ; counter for 16-byte chunks + beq.d @.L_write_15_bytes + mov r3, r0 ; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) + ldd.ab r4, [r1, 8] + ldd.ab r6, [r1, 8] + std.ab r4, [r3, 8] + std.ab r6, [r3, 8] + dbnz r11, @.L_write_16_bytes +#else + ld.ab r4, [r1, 4] + ld.ab r5, [r1, 4] + ld.ab r6, [r1, 4] + ld.ab r7, [r1, 4] + st.ab r4, [r3, 4] + st.ab r5, [r3, 4] + st.ab r6, [r3, 4] + dbnz.d r11, @.L_write_16_bytes + st.ab r7, [r3, 4] +#endif + bmsk_s r2, r2, 3 + +.L_write_15_bytes: + bbit0.d r2, 1, @1f + lsr r11, r2, 2 + ldh.ab r4, [r1, 2] + sth.ab r4, [r3, 2] +1: + bbit0.d r2, 0, @1f + xor r11, r11, 3 + ldb.ab r4, [r1, 1] + stb.ab r4, [r3, 1] +1: + asl r11, r11, 1 + bi [r11] + ld.ab r4,[r1, 4] + st.ab r4,[r3, 4] + ld.ab r4,[r1, 4] + st.ab r4,[r3, 4] + ld r4,[r1] + st r4,[r3] + + j_s [blink] + +#else +#error "Unsupported ARC CPU type" +#endif END(memcpy) libc_hidden_def(memcpy) diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S index 0b74ddc7f..5aa5d6c65 100644 --- a/libc/string/arc/memset.S +++ b/libc/string/arc/memset.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@ #include <sysdep.h> -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif - ENTRY(memset) -#ifdef __ARC700__ +#if defined(__ARC700__) #define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ mov_s r4,r0 @@ -52,9 +48,8 @@ ENTRY(memset) stb.ab r1,[r4,1] .Ltiny_end: j_s [blink] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) #ifdef DONT_USE_PREALLOC #define PREWRITE(A,B) prefetchw [(A),(B)] #else @@ -156,7 +151,55 @@ ENTRY(memset) .Lcopy3bytes: j [blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) + ;; Based on Synopsys code from newlib's arc64/memset.S + + ;; Assemble the bytes to 32bit words + bmsk_s r1, r1, 7 ; treat it like unsigned char + lsl8 r3, r1 + or_s r1, r1, r3 + lsl16 r3, r1 + or r6, r1, r3 + mov r7,r6 + + lsr.f r5, r2, 4 ; counter for 16-byte chunks + beq.d @.L_write_15_bytes + mov r4, r0 ; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) + std.ab r6, [r4, 8] + std.ab r6, [r4, 8] + dbnz r5, @.L_write_16_bytes +#else + st.ab r6, [r4, 4] + st.ab r6, [r4, 4] + st.ab r6, [r4, 4] + dbnz.d r5, @.L_write_16_bytes + st.ab r6, [r4, 4] +#endif + bmsk_s r2, r2, 3 + +.L_write_15_bytes: + bbit0.d r2, 1, @1f + lsr r3, r2, 2 + sth.ab r6, [r4, 2] +1: + bbit0.d r2, 0, @1f + xor r3, r3, 3 + stb.ab r6, [r4, 1] +1: + bi [r3] + st.ab r6,[r4, 4] + st.ab r6,[r4, 4] + st.ab r6,[r4, 4] + + j_s [blink] + +#else +#error "Unsupported ARC CPU type" +#endif END(memset) libc_hidden_def(memset) diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S index 443993589..df25eb3be 100644 --- a/libc/string/arc/strchr.S +++ b/libc/string/arc/strchr.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@ #include <sysdep.h> #include <features.h> +#include <asm.h> /* ARC700 has a relatively long pipeline and branch prediction, so we want to avoid branches that are hard to predict. On the other hand, the @@ -21,7 +22,7 @@ ENTRY(strchr) mov_s r3,0x01010101 breq.d r2,r0,.Laligned asl r4,r5,16 - sub_s r0,r0,r2 + SUBR_S r0,r0,r2 asl r7,r2,3 ld_s r2,[r0] #ifdef __LITTLE_ENDIAN__ @@ -77,10 +78,10 @@ ENTRY(strchr) sub r3,r7,1 bic r3,r3,r7 norm r2,r3 - sub_s r0,r0,1 - asr_s r2,r2,3 + SUBR_S r0,r0,1 + ASRR_S r2,r2,3 j.d [blink] - sub_s r0,r0,r2 + SUBR_S r0,r0,r2 .balign 4 .Lfound0_ua: @@ -90,13 +91,13 @@ ENTRY(strchr) bic r3,r3,r6 and r2,r3,r4 or_s r12,r12,r2 - sub_s r3,r12,1 + SUBR_S r3,r12,1 bic_s r3,r3,r12 norm r3,r3 - add_s r0,r0,3 - asr_s r12,r3,3 + ADDR_S r0,r0,3 + ASRR_S r12,r3,3 asl.f 0,r2,r3 - sub_s r0,r0,r12 + SUBR_S r0,r0,r12 j_s.d [blink] mov.pl r0,0 #else /* BIG ENDIAN */ @@ -106,10 +107,10 @@ ENTRY(strchr) bic r2,r7,r6 .Lfound_char_b: norm r2,r2 - sub_s r0,r0,4 + SUBR_S r0,r0,4 asr_s r2,r2,3 j.d [blink] - add_s r0,r0,r2 + ADDR_S r0,r0,r2 .Lfound0_ua: mov_s r3,r7 @@ -126,7 +127,7 @@ ENTRY(strchr) add.pl r3,r3,1 asr_s r12,r3,3 asl.f 0,r2,r3 - add_s r0,r0,r12 + ADDR_S r0,r0,r12 j_s.d [blink] mov.mi r0,0 #endif /* ENDIAN */ diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S index ad38d9e00..48d2d7ec1 100644 --- a/libc/string/arc/strcmp.S +++ b/libc/string/arc/strcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,14 +7,11 @@ #include <features.h> #include <sysdep.h> - -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif +#include <asm.h> ENTRY(strcmp) -#ifdef __ARC700__ +#if defined(__ARC700__) || defined(__ARC64_ARCH32__) /* This is optimized primarily for the ARC700. It would be possible to speed up the loops by one cycle / word respective one cycle / byte by forcing double source 1 alignment, unrolling @@ -38,7 +35,7 @@ ENTRY(strcmp) breq r2,r3,.Lwordloop #ifdef __LITTLE_ENDIAN__ xor r0,r2,r3 ; mask for difference - sub_s r1,r0,1 + SUBR_S r1,r0,1 bic_s r0,r0,r1 ; mask for least significant difference bit sub r1,r5,r0 xor r0,r5,r1 ; mask for least significant difference byte @@ -55,7 +52,7 @@ ENTRY(strcmp) .Lfound0: xor r0,r2,r3 ; mask for difference or r0,r0,r4 ; or in zero indicator - sub_s r1,r0,1 + SUBR_S r1,r0,1 bic_s r0,r0,r1 ; mask for least significant difference bit sub r1,r5,r0 xor r0,r5,r1 ; mask for least significant difference byte @@ -99,31 +96,28 @@ ENTRY(strcmp) .Lcmpend: j_s.d [blink] sub r0,r2,r3 -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) or r2, r0, r1 bmsk_s r2, r2, 1 brne r2, 0, @.Lcharloop ;;; s1 and s2 are word aligned - ld.ab r2, [r0, 4] mov_s r12, 0x01010101 ror r11, r12 .align 4 .LwordLoop: + ld.ab r2, [r0, 4] + sub r4, r2, r12 ld.ab r3, [r1, 4] ;; Detect NULL char in str1 - sub r4, r2, r12 - ld.ab r5, [r0, 4] bic r4, r4, r2 and r4, r4, r11 brne.d.nt r4, 0, .LfoundNULL ;; Check if the read locations are the same cmp r2, r3 - beq.d .LwordLoop - mov.eq r2, r5 + beq .LwordLoop ;; A match is found, spot it out #ifdef __LITTLE_ENDIAN__ @@ -168,7 +162,10 @@ ENTRY(strcmp) .Lcmpend: j_s.d [blink] sub r0, r2, r3 -#endif /* __ARCHS__ */ + +#else +#error "Unsupported ARC CPU type" +#endif END(strcmp) libc_hidden_def(strcmp) diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S index 0b9b93815..0d1d3aa4e 100644 --- a/libc/string/arc/strlen.S +++ b/libc/string/arc/strlen.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@ #include <sysdep.h> +#include <asm.h> ENTRY(strlen) or r3,r0,7 @@ -15,7 +16,7 @@ ENTRY(strlen) mov r4,0x01010101 ; uses long immediate #ifdef __LITTLE_ENDIAN__ - asl_s r1,r0,3 + ASLR_S r1,r0,3 btst_s r0,2 asl r7,r4,r1 ror r5,r4 @@ -59,7 +60,7 @@ ENTRY(strlen) sub.ne r3,r3,4 mov.eq r1,r12 #ifdef __LITTLE_ENDIAN__ - sub_s r2,r1,1 + SUBR_S r2,r1,1 bic_s r2,r2,r1 norm r1,r2 sub_s r0,r0,3 diff --git a/libc/string/arm/memset.S b/libc/string/arm/memset.S index 412270f50..29c583f16 100644 --- a/libc/string/arm/memset.S +++ b/libc/string/arm/memset.S @@ -32,6 +32,7 @@ memset: cmp r2, #8 @ at least 8 bytes to do? bcc 2f + and r1, r1, #0xFF lsl r3, r1, #8 orr r1, r3 lsl r3, r1, #16 @@ -68,6 +69,7 @@ memset: mov a4, a1 cmp a3, $8 @ at least 8 bytes to do? blo 2f + and a2, a2, #0xFF orr a2, a2, a2, lsl $8 orr a2, a2, a2, lsl $16 1: diff --git a/libc/string/explicit_bzero.c b/libc/string/explicit_bzero.c new file mode 100644 index 000000000..b09e4c1f4 --- /dev/null +++ b/libc/string/explicit_bzero.c @@ -0,0 +1,30 @@ +/* +Copyright © 2005-2020 Rich Felker, et al. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ +#define _BSD_SOURCE +#include <string.h> + +void explicit_bzero(void *d, size_t n) +{ + d = memset(d, 0, n); + __asm__ __volatile__ ("" : : "r"(d) : "memory"); +} diff --git a/libc/string/generic/memmove.c b/libc/string/generic/memmove.c index 1ac018013..5389cc029 100644 --- a/libc/string/generic/memmove.c +++ b/libc/string/generic/memmove.c @@ -23,8 +23,9 @@ #include "memcopy.h" #include "pagecopy.h" -#ifndef __ARCH_HAS_BWD_MEMCPY__ +#if defined(__ARCH_HAS_BWD_MEMCPY__) || defined(__mips__) /* generic-opt memmove assumes memcpy does forward copying! */ +/* also needed for MIPS as its memcpy does not support overlapping regions */ #include "_memcpy_fwd.c" #endif @@ -224,8 +225,11 @@ void *memmove (void *dest, const void *src, size_t len) Reduces the working set. */ if (dstp - srcp >= len) /* *Unsigned* compare! */ { -#ifdef __ARCH_HAS_BWD_MEMCPY__ - /* Backward memcpy implementation can be used */ + /* Calling memcpy() from memmove() should be skipped in two cases: + * a) if arch's memcpy uses a backward copying (SH4) + * b) if arch's memcpy is not fully safe for overlapping regions (MIPS) + */ +#if !defined(__ARCH_HAS_BWD_MEMCPY_) && !defined(__mips__) memcpy(dest, src, len); #else /* Copy from the beginning to the end. */ diff --git a/libc/string/generic/strchr.c b/libc/string/generic/strchr.c index 321d2b8c3..b34884d67 100644 --- a/libc/string/generic/strchr.c +++ b/libc/string/generic/strchr.c @@ -60,22 +60,19 @@ char *strchr (const char *s, int c_in) The 1-bits make sure that carries propagate to the next 0-bit. The 0-bits provide holes for carries to fall into. */ - switch (sizeof (longword)) - { - case 4: magic_bits = 0x7efefeffL; break; - case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break; - default: - abort (); - } - /* Set up a longword, each of whose bytes is C. */ +#if __WORDSIZE == 32 + magic_bits = 0x7efefeffL; charmask = c | (c << 8); charmask |= charmask << 16; - if (sizeof (longword) > 4) - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ - charmask |= (charmask << 16) << 16; - if (sizeof (longword) > 8) - abort (); +#elif __WORDSIZE == 64 + magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; + charmask = c | (c << 8); + charmask |= charmask << 16; + charmask |= (charmask << 16) << 16; +#else + #error unexpected integer size strchr() +#endif /* Instead of the traditional loop which tests each character, we will test a longword at a time. The tricky part is testing diff --git a/libc/string/generic/strchrnul.c b/libc/string/generic/strchrnul.c index d11d9e00d..d9fadc776 100644 --- a/libc/string/generic/strchrnul.c +++ b/libc/string/generic/strchrnul.c @@ -59,22 +59,19 @@ char *strchrnul (const char *s, int c_in) The 1-bits make sure that carries propagate to the next 0-bit. The 0-bits provide holes for carries to fall into. */ - switch (sizeof (longword)) - { - case 4: magic_bits = 0x7efefeffL; break; - case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break; - default: - abort (); - } - /* Set up a longword, each of whose bytes is C. */ +#if __WORDSIZE == 32 + magic_bits = 0x7efefeffL; charmask = c | (c << 8); charmask |= charmask << 16; - if (sizeof (longword) > 4) - /* Do the shift in two steps to avoid a warning if long has 32 bits. */ - charmask |= (charmask << 16) << 16; - if (sizeof (longword) > 8) - abort (); +#elif __WORDSIZE == 64 + magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; + charmask = c | (c << 8); + charmask |= charmask << 16; + charmask |= (charmask << 16) << 16; +#else + #error unexpected integer size strchr() +#endif /* Instead of the traditional loop which tests each character, we will test a longword at a time. The tricky part is testing diff --git a/libc/string/generic/strlen.c b/libc/string/generic/strlen.c index dc383398b..dcc032ddc 100644 --- a/libc/string/generic/strlen.c +++ b/libc/string/generic/strlen.c @@ -28,7 +28,7 @@ size_t strlen (const char *str) { const char *char_ptr; const unsigned long int *longword_ptr; - unsigned long int longword, magic_bits, himagic, lomagic; + unsigned long int longword, himagic, lomagic; /* Handle the first few characters by reading one character at a time. Do this until CHAR_PTR is aligned on a longword boundary. */ @@ -52,14 +52,12 @@ size_t strlen (const char *str) The 1-bits make sure that carries propagate to the next 0-bit. The 0-bits provide holes for carries to fall into. */ - magic_bits = 0x7efefeffL; himagic = 0x80808080L; lomagic = 0x01010101L; if (sizeof (longword) > 4) { /* 64-bit version of the magic. */ /* Do the shift in two steps to avoid a warning if long has 32 bits. */ - magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; himagic = ((himagic << 16) << 16) | himagic; lomagic = ((lomagic << 16) << 16) | lomagic; } @@ -102,22 +100,7 @@ size_t strlen (const char *str) longword = *longword_ptr++; - if ( -#if 0 - /* Add MAGIC_BITS to LONGWORD. */ - (((longword + magic_bits) - - /* Set those bits that were unchanged by the addition. */ - ^ ~longword) - - /* Look at only the hole bits. If any of the hole bits - are unchanged, most likely one of the bytes was a - zero. */ - & ~magic_bits) -#else - ((longword - lomagic) & himagic) -#endif - != 0) + if (((longword - lomagic) & himagic) != 0) { /* Which of the bytes was the zero? If none of them were, it was a misfire; continue the search. */ diff --git a/libc/string/kvx/Makefile b/libc/string/kvx/Makefile new file mode 100644 index 000000000..0a95346fd --- /dev/null +++ b/libc/string/kvx/Makefile @@ -0,0 +1,13 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> +# +# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. +# + +top_srcdir:=../../../ +top_builddir:=../../../ +all: objs +include $(top_builddir)Rules.mak +include ../Makefile.in +include $(top_srcdir)Makerules diff --git a/libc/string/kvx/memcpy.S b/libc/string/kvx/memcpy.S new file mode 100644 index 000000000..70e8db910 --- /dev/null +++ b/libc/string/kvx/memcpy.S @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2020 Kalray Inc. + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB + * in this tarball. + */ + +#include <sysdep.h> + +.align 16 +ENTRY(memcpy) + cb.deqz $r2? .Lreturn + compd.geu $r3 = $r2, 256 + copyd $r6 = $r0 + ;; + cb.deqz $r3? .Lremaining_256 + ;; + lq.u $r32r33 = 0[$r1] + addd $r2 = $r2, -256 + ;; + lq.u $r34r35 = 16[$r1] + ;; + lq.u $r36r37 = 32[$r1] + srld $r7 = $r2, 8 + ;; + lq.u $r38r39 = 48[$r1] + ;; + lq.u $r40r41 = 64[$r1] + ;; + lq.u $r42r43 = 80[$r1] + ;; + lq.u $r44r45 = 96[$r1] + ;; + lq.u $r46r47 = 112[$r1] + ;; + lq.u $r48r49 = 128[$r1] + ;; + lq.u $r50r51 = 144[$r1] + ;; + lq.u $r52r53 = 160[$r1] + ;; + lq.u $r54r55 = 176[$r1] + ;; + lq.u $r56r57 = 192[$r1] + ;; + lq.u $r58r59 = 208[$r1] + compd.geu $r3 = $r2, 256 + ;; + lq.u $r60r61 = 224[$r1] + ;; + lq.u $r62r63 = 240[$r1] + addd $r1 = $r1, 256 + ;; + cb.deqz $r7? .Lstreaming_loop_end + ;; + loopdo $r7, .Lstreaming_loop_end + ;; + sq 0[$r0] = $r32r33 + addd $r2 = $r2, -256 + ;; + lq.u $r32r33 = 0[$r1] + ;; + sq 16[$r0] = $r34r35 + ;; + lq.u $r34r35 = 16[$r1] + ;; + sq 32[$r0] = $r36r37 + ;; + lq.u $r36r37 = 32[$r1] + ;; + sq 48[$r0] = $r38r39 + ;; + lq.u $r38r39 = 48[$r1] + ;; + sq 64[$r0] = $r40r41 + ;; + lq.u $r40r41 = 64[$r1] + ;; + sq 80[$r0] = $r42r43 + ;; + lq.u $r42r43 = 80[$r1] + ;; + sq 96[$r0] = $r44r45 + ;; + lq.u $r44r45 = 96[$r1] + ;; + sq 112[$r0] = $r46r47 + ;; + lq.u $r46r47 = 112[$r1] + ;; + sq 128[$r0] = $r48r49 + ;; + lq.u $r48r49 = 128[$r1] + ;; + sq 144[$r0] = $r50r51 + ;; + lq.u $r50r51 = 144[$r1] + ;; + sq 160[$r0] = $r52r53 + ;; + lq.u $r52r53 = 160[$r1] + ;; + sq 176[$r0] = $r54r55 + ;; + lq.u $r54r55 = 176[$r1] + ;; + sq 192[$r0] = $r56r57 + ;; + lq.u $r56r57 = 192[$r1] + ;; + sq 208[$r0] = $r58r59 + ;; + lq.u $r58r59 = 208[$r1] + ;; + sq 224[$r0] = $r60r61 + ;; + lq.u $r60r61 = 224[$r1] + ;; + sq 240[$r0] = $r62r63 + addd $r0 = $r0, 256 + ;; + lq.u $r62r63 = 240[$r1] + addd $r1 = $r1, 256 + ;; + .Lstreaming_loop_end: + sq 0[$r0] = $r32r33 + ;; + sq 16[$r0] = $r34r35 + ;; + sq 32[$r0] = $r36r37 + ;; + sq 48[$r0] = $r38r39 + ;; + sq 64[$r0] = $r40r41 + ;; + sq 80[$r0] = $r42r43 + ;; + sq 96[$r0] = $r44r45 + ;; + sq 112[$r0] = $r46r47 + ;; + sq 128[$r0] = $r48r49 + ;; + sq 144[$r0] = $r50r51 + ;; + sq 160[$r0] = $r52r53 + ;; + sq 176[$r0] = $r54r55 + ;; + sq 192[$r0] = $r56r57 + ;; + sq 208[$r0] = $r58r59 + ;; + sq 224[$r0] = $r60r61 + ;; + sq 240[$r0] = $r62r63 + addd $r0 = $r0, 256 + ;; +.Lremaining_256: + andd $r11 = $r2, 16 + srld $r7 = $r2, 5 + ;; + cb.deqz $r7? .Lloop_32_end + ;; + loopdo $r7, .Lloop_32_end + ;; + lo $r32r33r34r35 = 0[$r1] + addd $r1 = $r1, 32 + addd $r2 = $r2, -32 + ;; + so 0[$r0] = $r32r33r34r35 + addd $r0 = $r0, 32 + ;; + .Lloop_32_end: + andd $r10 = $r2, 8 + andd $r9 = $r2, 4 + cb.deqz $r11? .Lloop_remaining_16 + lq.u.dnez $r11? $r32r33 = 0[$r1] + ;; + sq 0[$r0] = $r32r33 + addd $r1 = $r1, 16 + addd $r0 = $r0, 16 + ;; +.Lloop_remaining_16: + andd $r8 = $r2, 2 + andd $r7 = $r2, 1 + cb.deqz $r10? .Lloop_remaining_8 + ld.dnez $r10? $r32 = 0[$r1] + ;; + sd 0[$r0] = $r32 + addd $r1 = $r1, 8 + addd $r0 = $r0, 8 + ;; +.Lloop_remaining_8: + cb.deqz $r9? .Lloop_remaining_4 + lwz.dnez $r9? $r32 = 0[$r1] + ;; + sw 0[$r0] = $r32 + addd $r1 = $r1, 4 + addd $r0 = $r0, 4 + ;; +.Lloop_remaining_4: + cb.deqz $r8? .Lloop_remaining_2 + lhz.dnez $r8? $r32 = 0[$r1] + ;; + sh 0[$r0] = $r32 + addd $r1 = $r1, 2 + addd $r0 = $r0, 2 + ;; +.Lloop_remaining_2: + lbz.dnez $r7? $r32 = 0[$r1] + ;; + sb.dnez $r7? 0[$r0] = $r32 + ;; +.Lreturn: + copyd $r0 = $r6 + ret + ;; +END(memcpy) + +libc_hidden_def(memcpy) diff --git a/libc/string/kvx/memset.S b/libc/string/kvx/memset.S new file mode 100644 index 000000000..45023a68f --- /dev/null +++ b/libc/string/kvx/memset.S @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2019 Kalray Inc. + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB + * in this tarball. + */ + +#define REPLICATE_BYTE_MASK 0x0101010101010101 +#define MIN_SIZE_FOR_ALIGN 128 + +/* + * Optimized memset for kvx architecture + * + * In order to optimize memset on kvx, we can use various things: + * - conditionnal store which avoid branch penalty + * - store half/word/double/quad/octuple to store up to 16 bytes at a time + * - hardware loop for steady cases. + * + * First, we start by checking if the size is below a minimum size. If so, we + * skip the alignment part. Indeed, the kvx supports misalignment and the + * penalty for letting it do unaligned accesses is lower than trying to + * realigning us. So for small sizes, we don't even bother to realign. + * In order to create the 64 bits pattern, we use sbmm to replicate the pattern + * on all bits on a register in one call. + * Once alignment has been reached, we can do the hardware loop using store + * octuple in order to optimize throughput. Care must be taken to align hardware + * loops on at least 8 bytes for performances. + * Once the main loop has been done, we finish the copy by checking length to do + * the necessary calls to store remaining bytes. + */ + +#include <sysdep.h> + +.align 16 +ENTRY(memset) + /* Preserve return value */ + copyd $r3 = $r0 + /* Replicate the first pattern byte on all bytes */ + sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK + /* Check if length < MIN_SIZE_FOR_ALIGN */ + compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN + /* Invert address to compute what we need to copy to be aligned on 32 bytes */ + negd $r5 = $r0 + ;; + /* Check if we are aligned on 32 bytes */ + andw $r9 = $r0, 0x1F + /* Compute the length that will be copied to align on 32 bytes boundary */ + andw $r6 = $r5, 0x1F + /* + * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done + * unaligned but that is still better that what we can do with sb + */ + cb.deqz $r7? .Laligned_32 + ;; + /* Remove unaligned part from length */ + sbfd $r2 = $r6, $r2 + /* If we are already aligned on 32 bytes, jump to main "so" loop */ + cb.deqz $r9? .Laligned_32 + /* Check if we need to copy 1 byte */ + andw $r4 = $r5, (1 << 0) + ;; + /* If we are not aligned, store byte */ + sb.dnez $r4? [$r0] = $r32 + /* Check if we need to copy 2 bytes */ + andw $r4 = $r5, (1 << 1) + /* Add potentially copied part for next store offset */ + addd $r0 = $r0, $r4 + ;; + sh.dnez $r4? [$r0] = $r32 + /* Check if we need to copy 4 bytes */ + andw $r4 = $r5, (1 << 2) + addd $r0 = $r0, $r4 + ;; + sw.dnez $r4? [$r0] = $r32 + /* Check if we need to copy 8 bytes */ + andw $r4 = $r5, (1 << 3) + addd $r0 = $r0, $r4 + /* Copy second part of pattern for sq */ + copyd $r33 = $r32 + ;; + sd.dnez $r4? [$r0] = $r32 + /* Check if we need to copy 16 bytes */ + andw $r4 = $r5, (1 << 4) + addd $r0 = $r0, $r4 + ;; + sq.dnez $r4? [$r0] = $r32r33 + addd $r0 = $r0, $r4 + ;; +.Laligned_32: + /* Copy second part of pattern for sq */ + copyd $r33 = $r32 + /* Prepare amount of data for 32 bytes store */ + srld $r10 = $r2, 5 + nop + nop + ;; + copyq $r34r35 = $r32, $r33 + /* Remaining bytes for 16 bytes store */ + andw $r8 = $r2, (1 << 4) + make $r11 = 32 + /* Check if there are enough data for 32 bytes store */ + cb.deqz $r10? .Laligned_32_done + ;; + loopdo $r10, .Laligned_32_done + ;; + so 0[$r0] = $r32r33r34r35 + addd $r0 = $r0, $r11 + ;; + .Laligned_32_done: + /* + * Now that we have handled every aligned bytes using 'so', we can + * handled the remainder of length using store by decrementing size + * We also exploit the fact we are aligned to simply check remaining + * size */ + sq.dnez $r8? [$r0] = $r32r33 + addd $r0 = $r0, $r8 + /* Remaining bytes for 8 bytes store */ + andw $r8 = $r2, (1 << 3) + cb.deqz $r2? .Lmemset_done + ;; + sd.dnez $r8? [$r0] = $r32 + addd $r0 = $r0, $r8 + /* Remaining bytes for 4 bytes store */ + andw $r8 = $r2, (1 << 2) + ;; + sw.dnez $r8? [$r0] = $r32 + addd $r0 = $r0, $r8 + /* Remaining bytes for 2 bytes store */ + andw $r8 = $r2, (1 << 1) + ;; + sh.dnez $r8? [$r0] = $r32 + addd $r0 = $r0, $r8 + ;; + sb.odd $r2? [$r0] = $r32 + /* Restore original value */ + copyd $r0 = $r3 + ret + ;; +.Lmemset_done: + /* Restore original value */ + copyd $r0 = $r3 + ret + ;; +END(memset) + +libc_hidden_def(memset) diff --git a/libc/string/strcasestr.c b/libc/string/strcasestr.c index 3334086bf..8f57cc0a3 100644 --- a/libc/string/strcasestr.c +++ b/libc/string/strcasestr.c @@ -16,7 +16,7 @@ char *strcasestr(const char *s1, const char *s2) #if 1 do { if (!*p) { - return (char *) s1;; + return (char *) s1; } if ((*p == *s) || (tolower(*((unsigned char *)p)) == tolower(*((unsigned char *)s))) diff --git a/libc/string/strstr.c b/libc/string/strstr.c index 7e2a64e7d..bf56b9c12 100644 --- a/libc/string/strstr.c +++ b/libc/string/strstr.c @@ -22,7 +22,7 @@ Wchar *Wstrstr(const Wchar *s1, const Wchar *s2) do { if (!*p) { - return (Wchar *) s1;; + return (Wchar *) s1; } if (*p == *s) { ++p; |