From 663b8a0497c40a20668258bd69db13924c569c41 Mon Sep 17 00:00:00 2001 From: Pavel Kozlov Date: Fri, 7 Oct 2022 13:43:45 +0400 Subject: arc: add optimized string functions for ARCv3 Add ability to use optimized versions of string functions for ARCv3 32-bit CPUs with UCLIBC_HAS_STRING_ARCH_OPT option. Add optimized memcpy/memset/memcmp code for ARCv3 CPUs based on the code from newlib and adapt for ARCv3 existed optimized strchr/strcmp/strcpy/strlen. Link to the Synopsys newlib repo with code for ARCv3 on GitHub: https://github.com/foss-for-synopsys-dwc-arc-processors/newlib Signed-off-by: Pavel Kozlov --- libc/string/arc/memcmp.S | 94 +++++++++++++++++++++++++++++++++++++++++++- libc/string/arc/memcpy.S | 65 +++++++++++++++++++++++++----- libc/string/arc/memset.S | 61 +++++++++++++++++++++++----- libc/string/arc/strchr.S | 25 ++++++------ libc/string/arc/strcmp.S | 21 +++++----- libc/string/arc/strlen.S | 7 ++-- libc/sysdeps/linux/arc/asm.h | 39 ++++++++++++++++++ 7 files changed, 267 insertions(+), 45 deletions(-) diff --git a/libc/string/arc/memcmp.S b/libc/string/arc/memcmp.S index a60757e7a..20122a296 100644 --- a/libc/string/arc/memcmp.S +++ b/libc/string/arc/memcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -17,6 +17,8 @@ #endif ENTRY(memcmp) + +#if defined(__ARC700__) || defined(__ARCHS__) or r12,r0,r1 asl_s r12,r12,30 sub r3,r2,1 @@ -149,6 +151,96 @@ ENTRY(memcmp) .Lnil: j_s.d [blink] mov r0,0 + +#elif (__ARC64_ARCH32__) + ;; Based on Synopsys code from newlib's arc64/memcmp.S + cmp r2, 32 + bls.d @.L_compare_1_bytes + mov r3, r0 ; "r0" will be used as return value + + lsr r12, r2, 4 ; counter for 16-byte chunks + xor r13, r13, r13 ; the mask showing inequal registers + +.L_compare_16_bytes: + ld.ab r4, [r3, +4] + ld.ab r5, [r1, +4] + ld.ab r6, [r3, +4] + ld.ab r7, [r1, +4] + ld.ab r8, [r3, +4] + ld.ab r9, [r1, +4] + ld.ab r10, [r3, +4] + ld.ab r11, [r1, +4] + xor.f 0, r4, r5 + xor.ne r13, r13, 0b0001 + xor.f 0, r6, r7 + xor.ne r13, r13, 0b0010 + xor.f 0, r8, r9 + xor.ne r13, r13, 0b0100 + xor.f 0, r10, r11 + xor.ne r13, r13, 0b1000 + brne r13, 0, @.L_unequal_find + dbnz r12, @.L_compare_16_bytes + + ;; Adjusting the pointers because of the extra loads in the end + sub r1, r1, 4 + sub r3, r3, 4 + bmsk_s r2, r2, 3 ; any remaining bytes to compare + +.L_compare_1_bytes: + cmp r2, 0 + jeq.d [blink] + xor_s r0, r0, r0 + +2: + ldb.ab r4, [r3, +1] + ldb.ab r5, [r1, +1] + sub.f r0, r4, r5 + jne [blink] + dbnz r2, @2b + j_s [blink] + + ;; At this point, we want to find the _first_ comparison that marked the + ;; inequality of "lhs" and "rhs" +.L_unequal_find: + ffs r13, r13 + asl r13, r13, 2 + bi [r13] +.L_unequal_r4r5: + mov r1, r4 + b.d @.L_diff_byte_in_regs + mov r2, r5 + nop +.L_unequal_r6r7: + mov r1, r6 + b.d @.L_diff_byte_in_regs + mov r2, r7 + nop +.L_unequal_r8r9: + mov r1, r8 + b.d @.L_diff_byte_in_regs + mov r2, r9 + nop +.L_unequal_r10r11: + mov r1, r10 + mov r2, r11 + + ;; fall-through + ;; If we're here, that means the two operands are not equal. +.L_diff_byte_in_regs: + xor r0, r1, r2 + ffs r0, r0 + and r0, r0, 0x18 + lsr r1, r1, r0 + lsr r2, r2, r0 + bmsk_s r1, r1, 7 + bmsk_s r2, r2, 7 + j_s.d [blink] + sub r0, r1, r2 + +#else +#error "Unsupported ARC CPU type" +#endif + END(memcmp) libc_hidden_def(memcmp) diff --git a/libc/string/arc/memcpy.S b/libc/string/arc/memcpy.S index 69d7220b8..153083765 100644 --- a/libc/string/arc/memcpy.S +++ b/libc/string/arc/memcpy.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@ #include -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif - ENTRY(memcpy) -#ifdef __ARC700__ +#if defined(__ARC700__) /* This memcpy implementation does not support objects of 1GB or larger - the check for alignment does not work then. */ /* We assume that most sources and destinations are aligned, and @@ -73,9 +69,9 @@ ENTRY(memcpy) .Lendbloop: j_s.d [blink] stb r12,[r5,0] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) + #ifdef __LITTLE_ENDIAN__ # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> @@ -299,7 +295,58 @@ ENTRY(memcpy) stb.ab r6, [r3,1] .Lcopybytewise_3: j [blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) + ;; Based on Synopsys code from newlib's arc64/memcpy.S + lsr.f r11, r2, 4 ; counter for 16-byte chunks + beq.d @.L_write_15_bytes + mov r3, r0 ; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) + ldd.ab r4, [r1, 8] + ldd.ab r6, [r1, 8] + std.ab r4, [r3, 8] + std.ab r6, [r3, 8] + dbnz r11, @.L_write_16_bytes +#else + ld.ab r4, [r1, 4] + ld.ab r5, [r1, 4] + ld.ab r6, [r1, 4] + ld.ab r7, [r1, 4] + st.ab r4, [r3, 4] + st.ab r5, [r3, 4] + st.ab r6, [r3, 4] + dbnz.d r11, @.L_write_16_bytes + st.ab r7, [r3, 4] +#endif + bmsk_s r2, r2, 3 + +.L_write_15_bytes: + bbit0.d r2, 1, @1f + lsr r11, r2, 2 + ldh.ab r4, [r1, 2] + sth.ab r4, [r3, 2] +1: + bbit0.d r2, 0, @1f + xor r11, r11, 3 + ldb.ab r4, [r1, 1] + stb.ab r4, [r3, 1] +1: + asl r11, r11, 1 + bi [r11] + ld.ab r4,[r1, 4] + st.ab r4,[r3, 4] + ld.ab r4,[r1, 4] + st.ab r4,[r3, 4] + ld r4,[r1] + st r4,[r3] + + j_s [blink] + +#else +#error "Unsupported ARC CPU type" +#endif END(memcpy) libc_hidden_def(memcpy) diff --git a/libc/string/arc/memset.S b/libc/string/arc/memset.S index 0b74ddc7f..5aa5d6c65 100644 --- a/libc/string/arc/memset.S +++ b/libc/string/arc/memset.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,13 +7,9 @@ #include -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif - ENTRY(memset) -#ifdef __ARC700__ +#if defined(__ARC700__) #define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ mov_s r4,r0 @@ -52,9 +48,8 @@ ENTRY(memset) stb.ab r1,[r4,1] .Ltiny_end: j_s [blink] -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) #ifdef DONT_USE_PREALLOC #define PREWRITE(A,B) prefetchw [(A),(B)] #else @@ -156,7 +151,55 @@ ENTRY(memset) .Lcopy3bytes: j [blink] -#endif /* __ARCHS__ */ + +#elif defined(__ARC64_ARCH32__) + ;; Based on Synopsys code from newlib's arc64/memset.S + + ;; Assemble the bytes to 32bit words + bmsk_s r1, r1, 7 ; treat it like unsigned char + lsl8 r3, r1 + or_s r1, r1, r3 + lsl16 r3, r1 + or r6, r1, r3 + mov r7,r6 + + lsr.f r5, r2, 4 ; counter for 16-byte chunks + beq.d @.L_write_15_bytes + mov r4, r0 ; work on a copy of "r0" + +.L_write_16_bytes: +#if defined(__ARC64_LL64__) + std.ab r6, [r4, 8] + std.ab r6, [r4, 8] + dbnz r5, @.L_write_16_bytes +#else + st.ab r6, [r4, 4] + st.ab r6, [r4, 4] + st.ab r6, [r4, 4] + dbnz.d r5, @.L_write_16_bytes + st.ab r6, [r4, 4] +#endif + bmsk_s r2, r2, 3 + +.L_write_15_bytes: + bbit0.d r2, 1, @1f + lsr r3, r2, 2 + sth.ab r6, [r4, 2] +1: + bbit0.d r2, 0, @1f + xor r3, r3, 3 + stb.ab r6, [r4, 1] +1: + bi [r3] + st.ab r6,[r4, 4] + st.ab r6,[r4, 4] + st.ab r6,[r4, 4] + + j_s [blink] + +#else +#error "Unsupported ARC CPU type" +#endif END(memset) libc_hidden_def(memset) diff --git a/libc/string/arc/strchr.S b/libc/string/arc/strchr.S index 443993589..df25eb3be 100644 --- a/libc/string/arc/strchr.S +++ b/libc/string/arc/strchr.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@ #include #include +#include /* ARC700 has a relatively long pipeline and branch prediction, so we want to avoid branches that are hard to predict. On the other hand, the @@ -21,7 +22,7 @@ ENTRY(strchr) mov_s r3,0x01010101 breq.d r2,r0,.Laligned asl r4,r5,16 - sub_s r0,r0,r2 + SUBR_S r0,r0,r2 asl r7,r2,3 ld_s r2,[r0] #ifdef __LITTLE_ENDIAN__ @@ -77,10 +78,10 @@ ENTRY(strchr) sub r3,r7,1 bic r3,r3,r7 norm r2,r3 - sub_s r0,r0,1 - asr_s r2,r2,3 + SUBR_S r0,r0,1 + ASRR_S r2,r2,3 j.d [blink] - sub_s r0,r0,r2 + SUBR_S r0,r0,r2 .balign 4 .Lfound0_ua: @@ -90,13 +91,13 @@ ENTRY(strchr) bic r3,r3,r6 and r2,r3,r4 or_s r12,r12,r2 - sub_s r3,r12,1 + SUBR_S r3,r12,1 bic_s r3,r3,r12 norm r3,r3 - add_s r0,r0,3 - asr_s r12,r3,3 + ADDR_S r0,r0,3 + ASRR_S r12,r3,3 asl.f 0,r2,r3 - sub_s r0,r0,r12 + SUBR_S r0,r0,r12 j_s.d [blink] mov.pl r0,0 #else /* BIG ENDIAN */ @@ -106,10 +107,10 @@ ENTRY(strchr) bic r2,r7,r6 .Lfound_char_b: norm r2,r2 - sub_s r0,r0,4 + SUBR_S r0,r0,4 asr_s r2,r2,3 j.d [blink] - add_s r0,r0,r2 + ADDR_S r0,r0,r2 .Lfound0_ua: mov_s r3,r7 @@ -126,7 +127,7 @@ ENTRY(strchr) add.pl r3,r3,1 asr_s r12,r3,3 asl.f 0,r2,r3 - add_s r0,r0,r12 + ADDR_S r0,r0,r12 j_s.d [blink] mov.mi r0,0 #endif /* ENDIAN */ diff --git a/libc/string/arc/strcmp.S b/libc/string/arc/strcmp.S index ad38d9e00..3f64ac421 100644 --- a/libc/string/arc/strcmp.S +++ b/libc/string/arc/strcmp.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,14 +7,11 @@ #include #include - -#if !defined(__ARC700__) && !defined(__ARCHS__) -#error "Neither ARC700 nor ARCHS is defined!" -#endif +#include ENTRY(strcmp) -#ifdef __ARC700__ +#if defined(__ARC700__) || defined(__ARC64_ARCH32__) /* This is optimized primarily for the ARC700. It would be possible to speed up the loops by one cycle / word respective one cycle / byte by forcing double source 1 alignment, unrolling @@ -38,7 +35,7 @@ ENTRY(strcmp) breq r2,r3,.Lwordloop #ifdef __LITTLE_ENDIAN__ xor r0,r2,r3 ; mask for difference - sub_s r1,r0,1 + SUBR_S r1,r0,1 bic_s r0,r0,r1 ; mask for least significant difference bit sub r1,r5,r0 xor r0,r5,r1 ; mask for least significant difference byte @@ -55,7 +52,7 @@ ENTRY(strcmp) .Lfound0: xor r0,r2,r3 ; mask for difference or r0,r0,r4 ; or in zero indicator - sub_s r1,r0,1 + SUBR_S r1,r0,1 bic_s r0,r0,r1 ; mask for least significant difference bit sub r1,r5,r0 xor r0,r5,r1 ; mask for least significant difference byte @@ -99,9 +96,8 @@ ENTRY(strcmp) .Lcmpend: j_s.d [blink] sub r0,r2,r3 -#endif /* __ARC700__ */ -#ifdef __ARCHS__ +#elif defined(__ARCHS__) or r2, r0, r1 bmsk_s r2, r2, 1 brne r2, 0, @.Lcharloop @@ -168,7 +164,10 @@ ENTRY(strcmp) .Lcmpend: j_s.d [blink] sub r0, r2, r3 -#endif /* __ARCHS__ */ + +#else +#error "Unsupported ARC CPU type" +#endif END(strcmp) libc_hidden_def(strcmp) diff --git a/libc/string/arc/strlen.S b/libc/string/arc/strlen.S index 0b9b93815..0d1d3aa4e 100644 --- a/libc/string/arc/strlen.S +++ b/libc/string/arc/strlen.S @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2007 ARC International (UK) LTD * * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball. @@ -7,6 +7,7 @@ #include +#include ENTRY(strlen) or r3,r0,7 @@ -15,7 +16,7 @@ ENTRY(strlen) mov r4,0x01010101 ; uses long immediate #ifdef __LITTLE_ENDIAN__ - asl_s r1,r0,3 + ASLR_S r1,r0,3 btst_s r0,2 asl r7,r4,r1 ror r5,r4 @@ -59,7 +60,7 @@ ENTRY(strlen) sub.ne r3,r3,4 mov.eq r1,r12 #ifdef __LITTLE_ENDIAN__ - sub_s r2,r1,1 + SUBR_S r2,r1,1 bic_s r2,r2,r1 norm r1,r2 sub_s r0,r0,3 diff --git a/libc/sysdeps/linux/arc/asm.h b/libc/sysdeps/linux/arc/asm.h index f15dff841..f83075ea1 100644 --- a/libc/sysdeps/linux/arc/asm.h +++ b/libc/sysdeps/linux/arc/asm.h @@ -7,6 +7,13 @@ #ifndef _ARC_ASM_H #define _ARC_ASM_H +/* + * Some 16-bit instructions were excluded from the ARCv3 ISA + * the following macros are introduced to handle these changes in one place. + * This will allow not to change existing ARCv2 code and use 16-bit versions + * of instructions for ARCv2 and replace them with 32-bit vesrions for ARCv3 + */ + #if defined (__ARC64_ARCH32__) .macro PUSHR reg @@ -25,6 +32,22 @@ pop \reg .endm +.macro SUBR_S dst,src1,src2 + sub \dst, \src1, \src2 +.endm + +.macro ADDR_S dst,src1,src2 + add \dst, \src1, \src2 +.endm + +.macro ASRR_S dst,src1,src2 + asr \dst, \src1, \src2 +.endm + +.macro ASLR_S dst,src1,src2 + asl \dst, \src1, \src2 +.endm + #elif defined (__ARC64_ARCH64__) # error ARCv3 64-bit is not supported by uClibc-ng @@ -47,6 +70,22 @@ pop_s \reg .endm +.macro SUBR_S dst,src1,src2 + sub_s \dst, \src1, \src2 +.endm + +.macro ADDR_S dst,src1,src2 + add_s \dst, \src1, \src2 +.endm + +.macro ASRR_S dst,src1,src2 + asr_s \dst, \src1, \src2 +.endm + +.macro ASLR_S dst,src1,src2 + asl_s \dst, \src1, \src2 +.endm + #endif #endif /* _ARC_ASM_H */ -- cgit v1.2.3