3 files changed, 380 insertions, 0 deletions
diff --git a/libc/string/kvx/Makefile b/libc/string/kvx/Makefile
new file mode 100644
index 000000000..0a95346fd
--- /dev/null
+++ b/libc/string/kvx/Makefile
@@ -0,0 +1,13 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
+#
+# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
+#
+
+top_srcdir:=../../../
+top_builddir:=../../../
+all: objs
+include $(top_builddir)Rules.mak
+include ../Makefile.in
+include $(top_srcdir)Makerules
diff --git a/libc/string/kvx/memcpy.S b/libc/string/kvx/memcpy.S
new file mode 100644
index 000000000..70e8db910
--- /dev/null
+++ b/libc/string/kvx/memcpy.S
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2020 Kalray Inc.
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
+ * in this tarball.
+ */
+
+#include <sysdep.h>
+
+.align 16
+ENTRY(memcpy)
+	cb.deqz $r2? .Lreturn
+	compd.geu $r3 = $r2, 256
+	copyd $r6 = $r0
+	;;
+	cb.deqz $r3? .Lremaining_256
+	;;
+	lq.u $r32r33 = 0[$r1]
+	addd $r2 = $r2, -256
+	;;
+	lq.u $r34r35 = 16[$r1]
+	;;
+	lq.u $r36r37 = 32[$r1]
+	srld $r7 = $r2, 8
+	;;
+	lq.u $r38r39 = 48[$r1]
+	;;
+	lq.u $r40r41 = 64[$r1]
+	;;
+	lq.u $r42r43 = 80[$r1]
+	;;
+	lq.u $r44r45 = 96[$r1]
+	;;
+	lq.u $r46r47 = 112[$r1]
+	;;
+	lq.u $r48r49 = 128[$r1]
+	;;
+	lq.u $r50r51 = 144[$r1]
+	;;
+	lq.u $r52r53 = 160[$r1]
+	;;
+	lq.u $r54r55 = 176[$r1]
+	;;
+	lq.u $r56r57 = 192[$r1]
+	;;
+	lq.u $r58r59 = 208[$r1]
+	compd.geu $r3 = $r2, 256
+	;;
+	lq.u $r60r61 = 224[$r1]
+	;;
+	lq.u $r62r63 = 240[$r1]
+	addd $r1 = $r1, 256
+	;;
+	cb.deqz $r7? .Lstreaming_loop_end
+	;;
+	loopdo $r7, .Lstreaming_loop_end
+		;;
+		sq 0[$r0] = $r32r33
+		addd $r2 = $r2, -256
+		;;
+		lq.u $r32r33 = 0[$r1]
+		;;
+		sq 16[$r0] = $r34r35
+		;;
+		lq.u $r34r35 = 16[$r1]
+		;;
+		sq 32[$r0] = $r36r37
+		;;
+		lq.u $r36r37 = 32[$r1]
+		;;
+		sq 48[$r0] = $r38r39
+		;;
+		lq.u $r38r39 = 48[$r1]
+		;;
+		sq 64[$r0] = $r40r41
+		;;
+		lq.u $r40r41 = 64[$r1]
+		;;
+		sq 80[$r0] = $r42r43
+		;;
+		lq.u $r42r43 = 80[$r1]
+		;;
+		sq 96[$r0] = $r44r45
+		;;
+		lq.u $r44r45 = 96[$r1]
+		;;
+		sq 112[$r0] = $r46r47
+		;;
+		lq.u $r46r47 = 112[$r1]
+		;;
+		sq 128[$r0] = $r48r49
+		;;
+		lq.u $r48r49 = 128[$r1]
+		;;
+		sq 144[$r0] = $r50r51
+		;;
+		lq.u $r50r51 = 144[$r1]
+		;;
+		sq 160[$r0] = $r52r53
+		;;
+		lq.u $r52r53 = 160[$r1]
+		;;
+		sq 176[$r0] = $r54r55
+		;;
+		lq.u $r54r55 = 176[$r1]
+		;;
+		sq 192[$r0] = $r56r57
+		;;
+		lq.u $r56r57 = 192[$r1]
+		;;
+		sq 208[$r0] = $r58r59
+		;;
+		lq.u $r58r59 = 208[$r1]
+		;;
+		sq 224[$r0] = $r60r61
+		;;
+		lq.u $r60r61 = 224[$r1]
+		;;
+		sq 240[$r0] = $r62r63
+		addd $r0 = $r0, 256
+		;;
+		lq.u $r62r63 = 240[$r1]
+		addd $r1 = $r1, 256
+		;;
+	.Lstreaming_loop_end:
+	sq 0[$r0] = $r32r33
+	;;
+	sq 16[$r0] = $r34r35
+	;;
+	sq 32[$r0] = $r36r37
+	;;
+	sq 48[$r0] = $r38r39
+	;;
+	sq 64[$r0] = $r40r41
+	;;
+	sq 80[$r0] = $r42r43
+	;;
+	sq 96[$r0] = $r44r45
+	;;
+	sq 112[$r0] = $r46r47
+	;;
+	sq 128[$r0] = $r48r49
+	;;
+	sq 144[$r0] = $r50r51
+	;;
+	sq 160[$r0] = $r52r53
+	;;
+	sq 176[$r0] = $r54r55
+	;;
+	sq 192[$r0] = $r56r57
+	;;
+	sq 208[$r0] = $r58r59
+	;;
+	sq 224[$r0] = $r60r61
+	;;
+	sq 240[$r0] = $r62r63
+	addd $r0 = $r0, 256
+	;;
+.Lremaining_256:
+	andd $r11 = $r2, 16
+	srld $r7 = $r2, 5
+	;;
+	cb.deqz $r7? .Lloop_32_end
+	;;
+	loopdo $r7, .Lloop_32_end
+		;;
+		lo $r32r33r34r35 = 0[$r1]
+		addd $r1 = $r1, 32
+		addd $r2 = $r2, -32
+		;;
+		so 0[$r0] = $r32r33r34r35
+		addd $r0 = $r0, 32
+		;;
+	.Lloop_32_end:
+	andd $r10 = $r2, 8
+	andd $r9 = $r2, 4
+	cb.deqz $r11? .Lloop_remaining_16
+	lq.u.dnez $r11? $r32r33 = 0[$r1]
+	;;
+	sq 0[$r0] = $r32r33
+	addd $r1 = $r1, 16
+	addd $r0 = $r0, 16
+	;;
+.Lloop_remaining_16:
+	andd $r8 = $r2, 2
+	andd $r7 = $r2, 1
+	cb.deqz $r10? .Lloop_remaining_8
+	ld.dnez $r10? $r32 = 0[$r1]
+	;;
+	sd 0[$r0] = $r32
+	addd $r1 = $r1, 8
+	addd $r0 = $r0, 8
+	;;
+.Lloop_remaining_8:
+	cb.deqz $r9? .Lloop_remaining_4
+	lwz.dnez $r9? $r32 = 0[$r1]
+	;;
+	sw 0[$r0] = $r32
+	addd $r1 = $r1, 4
+	addd $r0 = $r0, 4
+	;;
+.Lloop_remaining_4:
+	cb.deqz $r8? .Lloop_remaining_2
+	lhz.dnez $r8? $r32 = 0[$r1]
+	;;
+	sh 0[$r0] = $r32
+	addd $r1 = $r1, 2
+	addd $r0 = $r0, 2
+	;;
+.Lloop_remaining_2:
+	lbz.dnez $r7? $r32 = 0[$r1]
+	;;
+	sb.dnez $r7? 0[$r0] = $r32
+	;;
+.Lreturn:
+	copyd $r0 = $r6
+	ret
+	;;
+END(memcpy)
+
+libc_hidden_def(memcpy)
diff --git a/libc/string/kvx/memset.S b/libc/string/kvx/memset.S
new file mode 100644
index 000000000..45023a68f
--- /dev/null
+++ b/libc/string/kvx/memset.S
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2019 Kalray Inc.
+ *
+ * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
+ * in this tarball.
+ */
+
+#define REPLICATE_BYTE_MASK	0x0101010101010101
+#define MIN_SIZE_FOR_ALIGN	128
+
+/*
+ * Optimized memset for kvx architecture
+ *
+ * In order to optimize memset on kvx, we can use various things:
+ * - conditionnal store which avoid branch penalty
+ * - store half/word/double/quad/octuple to store up to 16 bytes at a time
+ * - hardware loop for steady cases.
+ *
+ * First,  we start by checking if the size is below a minimum size. If so, we
+ * skip the alignment part. Indeed, the kvx supports misalignment and the
+ * penalty for letting it do unaligned accesses is lower than trying to
+ * realigning us. So for small sizes, we don't even bother to realign.
+ * In order to create the 64 bits pattern, we use sbmm to replicate the pattern
+ * on all bits on a register in one call.
+ * Once alignment has been reached, we can do the hardware loop using store
+ * octuple in order to optimize throughput. Care must be taken to align hardware
+ * loops on at least 8 bytes for performances.
+ * Once the main loop has been done, we finish the copy by checking length to do
+ * the necessary calls to store remaining bytes.
+ */
+
+#include <sysdep.h>
+
+.align 16
+ENTRY(memset)
+	/* Preserve return value */
+	copyd $r3 = $r0
+	/* Replicate the first pattern byte on all bytes */
+	sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK
+	/* Check if length < MIN_SIZE_FOR_ALIGN */
+	compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN
+	/* Invert address to compute what we need to copy to be aligned on 32 bytes */
+	negd $r5 = $r0
+	;;
+	/* Check if we are aligned on 32 bytes */
+	andw $r9 = $r0, 0x1F
+	/* Compute the length that will be copied to align on 32 bytes boundary */
+	andw $r6 = $r5, 0x1F
+	/*
+	 * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done
+	 * unaligned but that is still better that what we can do with sb
+	 */
+	cb.deqz $r7? .Laligned_32
+	;;
+	/* Remove unaligned part from length */
+	sbfd $r2 = $r6, $r2
+	/* If we are already aligned on 32 bytes, jump to main "so" loop */
+	cb.deqz $r9? .Laligned_32
+	/* Check if we need to copy 1 byte */
+	andw $r4 = $r5, (1 << 0)
+	;;
+	/* If we are not aligned, store byte */
+	sb.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 2 bytes */
+	andw $r4 = $r5, (1 << 1)
+	/* Add potentially copied part for next store offset */
+	addd $r0 = $r0, $r4
+	;;
+	sh.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 4 bytes */
+	andw $r4 = $r5, (1 << 2)
+	addd $r0 = $r0, $r4
+	;;
+	sw.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 8 bytes */
+	andw $r4 = $r5, (1 << 3)
+	addd $r0 = $r0, $r4
+	/* Copy second part of pattern for sq */
+	copyd $r33 = $r32
+	;;
+	sd.dnez $r4? [$r0] = $r32
+	/* Check if we need to copy 16 bytes */
+	andw $r4 = $r5, (1 << 4)
+	addd $r0 = $r0, $r4
+	;;
+	sq.dnez $r4? [$r0] = $r32r33
+	addd $r0 = $r0, $r4
+	;;
+.Laligned_32:
+	/* Copy second part of pattern for sq */
+	copyd $r33 = $r32
+	/* Prepare amount of data for 32 bytes store */
+	srld $r10 = $r2, 5
+	nop
+	nop
+	;;
+	copyq $r34r35 = $r32, $r33
+	/* Remaining bytes for 16 bytes store */
+	andw $r8 = $r2, (1 << 4)
+	make $r11 = 32
+	/* Check if there are enough data for 32 bytes store */
+	cb.deqz $r10? .Laligned_32_done
+	;;
+	loopdo $r10, .Laligned_32_done
+		;;
+		so 0[$r0] = $r32r33r34r35
+		addd $r0 = $r0, $r11
+		;;
+	.Laligned_32_done:
+	/*
+	 * Now that we have handled every aligned bytes using 'so', we can
+	 * handled the remainder of length using store by decrementing size
+	 * We also exploit the fact we are aligned to simply check remaining
+	 * size */
+	sq.dnez $r8? [$r0] = $r32r33
+	addd $r0 = $r0, $r8
+	/* Remaining bytes for 8 bytes store */
+	andw $r8 = $r2, (1 << 3)
+	cb.deqz $r2? .Lmemset_done
+	;;
+	sd.dnez $r8? [$r0] = $r32
+	addd $r0 = $r0, $r8
+	/* Remaining bytes for 4 bytes store */
+	andw $r8 = $r2, (1 << 2)
+	;;
+	sw.dnez $r8? [$r0] = $r32
+	addd $r0 = $r0, $r8
+	/* Remaining bytes for 2 bytes store */
+	andw $r8 = $r2, (1 << 1)
+	;;
+	sh.dnez $r8? [$r0] = $r32
+	addd $r0 = $r0, $r8
+	;;
+	sb.odd $r2? [$r0] = $r32
+	/* Restore original value */
+	copyd $r0 = $r3
+	ret
+	;;
+.Lmemset_done:
+	/* Restore original value */
+	copyd $r0 = $r3
+	ret
+	;;
+END(memset)
+
+libc_hidden_def(memset)