diff options
Diffstat (limited to 'libc/string/kvx')
| -rw-r--r-- | libc/string/kvx/Makefile | 13 | ||||
| -rw-r--r-- | libc/string/kvx/memcpy.S | 221 | ||||
| -rw-r--r-- | libc/string/kvx/memset.S | 146 | 
3 files changed, 380 insertions, 0 deletions
diff --git a/libc/string/kvx/Makefile b/libc/string/kvx/Makefile new file mode 100644 index 000000000..0a95346fd --- /dev/null +++ b/libc/string/kvx/Makefile @@ -0,0 +1,13 @@ +# Makefile for uClibc +# +# Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org> +# +# Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. +# + +top_srcdir:=../../../ +top_builddir:=../../../ +all: objs +include $(top_builddir)Rules.mak +include ../Makefile.in +include $(top_srcdir)Makerules diff --git a/libc/string/kvx/memcpy.S b/libc/string/kvx/memcpy.S new file mode 100644 index 000000000..70e8db910 --- /dev/null +++ b/libc/string/kvx/memcpy.S @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2020 Kalray Inc. + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB + * in this tarball. + */ + +#include <sysdep.h> + +.align 16 +ENTRY(memcpy) +	cb.deqz $r2? .Lreturn +	compd.geu $r3 = $r2, 256 +	copyd $r6 = $r0 +	;; +	cb.deqz $r3? .Lremaining_256 +	;; +	lq.u $r32r33 = 0[$r1] +	addd $r2 = $r2, -256 +	;; +	lq.u $r34r35 = 16[$r1] +	;; +	lq.u $r36r37 = 32[$r1] +	srld $r7 = $r2, 8 +	;; +	lq.u $r38r39 = 48[$r1] +	;; +	lq.u $r40r41 = 64[$r1] +	;; +	lq.u $r42r43 = 80[$r1] +	;; +	lq.u $r44r45 = 96[$r1] +	;; +	lq.u $r46r47 = 112[$r1] +	;; +	lq.u $r48r49 = 128[$r1] +	;; +	lq.u $r50r51 = 144[$r1] +	;; +	lq.u $r52r53 = 160[$r1] +	;; +	lq.u $r54r55 = 176[$r1] +	;; +	lq.u $r56r57 = 192[$r1] +	;; +	lq.u $r58r59 = 208[$r1] +	compd.geu $r3 = $r2, 256 +	;; +	lq.u $r60r61 = 224[$r1] +	;; +	lq.u $r62r63 = 240[$r1] +	addd $r1 = $r1, 256 +	;; +	cb.deqz $r7? .Lstreaming_loop_end +	;; +	loopdo $r7, .Lstreaming_loop_end +		;; +		sq 0[$r0] = $r32r33 +		addd $r2 = $r2, -256 +		;; +		lq.u $r32r33 = 0[$r1] +		;; +		sq 16[$r0] = $r34r35 +		;; +		lq.u $r34r35 = 16[$r1] +		;; +		sq 32[$r0] = $r36r37 +		;; +		lq.u $r36r37 = 32[$r1] +		;; +		sq 48[$r0] = $r38r39 +		;; +		lq.u $r38r39 = 48[$r1] +		;; +		sq 64[$r0] = $r40r41 +		;; +		lq.u $r40r41 = 64[$r1] +		;; +		sq 80[$r0] = $r42r43 +		;; +		lq.u $r42r43 = 80[$r1] +		;; +		sq 96[$r0] = $r44r45 +		;; +		lq.u $r44r45 = 96[$r1] +		;; +		sq 112[$r0] = $r46r47 +		;; +		lq.u $r46r47 = 112[$r1] +		;; +		sq 128[$r0] = $r48r49 +		;; +		lq.u $r48r49 = 128[$r1] +		;; +		sq 144[$r0] = $r50r51 +		;; +		lq.u $r50r51 = 144[$r1] +		;; +		sq 160[$r0] = $r52r53 +		;; +		lq.u $r52r53 = 160[$r1] +		;; +		sq 176[$r0] = $r54r55 +		;; +		lq.u $r54r55 = 176[$r1] +		;; +		sq 192[$r0] = $r56r57 +		;; +		lq.u $r56r57 = 192[$r1] +		;; +		sq 208[$r0] = $r58r59 +		;; +		lq.u $r58r59 = 208[$r1] +		;; +		sq 224[$r0] = $r60r61 +		;; +		lq.u $r60r61 = 224[$r1] +		;; +		sq 240[$r0] = $r62r63 +		addd $r0 = $r0, 256 +		;; +		lq.u $r62r63 = 240[$r1] +		addd $r1 = $r1, 256 +		;; +	.Lstreaming_loop_end: +	sq 0[$r0] = $r32r33 +	;; +	sq 16[$r0] = $r34r35 +	;; +	sq 32[$r0] = $r36r37 +	;; +	sq 48[$r0] = $r38r39 +	;; +	sq 64[$r0] = $r40r41 +	;; +	sq 80[$r0] = $r42r43 +	;; +	sq 96[$r0] = $r44r45 +	;; +	sq 112[$r0] = $r46r47 +	;; +	sq 128[$r0] = $r48r49 +	;; +	sq 144[$r0] = $r50r51 +	;; +	sq 160[$r0] = $r52r53 +	;; +	sq 176[$r0] = $r54r55 +	;; +	sq 192[$r0] = $r56r57 +	;; +	sq 208[$r0] = $r58r59 +	;; +	sq 224[$r0] = $r60r61 +	;; +	sq 240[$r0] = $r62r63 +	addd $r0 = $r0, 256 +	;; +.Lremaining_256: +	andd $r11 = $r2, 16 +	srld $r7 = $r2, 5 +	;; +	cb.deqz $r7? .Lloop_32_end +	;; +	loopdo $r7, .Lloop_32_end +		;; +		lo $r32r33r34r35 = 0[$r1] +		addd $r1 = $r1, 32 +		addd $r2 = $r2, -32 +		;; +		so 0[$r0] = $r32r33r34r35 +		addd $r0 = $r0, 32 +		;; +	.Lloop_32_end: +	andd $r10 = $r2, 8 +	andd $r9 = $r2, 4 +	cb.deqz $r11? .Lloop_remaining_16 +	lq.u.dnez $r11? $r32r33 = 0[$r1] +	;; +	sq 0[$r0] = $r32r33 +	addd $r1 = $r1, 16 +	addd $r0 = $r0, 16 +	;; +.Lloop_remaining_16: +	andd $r8 = $r2, 2 +	andd $r7 = $r2, 1 +	cb.deqz $r10? .Lloop_remaining_8 +	ld.dnez $r10? $r32 = 0[$r1] +	;; +	sd 0[$r0] = $r32 +	addd $r1 = $r1, 8 +	addd $r0 = $r0, 8 +	;; +.Lloop_remaining_8: +	cb.deqz $r9? .Lloop_remaining_4 +	lwz.dnez $r9? $r32 = 0[$r1] +	;; +	sw 0[$r0] = $r32 +	addd $r1 = $r1, 4 +	addd $r0 = $r0, 4 +	;; +.Lloop_remaining_4: +	cb.deqz $r8? .Lloop_remaining_2 +	lhz.dnez $r8? $r32 = 0[$r1] +	;; +	sh 0[$r0] = $r32 +	addd $r1 = $r1, 2 +	addd $r0 = $r0, 2 +	;; +.Lloop_remaining_2: +	lbz.dnez $r7? $r32 = 0[$r1] +	;; +	sb.dnez $r7? 0[$r0] = $r32 +	;; +.Lreturn: +	copyd $r0 = $r6 +	ret +	;; +END(memcpy) + +libc_hidden_def(memcpy) diff --git a/libc/string/kvx/memset.S b/libc/string/kvx/memset.S new file mode 100644 index 000000000..45023a68f --- /dev/null +++ b/libc/string/kvx/memset.S @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2019 Kalray Inc. + * + * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB + * in this tarball. + */ + +#define REPLICATE_BYTE_MASK	0x0101010101010101 +#define MIN_SIZE_FOR_ALIGN	128 + +/* + * Optimized memset for kvx architecture + * + * In order to optimize memset on kvx, we can use various things: + * - conditionnal store which avoid branch penalty + * - store half/word/double/quad/octuple to store up to 16 bytes at a time + * - hardware loop for steady cases. + * + * First,  we start by checking if the size is below a minimum size. If so, we + * skip the alignment part. Indeed, the kvx supports misalignment and the + * penalty for letting it do unaligned accesses is lower than trying to + * realigning us. So for small sizes, we don't even bother to realign. + * In order to create the 64 bits pattern, we use sbmm to replicate the pattern + * on all bits on a register in one call. + * Once alignment has been reached, we can do the hardware loop using store + * octuple in order to optimize throughput. Care must be taken to align hardware + * loops on at least 8 bytes for performances. + * Once the main loop has been done, we finish the copy by checking length to do + * the necessary calls to store remaining bytes. + */ + +#include <sysdep.h> + +.align 16 +ENTRY(memset) +	/* Preserve return value */ +	copyd $r3 = $r0 +	/* Replicate the first pattern byte on all bytes */ +	sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK +	/* Check if length < MIN_SIZE_FOR_ALIGN */ +	compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN +	/* Invert address to compute what we need to copy to be aligned on 32 bytes */ +	negd $r5 = $r0 +	;; +	/* Check if we are aligned on 32 bytes */ +	andw $r9 = $r0, 0x1F +	/* Compute the length that will be copied to align on 32 bytes boundary */ +	andw $r6 = $r5, 0x1F +	/* +	 * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done +	 * unaligned but that is still better that what we can do with sb +	 */ +	cb.deqz $r7? .Laligned_32 +	;; +	/* Remove unaligned part from length */ +	sbfd $r2 = $r6, $r2 +	/* If we are already aligned on 32 bytes, jump to main "so" loop */ +	cb.deqz $r9? .Laligned_32 +	/* Check if we need to copy 1 byte */ +	andw $r4 = $r5, (1 << 0) +	;; +	/* If we are not aligned, store byte */ +	sb.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 2 bytes */ +	andw $r4 = $r5, (1 << 1) +	/* Add potentially copied part for next store offset */ +	addd $r0 = $r0, $r4 +	;; +	sh.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 4 bytes */ +	andw $r4 = $r5, (1 << 2) +	addd $r0 = $r0, $r4 +	;; +	sw.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 8 bytes */ +	andw $r4 = $r5, (1 << 3) +	addd $r0 = $r0, $r4 +	/* Copy second part of pattern for sq */ +	copyd $r33 = $r32 +	;; +	sd.dnez $r4? [$r0] = $r32 +	/* Check if we need to copy 16 bytes */ +	andw $r4 = $r5, (1 << 4) +	addd $r0 = $r0, $r4 +	;; +	sq.dnez $r4? [$r0] = $r32r33 +	addd $r0 = $r0, $r4 +	;; +.Laligned_32: +	/* Copy second part of pattern for sq */ +	copyd $r33 = $r32 +	/* Prepare amount of data for 32 bytes store */ +	srld $r10 = $r2, 5 +	nop +	nop +	;; +	copyq $r34r35 = $r32, $r33 +	/* Remaining bytes for 16 bytes store */ +	andw $r8 = $r2, (1 << 4) +	make $r11 = 32 +	/* Check if there are enough data for 32 bytes store */ +	cb.deqz $r10? .Laligned_32_done +	;; +	loopdo $r10, .Laligned_32_done +		;; +		so 0[$r0] = $r32r33r34r35 +		addd $r0 = $r0, $r11 +		;; +	.Laligned_32_done: +	/* +	 * Now that we have handled every aligned bytes using 'so', we can +	 * handled the remainder of length using store by decrementing size +	 * We also exploit the fact we are aligned to simply check remaining +	 * size */ +	sq.dnez $r8? [$r0] = $r32r33 +	addd $r0 = $r0, $r8 +	/* Remaining bytes for 8 bytes store */ +	andw $r8 = $r2, (1 << 3) +	cb.deqz $r2? .Lmemset_done +	;; +	sd.dnez $r8? [$r0] = $r32 +	addd $r0 = $r0, $r8 +	/* Remaining bytes for 4 bytes store */ +	andw $r8 = $r2, (1 << 2) +	;; +	sw.dnez $r8? [$r0] = $r32 +	addd $r0 = $r0, $r8 +	/* Remaining bytes for 2 bytes store */ +	andw $r8 = $r2, (1 << 1) +	;; +	sh.dnez $r8? [$r0] = $r32 +	addd $r0 = $r0, $r8 +	;; +	sb.odd $r2? [$r0] = $r32 +	/* Restore original value */ +	copyd $r0 = $r3 +	ret +	;; +.Lmemset_done: +	/* Restore original value */ +	copyd $r0 = $r3 +	ret +	;; +END(memset) + +libc_hidden_def(memset)  | 
