diff options
Diffstat (limited to 'libc/string/csky/cskyv1/memcpy.S')
-rw-r--r-- | libc/string/csky/cskyv1/memcpy.S | 211 |
1 files changed, 211 insertions, 0 deletions
diff --git a/libc/string/csky/cskyv1/memcpy.S b/libc/string/csky/cskyv1/memcpy.S new file mode 100644 index 000000000..dfa7f64a4 --- /dev/null +++ b/libc/string/csky/cskyv1/memcpy.S @@ -0,0 +1,211 @@ +.macro GET_FRONT_BITS rx ry +#ifdef __cskyLE__ + lsr \rx, \ry +#else + lsl \rx, \ry +#endif +.endm + +.macro GET_AFTER_BITS rx ry +#ifdef __cskyLE__ + lsl \rx, \ry +#else + lsr \rx, \ry +#endif +.endm + + +#ifdef WANT_WIDE +# define Wmemcpy wmemcpy +#else +# define Wmemcpy memcpy +#endif + +/* void *memcpy(void *dest, const void *src, size_t n); */ + + .text + .align 2 + .global Wmemcpy + .type Wmemcpy, @function +Wmemcpy: + mov r7, r2 + cmplti r4, 4 /* If len less than 4 bytes */ + jbt .L_copy_by_byte + + mov r6, r2 + andi r6, 3 + cmpnei r6, 0 + jbt .L_dest_not_aligned /* If dest is not 4 bytes aligned */ +.L0: + mov r6, r3 + andi r6, 3 + cmpnei r6, 0 + jbt .L_dest_aligned_but_src_not_aligned /* If dest is aligned, but src is not aligned */ + + cmplti r4, 16 /* dest and src are all aligned */ + jbt .L_aligned_and_len_less_16bytes /* If len less than 16 bytes */ + + subi sp, 8 + stw r8, (sp, 0) + stw r9, (sp, 4) +.L_aligned_and_len_larger_16bytes: /* src and dst are all aligned, and len > 16 bytes */ + ldw r1, (r3, 0) + ldw r5, (r3, 4) + ldw r8, (r3, 8) + ldw r9, (r3, 12) + stw r1, (r7, 0) + stw r5, (r7, 4) + stw r8, (r7, 8) + stw r9, (r7, 12) + subi r4, 16 + addi r3, 16 + addi r7, 16 + cmplti r4, 16 + jbf .L_aligned_and_len_larger_16bytes + ldw r8, (sp, 0) + ldw r9, (sp, 4) + addi sp, 8 + +.L_aligned_and_len_less_16bytes: + cmplti r4, 4 + jbt .L_copy_by_byte + ldw r1, (r3, 0) + stw r1, (r7, 0) + subi r4, 4 + addi r3, 4 + addi r7, 4 + jbr .L_aligned_and_len_less_16bytes + +.L_copy_by_byte: /* len less than 4 bytes */ + cmpnei r4, 0 + jbf .L_return + ldb r1, (r3, 0) + stb r1, (r7, 0) + subi r4, 1 + addi r3, 1 + addi r7, 1 + jbr .L_copy_by_byte + +.L_return: + rts + +/* If dest is not aligned, we copy some bytes to make dest align. + Then we should judge whether src is aligned. */ + +.L_dest_not_aligned: + mov r5, r3 /* consider overlapped case */ + rsub r5, r5, r7 + abs r5, r5 + cmplt r5, r4 + jbt .L_copy_by_byte + +.L1: + ldb r1, (r3, 0) /* makes the dest align. */ + stb r1, (r7, 0) + addi r6, 1 + subi r4, 1 + addi r3, 1 + addi r7, 1 + cmpnei r6, 4 + jbt .L1 + cmplti r4, 4 + jbt .L_copy_by_byte + jbf .L0 /* judge whether the src is aligned. */ + +.L_dest_aligned_but_src_not_aligned: + mov r5, r3 /* consider overlapped case*/ + rsub r5, r5, r7 + abs r5, r5 + cmplt r5, r4 + jbt .L_copy_by_byte + + bclri r3, 0 + bclri r3, 1 + ldw r1, (r3, 0) + addi r3, 4 + + subi sp, 16 + stw r11, (sp,0) + stw r12, (sp,4) + stw r13, (sp,8) + movi r5, 8 + mult r5, r6 /* r6 is used to store tne misaligned bits */ + mov r12, r5 + rsubi r5, 31 + addi r5, 1 + mov r13, r5 + + cmplti r4, 16 + jbt .L_not_aligned_and_len_less_16bytes + + stw r8, (sp, 12) + subi sp, 8 + stw r9, (sp, 0) + stw r10, (sp, 4) +.L_not_aligned_and_len_larger_16bytes: + ldw r5, (r3, 0) + ldw r11, (r3, 4) + ldw r8, (r3, 8) + ldw r9, (r3, 12) + + GET_FRONT_BITS r1 r12 /* little or big endian? */ + mov r10, r5 + GET_AFTER_BITS r5 r13 + or r5, r1 + + GET_FRONT_BITS r10 r12 + mov r1, r11 + GET_AFTER_BITS r11 r13 + or r11, r10 + + GET_FRONT_BITS r1 r12 + mov r10, r8 + GET_AFTER_BITS r8 r13 + or r8, r1 + + GET_FRONT_BITS r10 r12 + mov r1, r9 + GET_AFTER_BITS r9 r13 + or r9, r10 + + stw r5, (r7, 0) + stw r11, (r7, 4) + stw r8, (r7, 8) + stw r9, (r7, 12) + subi r4, 16 + addi r3, 16 + addi r7, 16 + cmplti r4, 16 + jbf .L_not_aligned_and_len_larger_16bytes + ldw r9, (sp, 0) + ldw r10, (sp, 4) + addi sp, 8 + ldw r8, (sp,12) + +.L_not_aligned_and_len_less_16bytes: + cmplti r4, 4 + jbf .L2 + rsubi r6, 4 /* r6 is used to stored the misaligned bits */ + subu r3, r6 /* initial the position */ + ldw r11, (sp, 0) + ldw r12, (sp, 4) + ldw r13, (sp, 8) + addi sp, 16 + jbr .L_copy_by_byte +.L2: + ldw r5, (r3, 0) + GET_FRONT_BITS r1 r12 + mov r11, r1 + mov r1, r5 + GET_AFTER_BITS r5 r13 + or r5, r11 + stw r5, (r7, 0) + subi r4, 4 + addi r3, 4 + addi r7, 4 + jbr .L_not_aligned_and_len_less_16bytes + +.size Wmemcpy, .-Wmemcpy + +libc_hidden_def(Wmemcpy) +.weak Wmemcpy |