summaryrefslogtreecommitdiff
path: root/libc/string/csky/cskyv2/abiv2_memcpy.S
blob: 4bbb7a37d26976926be075a705830d01d4f7556a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/*
 * Copyright (C) 2017 Hangzhou C-SKY Microsystems co.,ltd.
 *
 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
 * in this tarball.
 */

.macro      GET_FRONT_BITS rx ry
#ifdef      __cskyLE__
    lsr     \rx, \ry
#else
    lsl     \rx, \ry
#endif
.endm

.macro      GET_AFTER_BITS rx ry
#ifdef      __cskyLE__
    lsl     \rx, \ry
#else
    lsr     \rx, \ry
#endif
.endm


#ifdef WANT_WIDE
# define Wmemcpy wmemcpy
#else
# define Wmemcpy memcpy
#endif

/* void *memcpy(void *dest, const void *src, size_t n); */

    .text
	.align 2
	.global Wmemcpy
	.type   Wmemcpy, @function
Wmemcpy:
    mov     r3, r0
    cmplti  r2, 4                                            /* If len less than 4 bytes */
    jbt     .L_copy_by_byte

    mov     r12, r0
    andi    r12, 3
    bnez    r12, .L_dest_not_aligned                         /* If dest is not 4 bytes aligned */
.L0:
    mov     r12, r1
    andi    r12, 3
    bnez    r12, .L_dest_aligned_but_src_not_aligned         /* If dest is aligned, but src is not aligned */

    cmplti  r2, 16                                           /* dest and src are all aligned */
    jbt     .L_aligned_and_len_less_16bytes                  /* If len less than 16 bytes */

.L_aligned_and_len_larger_16bytes:                           /* src and dst are all aligned, and len > 16 bytes */
    ldw     r18, (r1, 0)
    ldw     r19, (r1, 4)
    ldw     r20, (r1, 8)
    ldw     r21, (r1, 12)
    stw     r18, (r3, 0)
    stw     r19, (r3, 4)
    stw     r20, (r3, 8)
    stw     r21, (r3, 12)
    subi    r2, 16
    addi    r1, 16
    addi    r3, 16
    cmplti  r2, 16
    jbf     .L_aligned_and_len_larger_16bytes

.L_aligned_and_len_less_16bytes:
    cmplti  r2, 4
    jbt     .L_copy_by_byte
    ldw     r18, (r1, 0)
    stw     r18, (r3, 0)
    subi    r2, 4
    addi    r1, 4
    addi    r3, 4
    jbr     .L_aligned_and_len_less_16bytes

.L_copy_by_byte:                                    /* len less than 4 bytes */
    cmpnei  r2, 0
    jbf     .L_return
    ldb     r18, (r1, 0)
    stb     r18, (r3, 0)
    subi    r2, 1
    addi    r1, 1
    addi    r3, 1
    jbr     .L_copy_by_byte

.L_return:
    rts

/* If dest is not aligned, just copying some bytes makes the dest align.
   After that, we judge whether the src is aligned. */

.L_dest_not_aligned:
    rsub    r13, r1, r3                              /* consider overlapped case */
    abs     r13, r13
    cmplt   r13, r2
    jbt     .L_copy_by_byte

.L1:
    ldb     r18, (r1, 0)                             /* makes the dest align. */
    stb     r18, (r3, 0)
    addi    r12, 1
    subi    r2, 1
    addi    r1, 1
    addi    r3, 1
    cmpnei  r12, 4
    jbt     .L1
    cmplti  r2, 4
    jbt     .L_copy_by_byte
    jbf     .L0                                     /* judge whether the src is aligned. */

.L_dest_aligned_but_src_not_aligned:
    rsub    r13, r1, r3                             /* consider overlapped case */
    abs     r13, r13
    cmplt   r13, r2
    jbt     .L_copy_by_byte

    bclri   r1, 0
    bclri   r1, 1
    ldw     r18, (r1, 0)
    addi    r1, 4

    movi    r13, 8
    mult    r13, r12
    mov     r24, r13                                /* r12 is used to store the misaligned bits */
    rsubi   r13, 32
    mov     r25, r13

    cmplti  r2, 16
    jbt     .L_not_aligned_and_len_less_16bytes

.L_not_aligned_and_len_larger_16bytes:
    ldw     r20, (r1, 0)
    ldw     r21, (r1, 4)
    ldw     r22, (r1, 8)
    ldw     r23, (r1, 12)

    GET_FRONT_BITS r18 r24                          /* little or big endian? */
    mov     r19, r20
    GET_AFTER_BITS r20 r25
    or      r20, r18

    GET_FRONT_BITS r19 r24
    mov     r18, r21
    GET_AFTER_BITS r21 r13
    or      r21, r19

    GET_FRONT_BITS r18 r24
    mov     r19, r22
    GET_AFTER_BITS r22 r25
    or      r22, r18

    GET_FRONT_BITS r19 r24
    mov     r18, r23
    GET_AFTER_BITS r23 r25
    or      r23, r19

    stw     r20, (r3, 0)
    stw     r21, (r3, 4)
    stw     r22, (r3, 8)
    stw     r23, (r3, 12)
    subi    r2, 16
    addi    r1, 16
    addi    r3, 16
    cmplti  r2, 16
    jbf     .L_not_aligned_and_len_larger_16bytes

.L_not_aligned_and_len_less_16bytes:
    cmplti  r2, 4
    jbf     .L2
    rsubi   r12, 4                                   /* r12 is used to stored the misaligned bits */
    subu    r1, r12                                  /* initial the position */
    jbr     .L_copy_by_byte
.L2:
    ldw     r21, (r1, 0)
    GET_FRONT_BITS r18 r24
    mov     r19, r18
    mov     r18, r21
    GET_AFTER_BITS r21 r25
    or      r21, r19
    stw     r21, (r3, 0)
    subi    r2, 4
    addi    r1, 4
    addi    r3, 4
    jbr     .L_not_aligned_and_len_less_16bytes

.size   Wmemcpy, .-Wmemcpy

libc_hidden_def(Wmemcpy)
.weak Wmemcpy