summaryrefslogtreecommitdiff
path: root/libc/string/csky/cskyv1/memcpy.S
blob: dfa7f64a475e4e7a92a681b1835f4bf49fc6b9f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
.macro      GET_FRONT_BITS rx ry
#ifdef      __cskyLE__
    lsr     \rx, \ry
#else
    lsl     \rx, \ry
#endif
.endm

.macro      GET_AFTER_BITS rx ry
#ifdef      __cskyLE__
    lsl     \rx, \ry
#else
    lsr     \rx, \ry
#endif
.endm


#ifdef WANT_WIDE
# define Wmemcpy wmemcpy
#else
# define Wmemcpy memcpy
#endif

/* void *memcpy(void *dest, const void *src, size_t n); */

	.text
	.align 2
	.global Wmemcpy
	.type   Wmemcpy, @function
Wmemcpy:
    mov     r7, r2
    cmplti  r4, 4                                   /* If len less than 4 bytes */
    jbt     .L_copy_by_byte

    mov     r6, r2
    andi    r6, 3
    cmpnei  r6, 0
    jbt     .L_dest_not_aligned                     /* If dest is not 4 bytes aligned */
.L0:
    mov     r6, r3
    andi    r6, 3
    cmpnei  r6, 0
    jbt     .L_dest_aligned_but_src_not_aligned     /* If dest is aligned, but src is not aligned */

    cmplti  r4, 16                                  /* dest and src are all aligned */
    jbt     .L_aligned_and_len_less_16bytes         /* If len less than 16 bytes */

    subi    sp, 8
    stw     r8, (sp, 0)
    stw     r9, (sp, 4)
.L_aligned_and_len_larger_16bytes:                  /* src and dst are all aligned, and len > 16 bytes */
    ldw     r1, (r3, 0)
    ldw     r5, (r3, 4)
    ldw     r8, (r3, 8)
    ldw     r9, (r3, 12)
    stw     r1, (r7, 0)
    stw     r5, (r7, 4)
    stw     r8, (r7, 8)
    stw     r9, (r7, 12)
    subi    r4, 16
    addi    r3, 16
    addi    r7, 16
    cmplti  r4, 16
    jbf     .L_aligned_and_len_larger_16bytes
    ldw     r8, (sp, 0)
    ldw     r9, (sp, 4)
    addi    sp, 8

.L_aligned_and_len_less_16bytes:
    cmplti  r4, 4
    jbt     .L_copy_by_byte
    ldw     r1, (r3, 0)
    stw     r1, (r7, 0)
    subi    r4, 4
    addi    r3, 4
    addi    r7, 4
    jbr     .L_aligned_and_len_less_16bytes

.L_copy_by_byte:                                    /* len less than 4 bytes */
    cmpnei  r4, 0
    jbf     .L_return
    ldb     r1, (r3, 0)
    stb     r1, (r7, 0)
    subi    r4, 1
    addi    r3, 1
    addi    r7, 1
    jbr     .L_copy_by_byte

.L_return:
    rts

/* If dest is not aligned, we copy some bytes to make dest align.
   Then we should judge whether src is aligned. */

.L_dest_not_aligned:
    mov     r5, r3                                  /* consider overlapped case */
    rsub    r5, r5, r7
    abs     r5, r5
    cmplt   r5, r4
    jbt     .L_copy_by_byte

.L1:
    ldb     r1, (r3, 0)                             /* makes the dest align. */
    stb     r1, (r7, 0)
    addi    r6, 1
    subi    r4, 1
    addi    r3, 1
    addi    r7, 1
    cmpnei  r6, 4
    jbt     .L1
    cmplti  r4, 4
    jbt     .L_copy_by_byte
    jbf     .L0                                     /* judge whether the src is aligned. */

.L_dest_aligned_but_src_not_aligned:
    mov     r5, r3                                  /* consider overlapped case*/
    rsub    r5, r5, r7
    abs     r5, r5
    cmplt   r5, r4
    jbt     .L_copy_by_byte

    bclri   r3, 0
    bclri   r3, 1
    ldw     r1, (r3, 0)
    addi    r3, 4

    subi    sp, 16
    stw     r11, (sp,0)
    stw     r12, (sp,4)
    stw     r13, (sp,8)
    movi    r5, 8
    mult    r5, r6                                  /* r6 is used to store tne misaligned bits */
    mov     r12, r5
    rsubi   r5, 31
    addi    r5, 1
    mov     r13, r5

    cmplti  r4, 16
    jbt     .L_not_aligned_and_len_less_16bytes

    stw     r8, (sp, 12)
    subi    sp, 8
    stw     r9, (sp, 0)
    stw     r10, (sp, 4)
.L_not_aligned_and_len_larger_16bytes:
    ldw     r5, (r3, 0)
    ldw     r11, (r3, 4)
    ldw     r8, (r3, 8)
    ldw     r9, (r3, 12)

    GET_FRONT_BITS r1 r12                          /* little or big endian? */
    mov     r10, r5
    GET_AFTER_BITS r5 r13
    or      r5, r1

    GET_FRONT_BITS r10 r12
    mov     r1, r11
    GET_AFTER_BITS r11 r13
    or      r11, r10

    GET_FRONT_BITS r1 r12
    mov     r10, r8
    GET_AFTER_BITS r8 r13
    or      r8, r1

    GET_FRONT_BITS r10 r12
    mov     r1, r9
    GET_AFTER_BITS r9 r13
    or      r9, r10

    stw     r5, (r7, 0)
    stw     r11, (r7, 4)
    stw     r8, (r7, 8)
    stw     r9, (r7, 12)
    subi    r4, 16
    addi    r3, 16
    addi    r7, 16
    cmplti  r4, 16
    jbf     .L_not_aligned_and_len_larger_16bytes
    ldw     r9, (sp, 0)
    ldw     r10, (sp, 4)
    addi    sp, 8
    ldw     r8, (sp,12)

.L_not_aligned_and_len_less_16bytes:
    cmplti  r4, 4
    jbf     .L2
    rsubi   r6, 4                                   /* r6 is used to stored the misaligned bits */
    subu    r3, r6                                 /* initial the position */
    ldw     r11, (sp, 0)
    ldw     r12, (sp, 4)
    ldw     r13, (sp, 8)
    addi    sp, 16
    jbr     .L_copy_by_byte
.L2:
    ldw     r5, (r3, 0)
    GET_FRONT_BITS r1 r12
    mov     r11, r1
    mov     r1, r5
    GET_AFTER_BITS r5 r13
    or      r5, r11
    stw     r5, (r7, 0)
    subi    r4, 4
    addi    r3, 4
    addi    r7, 4
    jbr     .L_not_aligned_and_len_less_16bytes

.size   Wmemcpy, .-Wmemcpy

libc_hidden_def(Wmemcpy)
.weak Wmemcpy