summaryrefslogtreecommitdiff
path: root/libc/string/avr32/memcpy.S
blob: f95aabd13e35586766853ac4e1eadee81f21c58b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
 * Copyright (C) 2004-2007 Atmel Corporation
 *
 * This file is subject to the terms and conditions of the GNU Lesser General
 * Public License.  See the file "COPYING.LIB" in the main directory of this
 * archive for more details.
 */

/* Don't use r12 as dst since we must return it unmodified */
#define dst r9
#define src r11
#define len r10

       .text
       .global memcpy
       .type   memcpy, @function
memcpy:
       pref    src[0]
       mov     dst, r12

       /* If we have less than 32 bytes, don't do anything fancy */
       cp.w    len, 32
       brge    .Lmore_than_31

       sub     len, 1
       retlt   r12
1:     ld.ub   r8, src++
       st.b    dst++, r8
       sub     len, 1
       brge    1b
       retal   r12

.Lmore_than_31:
       pushm   r0-r7, lr

       /* Check alignment */
       mov     r8, src
       andl    r8, 31, COH
       brne    .Lunaligned_src
       mov     r8, dst
       andl    r8, 3, COH
       brne    .Lunaligned_dst

.Laligned_copy:
       sub     len, 32
       brlt    .Lless_than_32

1:     /* Copy 32 bytes at a time */
       ldm     src, r0-r7
       sub     src, -32
       stm     dst, r0-r7
       sub     dst, -32
       sub     len, 32
       brge    1b

.Lless_than_32:
       /* Copy 16 more bytes if possible */
       sub     len, -16
       brlt    .Lless_than_16
       ldm     src, r0-r3
       sub     src, -16
       sub     len, 16
       stm     dst, r0-r3
       sub     dst, -16

.Lless_than_16:
       /* Do the remaining as byte copies */
       neg     len
       add     pc, pc, len << 2
       .rept   15
       ld.ub   r0, src++
       st.b    dst++, r0
       .endr

       popm    r0-r7, pc

.Lunaligned_src:
       /* Make src cacheline-aligned. r8 = (src & 31) */
       rsub    r8, r8, 32
       sub     len, r8
1:     ld.ub   r0, src++
       st.b    dst++, r0
       sub     r8, 1
       brne    1b

       /* If dst is word-aligned, we're ready to go */
       pref    src[0]
       mov     r8, 3
       tst     dst, r8
       breq    .Laligned_copy

.Lunaligned_dst:
       /* src is aligned, but dst is not. Expect bad performance */
       sub     len, 4
       brlt    2f
1:     ld.w    r0, src++
       st.w    dst++, r0
       sub     len, 4
       brge    1b

2:     neg     len
       add     pc, pc, len << 2
       .rept   3
       ld.ub   r0, src++
       st.b    dst++, r0
       .endr

       popm    r0-r7, pc
       .size   memcpy, . - memcpy

libc_hidden_def(memcpy)