1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
/* memmove implementation for SH4
*
* Copyright (C) 2009 STMicroelectronics Ltd.
*
* Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
*
* Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
*/
#ifndef __SH_FPU_ANY__
#include "../../generic/memmove.c"
#else
#include <string.h>
#define FPSCR_SR (1 << 20)
#define STORE_FPSCR(x) __asm__ __volatile__("sts fpscr, %0" : "=r"(x))
#define LOAD_FPSCR(x) __asm__ __volatile__("lds %0, fpscr" : : "r"(x))
static void fpu_optimised_copy_fwd(void *dest, const void *src, size_t len)
{
char *d = (char *)dest;
char *s = (char *)src;
if (len >= 64) {
unsigned long fpscr;
int *s1;
int *d1;
/* Align the dest to 4 byte boundary. */
while ((unsigned)d & 0x7) {
*d++ = *s++;
len--;
}
s1 = (int *)s;
d1 = (int *)d;
/* check if s is well aligned to use FPU */
if (!((unsigned)s1 & 0x7)) {
/* Align the dest to cache-line boundary */
while ((unsigned)d1 & 0x1c) {
*d1++ = *s1++;
len -= 4;
}
/* Use paired single precision load or store mode for
* 64-bit tranfering.*/
STORE_FPSCR(fpscr);
LOAD_FPSCR(FPSCR_SR);
while (len >= 32) {
__asm__ __volatile__ ("fmov @%0+,dr0":"+r" (s1));
__asm__ __volatile__ ("fmov @%0+,dr2":"+r" (s1));
__asm__ __volatile__ ("fmov @%0+,dr4":"+r" (s1));
__asm__ __volatile__ ("fmov @%0+,dr6":"+r" (s1));
__asm__
__volatile__ ("fmov dr0,@%0"::"r"
(d1):"memory");
d1 += 2;
__asm__
__volatile__ ("fmov dr2,@%0"::"r"
(d1):"memory");
d1 += 2;
__asm__
__volatile__ ("fmov dr4,@%0"::"r"
(d1):"memory");
d1 += 2;
__asm__
__volatile__ ("fmov dr6,@%0"::"r"
(d1):"memory");
d1 += 2;
len -= 32;
}
LOAD_FPSCR(fpscr);
}
s = (char *)s1;
d = (char *)d1;
/*TODO: other subcases could be covered here?!?*/
}
/* Go to per-byte copy */
while (len > 0) {
*d++ = *s++;
len--;
}
return;
}
void *memmove(void *dest, const void *src, size_t len)
{
unsigned long int d = (long int)dest;
unsigned long int s = (long int)src;
unsigned long int res;
if (d >= s)
res = d - s;
else
res = s - d;
/*
* 1) dest and src are not overlap ==> memcpy (BWD/FDW)
* 2) dest and src are 100% overlap ==> memcpy (BWD/FDW)
* 3) left-to-right overlap ==> Copy from the beginning to the end
* 4) right-to-left overlap ==> Copy from the end to the beginning
*/
if (res == 0) /* 100% overlap */
memcpy(dest, src, len); /* No overlap */
else if (res >= len)
memcpy(dest, src, len);
else {
if (d > s) /* right-to-left overlap */
memcpy(dest, src, len); /* memcpy is BWD */
else /* cannot use SH4 memcpy for this case */
fpu_optimised_copy_fwd(dest, src, len);
}
return (dest);
}
libc_hidden_def(memmove)
#endif /*__SH_FPU_ANY__ */
|