summaryrefslogtreecommitdiff
path: root/libc/string/sh/sh4/memset.S
blob: 1a57cb96930b54c4b33b57de179ee49e2b7a5085 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
 *
 * "memset" implementation of SuperH
 *
 * Copyright (C) 1999  Niibe Yutaka
 *
 * Copyright (c) 2009  STMicroelectronics Ltd
 *   Optimised using 64bit data transfer via FPU
 *   Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
 *
 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
 */

/*
 *            void *memset(void *s, int c, size_t n);
 */

#include <sysdep.h>

#ifdef __LITTLE_ENDIAN__
#define MEMSET_USES_FPU
/* Use paired single precision load or store mode for 64-bit tranfering.
 * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
 * Currenlty it has been only implemented and tested for little endian mode. */
.macro FPU_SET_PAIRED_PREC
	sts	fpscr, r3
	mov	#0x10, r0	! PR=0 SZ=1
	shll16  r0
	lds	r0, fpscr
.endm
.macro RESTORE_FPSCR
	lds	r3, fpscr
.endm
#endif

ENTRY(memset)
	tst	r6,r6
	bt/s	5f		! if n=0, do nothing
	 add	r6,r4
	mov	#12,r0
	cmp/gt	r6,r0
	bt/s	4f		! if it's too small, set a byte at once
	 mov	r4,r0
	and	#3,r0
	cmp/eq	#0,r0
	bt/s	2f		! It's aligned
	 sub	r0,r6
1:
	dt	r0
	bf/s	1b
	 mov.b	r5,@-r4
2:				! make VVVV
	extu.b	r5,r5
	swap.b	r5,r0		!   V0
	or	r0,r5		!   VV
	swap.w	r5,r0		! VV00
	or	r0,r5		! VVVV

	! Enough bytes need to be copied
	mov	#0x40, r0	! (MT)
	cmp/gt	r6,r0		! (MT)  64 > len => slow loop

	bt/s	22f
	 mov	r6,r0

	! align the dst to the cache block size if necessary
	mov	r4, r3
	mov	#~(0x1f), r1

	and	r3, r1
	cmp/eq	r3, r1

	bt/s	11f		! dst is already aligned
	 sub	r1, r3		! r3-r1 -> r3
	shlr2	r3		! number of loops

10:	mov.l	r5,@-r4
	dt	r3
	bf/s	10b
	 add	#-4, r6

11:	! dst is 32byte aligned
	mov	r6,r2
	mov	#-5,r0
	shld	r0,r2		! number of loops

#ifdef MEMSET_USES_FPU
	lds	r5, fpul	! (CO)
	fsts	fpul, fr0	! Dr0 will be 'VVVVVVVV'
	fsts	fpul, fr1

	FPU_SET_PAIRED_PREC
12:
	add	#-0x20, r6	!(MT)
	fmov	dr0, @-r4
	fmov	dr0, @-r4
	fmov	dr0, @-r4
	dt	r2
	bf/s	12b		!(BR)
	 fmov	dr0, @-r4

	RESTORE_FPSCR
#else
12:
	mov.l	r5,@-r4
	mov.l	r5,@-r4
	mov.l	r5,@-r4
	mov.l	r5,@-r4
	mov.l	r5,@-r4
	mov.l	r5,@-r4
	add	#-0x20, r6
	mov.l	r5,@-r4
	dt	r2
	bf/s	12b
	 mov.l	r5,@-r4
#endif
	tst	r6,r6
	bt/s	5f
	 mov	#8, r0

	cmp/ge	r0, r6
	bf/s	4f
	 mov	r6,r0
22:
	shlr2	r0
	shlr	r0		! r0 = r6 >> 3
3:
	dt	r0
	mov.l	r5,@-r4		! set 8-byte at once
	bf/s	3b
	 mov.l	r5,@-r4
	!
	mov	#7,r0
	and	r0,r6
	tst	r6,r6
	bt	5f
	! fill bytes
4:
	dt	r6
	bf/s	4b
	 mov.b	r5,@-r4
5:
	rts
	 mov	r4,r0
END(memset)
libc_hidden_def (memset)