summaryrefslogtreecommitdiff
path: root/libc/string/arc/memcmp.S
blob: 20122a2967ebefb65fc530b9a6de9df0900488f1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
/*
 * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
 * Copyright (C) 2007 ARC International (UK) LTD
 *
 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
 */

#include <sysdep.h>
#include <features.h>

#ifdef __LITTLE_ENDIAN__
#define WORD2 r2
#define SHIFT r3
#else /* BIG ENDIAN */
#define WORD2 r3
#define SHIFT r2
#endif

ENTRY(memcmp)

#if defined(__ARC700__) || defined(__ARCHS__)
	or	r12,r0,r1
	asl_s	r12,r12,30
	sub	r3,r2,1
	brls	r2,r12,.Lbytewise
	ld	r4,[r0,0]
	ld	r5,[r1,0]
	lsr.f	lp_count,r3,3
#ifdef __HS__
	/* In ARCv2 a branch can't be the last instruction in a zero overhead
	 * loop.
	 * So we move the branch to the start of the loop, duplicate it
	 * after the end, and set up r12 so that the branch isn't taken
	 *  initially.
	 */
	mov_s	r12,WORD2
	lpne	.Loop_end
	brne	WORD2,r12,.Lodd
	ld	WORD2,[r0,4]
#else
	lpne	.Loop_end
	ld_s	WORD2,[r0,4]
#endif
	ld_s	r12,[r1,4]
	brne	r4,r5,.Leven
	ld.a	r4,[r0,8]
	ld.a	r5,[r1,8]
#ifdef __HS__
.Loop_end:
	brne	WORD2,r12,.Lodd
#else
	brne	WORD2,r12,.Lodd
.Loop_end:
#endif
	asl_s	SHIFT,SHIFT,3
	bhs_s	.Last_cmp
	brne	r4,r5,.Leven
	ld	r4,[r0,4]
	ld	r5,[r1,4]
#ifdef __LITTLE_ENDIAN__
	nop_s
	; one more load latency cycle
.Last_cmp:
	xor	r0,r4,r5
	bset	r0,r0,SHIFT
	sub_s	r1,r0,1
	bic_s	r1,r1,r0
	norm	r1,r1
	b.d	.Leven_cmp
	and	r1,r1,24
.Leven:
	xor	r0,r4,r5
	sub_s	r1,r0,1
	bic_s	r1,r1,r0
	norm	r1,r1
	; slow track insn
	and	r1,r1,24
.Leven_cmp:
	asl	r2,r4,r1
	asl	r12,r5,r1
	lsr_s	r2,r2,1
	lsr_s	r12,r12,1
	j_s.d	[blink]
	sub	r0,r2,r12
	.balign	4
.Lodd:
	xor	r0,WORD2,r12
	sub_s	r1,r0,1
	bic_s	r1,r1,r0
	norm	r1,r1
	; slow track insn
	and	r1,r1,24
	asl_s	r2,r2,r1
	asl_s	r12,r12,r1
	lsr_s	r2,r2,1
	lsr_s	r12,r12,1
	j_s.d	[blink]
	sub	r0,r2,r12
#else /* BIG ENDIAN */
.Last_cmp:
	neg_s	SHIFT,SHIFT
	lsr	r4,r4,SHIFT
	lsr	r5,r5,SHIFT
	; slow track insn
.Leven:
	sub.f	r0,r4,r5
	mov.ne	r0,1
	j_s.d	[blink]
	bset.cs	r0,r0,31
.Lodd:
	cmp_s	WORD2,r12
	mov_s	r0,1
	j_s.d	[blink]
	bset.cs	r0,r0,31
#endif /* ENDIAN */
	.balign	4
.Lbytewise:
	breq	r2,0,.Lnil
	ldb	r4,[r0,0]
	ldb	r5,[r1,0]
	lsr.f	lp_count,r3
#ifdef __HS__
	mov	r12,r3
	lpne	.Lbyte_end
	brne	r3,r12,.Lbyte_odd
#else
	lpne	.Lbyte_end
#endif
	ldb_s	r3,[r0,1]
	ldb	r12,[r1,1]
	brne	r4,r5,.Lbyte_even
	ldb.a	r4,[r0,2]
	ldb.a	r5,[r1,2]
#ifdef __HS__
.Lbyte_end:
	brne	r3,r12,.Lbyte_odd
#else
	brne	r3,r12,.Lbyte_odd
.Lbyte_end:
#endif
	bcc	.Lbyte_even
	brne	r4,r5,.Lbyte_even
	ldb_s	r3,[r0,1]
	ldb_s	r12,[r1,1]
.Lbyte_odd:
	j_s.d	[blink]
	sub	r0,r3,r12
.Lbyte_even:
	j_s.d	[blink]
	sub	r0,r4,r5
.Lnil:
	j_s.d	[blink]
	mov	r0,0

#elif (__ARC64_ARCH32__)
	;; Based on Synopsys code from newlib's arc64/memcmp.S
	cmp		r2, 32
	bls.d	@.L_compare_1_bytes
	mov		r3, r0	; "r0" will be used as return value

	lsr		r12, r2, 4	; counter for 16-byte chunks
	xor		r13, r13, r13	; the mask showing inequal registers

.L_compare_16_bytes:
	ld.ab	r4, [r3, +4]
	ld.ab	r5, [r1, +4]
	ld.ab	r6, [r3, +4]
	ld.ab	r7, [r1, +4]
	ld.ab	r8, [r3, +4]
	ld.ab	r9, [r1, +4]
	ld.ab	r10, [r3, +4]
	ld.ab	r11, [r1, +4]
	xor.f	0, r4, r5
	xor.ne	r13, r13, 0b0001
	xor.f	0, r6, r7
	xor.ne	r13, r13, 0b0010
	xor.f	0, r8, r9
	xor.ne	r13, r13, 0b0100
	xor.f	0, r10, r11
	xor.ne	r13, r13, 0b1000
	brne	r13, 0, @.L_unequal_find
	dbnz	r12, @.L_compare_16_bytes

	;; Adjusting the pointers because of the extra loads in the end
	sub		r1, r1, 4
	sub		r3, r3, 4
	bmsk_s	  r2, r2, 3	; any remaining bytes to compare

.L_compare_1_bytes:
	cmp		r2, 0
	jeq.d	[blink]
	xor_s	r0, r0, r0

2:
	ldb.ab	r4, [r3, +1]
	ldb.ab	r5, [r1, +1]
	sub.f	r0, r4, r5
	jne		[blink]
	dbnz	r2, @2b
	j_s		[blink]

	;; At this point, we want to find the _first_ comparison that marked the
	;; inequality of "lhs" and "rhs"
.L_unequal_find:
	ffs		r13, r13
	asl		r13, r13, 2
	bi		[r13]
.L_unequal_r4r5:
	mov		r1, r4
	b.d		@.L_diff_byte_in_regs
	mov		r2, r5
	nop
.L_unequal_r6r7:
	mov		r1, r6
	b.d		@.L_diff_byte_in_regs
	mov		r2, r7
	nop
.L_unequal_r8r9:
	mov		r1, r8
	b.d		@.L_diff_byte_in_regs
	mov		r2, r9
	nop
.L_unequal_r10r11:
	mov		r1, r10
	mov		r2, r11

	;; fall-through
	;; If we're here, that means the two operands are not equal.
.L_diff_byte_in_regs:
	xor		r0, r1, r2
	ffs		r0, r0
	and		r0, r0, 0x18
	lsr		r1, r1, r0
	lsr		r2, r2, r0
	bmsk_s	r1, r1, 7
	bmsk_s	r2, r2, 7
	j_s.d	[blink]
	sub		r0, r1, r2

#else
#error "Unsupported ARC CPU type"
#endif

END(memcmp)
libc_hidden_def(memcmp)

#ifdef __UCLIBC_SUSV3_LEGACY__
strong_alias(memcmp,bcmp)
#endif