/* strcmp.S
 * Copyright (C) 2003-2007 Analog Devices Inc., All Rights Reserved.
 *
 * This file is subject to the terms and conditions of the GNU Library General
 * Public License. See the file "COPYING.LIB" in the main directory of this
 * archive for more details.
 *
 * Non-LGPL License also available as part of VisualDSP++
 * http://www.analog.com/processors/resources/crosscore/visualDspDevSoftware.html
 */

#include <sysdep.h>

/* Fast strcmp() for Blackfin.
 * When both strings are aligned, this processes four characters at
 * a time. Uses a hw loop with "very big" count to loop "forever",
 * until difference or a terminating zero is found.
 * Once the end-case word has been identified, breaks out of the
 * loop to check more carefully (same as the unaligned case).
 */

.text

.align 2

.weak _strcmp
ENTRY(_strcmp)
	[--sp] = (R7:4);
	p1 = r0;
	p2 = r1;

	p0 = -1;	// (need for loop counter init)

	  // check if byte aligned
	r0 = r0 | r1;	// check both pointers at same time
	r0 <<= 30;	// dump all but last 2 bits
	cc = az;	// are they zero?
	if !cc jump .Lunaligned;	// no; use unaligned code.
			// fall-thru for aligned case..

	  // note that r0 is zero from the previous...
	  //           p0 set to -1

	LSETUP (.Lbeginloop, .Lendloop) lc0=p0;
	  // pick up first words
	r1 = [p1++];
	r2 = [p2++];
	  // make up mask:  0FF0FF
	r7 = 0xFF;
	r7.h = 0xFF;
		// loop : 9 cycles to check 4 characters
	cc = r1 == r2;
.Lbeginloop:
	if !cc jump .Lnotequal4;	// compare failure, exit loop

	  // starting with   44332211
	  // see if char 3 or char 1 is 0
	r3 = r1 & r7;		// form 00330011
	  // add to zero, and (r2 is free, reload)
	r6 = r3 +|+ r0 || r2 = [p2++] || nop;
	cc = az;	// true if either is zero
	r3 = r1 ^ r3;	        // form 44002200 (4321^0301 => 4020)
				// (trick, saves having another mask)
	// add to zero,  and  (r1 is free, reload)
	r6 = r3 +|+ r0 || r1 = [p1++] || nop;
	cc |= az;	// true if either is zero
	if cc jump .Lzero4;	// leave if a zero somewhere
.Lendloop:
	cc = r1 == r2;

 // loop exits
.Lnotequal4:		// compare failure on 4-char compare
			// address pointers are one word ahead;
			// faster to use zero4 exit code
	p1 += 4;
	p2 += 4;

.Lzero4:			// one of the bytes in word 1 is zero
			// but we've already fetched the next word; so
			// backup two to look at failing word again
	p1 += -8;
	p2 += -8;



		// here when pointers are unaligned: checks one
		// character at a time.  Also use at the end of
		// the word-check algorithm to figure out what happened
.Lunaligned:
	  //	R0 is non-zero from before.
	  //           p0 set to -1

	r0 = 0 (Z);
	r1 = B[p1++] (Z);
	r2 = B[p2++] (Z);
	LSETUP (.Lbeginloop1, .Lendloop1) lc0=p0;

.Lbeginloop1:
	cc = r1;	// first char must be non-zero
	// chars must be the same
	r3 = r2 - r1 (NS) || r1 = B[p1++] (Z) || nop;
	cc &= az;
	r3 = r0 - r2;	// second char must be non-zero
	cc &= an;
	if !cc jump .Lexitloop1;
.Lendloop1:
	r2 = B[p2++] (Z);

.Lexitloop1: // here means we found a zero or a difference.
	   // we have r2(N), p2(N), r1(N+1), p1(N+2)
	r1=B[p1+ -2] (Z);
	r0 = r1 - r2;
	(r7:4) = [sp++];
	rts;
.size _strcmp,.-_strcmp

libc_hidden_def (strcmp)

#ifndef __UCLIBC_HAS_LOCALE__
weak_alias (strcmp,strcoll)
libc_hidden_def (strcoll)
#endif