diff options
| author | Mike Frysinger <vapier@gentoo.org> | 2005-08-25 23:50:55 +0000 | 
|---|---|---|
| committer | Mike Frysinger <vapier@gentoo.org> | 2005-08-25 23:50:55 +0000 | 
| commit | 46b06e9c1c7d57ae0008b3a282110d8ce4345b5f (patch) | |
| tree | 8bb70d81ee67ecd175ad14778b2513e4496e3b9e /libc/sysdeps/linux | |
| parent | 4147ffbcc6e2c20ab9bc1b43330244abe5a10b63 (diff) | |
import different optimized versions of div funcs based upon target sparc arch
Diffstat (limited to 'libc/sysdeps/linux')
20 files changed, 1763 insertions, 1588 deletions
| diff --git a/libc/sysdeps/linux/sparc/rem.S b/libc/sysdeps/linux/sparc/rem.S index 3da857e99..595e8ea5e 100644 --- a/libc/sysdeps/linux/sparc/rem.S +++ b/libc/sysdeps/linux/sparc/rem.S @@ -1,367 +1,9 @@ -   /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - *  .rem	name of function to generate - *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - *  true		true=true => signed; true=false => unsigned - * - * Algorithm parameters: - *  N		how many bits per iteration we try to get (4) - *  WORDSIZE	total number of bits (32) - * - * Derived constants: - *  TOPBITS	number of bits in the top decade of a number - * - * Important variables: - *  Q		the partial quotient under development (initially 0) - *  R		the remainder so far, initially the dividend - *  ITER	number of main division loop iterations required; - *		equal to ceil(log2(quotient) / N).  Note that this - *		is the log base (2^N) of the quotient. - *  V		the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - *  Current estimate for non-large dividend is - *	ceil(log2(quotient) / N) * (10 + 7N/2) + C - *  A large dividend is one greater than 2^(31-TOPBITS) and takes a - *  different path, as the upper bits of the quotient must be developed - *  one bit at a time. - */ - - - -#include <sys/syscall.h> - - -.global   .rem; -.align 4; -.type  .rem ,@function;  - -.rem:  -	! compute sign of result; if neither is negative, no problem -	orcc	%o1, %o0, %g0	! either negative? -	bge	2f			! no, go do the divide -	mov	%o0, %g3		! sign of remainder matches %o0 -	tst	%o1 -	bge	1f -	tst	%o0 -	! %o1 is definitely negative; %o0 might also be negative -	bge	2f			! if %o0 not negative... -	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg -1:	! %o0 is negative, %o1 is nonnegative -	sub	%g0, %o0, %o0	! make %o0 nonnegative -2: - -	! Ready to divide.  Compute size of quotient; scale comparand. -	orcc	%o1, %g0, %o5 -	bne	1f -	mov	%o0, %o3 - -		! Divide by zero trap.  If it returns, return 0 (about as -		! wrong as possible, but that is what SunOS does...). -		ta	0x02 -		retl -		clr	%o0 - -1: -	cmp	%o3, %o5			! if %o1 exceeds %o0, done -	blu	.Lgot_result		! (and algorithm fails otherwise) -	clr	%o2 -	sethi	%hi(1 << (32 - 4 - 1)), %g1 -	cmp	%o3, %g1 -	blu	.Lnot_really_big -	clr	%o4 - -	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, -	! as our usual N-at-a-shot divide step will cause overflow and havoc. -	! The number of bits in the result here is N*ITER+SC, where SC <= N. -	! Compute ITER in an unorthodox manner: know we need to shift V into -	! the top decade: so do not even bother to compare to R. -	1: -		cmp	%o5, %g1 -		bgeu	3f -		mov	1, %g2 -		sll	%o5, 4, %o5 -		b	1b -		add	%o4, 1, %o4 - -	! Now compute %g2. -	2:	addcc	%o5, %o5, %o5 -		bcc	.Lnot_too_big -		add	%g2, 1, %g2 - -		! We get here if the %o1 overflowed while shifting. -		! This means that %o3 has the high-order bit set. -		! Restore %o5 and subtract from %o3. -		sll	%g1, 4, %g1	! high order bit -		srl	%o5, 1, %o5		! rest of %o5 -		add	%o5, %g1, %o5 -		b	.Ldo_single_div -		sub	%g2, 1, %g2 - -	.Lnot_too_big: -	3:	cmp	%o5, %o3 -		blu	2b -		nop -		be	.Ldo_single_div -		nop -	/* NB: these are commented out in the V8-Sparc manual as well */ -	/* (I do not understand this) */ -	! %o5 > %o3: went too far: back up 1 step -	!	srl	%o5, 1, %o5 -	!	dec	%g2 -	! do single-bit divide steps -	! -	! We have to be careful here.  We know that %o3 >= %o5, so we can do the -	! first divide step without thinking.  BUT, the others are conditional, -	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- -	! order bit set in the first step, just falling into the regular -	! division loop will mess up the first time around. -	! So we unroll slightly... -	.Ldo_single_div: -		subcc	%g2, 1, %g2 -		bl	.Lend_regular_divide -		nop -		sub	%o3, %o5, %o3 -		mov	1, %o2 -		b	.Lend_single_divloop -		nop -	.Lsingle_divloop: -		sll	%o2, 1, %o2 -		bl	1f -		srl	%o5, 1, %o5 -		! %o3 >= 0 -		sub	%o3, %o5, %o3 -		b	2f -		add	%o2, 1, %o2 -	1:	! %o3 < 0 -		add	%o3, %o5, %o3 -		sub	%o2, 1, %o2 -	2: -	.Lend_single_divloop: -		subcc	%g2, 1, %g2 -		bge	.Lsingle_divloop -		tst	%o3 -		b,a	.Lend_regular_divide - -.Lnot_really_big: -1: -	sll	%o5, 4, %o5 -	cmp	%o5, %o3 -	bleu	1b -	addcc	%o4, 1, %o4 -	be	.Lgot_result -	sub	%o4, 1, %o4 - -	tst	%o3	! set up for initial iteration -.Ldivloop: -	sll	%o2, 4, %o2 -		! depth 1, accumulated bits 0 -	bl	.L1.16 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 2, accumulated bits 1 -	bl	.L2.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 3 -	bl	.L3.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 7 -	bl	.L4.23 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2+1), %o2 -	 -.L4.23: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2-1), %o2 -	 -	 -.L3.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 5 -	bl	.L4.21 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2+1), %o2 -	 -.L4.21: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2-1), %o2 -	 -	 -	 -.L2.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 1 -	bl	.L3.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 3 -	bl	.L4.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2+1), %o2 -	 -.L4.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2-1), %o2 -	 -	 -.L3.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 1 -	bl	.L4.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2+1), %o2 -	 -.L4.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2-1), %o2 -	 -	 -	 -	 -.L1.16: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 2, accumulated bits -1 -	bl	.L2.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -1 -	bl	.L3.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -1 -	bl	.L4.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2+1), %o2 -	 -.L4.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2-1), %o2 -	 -	 -.L3.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -3 -	bl	.L4.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2+1), %o2 -	 -.L4.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2-1), %o2 -	 -	 -	 -.L2.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -3 -	bl	.L3.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -5 -	bl	.L4.11 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2+1), %o2 -	 -.L4.11: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2-1), %o2 -	 -	 -.L3.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -7 -	bl	.L4.9 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2+1), %o2 -	 -.L4.9: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2-1), %o2 -	 -	 -	 -	 -	9: -.Lend_regular_divide: -	subcc	%o4, 1, %o4 -	bge	.Ldivloop -	tst	%o3 -	bl,a	.Lgot_result -	! non-restoring fixup here (one instruction only!) -	add	%o3, %o1, %o3 - - -.Lgot_result: -	! check to see if answer should be < 0 -	tst	%g3 -	bl,a	1f -	sub %g0, %o3, %o3 -1: -	retl -	mov %o3, %o0 - -.size .rem,.-.rem; +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/rem.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/rem.S" +#else +# include "sparcv7/rem.S" +#endif diff --git a/libc/sysdeps/linux/sparc/sdiv.S b/libc/sysdeps/linux/sparc/sdiv.S index 8fdb7daf8..7e9a9c923 100644 --- a/libc/sysdeps/linux/sparc/sdiv.S +++ b/libc/sysdeps/linux/sparc/sdiv.S @@ -1,366 +1,9 @@ -   /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - *  .div	name of function to generate - *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1 - *  true		true=true => signed; true=false => unsigned - * - * Algorithm parameters: - *  N		how many bits per iteration we try to get (4) - *  WORDSIZE	total number of bits (32) - * - * Derived constants: - *  TOPBITS	number of bits in the top decade of a number - * - * Important variables: - *  Q		the partial quotient under development (initially 0) - *  R		the remainder so far, initially the dividend - *  ITER	number of main division loop iterations required; - *		equal to ceil(log2(quotient) / N).  Note that this - *		is the log base (2^N) of the quotient. - *  V		the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - *  Current estimate for non-large dividend is - *	ceil(log2(quotient) / N) * (10 + 7N/2) + C - *  A large dividend is one greater than 2^(31-TOPBITS) and takes a - *  different path, as the upper bits of the quotient must be developed - *  one bit at a time. - */ - - - -#include <sys/syscall.h> - -.global   .div; -.align 4; -.type  .div ,@function;  - -.div:  -	! compute sign of result; if neither is negative, no problem -	orcc	%o1, %o0, %g0	! either negative? -	bge	2f			! no, go do the divide -	xor	%o1, %o0, %g3	! compute sign in any case -	tst	%o1 -	bge	1f -	tst	%o0 -	! %o1 is definitely negative; %o0 might also be negative -	bge	2f			! if %o0 not negative... -	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg -1:	! %o0 is negative, %o1 is nonnegative -	sub	%g0, %o0, %o0	! make %o0 nonnegative -2: - -	! Ready to divide.  Compute size of quotient; scale comparand. -	orcc	%o1, %g0, %o5 -	bne	1f -	mov	%o0, %o3 - -		! Divide by zero trap.  If it returns, return 0 (about as -		! wrong as possible, but that is what SunOS does...). -		ta	0x02 -		retl -		clr	%o0 - -1: -	cmp	%o3, %o5			! if %o1 exceeds %o0, done -	blu	.Lgot_result		! (and algorithm fails otherwise) -	clr	%o2 -	sethi	%hi(1 << (32 - 4 - 1)), %g1 -	cmp	%o3, %g1 -	blu	.Lnot_really_big -	clr	%o4 - -	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, -	! as our usual N-at-a-shot divide step will cause overflow and havoc. -	! The number of bits in the result here is N*ITER+SC, where SC <= N. -	! Compute ITER in an unorthodox manner: know we need to shift V into -	! the top decade: so do not even bother to compare to R. -	1: -		cmp	%o5, %g1 -		bgeu	3f -		mov	1, %g2 -		sll	%o5, 4, %o5 -		b	1b -		add	%o4, 1, %o4 - -	! Now compute %g2. -	2:	addcc	%o5, %o5, %o5 -		bcc	.Lnot_too_big -		add	%g2, 1, %g2 - -		! We get here if the %o1 overflowed while shifting. -		! This means that %o3 has the high-order bit set. -		! Restore %o5 and subtract from %o3. -		sll	%g1, 4, %g1	! high order bit -		srl	%o5, 1, %o5		! rest of %o5 -		add	%o5, %g1, %o5 -		b	.Ldo_single_div -		sub	%g2, 1, %g2 - -	.Lnot_too_big: -	3:	cmp	%o5, %o3 -		blu	2b -		nop -		be	.Ldo_single_div -		nop -	/* NB: these are commented out in the V8-Sparc manual as well */ -	/* (I do not understand this) */ -	! %o5 > %o3: went too far: back up 1 step -	!	srl	%o5, 1, %o5 -	!	dec	%g2 -	! do single-bit divide steps -	! -	! We have to be careful here.  We know that %o3 >= %o5, so we can do the -	! first divide step without thinking.  BUT, the others are conditional, -	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- -	! order bit set in the first step, just falling into the regular -	! division loop will mess up the first time around. -	! So we unroll slightly... -	.Ldo_single_div: -		subcc	%g2, 1, %g2 -		bl	.Lend_regular_divide -		nop -		sub	%o3, %o5, %o3 -		mov	1, %o2 -		b	.Lend_single_divloop -		nop -	.Lsingle_divloop: -		sll	%o2, 1, %o2 -		bl	1f -		srl	%o5, 1, %o5 -		! %o3 >= 0 -		sub	%o3, %o5, %o3 -		b	2f -		add	%o2, 1, %o2 -	1:	! %o3 < 0 -		add	%o3, %o5, %o3 -		sub	%o2, 1, %o2 -	2: -	.Lend_single_divloop: -		subcc	%g2, 1, %g2 -		bge	.Lsingle_divloop -		tst	%o3 -		b,a	.Lend_regular_divide - -.Lnot_really_big: -1: -	sll	%o5, 4, %o5 -	cmp	%o5, %o3 -	bleu	1b -	addcc	%o4, 1, %o4 -	be	.Lgot_result -	sub	%o4, 1, %o4 - -	tst	%o3	! set up for initial iteration -.Ldivloop: -	sll	%o2, 4, %o2 -		! depth 1, accumulated bits 0 -	bl	.L1.16 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 2, accumulated bits 1 -	bl	.L2.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 3 -	bl	.L3.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 7 -	bl	.L4.23 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2+1), %o2 -	 -.L4.23: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2-1), %o2 -	 -	 -.L3.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 5 -	bl	.L4.21 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2+1), %o2 -	 -.L4.21: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2-1), %o2 -	 -	 -	 -.L2.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 1 -	bl	.L3.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 3 -	bl	.L4.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2+1), %o2 -	 -.L4.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2-1), %o2 -	 -	 -.L3.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 1 -	bl	.L4.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2+1), %o2 -	 -.L4.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2-1), %o2 -	 -	 -	 -	 -.L1.16: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 2, accumulated bits -1 -	bl	.L2.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -1 -	bl	.L3.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -1 -	bl	.L4.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2+1), %o2 -	 -.L4.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2-1), %o2 -	 -	 -.L3.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -3 -	bl	.L4.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2+1), %o2 -	 -.L4.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2-1), %o2 -	 -	 -	 -.L2.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -3 -	bl	.L3.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -5 -	bl	.L4.11 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2+1), %o2 -	 -.L4.11: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2-1), %o2 -	 -	 -.L3.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -7 -	bl	.L4.9 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2+1), %o2 -	 -.L4.9: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2-1), %o2 -	 -	 -	 -	 -	9: -.Lend_regular_divide: -	subcc	%o4, 1, %o4 -	bge	.Ldivloop -	tst	%o3 -	bl,a	.Lgot_result -	! non-restoring fixup here (one instruction only!) -	sub	%o2, 1, %o2 - - -.Lgot_result: -	! check to see if answer should be < 0 -	tst	%g3 -	bl,a	1f -	sub %g0, %o2, %o2 -1: -	retl -	mov %o2, %o0 - -.size .div,.-.div; +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/sdiv.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/sdiv.S" +#else +# include "sparcv7/sdiv.S" +#endif diff --git a/libc/sysdeps/linux/sparc/sparcv7/rem.S b/libc/sysdeps/linux/sparc/sparcv7/rem.S new file mode 100644 index 000000000..ce06e6361 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/rem.S @@ -0,0 +1,360 @@ +   /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + *  .rem	name of function to generate + *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1 + *  true		true=true => signed; true=false => unsigned + * + * Algorithm parameters: + *  N		how many bits per iteration we try to get (4) + *  WORDSIZE	total number of bits (32) + * + * Derived constants: + *  TOPBITS	number of bits in the top decade of a number + * + * Important variables: + *  Q		the partial quotient under development (initially 0) + *  R		the remainder so far, initially the dividend + *  ITER	number of main division loop iterations required; + *		equal to ceil(log2(quotient) / N).  Note that this + *		is the log base (2^N) of the quotient. + *  V		the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + *  Current estimate for non-large dividend is + *	ceil(log2(quotient) / N) * (10 + 7N/2) + C + *  A large dividend is one greater than 2^(31-TOPBITS) and takes a + *  different path, as the upper bits of the quotient must be developed + *  one bit at a time. + */ + + + +ENTRY(.rem) +	! compute sign of result; if neither is negative, no problem +	orcc	%o1, %o0, %g0	! either negative? +	bge	2f			! no, go do the divide +	mov	%o0, %g3		! sign of remainder matches %o0 +	tst	%o1 +	bge	1f +	tst	%o0 +	! %o1 is definitely negative; %o0 might also be negative +	bge	2f			! if %o0 not negative... +	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg +1:	! %o0 is negative, %o1 is nonnegative +	sub	%g0, %o0, %o0	! make %o0 nonnegative +2: + +	! Ready to divide.  Compute size of quotient; scale comparand. +	orcc	%o1, %g0, %o5 +	bne	1f +	mov	%o0, %o3 + +		! Divide by zero trap.  If it returns, return 0 (about as +		! wrong as possible, but that is what SunOS does...). +		ta	ST_DIV0 +		retl +		clr	%o0 + +1: +	cmp	%o3, %o5			! if %o1 exceeds %o0, done +	blu	LOC(got_result)		! (and algorithm fails otherwise) +	clr	%o2 +	sethi	%hi(1 << (32 - 4 - 1)), %g1 +	cmp	%o3, %g1 +	blu	LOC(not_really_big) +	clr	%o4 + +	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, +	! as our usual N-at-a-shot divide step will cause overflow and havoc. +	! The number of bits in the result here is N*ITER+SC, where SC <= N. +	! Compute ITER in an unorthodox manner: know we need to shift V into +	! the top decade: so do not even bother to compare to R. +	1: +		cmp	%o5, %g1 +		bgeu	3f +		mov	1, %g2 +		sll	%o5, 4, %o5 +		b	1b +		add	%o4, 1, %o4 + +	! Now compute %g2. +	2:	addcc	%o5, %o5, %o5 +		bcc	LOC(not_too_big) +		add	%g2, 1, %g2 + +		! We get here if the %o1 overflowed while shifting. +		! This means that %o3 has the high-order bit set. +		! Restore %o5 and subtract from %o3. +		sll	%g1, 4, %g1	! high order bit +		srl	%o5, 1, %o5		! rest of %o5 +		add	%o5, %g1, %o5 +		b	LOC(do_single_div) +		sub	%g2, 1, %g2 + +	LOC(not_too_big): +	3:	cmp	%o5, %o3 +		blu	2b +		nop +		be	LOC(do_single_div) +		nop +	/* NB: these are commented out in the V8-Sparc manual as well */ +	/* (I do not understand this) */ +	! %o5 > %o3: went too far: back up 1 step +	!	srl	%o5, 1, %o5 +	!	dec	%g2 +	! do single-bit divide steps +	! +	! We have to be careful here.  We know that %o3 >= %o5, so we can do the +	! first divide step without thinking.  BUT, the others are conditional, +	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- +	! order bit set in the first step, just falling into the regular +	! division loop will mess up the first time around. +	! So we unroll slightly... +	LOC(do_single_div): +		subcc	%g2, 1, %g2 +		bl	LOC(end_regular_divide) +		nop +		sub	%o3, %o5, %o3 +		mov	1, %o2 +		b	LOC(end_single_divloop) +		nop +	LOC(single_divloop): +		sll	%o2, 1, %o2 +		bl	1f +		srl	%o5, 1, %o5 +		! %o3 >= 0 +		sub	%o3, %o5, %o3 +		b	2f +		add	%o2, 1, %o2 +	1:	! %o3 < 0 +		add	%o3, %o5, %o3 +		sub	%o2, 1, %o2 +	2: +	LOC(end_single_divloop): +		subcc	%g2, 1, %g2 +		bge	LOC(single_divloop) +		tst	%o3 +		b,a	LOC(end_regular_divide) + +LOC(not_really_big): +1: +	sll	%o5, 4, %o5 +	cmp	%o5, %o3 +	bleu	1b +	addcc	%o4, 1, %o4 +	be	LOC(got_result) +	sub	%o4, 1, %o4 + +	tst	%o3	! set up for initial iteration +LOC(divloop): +	sll	%o2, 4, %o2 +		! depth 1, accumulated bits 0 +	bl	LOC(1.16) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 2, accumulated bits 1 +	bl	LOC(2.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 3 +	bl	LOC(3.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 7 +	bl	LOC(4.23) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2+1), %o2 +	 +LOC(4.23): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2-1), %o2 +	 +	 +LOC(3.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 5 +	bl	LOC(4.21) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2+1), %o2 +	 +LOC(4.21): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2-1), %o2 +	 +	 +	 +LOC(2.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 1 +	bl	LOC(3.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 3 +	bl	LOC(4.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2+1), %o2 +	 +LOC(4.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2-1), %o2 +	 +	 +LOC(3.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 1 +	bl	LOC(4.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2+1), %o2 +	 +LOC(4.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2-1), %o2 +	 +	 +	 +	 +LOC(1.16): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 2, accumulated bits -1 +	bl	LOC(2.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -1 +	bl	LOC(3.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -1 +	bl	LOC(4.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2+1), %o2 +	 +LOC(4.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2-1), %o2 +	 +	 +LOC(3.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -3 +	bl	LOC(4.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2+1), %o2 +	 +LOC(4.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2-1), %o2 +	 +	 +	 +LOC(2.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -3 +	bl	LOC(3.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -5 +	bl	LOC(4.11) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2+1), %o2 +	 +LOC(4.11): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2-1), %o2 +	 +	 +LOC(3.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -7 +	bl	LOC(4.9) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2+1), %o2 +	 +LOC(4.9): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2-1), %o2 +	 +	 +	 +	 +	9: +LOC(end_regular_divide): +	subcc	%o4, 1, %o4 +	bge	LOC(divloop) +	tst	%o3 +	bl,a	LOC(got_result) +	! non-restoring fixup here (one instruction only!) +	add	%o3, %o1, %o3 + + +LOC(got_result): +	! check to see if answer should be < 0 +	tst	%g3 +	bl,a	1f +	sub %g0, %o3, %o3 +1: +	retl +	mov %o3, %o0 + +END(.rem) diff --git a/libc/sysdeps/linux/sparc/sparcv7/sdiv.S b/libc/sysdeps/linux/sparc/sparcv7/sdiv.S new file mode 100644 index 000000000..d6f2bc7c7 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/sdiv.S @@ -0,0 +1,360 @@ +   /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + *  .div	name of function to generate + *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1 + *  true		true=true => signed; true=false => unsigned + * + * Algorithm parameters: + *  N		how many bits per iteration we try to get (4) + *  WORDSIZE	total number of bits (32) + * + * Derived constants: + *  TOPBITS	number of bits in the top decade of a number + * + * Important variables: + *  Q		the partial quotient under development (initially 0) + *  R		the remainder so far, initially the dividend + *  ITER	number of main division loop iterations required; + *		equal to ceil(log2(quotient) / N).  Note that this + *		is the log base (2^N) of the quotient. + *  V		the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + *  Current estimate for non-large dividend is + *	ceil(log2(quotient) / N) * (10 + 7N/2) + C + *  A large dividend is one greater than 2^(31-TOPBITS) and takes a + *  different path, as the upper bits of the quotient must be developed + *  one bit at a time. + */ + + + +ENTRY(.div) +	! compute sign of result; if neither is negative, no problem +	orcc	%o1, %o0, %g0	! either negative? +	bge	2f			! no, go do the divide +	xor	%o1, %o0, %g3	! compute sign in any case +	tst	%o1 +	bge	1f +	tst	%o0 +	! %o1 is definitely negative; %o0 might also be negative +	bge	2f			! if %o0 not negative... +	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg +1:	! %o0 is negative, %o1 is nonnegative +	sub	%g0, %o0, %o0	! make %o0 nonnegative +2: + +	! Ready to divide.  Compute size of quotient; scale comparand. +	orcc	%o1, %g0, %o5 +	bne	1f +	mov	%o0, %o3 + +		! Divide by zero trap.  If it returns, return 0 (about as +		! wrong as possible, but that is what SunOS does...). +		ta	ST_DIV0 +		retl +		clr	%o0 + +1: +	cmp	%o3, %o5			! if %o1 exceeds %o0, done +	blu	LOC(got_result)		! (and algorithm fails otherwise) +	clr	%o2 +	sethi	%hi(1 << (32 - 4 - 1)), %g1 +	cmp	%o3, %g1 +	blu	LOC(not_really_big) +	clr	%o4 + +	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, +	! as our usual N-at-a-shot divide step will cause overflow and havoc. +	! The number of bits in the result here is N*ITER+SC, where SC <= N. +	! Compute ITER in an unorthodox manner: know we need to shift V into +	! the top decade: so do not even bother to compare to R. +	1: +		cmp	%o5, %g1 +		bgeu	3f +		mov	1, %g2 +		sll	%o5, 4, %o5 +		b	1b +		add	%o4, 1, %o4 + +	! Now compute %g2. +	2:	addcc	%o5, %o5, %o5 +		bcc	LOC(not_too_big) +		add	%g2, 1, %g2 + +		! We get here if the %o1 overflowed while shifting. +		! This means that %o3 has the high-order bit set. +		! Restore %o5 and subtract from %o3. +		sll	%g1, 4, %g1	! high order bit +		srl	%o5, 1, %o5		! rest of %o5 +		add	%o5, %g1, %o5 +		b	LOC(do_single_div) +		sub	%g2, 1, %g2 + +	LOC(not_too_big): +	3:	cmp	%o5, %o3 +		blu	2b +		nop +		be	LOC(do_single_div) +		nop +	/* NB: these are commented out in the V8-Sparc manual as well */ +	/* (I do not understand this) */ +	! %o5 > %o3: went too far: back up 1 step +	!	srl	%o5, 1, %o5 +	!	dec	%g2 +	! do single-bit divide steps +	! +	! We have to be careful here.  We know that %o3 >= %o5, so we can do the +	! first divide step without thinking.  BUT, the others are conditional, +	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- +	! order bit set in the first step, just falling into the regular +	! division loop will mess up the first time around. +	! So we unroll slightly... +	LOC(do_single_div): +		subcc	%g2, 1, %g2 +		bl	LOC(end_regular_divide) +		nop +		sub	%o3, %o5, %o3 +		mov	1, %o2 +		b	LOC(end_single_divloop) +		nop +	LOC(single_divloop): +		sll	%o2, 1, %o2 +		bl	1f +		srl	%o5, 1, %o5 +		! %o3 >= 0 +		sub	%o3, %o5, %o3 +		b	2f +		add	%o2, 1, %o2 +	1:	! %o3 < 0 +		add	%o3, %o5, %o3 +		sub	%o2, 1, %o2 +	2: +	LOC(end_single_divloop): +		subcc	%g2, 1, %g2 +		bge	LOC(single_divloop) +		tst	%o3 +		b,a	LOC(end_regular_divide) + +LOC(not_really_big): +1: +	sll	%o5, 4, %o5 +	cmp	%o5, %o3 +	bleu	1b +	addcc	%o4, 1, %o4 +	be	LOC(got_result) +	sub	%o4, 1, %o4 + +	tst	%o3	! set up for initial iteration +LOC(divloop): +	sll	%o2, 4, %o2 +		! depth 1, accumulated bits 0 +	bl	LOC(1.16) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 2, accumulated bits 1 +	bl	LOC(2.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 3 +	bl	LOC(3.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 7 +	bl	LOC(4.23) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2+1), %o2 +	 +LOC(4.23): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2-1), %o2 +	 +	 +LOC(3.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 5 +	bl	LOC(4.21) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2+1), %o2 +	 +LOC(4.21): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2-1), %o2 +	 +	 +	 +LOC(2.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 1 +	bl	LOC(3.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 3 +	bl	LOC(4.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2+1), %o2 +	 +LOC(4.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2-1), %o2 +	 +	 +LOC(3.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 1 +	bl	LOC(4.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2+1), %o2 +	 +LOC(4.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2-1), %o2 +	 +	 +	 +	 +LOC(1.16): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 2, accumulated bits -1 +	bl	LOC(2.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -1 +	bl	LOC(3.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -1 +	bl	LOC(4.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2+1), %o2 +	 +LOC(4.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2-1), %o2 +	 +	 +LOC(3.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -3 +	bl	LOC(4.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2+1), %o2 +	 +LOC(4.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2-1), %o2 +	 +	 +	 +LOC(2.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -3 +	bl	LOC(3.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -5 +	bl	LOC(4.11) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2+1), %o2 +	 +LOC(4.11): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2-1), %o2 +	 +	 +LOC(3.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -7 +	bl	LOC(4.9) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2+1), %o2 +	 +LOC(4.9): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2-1), %o2 +	 +	 +	 +	 +	9: +LOC(end_regular_divide): +	subcc	%o4, 1, %o4 +	bge	LOC(divloop) +	tst	%o3 +	bl,a	LOC(got_result) +	! non-restoring fixup here (one instruction only!) +	sub	%o2, 1, %o2 + + +LOC(got_result): +	! check to see if answer should be < 0 +	tst	%g3 +	bl,a	1f +	sub %g0, %o2, %o2 +1: +	retl +	mov %o2, %o0 + +END(.div) diff --git a/libc/sysdeps/linux/sparc/sparcv7/udiv.S b/libc/sysdeps/linux/sparc/sparcv7/udiv.S new file mode 100644 index 000000000..56c64ad01 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/udiv.S @@ -0,0 +1,343 @@ +   /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + *  .udiv	name of function to generate + *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1 + *  false		false=true => signed; false=false => unsigned + * + * Algorithm parameters: + *  N		how many bits per iteration we try to get (4) + *  WORDSIZE	total number of bits (32) + * + * Derived constants: + *  TOPBITS	number of bits in the top decade of a number + * + * Important variables: + *  Q		the partial quotient under development (initially 0) + *  R		the remainder so far, initially the dividend + *  ITER	number of main division loop iterations required; + *		equal to ceil(log2(quotient) / N).  Note that this + *		is the log base (2^N) of the quotient. + *  V		the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + *  Current estimate for non-large dividend is + *	ceil(log2(quotient) / N) * (10 + 7N/2) + C + *  A large dividend is one greater than 2^(31-TOPBITS) and takes a + *  different path, as the upper bits of the quotient must be developed + *  one bit at a time. + */ + + + +ENTRY(.udiv) + +	! Ready to divide.  Compute size of quotient; scale comparand. +	orcc	%o1, %g0, %o5 +	bne	1f +	mov	%o0, %o3 + +		! Divide by zero trap.  If it returns, return 0 (about as +		! wrong as possible, but that is what SunOS does...). +		ta	ST_DIV0 +		retl +		clr	%o0 + +1: +	cmp	%o3, %o5			! if %o1 exceeds %o0, done +	blu	LOC(got_result)		! (and algorithm fails otherwise) +	clr	%o2 +	sethi	%hi(1 << (32 - 4 - 1)), %g1 +	cmp	%o3, %g1 +	blu	LOC(not_really_big) +	clr	%o4 + +	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, +	! as our usual N-at-a-shot divide step will cause overflow and havoc. +	! The number of bits in the result here is N*ITER+SC, where SC <= N. +	! Compute ITER in an unorthodox manner: know we need to shift V into +	! the top decade: so do not even bother to compare to R. +	1: +		cmp	%o5, %g1 +		bgeu	3f +		mov	1, %g2 +		sll	%o5, 4, %o5 +		b	1b +		add	%o4, 1, %o4 + +	! Now compute %g2. +	2:	addcc	%o5, %o5, %o5 +		bcc	LOC(not_too_big) +		add	%g2, 1, %g2 + +		! We get here if the %o1 overflowed while shifting. +		! This means that %o3 has the high-order bit set. +		! Restore %o5 and subtract from %o3. +		sll	%g1, 4, %g1	! high order bit +		srl	%o5, 1, %o5		! rest of %o5 +		add	%o5, %g1, %o5 +		b	LOC(do_single_div) +		sub	%g2, 1, %g2 + +	LOC(not_too_big): +	3:	cmp	%o5, %o3 +		blu	2b +		nop +		be	LOC(do_single_div) +		nop +	/* NB: these are commented out in the V8-Sparc manual as well */ +	/* (I do not understand this) */ +	! %o5 > %o3: went too far: back up 1 step +	!	srl	%o5, 1, %o5 +	!	dec	%g2 +	! do single-bit divide steps +	! +	! We have to be careful here.  We know that %o3 >= %o5, so we can do the +	! first divide step without thinking.  BUT, the others are conditional, +	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- +	! order bit set in the first step, just falling into the regular +	! division loop will mess up the first time around. +	! So we unroll slightly... +	LOC(do_single_div): +		subcc	%g2, 1, %g2 +		bl	LOC(end_regular_divide) +		nop +		sub	%o3, %o5, %o3 +		mov	1, %o2 +		b	LOC(end_single_divloop) +		nop +	LOC(single_divloop): +		sll	%o2, 1, %o2 +		bl	1f +		srl	%o5, 1, %o5 +		! %o3 >= 0 +		sub	%o3, %o5, %o3 +		b	2f +		add	%o2, 1, %o2 +	1:	! %o3 < 0 +		add	%o3, %o5, %o3 +		sub	%o2, 1, %o2 +	2: +	LOC(end_single_divloop): +		subcc	%g2, 1, %g2 +		bge	LOC(single_divloop) +		tst	%o3 +		b,a	LOC(end_regular_divide) + +LOC(not_really_big): +1: +	sll	%o5, 4, %o5 +	cmp	%o5, %o3 +	bleu	1b +	addcc	%o4, 1, %o4 +	be	LOC(got_result) +	sub	%o4, 1, %o4 + +	tst	%o3	! set up for initial iteration +LOC(divloop): +	sll	%o2, 4, %o2 +		! depth 1, accumulated bits 0 +	bl	LOC(1.16) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 2, accumulated bits 1 +	bl	LOC(2.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 3 +	bl	LOC(3.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 7 +	bl	LOC(4.23) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2+1), %o2 +	 +LOC(4.23): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2-1), %o2 +	 +	 +LOC(3.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 5 +	bl	LOC(4.21) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2+1), %o2 +	 +LOC(4.21): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2-1), %o2 +	 +	 +	 +LOC(2.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 1 +	bl	LOC(3.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 3 +	bl	LOC(4.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2+1), %o2 +	 +LOC(4.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2-1), %o2 +	 +	 +LOC(3.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 1 +	bl	LOC(4.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2+1), %o2 +	 +LOC(4.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2-1), %o2 +	 +	 +	 +	 +LOC(1.16): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 2, accumulated bits -1 +	bl	LOC(2.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -1 +	bl	LOC(3.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -1 +	bl	LOC(4.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2+1), %o2 +	 +LOC(4.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2-1), %o2 +	 +	 +LOC(3.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -3 +	bl	LOC(4.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2+1), %o2 +	 +LOC(4.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2-1), %o2 +	 +	 +	 +LOC(2.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -3 +	bl	LOC(3.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -5 +	bl	LOC(4.11) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2+1), %o2 +	 +LOC(4.11): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2-1), %o2 +	 +	 +LOC(3.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -7 +	bl	LOC(4.9) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2+1), %o2 +	 +LOC(4.9): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2-1), %o2 +	 +	 +	 +	 +	9: +LOC(end_regular_divide): +	subcc	%o4, 1, %o4 +	bge	LOC(divloop) +	tst	%o3 +	bl,a	LOC(got_result) +	! non-restoring fixup here (one instruction only!) +	sub	%o2, 1, %o2 + + +LOC(got_result): + +	retl +	mov %o2, %o0 + +END(.udiv) diff --git a/libc/sysdeps/linux/sparc/sparcv7/umul.S b/libc/sysdeps/linux/sparc/sparcv7/umul.S new file mode 100644 index 000000000..50b3157db --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/umul.S @@ -0,0 +1,153 @@ +/* + * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the + * upper 32 bits of the 64-bit product). + * + * This code optimizes short (less than 13-bit) multiplies.  Short + * multiplies require 25 instruction cycles, and long ones require + * 45 instruction cycles. + * + * On return, overflow has occurred (%o1 is not zero) if and only if + * the Z condition code is clear, allowing, e.g., the following: + * + *	call	.umul + *	nop + *	bnz	overflow	(or tnz) + */ + +ENTRY(.umul) +	or	%o0, %o1, %o4 +	mov	%o0, %y			! multiplier -> Y +	andncc	%o4, 0xfff, %g0		! test bits 12..31 of *both* args +	be	LOC(mul_shortway)	! if zero, can do it the short way +	 andcc	%g0, %g0, %o4		! zero the partial product; clear N & V + +	/* +	 * Long multiply.  32 steps, followed by a final shift step. +	 */ +	mulscc	%o4, %o1, %o4	! 1 +	mulscc	%o4, %o1, %o4	! 2 +	mulscc	%o4, %o1, %o4	! 3 +	mulscc	%o4, %o1, %o4	! 4 +	mulscc	%o4, %o1, %o4	! 5 +	mulscc	%o4, %o1, %o4	! 6 +	mulscc	%o4, %o1, %o4	! 7 +	mulscc	%o4, %o1, %o4	! 8 +	mulscc	%o4, %o1, %o4	! 9 +	mulscc	%o4, %o1, %o4	! 10 +	mulscc	%o4, %o1, %o4	! 11 +	mulscc	%o4, %o1, %o4	! 12 +	mulscc	%o4, %o1, %o4	! 13 +	mulscc	%o4, %o1, %o4	! 14 +	mulscc	%o4, %o1, %o4	! 15 +	mulscc	%o4, %o1, %o4	! 16 +	mulscc	%o4, %o1, %o4	! 17 +	mulscc	%o4, %o1, %o4	! 18 +	mulscc	%o4, %o1, %o4	! 19 +	mulscc	%o4, %o1, %o4	! 20 +	mulscc	%o4, %o1, %o4	! 21 +	mulscc	%o4, %o1, %o4	! 22 +	mulscc	%o4, %o1, %o4	! 23 +	mulscc	%o4, %o1, %o4	! 24 +	mulscc	%o4, %o1, %o4	! 25 +	mulscc	%o4, %o1, %o4	! 26 +	mulscc	%o4, %o1, %o4	! 27 +	mulscc	%o4, %o1, %o4	! 28 +	mulscc	%o4, %o1, %o4	! 29 +	mulscc	%o4, %o1, %o4	! 30 +	mulscc	%o4, %o1, %o4	! 31 +	mulscc	%o4, %o1, %o4	! 32 +	mulscc	%o4, %g0, %o4	! final shift + +	/* +	 * Normally, with the shift-and-add approach, if both numbers are +	 * positive you get the correct result.  With 32-bit two's-complement +	 * numbers, -x is represented as +	 * +	 *		  x		    32 +	 *	( 2  -  ------ ) mod 2  *  2 +	 *		   32 +	 *		  2 +	 * +	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s, +	 * we can treat this as if the radix point were just to the left +	 * of the sign bit (multiply by 2^32), and get +	 * +	 *	-x  =  (2 - x) mod 2 +	 * +	 * Then, ignoring the `mod 2's for convenience: +	 * +	 *   x *  y	= xy +	 *  -x *  y	= 2y - xy +	 *   x * -y	= 2x - xy +	 *  -x * -y	= 4 - 2x - 2y + xy +	 * +	 * For signed multiplies, we subtract (x << 32) from the partial +	 * product to fix this problem for negative multipliers (see mul.s). +	 * Because of the way the shift into the partial product is calculated +	 * (N xor V), this term is automatically removed for the multiplicand, +	 * so we don't have to adjust. +	 * +	 * But for unsigned multiplies, the high order bit wasn't a sign bit, +	 * and the correction is wrong.  So for unsigned multiplies where the +	 * high order bit is one, we end up with xy - (y << 32).  To fix it +	 * we add y << 32. +	 */ +#if 0 +	tst	%o1 +	bl,a	1f		! if %o1 < 0 (high order bit = 1), +	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half) +1:	rd	%y, %o0		! get lower half of product +	retl +	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0 +#else +	/* Faster code from tege@sics.se.  */ +	sra	%o1, 31, %o2	! make mask from sign bit +	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1 +	rd	%y, %o0		! get lower half of product +	retl +	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place +#endif + +LOC(mul_shortway): +	/* +	 * Short multiply.  12 steps, followed by a final shift step. +	 * The resulting bits are off by 12 and (32-12) = 20 bit positions, +	 * but there is no problem with %o0 being negative (unlike above), +	 * and overflow is impossible (the answer is at most 24 bits long). +	 */ +	mulscc	%o4, %o1, %o4	! 1 +	mulscc	%o4, %o1, %o4	! 2 +	mulscc	%o4, %o1, %o4	! 3 +	mulscc	%o4, %o1, %o4	! 4 +	mulscc	%o4, %o1, %o4	! 5 +	mulscc	%o4, %o1, %o4	! 6 +	mulscc	%o4, %o1, %o4	! 7 +	mulscc	%o4, %o1, %o4	! 8 +	mulscc	%o4, %o1, %o4	! 9 +	mulscc	%o4, %o1, %o4	! 10 +	mulscc	%o4, %o1, %o4	! 11 +	mulscc	%o4, %o1, %o4	! 12 +	mulscc	%o4, %g0, %o4	! final shift + +	/* +	 * %o4 has 20 of the bits that should be in the result; %y has +	 * the bottom 12 (as %y's top 12).  That is: +	 * +	 *	  %o4		    %y +	 * +----------------+----------------+ +	 * | -12- |   -20-  | -12- |   -20-  | +	 * +------(---------+------)---------+ +	 *	   -----result----- +	 * +	 * The 12 bits of %o4 left of the `result' area are all zero; +	 * in fact, all top 20 bits of %o4 are zero. +	 */ + +	rd	%y, %o5 +	sll	%o4, 12, %o0	! shift middle bits left 12 +	srl	%o5, 20, %o5	! shift low bits right 20 +	or	%o5, %o0, %o0 +	retl +	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z + +END(.umul) diff --git a/libc/sysdeps/linux/sparc/sparcv7/urem.S b/libc/sysdeps/linux/sparc/sparcv7/urem.S new file mode 100644 index 000000000..ecf34672a --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/urem.S @@ -0,0 +1,343 @@ +   /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + *  .urem	name of function to generate + *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1 + *  false		false=true => signed; false=false => unsigned + * + * Algorithm parameters: + *  N		how many bits per iteration we try to get (4) + *  WORDSIZE	total number of bits (32) + * + * Derived constants: + *  TOPBITS	number of bits in the top decade of a number + * + * Important variables: + *  Q		the partial quotient under development (initially 0) + *  R		the remainder so far, initially the dividend + *  ITER	number of main division loop iterations required; + *		equal to ceil(log2(quotient) / N).  Note that this + *		is the log base (2^N) of the quotient. + *  V		the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + *  Current estimate for non-large dividend is + *	ceil(log2(quotient) / N) * (10 + 7N/2) + C + *  A large dividend is one greater than 2^(31-TOPBITS) and takes a + *  different path, as the upper bits of the quotient must be developed + *  one bit at a time. + */ + + + +ENTRY(.urem) + +	! Ready to divide.  Compute size of quotient; scale comparand. +	orcc	%o1, %g0, %o5 +	bne	1f +	mov	%o0, %o3 + +		! Divide by zero trap.  If it returns, return 0 (about as +		! wrong as possible, but that is what SunOS does...). +		ta	ST_DIV0 +		retl +		clr	%o0 + +1: +	cmp	%o3, %o5			! if %o1 exceeds %o0, done +	blu	LOC(got_result)		! (and algorithm fails otherwise) +	clr	%o2 +	sethi	%hi(1 << (32 - 4 - 1)), %g1 +	cmp	%o3, %g1 +	blu	LOC(not_really_big) +	clr	%o4 + +	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, +	! as our usual N-at-a-shot divide step will cause overflow and havoc. +	! The number of bits in the result here is N*ITER+SC, where SC <= N. +	! Compute ITER in an unorthodox manner: know we need to shift V into +	! the top decade: so do not even bother to compare to R. +	1: +		cmp	%o5, %g1 +		bgeu	3f +		mov	1, %g2 +		sll	%o5, 4, %o5 +		b	1b +		add	%o4, 1, %o4 + +	! Now compute %g2. +	2:	addcc	%o5, %o5, %o5 +		bcc	LOC(not_too_big) +		add	%g2, 1, %g2 + +		! We get here if the %o1 overflowed while shifting. +		! This means that %o3 has the high-order bit set. +		! Restore %o5 and subtract from %o3. +		sll	%g1, 4, %g1	! high order bit +		srl	%o5, 1, %o5		! rest of %o5 +		add	%o5, %g1, %o5 +		b	LOC(do_single_div) +		sub	%g2, 1, %g2 + +	LOC(not_too_big): +	3:	cmp	%o5, %o3 +		blu	2b +		nop +		be	LOC(do_single_div) +		nop +	/* NB: these are commented out in the V8-Sparc manual as well */ +	/* (I do not understand this) */ +	! %o5 > %o3: went too far: back up 1 step +	!	srl	%o5, 1, %o5 +	!	dec	%g2 +	! do single-bit divide steps +	! +	! We have to be careful here.  We know that %o3 >= %o5, so we can do the +	! first divide step without thinking.  BUT, the others are conditional, +	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- +	! order bit set in the first step, just falling into the regular +	! division loop will mess up the first time around. +	! So we unroll slightly... +	LOC(do_single_div): +		subcc	%g2, 1, %g2 +		bl	LOC(end_regular_divide) +		nop +		sub	%o3, %o5, %o3 +		mov	1, %o2 +		b	LOC(end_single_divloop) +		nop +	LOC(single_divloop): +		sll	%o2, 1, %o2 +		bl	1f +		srl	%o5, 1, %o5 +		! %o3 >= 0 +		sub	%o3, %o5, %o3 +		b	2f +		add	%o2, 1, %o2 +	1:	! %o3 < 0 +		add	%o3, %o5, %o3 +		sub	%o2, 1, %o2 +	2: +	LOC(end_single_divloop): +		subcc	%g2, 1, %g2 +		bge	LOC(single_divloop) +		tst	%o3 +		b,a	LOC(end_regular_divide) + +LOC(not_really_big): +1: +	sll	%o5, 4, %o5 +	cmp	%o5, %o3 +	bleu	1b +	addcc	%o4, 1, %o4 +	be	LOC(got_result) +	sub	%o4, 1, %o4 + +	tst	%o3	! set up for initial iteration +LOC(divloop): +	sll	%o2, 4, %o2 +		! depth 1, accumulated bits 0 +	bl	LOC(1.16) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 2, accumulated bits 1 +	bl	LOC(2.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 3 +	bl	LOC(3.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 7 +	bl	LOC(4.23) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2+1), %o2 +	 +LOC(4.23): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (7*2-1), %o2 +	 +	 +LOC(3.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 5 +	bl	LOC(4.21) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2+1), %o2 +	 +LOC(4.21): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (5*2-1), %o2 +	 +	 +	 +LOC(2.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits 1 +	bl	LOC(3.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 3 +	bl	LOC(4.19) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2+1), %o2 +	 +LOC(4.19): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (3*2-1), %o2 +	 +	 +LOC(3.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits 1 +	bl	LOC(4.17) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2+1), %o2 +	 +LOC(4.17): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (1*2-1), %o2 +	 +	 +	 +	 +LOC(1.16): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 2, accumulated bits -1 +	bl	LOC(2.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -1 +	bl	LOC(3.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -1 +	bl	LOC(4.15) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2+1), %o2 +	 +LOC(4.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-1*2-1), %o2 +	 +	 +LOC(3.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -3 +	bl	LOC(4.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2+1), %o2 +	 +LOC(4.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-3*2-1), %o2 +	 +	 +	 +LOC(2.15): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 3, accumulated bits -3 +	bl	LOC(3.13) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -5 +	bl	LOC(4.11) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2+1), %o2 +	 +LOC(4.11): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-5*2-1), %o2 +	 +	 +LOC(3.13): +	! remainder is negative +	addcc	%o3,%o5,%o3 +			! depth 4, accumulated bits -7 +	bl	LOC(4.9) +	srl	%o5,1,%o5 +	! remainder is positive +	subcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2+1), %o2 +	 +LOC(4.9): +	! remainder is negative +	addcc	%o3,%o5,%o3 +		b	9f +		add	%o2, (-7*2-1), %o2 +	 +	 +	 +	 +	9: +LOC(end_regular_divide): +	subcc	%o4, 1, %o4 +	bge	LOC(divloop) +	tst	%o3 +	bl,a	LOC(got_result) +	! non-restoring fixup here (one instruction only!) +	add	%o3, %o1, %o3 + + +LOC(got_result): + +	retl +	mov %o3, %o0 + +END(.urem) diff --git a/libc/sysdeps/linux/sparc/sparcv8/rem.S b/libc/sysdeps/linux/sparc/sparcv8/rem.S new file mode 100644 index 000000000..c4faebe88 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/rem.S @@ -0,0 +1,19 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.rem) + +	sra	%o0, 31, %o2 +	wr	%o2, 0, %y +	nop +	nop +	nop +	sdivcc	%o0, %o1, %o2 +	bvs,a	1f +	 xnor	%o2, %g0, %o2 +1:	smul	%o2, %o1, %o2 +	retl +	 sub	%o0, %o2, %o0 + +END(.rem) diff --git a/libc/sysdeps/linux/sparc/sparcv8/sdiv.S b/libc/sysdeps/linux/sparc/sparcv8/sdiv.S new file mode 100644 index 000000000..4ac901a77 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/sdiv.S @@ -0,0 +1,18 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.div) + +	sra	%o0, 31, %o2 +	wr	%o2, 0, %y +	nop +	nop +	nop +	sdivcc	%o0, %o1, %o0 +	bvs,a	1f +	 xnor	%o0, %g0, %o0 +1:	retl +	 nop + +END(.div) diff --git a/libc/sysdeps/linux/sparc/sparcv8/udiv.S b/libc/sysdeps/linux/sparc/sparcv8/udiv.S new file mode 100644 index 000000000..d5d93bb8f --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/udiv.S @@ -0,0 +1,13 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.udiv) + +	wr	%g0, 0, %y +	nop +	nop +	retl +	 udiv	%o0, %o1, %o0 + +END(.udiv) diff --git a/libc/sysdeps/linux/sparc/sparcv8/umul.S b/libc/sysdeps/linux/sparc/sparcv8/umul.S new file mode 100644 index 000000000..47b98e9ca --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/umul.S @@ -0,0 +1,11 @@ +/* + * Sparc v8 has multiply. + */ + +ENTRY(.umul) + +	umul	%o0, %o1, %o0 +	retl +	 rd	%y, %o1 + +END(.umul) diff --git a/libc/sysdeps/linux/sparc/sparcv8/urem.S b/libc/sysdeps/linux/sparc/sparcv8/urem.S new file mode 100644 index 000000000..0e96246f5 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/urem.S @@ -0,0 +1,16 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.urem) + +	wr	%g0, 0, %y +	nop +	nop +	nop +	udiv	%o0, %o1, %o2 +	umul	%o2, %o1, %o2 +	retl +	 sub	%o0, %o2, %o0 + +END(.urem) diff --git a/libc/sysdeps/linux/sparc/sparcv9/rem.S b/libc/sysdeps/linux/sparc/sparcv9/rem.S new file mode 100644 index 000000000..1474e32ae --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/rem.S @@ -0,0 +1,20 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and sdivcc only 36, + * we use sdivcc eventhough it is deprecated. + */ + +	.text +	.align		32 +ENTRY(.rem) + +	sra		%o0, 31, %o2 +	wr		%o2, 0, %y +	sdivcc		%o0, %o1, %o2 +	xnor		%o2, %g0, %o3 +	movvs		%icc, %o3, %o2 +	smul		%o2, %o1, %o2 +	retl +	 sub		%o0, %o2, %o0 + +END(.rem) diff --git a/libc/sysdeps/linux/sparc/sparcv9/sdiv.S b/libc/sysdeps/linux/sparc/sparcv9/sdiv.S new file mode 100644 index 000000000..45535bb68 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/sdiv.S @@ -0,0 +1,18 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and sdivcc only 36, + * we use sdivcc eventhough it is deprecated. + */ + +	.text +	.align		32 +ENTRY(.div) + +	sra		%o0, 31, %o2 +	wr		%o2, 0, %y +	sdivcc		%o0, %o1, %o0 +	xnor		%o0, %g0, %o2 +	retl +	 movvs		%icc, %o2, %o0 + +END(.div) diff --git a/libc/sysdeps/linux/sparc/sparcv9/udiv.S b/libc/sysdeps/linux/sparc/sparcv9/udiv.S new file mode 100644 index 000000000..303f29bdf --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/udiv.S @@ -0,0 +1,15 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and udiv only 37, + * we use udiv eventhough it is deprecated. + */ + +	.text +	.align		32 +ENTRY(.udiv) + +	wr		%g0, 0, %y +	retl +	 udiv		%o0, %o1, %o0 + +END(.udiv) diff --git a/libc/sysdeps/linux/sparc/sparcv9/umul.S b/libc/sysdeps/linux/sparc/sparcv9/umul.S new file mode 100644 index 000000000..e65e4b95f --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/umul.S @@ -0,0 +1,15 @@ +/* + * Sparc v9 has multiply. + */ + +	.text +	.align		32 +ENTRY(.umul) + +	srl		%o0, 0, %o0 +	srl		%o1, 0, %o1 +	mulx		%o0, %o1, %o0 +	retl +	 srlx		%o0, 32, %o1 + +END(.umul) diff --git a/libc/sysdeps/linux/sparc/sparcv9/urem.S b/libc/sysdeps/linux/sparc/sparcv9/urem.S new file mode 100644 index 000000000..93542698d --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/urem.S @@ -0,0 +1,17 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and udiv only 37, + * we use udiv eventhough it is deprecated. + */ + +	.text +	.align		32 +ENTRY(.urem) + +	wr		%g0, 0, %y +	udiv		%o0, %o1, %o2 +	umul		%o2, %o1, %o2 +	retl +	 sub		%o0, %o2, %o0 + +END(.urem) diff --git a/libc/sysdeps/linux/sparc/udiv.S b/libc/sysdeps/linux/sparc/udiv.S index 85eeb40fc..a1355a761 100644 --- a/libc/sysdeps/linux/sparc/udiv.S +++ b/libc/sysdeps/linux/sparc/udiv.S @@ -1,348 +1,9 @@ -   /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - *  .udiv	name of function to generate - *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1 - *  false		false=true => signed; false=false => unsigned - * - * Algorithm parameters: - *  N		how many bits per iteration we try to get (4) - *  WORDSIZE	total number of bits (32) - * - * Derived constants: - *  TOPBITS	number of bits in the top decade of a number - * - * Important variables: - *  Q		the partial quotient under development (initially 0) - *  R		the remainder so far, initially the dividend - *  ITER	number of main division loop iterations required; - *		equal to ceil(log2(quotient) / N).  Note that this - *		is the log base (2^N) of the quotient. - *  V		the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - *  Current estimate for non-large dividend is - *	ceil(log2(quotient) / N) * (10 + 7N/2) + C - *  A large dividend is one greater than 2^(31-TOPBITS) and takes a - *  different path, as the upper bits of the quotient must be developed - *  one bit at a time. - */ - - - -#include <sys/syscall.h> - -.global   .udiv; -.align 4; -.type  .udiv ,@function;  - -.udiv:  -	! Ready to divide.  Compute size of quotient; scale comparand. -	orcc	%o1, %g0, %o5 -	bne	1f -	mov	%o0, %o3 - -		! Divide by zero trap.  If it returns, return 0 (about as -		! wrong as possible, but that is what SunOS does...). -		ta	0x02 -		retl -		clr	%o0 - -1: -	cmp	%o3, %o5			! if %o1 exceeds %o0, done -	blu	.Lgot_result  		! (and algorithm fails otherwise) -	clr	%o2 -	sethi	%hi(1 << (32 - 4 - 1)), %g1 -	cmp	%o3, %g1 -	blu	.Lnot_really_big   -	clr	%o4 - -	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, -	! as our usual N-at-a-shot divide step will cause overflow and havoc. -	! The number of bits in the result here is N*ITER+SC, where SC <= N. -	! Compute ITER in an unorthodox manner: know we need to shift V into -	! the top decade: so do not even bother to compare to R. -	1: -		cmp	%o5, %g1 -		bgeu	3f -		mov	1, %g2 -		sll	%o5, 4, %o5 -		b	1b -		add	%o4, 1, %o4 - -	! Now compute %g2. -	2:	addcc	%o5, %o5, %o5 -		bcc	.Lnot_too_big   -		add	%g2, 1, %g2 - -		! We get here if the %o1 overflowed while shifting. -		! This means that %o3 has the high-order bit set. -		! Restore %o5 and subtract from %o3. -		sll	%g1, 4, %g1	! high order bit -		srl	%o5, 1, %o5		! rest of %o5 -		add	%o5, %g1, %o5 -		b	.Ldo_single_div -		sub	%g2, 1, %g2 - -	.Lnot_too_big: -	3:	cmp	%o5, %o3 -		blu	2b -		nop -		be	.Ldo_single_div -		nop -	/* NB: these are commented out in the V8-Sparc manual as well */ -	/* (I do not understand this) */ -	! %o5 > %o3: went too far: back up 1 step -	!	srl	%o5, 1, %o5 -	!	dec	%g2 -	! do single-bit divide steps -	! -	! We have to be careful here.  We know that %o3 >= %o5, so we can do the -	! first divide step without thinking.  BUT, the others are conditional, -	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- -	! order bit set in the first step, just falling into the regular -	! division loop will mess up the first time around. -	! So we unroll slightly... -	.Ldo_single_div: -		subcc	%g2, 1, %g2 -		bl	.Lend_regular_divide -		nop -		sub	%o3, %o5, %o3 -		mov	1, %o2 -		b	.Lend_single_divloop -		nop -	.Lsingle_divloop: -		sll	%o2, 1, %o2 -		bl	1f -		srl	%o5, 1, %o5 -		! %o3 >= 0 -		sub	%o3, %o5, %o3 -		b	2f -		add	%o2, 1, %o2 -	1:	! %o3 < 0 -		add	%o3, %o5, %o3 -		sub	%o2, 1, %o2 -	2: -	.Lend_single_divloop: -		subcc	%g2, 1, %g2 -		bge	.Lsingle_divloop -		tst	%o3 -		b,a	.Lend_regular_divide - -.Lnot_really_big: -1: -	sll	%o5, 4, %o5 -	cmp	%o5, %o3 -	bleu	1b -	addcc	%o4, 1, %o4 -	be	.Lgot_result -	sub	%o4, 1, %o4 - -	tst	%o3	! set up for initial iteration -.Ldivloop: -	sll	%o2, 4, %o2 -		! depth 1, accumulated bits 0 -	bl	.L1.16 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 2, accumulated bits 1 -	bl	.L2.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 3 -	bl	.L3.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 7 -	bl	.L4.23 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2+1), %o2 -	 -.L4.23: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2-1), %o2 -	 -	 -.L3.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 5 -	bl	.L4.21 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2+1), %o2 -	 -.L4.21: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2-1), %o2 -	 -	 -	 -.L2.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 1 -	bl	.L3.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 3 -	bl	.L4.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2+1), %o2 -	 -.L4.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2-1), %o2 -	 -	 -.L3.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 1 -	bl	.L4.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2+1), %o2 -	 -.L4.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2-1), %o2 -	 -	 -	 -	 -.L1.16: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 2, accumulated bits -1 -	bl	.L2.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -1 -	bl	.L3.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -1 -	bl	.L4.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2+1), %o2 -	 -.L4.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2-1), %o2 -	 -	 -.L3.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -3 -	bl	.L4.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2+1), %o2 -	 -.L4.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2-1), %o2 -	 -	 -	 -.L2.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -3 -	bl	.L3.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -5 -	bl	.L4.11 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2+1), %o2 -	 -.L4.11: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2-1), %o2 -	 -	 -.L3.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -7 -	bl	.L4.9 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2+1), %o2 -	 -.L4.9: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2-1), %o2 -	 -	 -	 -	 -	9: -.Lend_regular_divide: -	subcc	%o4, 1, %o4 -	bge	.Ldivloop -	tst	%o3 -	bl,a	.Lgot_result -	! non-restoring fixup here (one instruction only!) -	sub	%o2, 1, %o2 - - -.Lgot_result: - -	retl -	mov %o2, %o0 - -.size .udiv,.-.udiv; +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/udiv.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/udiv.S" +#else +# include "sparcv7/udiv.S" +#endif diff --git a/libc/sysdeps/linux/sparc/umul.S b/libc/sysdeps/linux/sparc/umul.S index 4f38d78eb..e86059743 100644 --- a/libc/sysdeps/linux/sparc/umul.S +++ b/libc/sysdeps/linux/sparc/umul.S @@ -1,160 +1,9 @@ -/* - * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the - * upper 32 bits of the 64-bit product). - * - * This code optimizes short (less than 13-bit) multiplies.  Short - * multiplies require 25 instruction cycles, and long ones require - * 45 instruction cycles. - * - * On return, overflow has occurred (%o1 is not zero) if and only if - * the Z condition code is clear, allowing, e.g., the following: - * - *	call	.umul - *	nop - *	bnz	overflow	(or tnz) - */ +#include "_math_inc.h" -#include <sys/syscall.h> - - -.global   .umul; -.align 4; -.type  .umul ,@function;  - -.umul:  -	or	%o0, %o1, %o4 -	mov	%o0, %y			! multiplier -> Y -	andncc	%o4, 0xfff, %g0		! test bits 12..31 of *both* args -	be	.Lmul_shortway	! if zero, can do it the short way -	 andcc	%g0, %g0, %o4		! zero the partial product; clear N & V - -	/* -	 * Long multiply.  32 steps, followed by a final shift step. -	 */ -	mulscc	%o4, %o1, %o4	! 1 -	mulscc	%o4, %o1, %o4	! 2 -	mulscc	%o4, %o1, %o4	! 3 -	mulscc	%o4, %o1, %o4	! 4 -	mulscc	%o4, %o1, %o4	! 5 -	mulscc	%o4, %o1, %o4	! 6 -	mulscc	%o4, %o1, %o4	! 7 -	mulscc	%o4, %o1, %o4	! 8 -	mulscc	%o4, %o1, %o4	! 9 -	mulscc	%o4, %o1, %o4	! 10 -	mulscc	%o4, %o1, %o4	! 11 -	mulscc	%o4, %o1, %o4	! 12 -	mulscc	%o4, %o1, %o4	! 13 -	mulscc	%o4, %o1, %o4	! 14 -	mulscc	%o4, %o1, %o4	! 15 -	mulscc	%o4, %o1, %o4	! 16 -	mulscc	%o4, %o1, %o4	! 17 -	mulscc	%o4, %o1, %o4	! 18 -	mulscc	%o4, %o1, %o4	! 19 -	mulscc	%o4, %o1, %o4	! 20 -	mulscc	%o4, %o1, %o4	! 21 -	mulscc	%o4, %o1, %o4	! 22 -	mulscc	%o4, %o1, %o4	! 23 -	mulscc	%o4, %o1, %o4	! 24 -	mulscc	%o4, %o1, %o4	! 25 -	mulscc	%o4, %o1, %o4	! 26 -	mulscc	%o4, %o1, %o4	! 27 -	mulscc	%o4, %o1, %o4	! 28 -	mulscc	%o4, %o1, %o4	! 29 -	mulscc	%o4, %o1, %o4	! 30 -	mulscc	%o4, %o1, %o4	! 31 -	mulscc	%o4, %o1, %o4	! 32 -	mulscc	%o4, %g0, %o4	! final shift - -	/* -	 * Normally, with the shift-and-add approach, if both numbers are -	 * positive you get the correct result.  With 32-bit two's-complement -	 * numbers, -x is represented as -	 * -	 *		  x		    32 -	 *	( 2  -  ------ ) mod 2  *  2 -	 *		   32 -	 *		  2 -	 * -	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s, -	 * we can treat this as if the radix point were just to the left -	 * of the sign bit (multiply by 2^32), and get -	 * -	 *	-x  =  (2 - x) mod 2 -	 * -	 * Then, ignoring the `mod 2's for convenience: -	 * -	 *   x *  y	= xy -	 *  -x *  y	= 2y - xy -	 *   x * -y	= 2x - xy -	 *  -x * -y	= 4 - 2x - 2y + xy -	 * -	 * For signed multiplies, we subtract (x << 32) from the partial -	 * product to fix this problem for negative multipliers (see mul.s). -	 * Because of the way the shift into the partial product is calculated -	 * (N xor V), this term is automatically removed for the multiplicand, -	 * so we don't have to adjust. -	 * -	 * But for unsigned multiplies, the high order bit wasn't a sign bit, -	 * and the correction is wrong.  So for unsigned multiplies where the -	 * high order bit is one, we end up with xy - (y << 32).  To fix it -	 * we add y << 32. -	 */ -#if 0 -	tst	%o1 -	bl,a	1f		! if %o1 < 0 (high order bit = 1), -	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half) -1:	rd	%y, %o0		! get lower half of product -	retl -	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0 +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/umul.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/umul.S"  #else -	/* Faster code from tege@sics.se.  */ -	sra	%o1, 31, %o2	! make mask from sign bit -	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1 -	rd	%y, %o0		! get lower half of product -	retl -	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place +# include "sparcv7/umul.S"  #endif - -.Lmul_shortway: -	/* -	 * Short multiply.  12 steps, followed by a final shift step. -	 * The resulting bits are off by 12 and (32-12) = 20 bit positions, -	 * but there is no problem with %o0 being negative (unlike above), -	 * and overflow is impossible (the answer is at most 24 bits long). -	 */ -	mulscc	%o4, %o1, %o4	! 1 -	mulscc	%o4, %o1, %o4	! 2 -	mulscc	%o4, %o1, %o4	! 3 -	mulscc	%o4, %o1, %o4	! 4 -	mulscc	%o4, %o1, %o4	! 5 -	mulscc	%o4, %o1, %o4	! 6 -	mulscc	%o4, %o1, %o4	! 7 -	mulscc	%o4, %o1, %o4	! 8 -	mulscc	%o4, %o1, %o4	! 9 -	mulscc	%o4, %o1, %o4	! 10 -	mulscc	%o4, %o1, %o4	! 11 -	mulscc	%o4, %o1, %o4	! 12 -	mulscc	%o4, %g0, %o4	! final shift - -	/* -	 * %o4 has 20 of the bits that should be in the result; %y has -	 * the bottom 12 (as %y's top 12).  That is: -	 * -	 *	  %o4		    %y -	 * +----------------+----------------+ -	 * | -12- |   -20-  | -12- |   -20-  | -	 * +------(---------+------)---------+ -	 *	   -----result----- -	 * -	 * The 12 bits of %o4 left of the `result' area are all zero; -	 * in fact, all top 20 bits of %o4 are zero. -	 */ - -	rd	%y, %o5 -	sll	%o4, 12, %o0	! shift middle bits left 12 -	srl	%o5, 20, %o5	! shift low bits right 20 -	or	%o5, %o0, %o0 -	retl -	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z - -.size  .umul , . -.umul diff --git a/libc/sysdeps/linux/sparc/urem.S b/libc/sysdeps/linux/sparc/urem.S index 890532141..baf6c4335 100644 --- a/libc/sysdeps/linux/sparc/urem.S +++ b/libc/sysdeps/linux/sparc/urem.S @@ -1,350 +1,9 @@ -   /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - *  .urem	name of function to generate - *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - *  false		false=true => signed; false=false => unsigned - * - * Algorithm parameters: - *  N		how many bits per iteration we try to get (4) - *  WORDSIZE	total number of bits (32) - * - * Derived constants: - *  TOPBITS	number of bits in the top decade of a number - * - * Important variables: - *  Q		the partial quotient under development (initially 0) - *  R		the remainder so far, initially the dividend - *  ITER	number of main division loop iterations required; - *		equal to ceil(log2(quotient) / N).  Note that this - *		is the log base (2^N) of the quotient. - *  V		the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - *  Current estimate for non-large dividend is - *	ceil(log2(quotient) / N) * (10 + 7N/2) + C - *  A large dividend is one greater than 2^(31-TOPBITS) and takes a - *  different path, as the upper bits of the quotient must be developed - *  one bit at a time. - */ - - - -#include <sys/syscall.h> - - -.global   .urem; -.align 4; -.type  .urem ,@function;  - -.urem:  - -	! Ready to divide.  Compute size of quotient; scale comparand. -	orcc	%o1, %g0, %o5 -	bne	1f -	mov	%o0, %o3 - -		! Divide by zero trap.  If it returns, return 0 (about as -		! wrong as possible, but that is what SunOS does...). -		ta	0x02  -		retl -		clr	%o0 - -1: -	cmp	%o3, %o5			! if %o1 exceeds %o0, done -	blu	.Lgot_result		! (and algorithm fails otherwise) -	clr	%o2 -	sethi	%hi(1 << (32 - 4 - 1)), %g1 -	cmp	%o3, %g1 -	blu	.Lnot_really_big -	clr	%o4 - -	! Here the dividend is >= 2**(31-N) or so.  We must be careful here, -	! as our usual N-at-a-shot divide step will cause overflow and havoc. -	! The number of bits in the result here is N*ITER+SC, where SC <= N. -	! Compute ITER in an unorthodox manner: know we need to shift V into -	! the top decade: so do not even bother to compare to R. -	1: -		cmp	%o5, %g1 -		bgeu	3f -		mov	1, %g2 -		sll	%o5, 4, %o5 -		b	1b -		add	%o4, 1, %o4 - -	! Now compute %g2. -	2:	addcc	%o5, %o5, %o5 -		bcc	.Lnot_too_big -		add	%g2, 1, %g2 - -		! We get here if the %o1 overflowed while shifting. -		! This means that %o3 has the high-order bit set. -		! Restore %o5 and subtract from %o3. -		sll	%g1, 4, %g1	! high order bit -		srl	%o5, 1, %o5		! rest of %o5 -		add	%o5, %g1, %o5 -		b	.Ldo_single_div -		sub	%g2, 1, %g2 - -	.Lnot_too_big: -	3:	cmp	%o5, %o3 -		blu	2b -		nop -		be	.Ldo_single_div -		nop -	/* NB: these are commented out in the V8-Sparc manual as well */ -	/* (I do not understand this) */ -	! %o5 > %o3: went too far: back up 1 step -	!	srl	%o5, 1, %o5 -	!	dec	%g2 -	! do single-bit divide steps -	! -	! We have to be careful here.  We know that %o3 >= %o5, so we can do the -	! first divide step without thinking.  BUT, the others are conditional, -	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high- -	! order bit set in the first step, just falling into the regular -	! division loop will mess up the first time around. -	! So we unroll slightly... -	.Ldo_single_div: -		subcc	%g2, 1, %g2 -		bl	.Lend_regular_divide -		nop -		sub	%o3, %o5, %o3 -		mov	1, %o2 -		b	.Lend_single_divloop -		nop -	.Lsingle_divloop: -		sll	%o2, 1, %o2 -		bl	1f -		srl	%o5, 1, %o5 -		! %o3 >= 0 -		sub	%o3, %o5, %o3 -		b	2f -		add	%o2, 1, %o2 -	1:	! %o3 < 0 -		add	%o3, %o5, %o3 -		sub	%o2, 1, %o2 -	2: -	.Lend_single_divloop: -		subcc	%g2, 1, %g2 -		bge	.Lsingle_divloop -		tst	%o3 -		b,a	.Lend_regular_divide - -.Lnot_really_big: -1: -	sll	%o5, 4, %o5 -	cmp	%o5, %o3 -	bleu	1b -	addcc	%o4, 1, %o4 -	be	.Lgot_result -	sub	%o4, 1, %o4 - -	tst	%o3	! set up for initial iteration -.Ldivloop: -	sll	%o2, 4, %o2 -		! depth 1, accumulated bits 0 -	bl	.L1.16 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 2, accumulated bits 1 -	bl	.L2.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 3 -	bl	.L3.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 7 -	bl	.L4.23 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2+1), %o2 -	 -.L4.23: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (7*2-1), %o2 -	 -	 -.L3.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 5 -	bl	.L4.21 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2+1), %o2 -	 -.L4.21: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (5*2-1), %o2 -	 -	 -	 -.L2.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits 1 -	bl	.L3.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 3 -	bl	.L4.19 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2+1), %o2 -	 -.L4.19: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (3*2-1), %o2 -	 -	 -.L3.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits 1 -	bl	.L4.17 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2+1), %o2 -	 -.L4.17: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (1*2-1), %o2 -	 -	 -	 -	 -.L1.16: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 2, accumulated bits -1 -	bl	.L2.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -1 -	bl	.L3.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -1 -	bl	.L4.15 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2+1), %o2 -	 -.L4.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-1*2-1), %o2 -	 -	 -.L3.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -3 -	bl	.L4.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2+1), %o2 -	 -.L4.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-3*2-1), %o2 -	 -	 -	 -.L2.15: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 3, accumulated bits -3 -	bl	.L3.13 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -5 -	bl	.L4.11 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2+1), %o2 -	 -.L4.11: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-5*2-1), %o2 -	 -	 -.L3.13: -	! remainder is negative -	addcc	%o3,%o5,%o3 -			! depth 4, accumulated bits -7 -	bl	.L4.9 -	srl	%o5,1,%o5 -	! remainder is positive -	subcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2+1), %o2 -	 -.L4.9: -	! remainder is negative -	addcc	%o3,%o5,%o3 -		b	9f -		add	%o2, (-7*2-1), %o2 -	 -	 -	 -	 -	9: -.Lend_regular_divide: -	subcc	%o4, 1, %o4 -	bge	.Ldivloop -	tst	%o3 -	bl,a	.Lgot_result -	! non-restoring fixup here (one instruction only!) -	add	%o3, %o1, %o3 - - -.Lgot_result: - -	retl -	mov %o3, %o0 - -.size  .urem , . -.urem +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/urem.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/urem.S" +#else +# include "sparcv7/urem.S" +#endif | 
