diff options
author | Mike Frysinger <vapier@gentoo.org> | 2005-08-25 23:50:55 +0000 |
---|---|---|
committer | Mike Frysinger <vapier@gentoo.org> | 2005-08-25 23:50:55 +0000 |
commit | 46b06e9c1c7d57ae0008b3a282110d8ce4345b5f (patch) | |
tree | 8bb70d81ee67ecd175ad14778b2513e4496e3b9e | |
parent | 4147ffbcc6e2c20ab9bc1b43330244abe5a10b63 (diff) |
import different optimized versions of div funcs based upon target sparc arch
20 files changed, 1763 insertions, 1588 deletions
diff --git a/libc/sysdeps/linux/sparc/rem.S b/libc/sysdeps/linux/sparc/rem.S index 3da857e99..595e8ea5e 100644 --- a/libc/sysdeps/linux/sparc/rem.S +++ b/libc/sysdeps/linux/sparc/rem.S @@ -1,367 +1,9 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .rem name of function to generate - * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - * true true=true => signed; true=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - -#include <sys/syscall.h> - - -.global .rem; -.align 4; -.type .rem ,@function; - -.rem: - ! compute sign of result; if neither is negative, no problem - orcc %o1, %o0, %g0 ! either negative? - bge 2f ! no, go do the divide - mov %o0, %g3 ! sign of remainder matches %o0 - tst %o1 - bge 1f - tst %o0 - ! %o1 is definitely negative; %o0 might also be negative - bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg -1: ! %o0 is negative, %o1 is nonnegative - sub %g0, %o0, %o0 ! make %o0 nonnegative -2: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta 0x02 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu .Lgot_result ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu .Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc .Lnot_too_big - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b .Ldo_single_div - sub %g2, 1, %g2 - - .Lnot_too_big: - 3: cmp %o5, %o3 - blu 2b - nop - be .Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - .Ldo_single_div: - subcc %g2, 1, %g2 - bl .Lend_regular_divide - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b .Lend_single_divloop - nop - .Lsingle_divloop: - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - .Lend_single_divloop: - subcc %g2, 1, %g2 - bge .Lsingle_divloop - tst %o3 - b,a .Lend_regular_divide - -.Lnot_really_big: -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be .Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -.Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl .L1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl .L2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl .L3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl .L4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -.L4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -.L3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl .L4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -.L4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -.L2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl .L3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl .L4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -.L4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -.L3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl .L4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -.L4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -.L1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl .L2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl .L3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl .L4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -.L4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -.L3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl .L4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -.L4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -.L2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl .L3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl .L4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -.L4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -.L3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl .L4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -.L4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - - 9: -.Lend_regular_divide: - subcc %o4, 1, %o4 - bge .Ldivloop - tst %o3 - bl,a .Lgot_result - ! non-restoring fixup here (one instruction only!) - add %o3, %o1, %o3 - - -.Lgot_result: - ! check to see if answer should be < 0 - tst %g3 - bl,a 1f - sub %g0, %o3, %o3 -1: - retl - mov %o3, %o0 - -.size .rem,.-.rem; +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/rem.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/rem.S" +#else +# include "sparcv7/rem.S" +#endif diff --git a/libc/sysdeps/linux/sparc/sdiv.S b/libc/sysdeps/linux/sparc/sdiv.S index 8fdb7daf8..7e9a9c923 100644 --- a/libc/sysdeps/linux/sparc/sdiv.S +++ b/libc/sysdeps/linux/sparc/sdiv.S @@ -1,366 +1,9 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .div name of function to generate - * div div=div => %o0 / %o1; div=rem => %o0 % %o1 - * true true=true => signed; true=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - -#include <sys/syscall.h> - -.global .div; -.align 4; -.type .div ,@function; - -.div: - ! compute sign of result; if neither is negative, no problem - orcc %o1, %o0, %g0 ! either negative? - bge 2f ! no, go do the divide - xor %o1, %o0, %g3 ! compute sign in any case - tst %o1 - bge 1f - tst %o0 - ! %o1 is definitely negative; %o0 might also be negative - bge 2f ! if %o0 not negative... - sub %g0, %o1, %o1 ! in any case, make %o1 nonneg -1: ! %o0 is negative, %o1 is nonnegative - sub %g0, %o0, %o0 ! make %o0 nonnegative -2: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta 0x02 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu .Lgot_result ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu .Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc .Lnot_too_big - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b .Ldo_single_div - sub %g2, 1, %g2 - - .Lnot_too_big: - 3: cmp %o5, %o3 - blu 2b - nop - be .Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - .Ldo_single_div: - subcc %g2, 1, %g2 - bl .Lend_regular_divide - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b .Lend_single_divloop - nop - .Lsingle_divloop: - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - .Lend_single_divloop: - subcc %g2, 1, %g2 - bge .Lsingle_divloop - tst %o3 - b,a .Lend_regular_divide - -.Lnot_really_big: -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be .Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -.Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl .L1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl .L2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl .L3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl .L4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -.L4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -.L3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl .L4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -.L4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -.L2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl .L3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl .L4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -.L4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -.L3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl .L4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -.L4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -.L1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl .L2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl .L3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl .L4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -.L4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -.L3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl .L4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -.L4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -.L2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl .L3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl .L4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -.L4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -.L3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl .L4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -.L4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - - 9: -.Lend_regular_divide: - subcc %o4, 1, %o4 - bge .Ldivloop - tst %o3 - bl,a .Lgot_result - ! non-restoring fixup here (one instruction only!) - sub %o2, 1, %o2 - - -.Lgot_result: - ! check to see if answer should be < 0 - tst %g3 - bl,a 1f - sub %g0, %o2, %o2 -1: - retl - mov %o2, %o0 - -.size .div,.-.div; +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/sdiv.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/sdiv.S" +#else +# include "sparcv7/sdiv.S" +#endif diff --git a/libc/sysdeps/linux/sparc/sparcv7/rem.S b/libc/sysdeps/linux/sparc/sparcv7/rem.S new file mode 100644 index 000000000..ce06e6361 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/rem.S @@ -0,0 +1,360 @@ + /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .rem name of function to generate + * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 + * true true=true => signed; true=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + +ENTRY(.rem) + ! compute sign of result; if neither is negative, no problem + orcc %o1, %o0, %g0 ! either negative? + bge 2f ! no, go do the divide + mov %o0, %g3 ! sign of remainder matches %o0 + tst %o1 + bge 1f + tst %o0 + ! %o1 is definitely negative; %o0 might also be negative + bge 2f ! if %o0 not negative... + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg +1: ! %o0 is negative, %o1 is nonnegative + sub %g0, %o0, %o0 ! make %o0 nonnegative +2: + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu LOC(got_result) ! (and algorithm fails otherwise) + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 + blu LOC(not_really_big) + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g2 + sll %o5, 4, %o5 + b 1b + add %o4, 1, %o4 + + ! Now compute %g2. + 2: addcc %o5, %o5, %o5 + bcc LOC(not_too_big) + add %g2, 1, %g2 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + b LOC(do_single_div) + sub %g2, 1, %g2 + + LOC(not_too_big): + 3: cmp %o5, %o3 + blu 2b + nop + be LOC(do_single_div) + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g2 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + LOC(do_single_div): + subcc %g2, 1, %g2 + bl LOC(end_regular_divide) + nop + sub %o3, %o5, %o3 + mov 1, %o2 + b LOC(end_single_divloop) + nop + LOC(single_divloop): + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + LOC(end_single_divloop): + subcc %g2, 1, %g2 + bge LOC(single_divloop) + tst %o3 + b,a LOC(end_regular_divide) + +LOC(not_really_big): +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be LOC(got_result) + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +LOC(divloop): + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl LOC(1.16) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl LOC(2.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl LOC(3.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl LOC(4.23) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +LOC(4.23): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + + +LOC(3.19): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl LOC(4.21) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +LOC(4.21): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + + + +LOC(2.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl LOC(3.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl LOC(4.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +LOC(4.19): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + + +LOC(3.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl LOC(4.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +LOC(4.17): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + + + + +LOC(1.16): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl LOC(2.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl LOC(3.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl LOC(4.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +LOC(4.15): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + + +LOC(3.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl LOC(4.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +LOC(4.13): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + + + +LOC(2.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl LOC(3.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl LOC(4.11) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +LOC(4.11): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + + +LOC(3.13): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl LOC(4.9) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +LOC(4.9): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + + + + 9: +LOC(end_regular_divide): + subcc %o4, 1, %o4 + bge LOC(divloop) + tst %o3 + bl,a LOC(got_result) + ! non-restoring fixup here (one instruction only!) + add %o3, %o1, %o3 + + +LOC(got_result): + ! check to see if answer should be < 0 + tst %g3 + bl,a 1f + sub %g0, %o3, %o3 +1: + retl + mov %o3, %o0 + +END(.rem) diff --git a/libc/sysdeps/linux/sparc/sparcv7/sdiv.S b/libc/sysdeps/linux/sparc/sparcv7/sdiv.S new file mode 100644 index 000000000..d6f2bc7c7 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/sdiv.S @@ -0,0 +1,360 @@ + /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .div name of function to generate + * div div=div => %o0 / %o1; div=rem => %o0 % %o1 + * true true=true => signed; true=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + +ENTRY(.div) + ! compute sign of result; if neither is negative, no problem + orcc %o1, %o0, %g0 ! either negative? + bge 2f ! no, go do the divide + xor %o1, %o0, %g3 ! compute sign in any case + tst %o1 + bge 1f + tst %o0 + ! %o1 is definitely negative; %o0 might also be negative + bge 2f ! if %o0 not negative... + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg +1: ! %o0 is negative, %o1 is nonnegative + sub %g0, %o0, %o0 ! make %o0 nonnegative +2: + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu LOC(got_result) ! (and algorithm fails otherwise) + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 + blu LOC(not_really_big) + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g2 + sll %o5, 4, %o5 + b 1b + add %o4, 1, %o4 + + ! Now compute %g2. + 2: addcc %o5, %o5, %o5 + bcc LOC(not_too_big) + add %g2, 1, %g2 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + b LOC(do_single_div) + sub %g2, 1, %g2 + + LOC(not_too_big): + 3: cmp %o5, %o3 + blu 2b + nop + be LOC(do_single_div) + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g2 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + LOC(do_single_div): + subcc %g2, 1, %g2 + bl LOC(end_regular_divide) + nop + sub %o3, %o5, %o3 + mov 1, %o2 + b LOC(end_single_divloop) + nop + LOC(single_divloop): + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + LOC(end_single_divloop): + subcc %g2, 1, %g2 + bge LOC(single_divloop) + tst %o3 + b,a LOC(end_regular_divide) + +LOC(not_really_big): +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be LOC(got_result) + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +LOC(divloop): + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl LOC(1.16) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl LOC(2.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl LOC(3.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl LOC(4.23) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +LOC(4.23): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + + +LOC(3.19): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl LOC(4.21) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +LOC(4.21): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + + + +LOC(2.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl LOC(3.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl LOC(4.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +LOC(4.19): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + + +LOC(3.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl LOC(4.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +LOC(4.17): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + + + + +LOC(1.16): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl LOC(2.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl LOC(3.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl LOC(4.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +LOC(4.15): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + + +LOC(3.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl LOC(4.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +LOC(4.13): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + + + +LOC(2.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl LOC(3.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl LOC(4.11) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +LOC(4.11): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + + +LOC(3.13): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl LOC(4.9) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +LOC(4.9): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + + + + 9: +LOC(end_regular_divide): + subcc %o4, 1, %o4 + bge LOC(divloop) + tst %o3 + bl,a LOC(got_result) + ! non-restoring fixup here (one instruction only!) + sub %o2, 1, %o2 + + +LOC(got_result): + ! check to see if answer should be < 0 + tst %g3 + bl,a 1f + sub %g0, %o2, %o2 +1: + retl + mov %o2, %o0 + +END(.div) diff --git a/libc/sysdeps/linux/sparc/sparcv7/udiv.S b/libc/sysdeps/linux/sparc/sparcv7/udiv.S new file mode 100644 index 000000000..56c64ad01 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/udiv.S @@ -0,0 +1,343 @@ + /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .udiv name of function to generate + * div div=div => %o0 / %o1; div=rem => %o0 % %o1 + * false false=true => signed; false=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + +ENTRY(.udiv) + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu LOC(got_result) ! (and algorithm fails otherwise) + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 + blu LOC(not_really_big) + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g2 + sll %o5, 4, %o5 + b 1b + add %o4, 1, %o4 + + ! Now compute %g2. + 2: addcc %o5, %o5, %o5 + bcc LOC(not_too_big) + add %g2, 1, %g2 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + b LOC(do_single_div) + sub %g2, 1, %g2 + + LOC(not_too_big): + 3: cmp %o5, %o3 + blu 2b + nop + be LOC(do_single_div) + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g2 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + LOC(do_single_div): + subcc %g2, 1, %g2 + bl LOC(end_regular_divide) + nop + sub %o3, %o5, %o3 + mov 1, %o2 + b LOC(end_single_divloop) + nop + LOC(single_divloop): + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + LOC(end_single_divloop): + subcc %g2, 1, %g2 + bge LOC(single_divloop) + tst %o3 + b,a LOC(end_regular_divide) + +LOC(not_really_big): +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be LOC(got_result) + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +LOC(divloop): + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl LOC(1.16) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl LOC(2.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl LOC(3.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl LOC(4.23) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +LOC(4.23): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + + +LOC(3.19): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl LOC(4.21) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +LOC(4.21): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + + + +LOC(2.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl LOC(3.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl LOC(4.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +LOC(4.19): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + + +LOC(3.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl LOC(4.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +LOC(4.17): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + + + + +LOC(1.16): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl LOC(2.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl LOC(3.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl LOC(4.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +LOC(4.15): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + + +LOC(3.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl LOC(4.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +LOC(4.13): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + + + +LOC(2.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl LOC(3.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl LOC(4.11) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +LOC(4.11): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + + +LOC(3.13): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl LOC(4.9) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +LOC(4.9): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + + + + 9: +LOC(end_regular_divide): + subcc %o4, 1, %o4 + bge LOC(divloop) + tst %o3 + bl,a LOC(got_result) + ! non-restoring fixup here (one instruction only!) + sub %o2, 1, %o2 + + +LOC(got_result): + + retl + mov %o2, %o0 + +END(.udiv) diff --git a/libc/sysdeps/linux/sparc/sparcv7/umul.S b/libc/sysdeps/linux/sparc/sparcv7/umul.S new file mode 100644 index 000000000..50b3157db --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/umul.S @@ -0,0 +1,153 @@ +/* + * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the + * upper 32 bits of the 64-bit product). + * + * This code optimizes short (less than 13-bit) multiplies. Short + * multiplies require 25 instruction cycles, and long ones require + * 45 instruction cycles. + * + * On return, overflow has occurred (%o1 is not zero) if and only if + * the Z condition code is clear, allowing, e.g., the following: + * + * call .umul + * nop + * bnz overflow (or tnz) + */ + +ENTRY(.umul) + or %o0, %o1, %o4 + mov %o0, %y ! multiplier -> Y + andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args + be LOC(mul_shortway) ! if zero, can do it the short way + andcc %g0, %g0, %o4 ! zero the partial product; clear N & V + + /* + * Long multiply. 32 steps, followed by a final shift step. + */ + mulscc %o4, %o1, %o4 ! 1 + mulscc %o4, %o1, %o4 ! 2 + mulscc %o4, %o1, %o4 ! 3 + mulscc %o4, %o1, %o4 ! 4 + mulscc %o4, %o1, %o4 ! 5 + mulscc %o4, %o1, %o4 ! 6 + mulscc %o4, %o1, %o4 ! 7 + mulscc %o4, %o1, %o4 ! 8 + mulscc %o4, %o1, %o4 ! 9 + mulscc %o4, %o1, %o4 ! 10 + mulscc %o4, %o1, %o4 ! 11 + mulscc %o4, %o1, %o4 ! 12 + mulscc %o4, %o1, %o4 ! 13 + mulscc %o4, %o1, %o4 ! 14 + mulscc %o4, %o1, %o4 ! 15 + mulscc %o4, %o1, %o4 ! 16 + mulscc %o4, %o1, %o4 ! 17 + mulscc %o4, %o1, %o4 ! 18 + mulscc %o4, %o1, %o4 ! 19 + mulscc %o4, %o1, %o4 ! 20 + mulscc %o4, %o1, %o4 ! 21 + mulscc %o4, %o1, %o4 ! 22 + mulscc %o4, %o1, %o4 ! 23 + mulscc %o4, %o1, %o4 ! 24 + mulscc %o4, %o1, %o4 ! 25 + mulscc %o4, %o1, %o4 ! 26 + mulscc %o4, %o1, %o4 ! 27 + mulscc %o4, %o1, %o4 ! 28 + mulscc %o4, %o1, %o4 ! 29 + mulscc %o4, %o1, %o4 ! 30 + mulscc %o4, %o1, %o4 ! 31 + mulscc %o4, %o1, %o4 ! 32 + mulscc %o4, %g0, %o4 ! final shift + + /* + * Normally, with the shift-and-add approach, if both numbers are + * positive you get the correct result. With 32-bit two's-complement + * numbers, -x is represented as + * + * x 32 + * ( 2 - ------ ) mod 2 * 2 + * 32 + * 2 + * + * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, + * we can treat this as if the radix point were just to the left + * of the sign bit (multiply by 2^32), and get + * + * -x = (2 - x) mod 2 + * + * Then, ignoring the `mod 2's for convenience: + * + * x * y = xy + * -x * y = 2y - xy + * x * -y = 2x - xy + * -x * -y = 4 - 2x - 2y + xy + * + * For signed multiplies, we subtract (x << 32) from the partial + * product to fix this problem for negative multipliers (see mul.s). + * Because of the way the shift into the partial product is calculated + * (N xor V), this term is automatically removed for the multiplicand, + * so we don't have to adjust. + * + * But for unsigned multiplies, the high order bit wasn't a sign bit, + * and the correction is wrong. So for unsigned multiplies where the + * high order bit is one, we end up with xy - (y << 32). To fix it + * we add y << 32. + */ +#if 0 + tst %o1 + bl,a 1f ! if %o1 < 0 (high order bit = 1), + add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) +1: rd %y, %o0 ! get lower half of product + retl + addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 +#else + /* Faster code from tege@sics.se. */ + sra %o1, 31, %o2 ! make mask from sign bit + and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 + rd %y, %o0 ! get lower half of product + retl + addcc %o4, %o2, %o1 ! add compensation and put upper half in place +#endif + +LOC(mul_shortway): + /* + * Short multiply. 12 steps, followed by a final shift step. + * The resulting bits are off by 12 and (32-12) = 20 bit positions, + * but there is no problem with %o0 being negative (unlike above), + * and overflow is impossible (the answer is at most 24 bits long). + */ + mulscc %o4, %o1, %o4 ! 1 + mulscc %o4, %o1, %o4 ! 2 + mulscc %o4, %o1, %o4 ! 3 + mulscc %o4, %o1, %o4 ! 4 + mulscc %o4, %o1, %o4 ! 5 + mulscc %o4, %o1, %o4 ! 6 + mulscc %o4, %o1, %o4 ! 7 + mulscc %o4, %o1, %o4 ! 8 + mulscc %o4, %o1, %o4 ! 9 + mulscc %o4, %o1, %o4 ! 10 + mulscc %o4, %o1, %o4 ! 11 + mulscc %o4, %o1, %o4 ! 12 + mulscc %o4, %g0, %o4 ! final shift + + /* + * %o4 has 20 of the bits that should be in the result; %y has + * the bottom 12 (as %y's top 12). That is: + * + * %o4 %y + * +----------------+----------------+ + * | -12- | -20- | -12- | -20- | + * +------(---------+------)---------+ + * -----result----- + * + * The 12 bits of %o4 left of the `result' area are all zero; + * in fact, all top 20 bits of %o4 are zero. + */ + + rd %y, %o5 + sll %o4, 12, %o0 ! shift middle bits left 12 + srl %o5, 20, %o5 ! shift low bits right 20 + or %o5, %o0, %o0 + retl + addcc %g0, %g0, %o1 ! %o1 = zero, and set Z + +END(.umul) diff --git a/libc/sysdeps/linux/sparc/sparcv7/urem.S b/libc/sysdeps/linux/sparc/sparcv7/urem.S new file mode 100644 index 000000000..ecf34672a --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv7/urem.S @@ -0,0 +1,343 @@ + /* This file is generated from divrem.m4; DO NOT EDIT! */ +/* + * Division and remainder, from Appendix E of the Sparc Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .urem name of function to generate + * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 + * false false=true => signed; false=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + + + +ENTRY(.urem) + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu LOC(got_result) ! (and algorithm fails otherwise) + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 + blu LOC(not_really_big) + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g2 + sll %o5, 4, %o5 + b 1b + add %o4, 1, %o4 + + ! Now compute %g2. + 2: addcc %o5, %o5, %o5 + bcc LOC(not_too_big) + add %g2, 1, %g2 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + b LOC(do_single_div) + sub %g2, 1, %g2 + + LOC(not_too_big): + 3: cmp %o5, %o3 + blu 2b + nop + be LOC(do_single_div) + nop + /* NB: these are commented out in the V8-Sparc manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g2 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + LOC(do_single_div): + subcc %g2, 1, %g2 + bl LOC(end_regular_divide) + nop + sub %o3, %o5, %o3 + mov 1, %o2 + b LOC(end_single_divloop) + nop + LOC(single_divloop): + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + LOC(end_single_divloop): + subcc %g2, 1, %g2 + bge LOC(single_divloop) + tst %o3 + b,a LOC(end_regular_divide) + +LOC(not_really_big): +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be LOC(got_result) + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +LOC(divloop): + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl LOC(1.16) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl LOC(2.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl LOC(3.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl LOC(4.23) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +LOC(4.23): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + + +LOC(3.19): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl LOC(4.21) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +LOC(4.21): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + + + +LOC(2.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl LOC(3.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl LOC(4.19) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +LOC(4.19): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + + +LOC(3.17): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl LOC(4.17) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +LOC(4.17): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + + + + +LOC(1.16): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl LOC(2.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl LOC(3.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl LOC(4.15) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +LOC(4.15): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + + +LOC(3.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl LOC(4.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +LOC(4.13): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + + + +LOC(2.15): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl LOC(3.13) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl LOC(4.11) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +LOC(4.11): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + + +LOC(3.13): + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl LOC(4.9) + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +LOC(4.9): + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + + + + 9: +LOC(end_regular_divide): + subcc %o4, 1, %o4 + bge LOC(divloop) + tst %o3 + bl,a LOC(got_result) + ! non-restoring fixup here (one instruction only!) + add %o3, %o1, %o3 + + +LOC(got_result): + + retl + mov %o3, %o0 + +END(.urem) diff --git a/libc/sysdeps/linux/sparc/sparcv8/rem.S b/libc/sysdeps/linux/sparc/sparcv8/rem.S new file mode 100644 index 000000000..c4faebe88 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/rem.S @@ -0,0 +1,19 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.rem) + + sra %o0, 31, %o2 + wr %o2, 0, %y + nop + nop + nop + sdivcc %o0, %o1, %o2 + bvs,a 1f + xnor %o2, %g0, %o2 +1: smul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 + +END(.rem) diff --git a/libc/sysdeps/linux/sparc/sparcv8/sdiv.S b/libc/sysdeps/linux/sparc/sparcv8/sdiv.S new file mode 100644 index 000000000..4ac901a77 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/sdiv.S @@ -0,0 +1,18 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.div) + + sra %o0, 31, %o2 + wr %o2, 0, %y + nop + nop + nop + sdivcc %o0, %o1, %o0 + bvs,a 1f + xnor %o0, %g0, %o0 +1: retl + nop + +END(.div) diff --git a/libc/sysdeps/linux/sparc/sparcv8/udiv.S b/libc/sysdeps/linux/sparc/sparcv8/udiv.S new file mode 100644 index 000000000..d5d93bb8f --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/udiv.S @@ -0,0 +1,13 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.udiv) + + wr %g0, 0, %y + nop + nop + retl + udiv %o0, %o1, %o0 + +END(.udiv) diff --git a/libc/sysdeps/linux/sparc/sparcv8/umul.S b/libc/sysdeps/linux/sparc/sparcv8/umul.S new file mode 100644 index 000000000..47b98e9ca --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/umul.S @@ -0,0 +1,11 @@ +/* + * Sparc v8 has multiply. + */ + +ENTRY(.umul) + + umul %o0, %o1, %o0 + retl + rd %y, %o1 + +END(.umul) diff --git a/libc/sysdeps/linux/sparc/sparcv8/urem.S b/libc/sysdeps/linux/sparc/sparcv8/urem.S new file mode 100644 index 000000000..0e96246f5 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv8/urem.S @@ -0,0 +1,16 @@ +/* + * Sparc v8 has divide. + */ + +ENTRY(.urem) + + wr %g0, 0, %y + nop + nop + nop + udiv %o0, %o1, %o2 + umul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 + +END(.urem) diff --git a/libc/sysdeps/linux/sparc/sparcv9/rem.S b/libc/sysdeps/linux/sparc/sparcv9/rem.S new file mode 100644 index 000000000..1474e32ae --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/rem.S @@ -0,0 +1,20 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and sdivcc only 36, + * we use sdivcc eventhough it is deprecated. + */ + + .text + .align 32 +ENTRY(.rem) + + sra %o0, 31, %o2 + wr %o2, 0, %y + sdivcc %o0, %o1, %o2 + xnor %o2, %g0, %o3 + movvs %icc, %o3, %o2 + smul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 + +END(.rem) diff --git a/libc/sysdeps/linux/sparc/sparcv9/sdiv.S b/libc/sysdeps/linux/sparc/sparcv9/sdiv.S new file mode 100644 index 000000000..45535bb68 --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/sdiv.S @@ -0,0 +1,18 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and sdivcc only 36, + * we use sdivcc eventhough it is deprecated. + */ + + .text + .align 32 +ENTRY(.div) + + sra %o0, 31, %o2 + wr %o2, 0, %y + sdivcc %o0, %o1, %o0 + xnor %o0, %g0, %o2 + retl + movvs %icc, %o2, %o0 + +END(.div) diff --git a/libc/sysdeps/linux/sparc/sparcv9/udiv.S b/libc/sysdeps/linux/sparc/sparcv9/udiv.S new file mode 100644 index 000000000..303f29bdf --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/udiv.S @@ -0,0 +1,15 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and udiv only 37, + * we use udiv eventhough it is deprecated. + */ + + .text + .align 32 +ENTRY(.udiv) + + wr %g0, 0, %y + retl + udiv %o0, %o1, %o0 + +END(.udiv) diff --git a/libc/sysdeps/linux/sparc/sparcv9/umul.S b/libc/sysdeps/linux/sparc/sparcv9/umul.S new file mode 100644 index 000000000..e65e4b95f --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/umul.S @@ -0,0 +1,15 @@ +/* + * Sparc v9 has multiply. + */ + + .text + .align 32 +ENTRY(.umul) + + srl %o0, 0, %o0 + srl %o1, 0, %o1 + mulx %o0, %o1, %o0 + retl + srlx %o0, 32, %o1 + +END(.umul) diff --git a/libc/sysdeps/linux/sparc/sparcv9/urem.S b/libc/sysdeps/linux/sparc/sparcv9/urem.S new file mode 100644 index 000000000..93542698d --- /dev/null +++ b/libc/sysdeps/linux/sparc/sparcv9/urem.S @@ -0,0 +1,17 @@ +/* + * Sparc v9 has divide. + * As divx takes 68 cycles and udiv only 37, + * we use udiv eventhough it is deprecated. + */ + + .text + .align 32 +ENTRY(.urem) + + wr %g0, 0, %y + udiv %o0, %o1, %o2 + umul %o2, %o1, %o2 + retl + sub %o0, %o2, %o0 + +END(.urem) diff --git a/libc/sysdeps/linux/sparc/udiv.S b/libc/sysdeps/linux/sparc/udiv.S index 85eeb40fc..a1355a761 100644 --- a/libc/sysdeps/linux/sparc/udiv.S +++ b/libc/sysdeps/linux/sparc/udiv.S @@ -1,348 +1,9 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .udiv name of function to generate - * div div=div => %o0 / %o1; div=rem => %o0 % %o1 - * false false=true => signed; false=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - -#include <sys/syscall.h> - -.global .udiv; -.align 4; -.type .udiv ,@function; - -.udiv: - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta 0x02 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu .Lgot_result ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu .Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc .Lnot_too_big - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b .Ldo_single_div - sub %g2, 1, %g2 - - .Lnot_too_big: - 3: cmp %o5, %o3 - blu 2b - nop - be .Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - .Ldo_single_div: - subcc %g2, 1, %g2 - bl .Lend_regular_divide - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b .Lend_single_divloop - nop - .Lsingle_divloop: - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - .Lend_single_divloop: - subcc %g2, 1, %g2 - bge .Lsingle_divloop - tst %o3 - b,a .Lend_regular_divide - -.Lnot_really_big: -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be .Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -.Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl .L1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl .L2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl .L3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl .L4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -.L4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -.L3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl .L4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -.L4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -.L2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl .L3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl .L4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -.L4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -.L3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl .L4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -.L4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -.L1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl .L2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl .L3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl .L4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -.L4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -.L3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl .L4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -.L4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -.L2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl .L3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl .L4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -.L4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -.L3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl .L4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -.L4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - - 9: -.Lend_regular_divide: - subcc %o4, 1, %o4 - bge .Ldivloop - tst %o3 - bl,a .Lgot_result - ! non-restoring fixup here (one instruction only!) - sub %o2, 1, %o2 - - -.Lgot_result: - - retl - mov %o2, %o0 - -.size .udiv,.-.udiv; +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/udiv.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/udiv.S" +#else +# include "sparcv7/udiv.S" +#endif diff --git a/libc/sysdeps/linux/sparc/umul.S b/libc/sysdeps/linux/sparc/umul.S index 4f38d78eb..e86059743 100644 --- a/libc/sysdeps/linux/sparc/umul.S +++ b/libc/sysdeps/linux/sparc/umul.S @@ -1,160 +1,9 @@ -/* - * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the - * upper 32 bits of the 64-bit product). - * - * This code optimizes short (less than 13-bit) multiplies. Short - * multiplies require 25 instruction cycles, and long ones require - * 45 instruction cycles. - * - * On return, overflow has occurred (%o1 is not zero) if and only if - * the Z condition code is clear, allowing, e.g., the following: - * - * call .umul - * nop - * bnz overflow (or tnz) - */ +#include "_math_inc.h" -#include <sys/syscall.h> - - -.global .umul; -.align 4; -.type .umul ,@function; - -.umul: - or %o0, %o1, %o4 - mov %o0, %y ! multiplier -> Y - andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args - be .Lmul_shortway ! if zero, can do it the short way - andcc %g0, %g0, %o4 ! zero the partial product; clear N & V - - /* - * Long multiply. 32 steps, followed by a final shift step. - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %o1, %o4 ! 13 - mulscc %o4, %o1, %o4 ! 14 - mulscc %o4, %o1, %o4 ! 15 - mulscc %o4, %o1, %o4 ! 16 - mulscc %o4, %o1, %o4 ! 17 - mulscc %o4, %o1, %o4 ! 18 - mulscc %o4, %o1, %o4 ! 19 - mulscc %o4, %o1, %o4 ! 20 - mulscc %o4, %o1, %o4 ! 21 - mulscc %o4, %o1, %o4 ! 22 - mulscc %o4, %o1, %o4 ! 23 - mulscc %o4, %o1, %o4 ! 24 - mulscc %o4, %o1, %o4 ! 25 - mulscc %o4, %o1, %o4 ! 26 - mulscc %o4, %o1, %o4 ! 27 - mulscc %o4, %o1, %o4 ! 28 - mulscc %o4, %o1, %o4 ! 29 - mulscc %o4, %o1, %o4 ! 30 - mulscc %o4, %o1, %o4 ! 31 - mulscc %o4, %o1, %o4 ! 32 - mulscc %o4, %g0, %o4 ! final shift - - /* - * Normally, with the shift-and-add approach, if both numbers are - * positive you get the correct result. With 32-bit two's-complement - * numbers, -x is represented as - * - * x 32 - * ( 2 - ------ ) mod 2 * 2 - * 32 - * 2 - * - * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, - * we can treat this as if the radix point were just to the left - * of the sign bit (multiply by 2^32), and get - * - * -x = (2 - x) mod 2 - * - * Then, ignoring the `mod 2's for convenience: - * - * x * y = xy - * -x * y = 2y - xy - * x * -y = 2x - xy - * -x * -y = 4 - 2x - 2y + xy - * - * For signed multiplies, we subtract (x << 32) from the partial - * product to fix this problem for negative multipliers (see mul.s). - * Because of the way the shift into the partial product is calculated - * (N xor V), this term is automatically removed for the multiplicand, - * so we don't have to adjust. - * - * But for unsigned multiplies, the high order bit wasn't a sign bit, - * and the correction is wrong. So for unsigned multiplies where the - * high order bit is one, we end up with xy - (y << 32). To fix it - * we add y << 32. - */ -#if 0 - tst %o1 - bl,a 1f ! if %o1 < 0 (high order bit = 1), - add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) -1: rd %y, %o0 ! get lower half of product - retl - addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/umul.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/umul.S" #else - /* Faster code from tege@sics.se. */ - sra %o1, 31, %o2 ! make mask from sign bit - and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 - rd %y, %o0 ! get lower half of product - retl - addcc %o4, %o2, %o1 ! add compensation and put upper half in place +# include "sparcv7/umul.S" #endif - -.Lmul_shortway: - /* - * Short multiply. 12 steps, followed by a final shift step. - * The resulting bits are off by 12 and (32-12) = 20 bit positions, - * but there is no problem with %o0 being negative (unlike above), - * and overflow is impossible (the answer is at most 24 bits long). - */ - mulscc %o4, %o1, %o4 ! 1 - mulscc %o4, %o1, %o4 ! 2 - mulscc %o4, %o1, %o4 ! 3 - mulscc %o4, %o1, %o4 ! 4 - mulscc %o4, %o1, %o4 ! 5 - mulscc %o4, %o1, %o4 ! 6 - mulscc %o4, %o1, %o4 ! 7 - mulscc %o4, %o1, %o4 ! 8 - mulscc %o4, %o1, %o4 ! 9 - mulscc %o4, %o1, %o4 ! 10 - mulscc %o4, %o1, %o4 ! 11 - mulscc %o4, %o1, %o4 ! 12 - mulscc %o4, %g0, %o4 ! final shift - - /* - * %o4 has 20 of the bits that should be in the result; %y has - * the bottom 12 (as %y's top 12). That is: - * - * %o4 %y - * +----------------+----------------+ - * | -12- | -20- | -12- | -20- | - * +------(---------+------)---------+ - * -----result----- - * - * The 12 bits of %o4 left of the `result' area are all zero; - * in fact, all top 20 bits of %o4 are zero. - */ - - rd %y, %o5 - sll %o4, 12, %o0 ! shift middle bits left 12 - srl %o5, 20, %o5 ! shift low bits right 20 - or %o5, %o0, %o0 - retl - addcc %g0, %g0, %o1 ! %o1 = zero, and set Z - -.size .umul , . -.umul diff --git a/libc/sysdeps/linux/sparc/urem.S b/libc/sysdeps/linux/sparc/urem.S index 890532141..baf6c4335 100644 --- a/libc/sysdeps/linux/sparc/urem.S +++ b/libc/sysdeps/linux/sparc/urem.S @@ -1,350 +1,9 @@ - /* This file is generated from divrem.m4; DO NOT EDIT! */ -/* - * Division and remainder, from Appendix E of the Sparc Version 8 - * Architecture Manual, with fixes from Gordon Irlam. - */ - -/* - * Input: dividend and divisor in %o0 and %o1 respectively. - * - * m4 parameters: - * .urem name of function to generate - * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1 - * false false=true => signed; false=false => unsigned - * - * Algorithm parameters: - * N how many bits per iteration we try to get (4) - * WORDSIZE total number of bits (32) - * - * Derived constants: - * TOPBITS number of bits in the top decade of a number - * - * Important variables: - * Q the partial quotient under development (initially 0) - * R the remainder so far, initially the dividend - * ITER number of main division loop iterations required; - * equal to ceil(log2(quotient) / N). Note that this - * is the log base (2^N) of the quotient. - * V the current comparand, initially divisor*2^(ITER*N-1) - * - * Cost: - * Current estimate for non-large dividend is - * ceil(log2(quotient) / N) * (10 + 7N/2) + C - * A large dividend is one greater than 2^(31-TOPBITS) and takes a - * different path, as the upper bits of the quotient must be developed - * one bit at a time. - */ - - - -#include <sys/syscall.h> - - -.global .urem; -.align 4; -.type .urem ,@function; - -.urem: - - ! Ready to divide. Compute size of quotient; scale comparand. - orcc %o1, %g0, %o5 - bne 1f - mov %o0, %o3 - - ! Divide by zero trap. If it returns, return 0 (about as - ! wrong as possible, but that is what SunOS does...). - ta 0x02 - retl - clr %o0 - -1: - cmp %o3, %o5 ! if %o1 exceeds %o0, done - blu .Lgot_result ! (and algorithm fails otherwise) - clr %o2 - sethi %hi(1 << (32 - 4 - 1)), %g1 - cmp %o3, %g1 - blu .Lnot_really_big - clr %o4 - - ! Here the dividend is >= 2**(31-N) or so. We must be careful here, - ! as our usual N-at-a-shot divide step will cause overflow and havoc. - ! The number of bits in the result here is N*ITER+SC, where SC <= N. - ! Compute ITER in an unorthodox manner: know we need to shift V into - ! the top decade: so do not even bother to compare to R. - 1: - cmp %o5, %g1 - bgeu 3f - mov 1, %g2 - sll %o5, 4, %o5 - b 1b - add %o4, 1, %o4 - - ! Now compute %g2. - 2: addcc %o5, %o5, %o5 - bcc .Lnot_too_big - add %g2, 1, %g2 - - ! We get here if the %o1 overflowed while shifting. - ! This means that %o3 has the high-order bit set. - ! Restore %o5 and subtract from %o3. - sll %g1, 4, %g1 ! high order bit - srl %o5, 1, %o5 ! rest of %o5 - add %o5, %g1, %o5 - b .Ldo_single_div - sub %g2, 1, %g2 - - .Lnot_too_big: - 3: cmp %o5, %o3 - blu 2b - nop - be .Ldo_single_div - nop - /* NB: these are commented out in the V8-Sparc manual as well */ - /* (I do not understand this) */ - ! %o5 > %o3: went too far: back up 1 step - ! srl %o5, 1, %o5 - ! dec %g2 - ! do single-bit divide steps - ! - ! We have to be careful here. We know that %o3 >= %o5, so we can do the - ! first divide step without thinking. BUT, the others are conditional, - ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- - ! order bit set in the first step, just falling into the regular - ! division loop will mess up the first time around. - ! So we unroll slightly... - .Ldo_single_div: - subcc %g2, 1, %g2 - bl .Lend_regular_divide - nop - sub %o3, %o5, %o3 - mov 1, %o2 - b .Lend_single_divloop - nop - .Lsingle_divloop: - sll %o2, 1, %o2 - bl 1f - srl %o5, 1, %o5 - ! %o3 >= 0 - sub %o3, %o5, %o3 - b 2f - add %o2, 1, %o2 - 1: ! %o3 < 0 - add %o3, %o5, %o3 - sub %o2, 1, %o2 - 2: - .Lend_single_divloop: - subcc %g2, 1, %g2 - bge .Lsingle_divloop - tst %o3 - b,a .Lend_regular_divide - -.Lnot_really_big: -1: - sll %o5, 4, %o5 - cmp %o5, %o3 - bleu 1b - addcc %o4, 1, %o4 - be .Lgot_result - sub %o4, 1, %o4 - - tst %o3 ! set up for initial iteration -.Ldivloop: - sll %o2, 4, %o2 - ! depth 1, accumulated bits 0 - bl .L1.16 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 2, accumulated bits 1 - bl .L2.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits 3 - bl .L3.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 7 - bl .L4.23 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (7*2+1), %o2 - -.L4.23: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (7*2-1), %o2 - - -.L3.19: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 5 - bl .L4.21 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (5*2+1), %o2 - -.L4.21: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (5*2-1), %o2 - - - -.L2.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits 1 - bl .L3.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits 3 - bl .L4.19 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (3*2+1), %o2 - -.L4.19: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (3*2-1), %o2 - - -.L3.17: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits 1 - bl .L4.17 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (1*2+1), %o2 - -.L4.17: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (1*2-1), %o2 - - - - -.L1.16: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 2, accumulated bits -1 - bl .L2.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 3, accumulated bits -1 - bl .L3.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -1 - bl .L4.15 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2+1), %o2 - -.L4.15: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-1*2-1), %o2 - - -.L3.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -3 - bl .L4.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2+1), %o2 - -.L4.13: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-3*2-1), %o2 - - - -.L2.15: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 3, accumulated bits -3 - bl .L3.13 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - ! depth 4, accumulated bits -5 - bl .L4.11 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2+1), %o2 - -.L4.11: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-5*2-1), %o2 - - -.L3.13: - ! remainder is negative - addcc %o3,%o5,%o3 - ! depth 4, accumulated bits -7 - bl .L4.9 - srl %o5,1,%o5 - ! remainder is positive - subcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2+1), %o2 - -.L4.9: - ! remainder is negative - addcc %o3,%o5,%o3 - b 9f - add %o2, (-7*2-1), %o2 - - - - - 9: -.Lend_regular_divide: - subcc %o4, 1, %o4 - bge .Ldivloop - tst %o3 - bl,a .Lgot_result - ! non-restoring fixup here (one instruction only!) - add %o3, %o1, %o3 - - -.Lgot_result: - - retl - mov %o3, %o0 - -.size .urem , . -.urem +#include "_math_inc.h" + +#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__) +# include "sparcv9/urem.S" +#elif defined(__CONFIG_SPARC_V8__) +# include "sparcv8/urem.S" +#else +# include "sparcv7/urem.S" +#endif |