From 6ffcc881dc156e1c9c0bc4b153091b4760b584cb Mon Sep 17 00:00:00 2001
From: Austin Foxley <austinf@cetoncorp.com>
Date: Sat, 17 Oct 2009 13:09:30 -0700
Subject: sh specific bits needed for nptl

 * unified atomic.h compare and exchange macros
 * clone.S with RESET_PID support
 * sh specific versions of pread/pwrite with cancellation support
 * check SHARED instead of PIC

Signed-off-by: Austin Foxley <austinf@cetoncorp.com>
---
 libc/sysdeps/linux/sh/Makefile.arch   |   4 +-
 libc/sysdeps/linux/sh/bits/atomic.h   | 516 +++++++++++++---------------------
 libc/sysdeps/linux/sh/clone.S         | 135 ++++-----
 libc/sysdeps/linux/sh/longjmp.c       |  56 ++++
 libc/sysdeps/linux/sh/pread_write.c   |  58 +++-
 libc/sysdeps/linux/sh/setjmp.S        |   2 +-
 libc/sysdeps/linux/sh/syscall_error.S |   4 +-
 7 files changed, 378 insertions(+), 397 deletions(-)
 create mode 100644 libc/sysdeps/linux/sh/longjmp.c

diff --git a/libc/sysdeps/linux/sh/Makefile.arch b/libc/sysdeps/linux/sh/Makefile.arch
index 31beda111..3e32e1095 100644
--- a/libc/sysdeps/linux/sh/Makefile.arch
+++ b/libc/sysdeps/linux/sh/Makefile.arch
@@ -7,6 +7,6 @@
 #
 
 CSRC := \
-	mmap.c pipe.c __init_brk.c brk.c sbrk.c pread_write.c cacheflush.c
+	mmap.c pipe.c __init_brk.c brk.c sbrk.c pread_write.c longjmp.c cacheflush.c
 
-SSRC := setjmp.S __longjmp.S vfork.S clone.S ___fpscr_values.S
+SSRC := setjmp.S __longjmp.S ___fpscr_values.S
diff --git a/libc/sysdeps/linux/sh/bits/atomic.h b/libc/sysdeps/linux/sh/bits/atomic.h
index 6bb7255c5..dd6e5f97d 100644
--- a/libc/sysdeps/linux/sh/bits/atomic.h
+++ b/libc/sysdeps/linux/sh/bits/atomic.h
@@ -54,6 +54,10 @@ typedef uintmax_t uatomic_max_t;
     Japan. http://lc.linux.or.jp/lc2002/papers/niibe0919h.pdf (in
     Japanese).
 
+    Niibe Yutaka, "gUSA: User Space Atomicity with Little Kernel
+    Modification", LinuxTag 2003, Rome.
+    http://www.semmel.ch/Linuxtag-DVD/talks/170/paper.html (in English).
+
     B.N. Bershad, D. Redell, and J. Ellis, "Fast Mutual Exclusion for
     Uniprocessors",  Proceedings of the Fifth Architectural Support for
     Programming Languages and Operating Systems (ASPLOS), pp. 223-233,
@@ -65,56 +69,44 @@ typedef uintmax_t uatomic_max_t;
       r1:     saved stack pointer
 */
 
-#define __arch_compare_and_exchange_val_8_acq(mem, newval, oldval) \
-  ({ __typeof (*(mem)) __result; \
-     __asm__ __volatile__ ("\
+/* Avoid having lots of different versions of compare and exchange,
+   by having this one complicated version. Parameters:
+      bwl:     b, w or l for 8, 16 and 32 bit versions.
+      version: val or bool, depending on whether the result is the
+               previous value or a bool indicating whether the transfer
+               did happen (note this needs inverting before being
+               returned in atomic_compare_and_exchange_bool).
+*/
+
+#define __arch_compare_and_exchange_n(mem, newval, oldval, bwl, version) \
+  ({ signed long __result; \
+     __asm __volatile ("\
 	.align 2\n\
 	mova 1f,r0\n\
 	nop\n\
 	mov r15,r1\n\
 	mov #-8,r15\n\
-     0: mov.b @%1,%0\n\
+     0: mov." #bwl " @%1,%0\n\
 	cmp/eq %0,%3\n\
 	bf 1f\n\
-	mov.b %2,@%1\n\
-     1: mov r1,r15"\
-	: "=&r" (__result) : "r" (mem), "r" (newval), "r" (oldval) \
-	: "r0", "r1", "t", "memory"); \
+	mov." #bwl " %2,@%1\n\
+     1: mov r1,r15\n\
+     .ifeqs \"bool\",\"" #version "\"\n\
+        movt %0\n\
+     .endif\n"					\
+	: "=&r" (__result)			\
+	: "r" (mem), "r" (newval), "r" (oldval)	\
+	: "r0", "r1", "t", "memory");		\
      __result; })
 
+#define __arch_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+  __arch_compare_and_exchange_n(mem, newval, (int8_t)(oldval), b, val)
+
 #define __arch_compare_and_exchange_val_16_acq(mem, newval, oldval) \
-  ({ __typeof (*(mem)) __result; \
-     __asm__ __volatile__ ("\
-	.align 2\n\
-	mova 1f,r0\n\
-	nop\n\
-	mov r15,r1\n\
-	mov #-8,r15\n\
-     0: mov.w @%1,%0\n\
-	cmp/eq %0,%3\n\
-	bf 1f\n\
-	mov.w %2,@%1\n\
-     1: mov r1,r15"\
-	: "=&r" (__result) : "r" (mem), "r" (newval), "r" (oldval) \
-	: "r0", "r1", "t", "memory"); \
-     __result; })
+  __arch_compare_and_exchange_n(mem, newval, (int16_t)(oldval), w, val)
 
 #define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
-  ({ __typeof (*(mem)) __result; \
-     __asm__ __volatile__ ("\
-	.align 2\n\
-	mova 1f,r0\n\
-	nop\n\
-	mov r15,r1\n\
-	mov #-8,r15\n\
-     0: mov.l @%1,%0\n\
-	cmp/eq %0,%3\n\
-	bf 1f\n\
-	mov.l %2,@%1\n\
-     1: mov r1,r15"\
-	: "=&r" (__result) : "r" (mem), "r" (newval), "r" (oldval) \
-	: "r0", "r1", "t", "memory"); \
-     __result; })
+  __arch_compare_and_exchange_n(mem, newval, (int32_t)(oldval), l, val)
 
 /* XXX We do not really need 64-bit compare-and-exchange.  At least
    not in the moment.  Using it would mean causing portability
@@ -122,298 +114,180 @@ typedef uintmax_t uatomic_max_t;
    such an operation.  So don't define any code for now.  */
 
 # define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  (abort (), (__typeof (*mem)) 0)
+  (abort (), 0)
+
+/* For "bool" routines, return if the exchange did NOT occur */
+
+#define __arch_compare_and_exchange_bool_8_acq(mem, newval, oldval) \
+  (! __arch_compare_and_exchange_n(mem, newval, (int8_t)(oldval), b, bool))
+
+#define __arch_compare_and_exchange_bool_16_acq(mem, newval, oldval) \
+  (! __arch_compare_and_exchange_n(mem, newval, (int16_t)(oldval), w, bool))
+
+#define __arch_compare_and_exchange_bool_32_acq(mem, newval, oldval) \
+  (! __arch_compare_and_exchange_n(mem, newval, (int32_t)(oldval), l, bool))
+
+# define __arch_compare_and_exchange_bool_64_acq(mem, newval, oldval) \
+  (abort (), 0)
+
+/* Similar to the above, have one template which can be used in a
+   number of places. This version returns both the old and the new
+   values of the location. Parameters:
+      bwl:     b, w or l for 8, 16 and 32 bit versions.
+      oper:    The instruction to perform on the old value.
+   Note old is not sign extended, so should be an unsigned long.
+*/
+
+#define __arch_operate_old_new_n(mem, value, old, new, bwl, oper)	\
+  (void) ({ __asm __volatile ("\
+	.align 2\n\
+	mova 1f,r0\n\
+	mov r15,r1\n\
+	nop\n\
+	mov #-8,r15\n\
+     0: mov." #bwl " @%2,%0\n\
+	mov %0,%1\n\
+	" #oper " %3,%1\n\
+	mov." #bwl " %1,@%2\n\
+     1: mov r1,r15"			\
+	: "=&r" (old), "=&r"(new)	\
+	: "r" (mem), "r" (value)	\
+	: "r0", "r1", "memory");	\
+    })
+
+#define __arch_exchange_and_add_8_int(mem, value)			\
+  ({ int32_t __value = (value), __new, __old;				\
+    __arch_operate_old_new_n((mem), __value, __old, __new, b, add);	\
+    __old; })
+
+#define __arch_exchange_and_add_16_int(mem, value)			\
+  ({ int32_t __value = (value), __new, __old;				\
+    __arch_operate_old_new_n((mem), __value, __old, __new, w, add);	\
+    __old; })
+
+#define __arch_exchange_and_add_32_int(mem, value)			\
+  ({ int32_t __value = (value), __new, __old;				\
+    __arch_operate_old_new_n((mem), __value, __old, __new, l, add);	\
+    __old; })
+
+#define __arch_exchange_and_add_64_int(mem, value)			\
+  (abort (), 0)
 
 #define atomic_exchange_and_add(mem, value) \
-  ({ __typeof (*(mem)) __result, __tmp, __value = (value); \
-     if (sizeof (*(mem)) == 1) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.b @%2,%0\n\
-	  add %0,%1\n\
-	  mov.b %1,@%2\n\
-       1: mov r1,r15"\
-	: "=&r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "memory"); \
-     else if (sizeof (*(mem)) == 2) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.w @%2,%0\n\
-	  add %0,%1\n\
-	  mov.w %1,@%2\n\
-       1: mov r1,r15"\
-	: "=&r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "memory"); \
-     else if (sizeof (*(mem)) == 4) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.l @%2,%0\n\
-	  add %0,%1\n\
-	  mov.l %1,@%2\n\
-       1: mov r1,r15"\
-	: "=&r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "memory"); \
-     else \
-       { \
-	 __typeof (mem) memp = (mem); \
-	 do \
-	   __result = *memp; \
-	 while (__arch_compare_and_exchange_val_64_acq \
-		 (memp,	__result + __value, __result) == __result); \
-	 (void) __value; \
-       } \
-     __result; })
+  __atomic_val_bysize (__arch_exchange_and_add, int, mem, value)
+
+
+/* Again, another template. We get a slight optimisation when the old value
+   does not need to be returned. Parameters:
+      bwl:     b, w or l for 8, 16 and 32 bit versions.
+      oper:    The instruction to perform on the old value.
+*/
+
+#define __arch_operate_new_n(mem, value, bwl, oper)	 \
+  ({ int32_t __value = (value), __new; \
+     __asm __volatile ("\
+	.align 2\n\
+	mova 1f,r0\n\
+	mov r15,r1\n\
+	mov #-6,r15\n\
+     0: mov." #bwl " @%1,%0\n\
+	" #oper " %2,%0\n\
+	mov." #bwl " %0,@%1\n\
+     1: mov r1,r15"			\
+	: "=&r" (__new)			\
+	: "r" (mem), "r" (__value)	\
+	: "r0", "r1", "memory");	\
+     __new;				\
+  })
+
+#define __arch_add_8_int(mem, value)		\
+  __arch_operate_new_n(mem, value, b, add)
+
+#define __arch_add_16_int(mem, value)		\
+  __arch_operate_new_n(mem, value, w, add)
+
+#define __arch_add_32_int(mem, value)		\
+  __arch_operate_new_n(mem, value, l, add)
+
+#define __arch_add_64_int(mem, value)		\
+  (abort (), 0)
 
 #define atomic_add(mem, value) \
-  (void) ({ __typeof (*(mem)) __tmp, __value = (value); \
-	    if (sizeof (*(mem)) == 1) \
-	      __asm__ __volatile__ ("\
-		.align 2\n\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		mov #-6,r15\n\
-	     0: mov.b @%1,r2\n\
-		add r2,%0\n\
-		mov.b %0,@%1\n\
-	     1: mov r1,r15"\
-		: "=&r" (__tmp) : "r" (mem), "0" (__value) \
-		: "r0", "r1", "r2", "memory"); \
-	    else if (sizeof (*(mem)) == 2) \
-	      __asm__ __volatile__ ("\
-		.align 2\n\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		mov #-6,r15\n\
-	     0: mov.w @%1,r2\n\
-		add r2,%0\n\
-		mov.w %0,@%1\n\
-	     1: mov r1,r15"\
-		: "=&r" (__tmp) : "r" (mem), "0" (__value) \
-		: "r0", "r1", "r2", "memory"); \
-	    else if (sizeof (*(mem)) == 4) \
-	      __asm__ __volatile__ ("\
-		.align 2\n\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		mov #-6,r15\n\
-	     0: mov.l @%1,r2\n\
-		add r2,%0\n\
-		mov.l %0,@%1\n\
-	     1: mov r1,r15"\
-		: "=&r" (__tmp) : "r" (mem), "0" (__value) \
-		: "r0", "r1", "r2", "memory"); \
-	    else \
-	      { \
-		__typeof (*(mem)) oldval; \
-		__typeof (mem) memp = (mem); \
-		do \
-		  oldval = *memp; \
-		while (__arch_compare_and_exchange_val_64_acq \
-			(memp, oldval + __value, oldval) == oldval); \
-		(void) __value; \
-	      } \
-	    })
+  ((void) __atomic_val_bysize (__arch_add, int, mem, value))
+
+
+#define __arch_add_negative_8_int(mem, value)		\
+  (__arch_operate_new_n(mem, value, b, add) < 0)
+
+#define __arch_add_negative_16_int(mem, value)		\
+  (__arch_operate_new_n(mem, value, w, add) < 0)
+
+#define __arch_add_negative_32_int(mem, value)		\
+  (__arch_operate_new_n(mem, value, l, add) < 0)
+
+#define __arch_add_negative_64_int(mem, value)		\
+  (abort (), 0)
 
 #define atomic_add_negative(mem, value) \
-  ({ unsigned char __result; \
-     __typeof (*(mem)) __tmp, __value = (value); \
-     if (sizeof (*(mem)) == 1) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.b @%2,r2\n\
-	  add r2,%1\n\
-	  mov.b %1,@%2\n\
-       1: mov r1,r15\n\
-	  shal %1\n\
-	  movt %0"\
-	: "=r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "r2", "t", "memory"); \
-     else if (sizeof (*(mem)) == 2) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.w @%2,r2\n\
-	  add r2,%1\n\
-	  mov.w %1,@%2\n\
-       1: mov r1,r15\n\
-	  shal %1\n\
-	  movt %0"\
-	: "=r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "r2", "t", "memory"); \
-     else if (sizeof (*(mem)) == 4) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.l @%2,r2\n\
-	  add r2,%1\n\
-	  mov.l %1,@%2\n\
-       1: mov r1,r15\n\
-	  shal %1\n\
-	  movt %0"\
-	: "=r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "r2", "t", "memory"); \
-     else \
-       abort (); \
-     __result; })
+  __atomic_bool_bysize (__arch_add_negative, int, mem, value)
+
+
+#define __arch_add_zero_8_int(mem, value)		\
+  (__arch_operate_new_n(mem, value, b, add) == 0)
+
+#define __arch_add_zero_16_int(mem, value)		\
+  (__arch_operate_new_n(mem, value, w, add) == 0)
+
+#define __arch_add_zero_32_int(mem, value)		\
+  (__arch_operate_new_n(mem, value, l, add) == 0)
+
+#define __arch_add_zero_64_int(mem, value)		\
+  (abort (), 0)
 
 #define atomic_add_zero(mem, value) \
-  ({ unsigned char __result; \
-     __typeof (*(mem)) __tmp, __value = (value); \
-     if (sizeof (*(mem)) == 1) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.b @%2,r2\n\
-	  add r2,%1\n\
-	  mov.b %1,@%2\n\
-       1: mov r1,r15\n\
-	  tst %1,%1\n\
-	  movt %0"\
-	: "=r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "r2", "t", "memory"); \
-     else if (sizeof (*(mem)) == 2) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.w @%2,r2\n\
-	  add r2,%1\n\
-	  mov.w %1,@%2\n\
-       1: mov r1,r15\n\
-	  tst %1,%1\n\
-	  movt %0"\
-	: "=r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "r2", "t", "memory"); \
-     else if (sizeof (*(mem)) == 4) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  mov r15,r1\n\
-	  mov #-6,r15\n\
-       0: mov.l @%2,r2\n\
-	  add r2,%1\n\
-	  mov.l %1,@%2\n\
-       1: mov r1,r15\n\
-	  tst %1,%1\n\
-	  movt %0"\
-	: "=r" (__result), "=&r" (__tmp) : "r" (mem), "1" (__value) \
-	: "r0", "r1", "r2", "t", "memory"); \
-     else \
-       abort (); \
-     __result; })
+  __atomic_bool_bysize (__arch_add_zero, int, mem, value)
+
 
 #define atomic_increment_and_test(mem) atomic_add_zero((mem), 1)
 #define atomic_decrement_and_test(mem) atomic_add_zero((mem), -1)
 
-#define atomic_bit_set(mem, bit) \
-  (void) ({ unsigned int __mask = 1 << (bit); \
-	    if (sizeof (*(mem)) == 1) \
-	      __asm__ __volatile__ ("\
-		.align 2\n\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		mov #-6,r15\n\
-	     0: mov.b @%0,r2\n\
-		or %1,r2\n\
-		mov.b r2,@%0\n\
-	     1: mov r1,r15"\
-		: : "r" (mem), "r" (__mask) \
-		: "r0", "r1", "r2", "memory"); \
-	    else if (sizeof (*(mem)) == 2) \
-	      __asm__ __volatile__ ("\
-		.align 2\n\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		mov #-6,r15\n\
-	     0: mov.w @%0,r2\n\
-		or %1,r2\n\
-		mov.w r2,@%0\n\
-	     1: mov r1,r15"\
-		: : "r" (mem), "r" (__mask) \
-		: "r0", "r1", "r2", "memory"); \
-	    else if (sizeof (*(mem)) == 4) \
-	      __asm__ __volatile__ ("\
-		.align 2\n\
-		mova 1f,r0\n\
-		mov r15,r1\n\
-		mov #-6,r15\n\
-	     0: mov.l @%0,r2\n\
-		or %1,r2\n\
-		mov.l r2,@%0\n\
-	     1: mov r1,r15"\
-		: : "r" (mem), "r" (__mask) \
-		: "r0", "r1", "r2", "memory"); \
-	    else \
-	      abort (); \
-	    })
-
-#define atomic_bit_test_set(mem, bit) \
-  ({ unsigned int __mask = 1 << (bit); \
-     unsigned int __result = __mask; \
-     if (sizeof (*(mem)) == 1) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  nop\n\
-	  mov r15,r1\n\
-	  mov #-8,r15\n\
-       0: mov.b @%2,r2\n\
-	  or r2,%1\n\
-	  and r2,%0\n\
-	  mov.b %1,@%2\n\
-       1: mov r1,r15"\
-	: "=&r" (__result), "=&r" (__mask) \
-	: "r" (mem), "0" (__result), "1" (__mask) \
-	: "r0", "r1", "r2", "memory"); \
-     else if (sizeof (*(mem)) == 2) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  nop\n\
-	  mov r15,r1\n\
-	  mov #-8,r15\n\
-       0: mov.w @%2,r2\n\
-	  or r2,%1\n\
-	  and r2,%0\n\
-	  mov.w %1,@%2\n\
-       1: mov r1,r15"\
-	: "=&r" (__result), "=&r" (__mask) \
-	: "r" (mem), "0" (__result), "1" (__mask) \
-	: "r0", "r1", "r2", "memory"); \
-     else if (sizeof (*(mem)) == 4) \
-       __asm__ __volatile__ ("\
-	  .align 2\n\
-	  mova 1f,r0\n\
-	  nop\n\
-	  mov r15,r1\n\
-	  mov #-8,r15\n\
-       0: mov.l @%2,r2\n\
-	  or r2,%1\n\
-	  and r2,%0\n\
-	  mov.l %1,@%2\n\
-       1: mov r1,r15"\
-	: "=&r" (__result), "=&r" (__mask) \
-	: "r" (mem), "0" (__result), "1" (__mask) \
-	: "r0", "r1", "r2", "memory"); \
-     else \
-       abort (); \
-     __result; })
+
+#define __arch_bit_set_8_int(mem, value)		\
+  __arch_operate_new_n(mem, 1<<(value), b, or)
+
+#define __arch_bit_set_16_int(mem, value)		\
+  __arch_operate_new_n(mem, 1<<(value), w, or)
+
+#define __arch_bit_set_32_int(mem, value)		\
+  __arch_operate_new_n(mem, 1<<(value), l, or)
+  
+#define __arch_bit_set_64_int(mem, value)		\
+  (abort (), 0)
+
+#define __arch_add_64_int(mem, value)			\
+  (abort (), 0)
+
+#define atomic_bit_set(mem, value) \
+  ((void) __atomic_val_bysize (__arch_bit_set, int, mem, value))
+
+
+#define __arch_bit_test_set_8_int(mem, value)				\
+  ({ int32_t __value = 1<<(value), __new, __old;			\
+    __arch_operate_old_new_n((mem), __value, __old, __new, b, or);	\
+    __old & __value; })
+
+#define __arch_bit_test_set_16_int(mem, value)				\
+  ({ int32_t __value = 1<<(value), __new, __old;			\
+    __arch_operate_old_new_n((mem), __value, __old, __new, w, or);	\
+    __old & __value; })
+
+#define __arch_bit_test_set_32_int(mem, value)				\
+  ({ int32_t __value = 1<<(value), __new, __old;			\
+    __arch_operate_old_new_n((mem), __value, __old, __new, l, or);	\
+    __old & __value; })
+
+#define __arch_bit_test_set_64_int(mem, value)	\
+  (abort (), 0)
+
+#define atomic_bit_test_set(mem, value) \
+  __atomic_val_bysize (__arch_bit_test_set, int, mem, value)
diff --git a/libc/sysdeps/linux/sh/clone.S b/libc/sysdeps/linux/sh/clone.S
index 3d18b6dd0..bb566276a 100644
--- a/libc/sysdeps/linux/sh/clone.S
+++ b/libc/sysdeps/linux/sh/clone.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2003, 2004, 2007 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -20,93 +20,94 @@
    and invokes a function in the right context after its all over.  */
 
 #include <features.h>
-#include <sys/syscall.h>
-#define _ERRNO_H
+#include <asm/unistd.h>
+#include <sysdep.h>
+#define _ERRNO_H	1
 #include <bits/errno.h>
-#include <bits/sysnum.h>
-
-
-#ifdef __PIC__
-#define PLTJMP(_x)	_x@PLT
-#else
-#define PLTJMP(_x)	_x
+#ifdef RESET_PID
+#include <tcb-offsets.h>
 #endif
+/* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg,
+	     pid_t *ptid, void *tls, pid_t *ctid); */
 
-
-/* int clone(int (*fn)(void *arg), void *child_stack, int flags, void *arg); */
-
-        .text
-
-.text
-.align 4
-.type	clone,@function
-.globl	clone;
-clone:
+	.text
+ENTRY(__clone)
 	/* sanity check arguments.  */
 	tst	r4, r4
-	bt	0f
-	tst	r5, r5
-	bf/s	1f
-	 mov	#+__NR_clone, r3
-0:		
-	bra __syscall_error
-	 mov	#-EINVAL, r4
-
+	bt/s	0f
+	 tst	r5, r5
+	bf	1f
+0:
+	bra	.Lsyscall_error
+	 mov	#-EINVAL,r0
 1:
 	/* insert the args onto the new stack */
 	mov.l	r7, @-r5
 	/* save the function pointer as the 0th element */
 	mov.l	r4, @-r5
-	
+
 	/* do the system call */
 	mov	r6, r4
-	trapa	#(__SH_SYSCALL_TRAP_BASE + 2)
+	mov.l	@r15, r6
+	mov.l	@(8,r15), r7
+	mov.l	@(4,r15), r0
+	mov	#+SYS_ify(clone), r3
+	trapa	#0x15
 	mov     r0, r1
-#ifdef __sh2__
-/* 12 arithmetic shifts for the crappy sh2, because shad doesn't exist!	 */
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-	shar	r1
-#else		
 	mov	#-12, r2
 	shad	r2, r1
-#endif
-	not	r1, r1			/* r1=0 means r0 = -1 to -4095 */
-	tst	r1, r1			/* i.e. error in linux */
-	bf/s	2f
-	 tst	r0, r0
-        bra __syscall_error
-	 mov	r0, r4
-
-2:
-	bt	3f
+	not	r1, r1			// r1=0 means r0 = -1 to -4095
+	tst	r1, r1			// i.e. error in linux
+	bf	.Lclone_end
+.Lsyscall_error:	
+	SYSCALL_ERROR_HANDLER
+.Lclone_end:
+	tst	r0, r0
+	bt	2f
+.Lpseudo_end:
 	rts
 	 nop
+2:
+	/* terminate the stack frame */
+	mov	#0, r14
+#ifdef RESET_PID
+	mov	r4, r0
+	shlr16	r0
+	tst	#1, r0			// CLONE_THREAD = (1 << 16)
+	bf/s	4f
+	 mov	r4, r0
+	/* new pid */
+	shlr8	r0
+	tst	#1, r0			// CLONE_VM = (1 << 8)
+	bf/s	3f
+	 mov	#-1, r0
+	mov	#+SYS_ify(getpid), r3
+	trapa	#0x15
 3:
+	stc	gbr, r1
+	mov.w	.Lpidoff, r2
+	add	r1, r2
+	mov.l	r0, @r2	
+	mov.w	.Ltidoff, r2
+	add	r1, r2
+	mov.l	r0, @r2	
+4:
+#endif
 	/* thread starts */
 	mov.l	@r15, r1
 	jsr	@r1
 	 mov.l	@(4,r15), r4
 
 	/* we are done, passing the return value through r0  */
-	mov.l	.L1, r1
-#ifdef __PIC__
+	mov.l	.L3, r1
+#ifdef SHARED
 	mov.l	r12, @-r15
 	sts.l	pr, @-r15
 	mov	r0, r4
-	mova	.LG, r0  /* .LG from syscall_error.S */
+	mova	.LG, r0
 	mov.l	.LG, r12
 	add	r0, r12
-	mova	.L1, r0
+	mova	.L3, r0
 	add	r0, r1
 	jsr	@r1
 	 nop
@@ -118,8 +119,16 @@ clone:
 	 mov	r0, r4
 #endif
 	.align	2
-.L1:
-	.long	PLTJMP( HIDDEN_JUMPTARGET(_exit))
-.size clone,.-clone;
+.LG:
+	.long	_GLOBAL_OFFSET_TABLE_
+.L3:
+	.long	PLTJMP(C_SYMBOL_NAME(_exit))
+#ifdef RESET_PID
+.Lpidoff:
+	.word	PID - TLS_PRE_TCB_SIZE
+.Ltidoff:
+	.word	TID - TLS_PRE_TCB_SIZE
+#endif
+PSEUDO_END (__clone)
 
-#include "syscall_error.S"
+weak_alias (__clone, clone)
diff --git a/libc/sysdeps/linux/sh/longjmp.c b/libc/sysdeps/linux/sh/longjmp.c
new file mode 100644
index 000000000..dd0616d8a
--- /dev/null
+++ b/libc/sysdeps/linux/sh/longjmp.c
@@ -0,0 +1,56 @@
+/* Copyright (C) 1991, 92, 94, 95, 97, 98, 2000 Free Software Foundation, Inc.
+   Copyright (C) 2001 Hewlett-Packard Australia
+
+ This program is free software; you can redistribute it and/or modify it under
+ the terms of the GNU Library General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more
+ details.
+
+ You should have received a copy of the GNU Library General Public License
+ along with this program; if not, write to the Free Software Foundation, Inc.,
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Derived in part from the Linux-8086 C library, the GNU C Library, and several
+ other sundry sources.  Files within this library are copyright by their
+ respective copyright holders.
+*/
+
+#include <stddef.h>
+#include <setjmp.h>
+#include <signal.h>
+
+libc_hidden_proto(sigprocmask)
+
+extern int __longjmp(char *env, int val);
+libc_hidden_proto(__longjmp)
+
+extern void _longjmp_unwind (jmp_buf env, int val);
+
+
+/* Set the signal mask to the one specified in ENV, and jump
+   to the position specified in ENV, causing the setjmp
+   call there to return VAL, or 1 if VAL is 0.  */
+void __libc_siglongjmp (sigjmp_buf env, int val)
+{
+  /* Perform any cleanups needed by the frames being unwound.  */
+
+  _longjmp_unwind (env, val);
+
+  if (env[0].__mask_was_saved)
+    /* Restore the saved signal mask.  */
+    (void) sigprocmask (SIG_SETMASK, &env[0].__saved_mask,
+			  (sigset_t *) NULL);
+
+  /* Call the machine-dependent function to restore machine state.  */
+  __longjmp ((char *) env[0].__jmpbuf, val ?: 1);
+}
+
+__asm__(".weak longjmp; longjmp = __libc_siglongjmp");
+__asm__(".weak _longjmp; _longjmp = __libc_siglongjmp");
+__asm__(".weak siglongjmp; siglongjmp = __libc_siglongjmp");
+strong_alias(__libc_siglongjmp, __libc_longjmp)
diff --git a/libc/sysdeps/linux/sh/pread_write.c b/libc/sysdeps/linux/sh/pread_write.c
index 84a28e766..76c750ad6 100644
--- a/libc/sysdeps/linux/sh/pread_write.c
+++ b/libc/sysdeps/linux/sh/pread_write.c
@@ -18,6 +18,13 @@
 #include <stdint.h>
 #include <endian.h>
 
+#ifdef __UCLIBC_HAS_THREADS_NATIVE__
+#include <sysdep-cancel.h>
+#else
+#define SINGLE_THREAD_P 1
+#endif
+
+
 #ifdef __NR_pread64             /* Newer kernels renamed but it's the same.  */
 # ifdef __NR_pread
 #  error "__NR_pread and __NR_pread64 both defined???"
@@ -32,18 +39,35 @@ static __inline__ _syscall6(ssize_t, __syscall_pread, int, fd, void *, buf,
 		size_t, count, int, dummy, off_t, offset_hi, off_t, offset_lo)
 
 ssize_t __libc_pread(int fd, void *buf, size_t count, off_t offset)
-{
-	return(__syscall_pread(fd,buf,count,0,__LONG_LONG_PAIR(offset >> 31,offset)));
+{ 
+	if (SINGLE_THREAD_P)
+		return(__syscall_pread(fd,buf,count,0,__LONG_LONG_PAIR(offset >> 31,offset)));
+
+#ifdef __UCLIBC_HAS_THREADS_NATIVE__
+	int oldtype = LIBC_CANCEL_ASYNC ();
+	ssize_t result = __syscall_pread(fd,buf,count,0,__LONG_LONG_PAIR(offset >> 31,offset));
+	LIBC_CANCEL_RESET (oldtype);
+	return result;
+#endif	
 }
 weak_alias(__libc_pread,pread)
 
 # ifdef __UCLIBC_HAS_LFS__
 extern __typeof(pread64) __libc_pread64;
 ssize_t __libc_pread64(int fd, void *buf, size_t count, off64_t offset)
-{
+{ 
 	uint32_t low = offset & 0xffffffff;
 	uint32_t high = offset >> 32;
-	return(__syscall_pread(fd, buf, count, 0, __LONG_LONG_PAIR (high, low)));
+
+	if (SINGLE_THREAD_P)
+		return __syscall_pread(fd, buf, count, 0, __LONG_LONG_PAIR (high, low));
+
+#ifdef __UCLIBC_HAS_THREADS_NATIVE__
+	int oldtype = LIBC_CANCEL_ASYNC ();
+	ssize_t result = __syscall_pread(fd, buf, count, 0, __LONG_LONG_PAIR (high, low));
+	LIBC_CANCEL_RESET (oldtype);
+	return result;
+#endif	
 }
 weak_alias(__libc_pread64,pread64)
 # endif /* __UCLIBC_HAS_LFS__  */
@@ -65,18 +89,36 @@ static __inline__ _syscall6(ssize_t, __syscall_pwrite, int, fd, const void *, bu
 		size_t, count, int, dummy, off_t, offset_hi, off_t, offset_lo)
 
 ssize_t __libc_pwrite(int fd, const void *buf, size_t count, off_t offset)
-{
-	return(__syscall_pwrite(fd,buf,count,0,__LONG_LONG_PAIR(offset >> 31,offset)));
+{ 
+	if (SINGLE_THREAD_P)
+		return __syscall_pwrite(fd,buf,count,0,__LONG_LONG_PAIR(offset >> 31,offset));
+
+#ifdef __UCLIBC_HAS_THREADS_NATIVE__
+	int oldtype = LIBC_CANCEL_ASYNC ();
+	ssize_t result = __syscall_pwrite(fd,buf,count,0,__LONG_LONG_PAIR(offset >> 31,offset));
+	LIBC_CANCEL_RESET (oldtype);
+	return result;
+#endif
+
 }
 weak_alias(__libc_pwrite,pwrite)
 
 # ifdef __UCLIBC_HAS_LFS__
 extern __typeof(pwrite64) __libc_pwrite64;
 ssize_t __libc_pwrite64(int fd, const void *buf, size_t count, off64_t offset)
-{
+{ 
 	uint32_t low = offset & 0xffffffff;
 	uint32_t high = offset >> 32;
-	return(__syscall_pwrite(fd, buf, count, 0, __LONG_LONG_PAIR (high, low)));
+
+	if (SINGLE_THREAD_P)
+		return __syscall_pwrite(fd, buf, count, 0, __LONG_LONG_PAIR (high, low));
+
+#ifdef __UCLIBC_HAS_THREADS_NATIVE__
+	int oldtype = LIBC_CANCEL_ASYNC ();
+	ssize_t result = __syscall_pwrite(fd, buf, count, 0, __LONG_LONG_PAIR (high, low));
+	LIBC_CANCEL_RESET (oldtype);
+	return result;
+#endif
 }
 weak_alias(__libc_pwrite64,pwrite64)
 # endif /* __UCLIBC_HAS_LFS__  */
diff --git a/libc/sysdeps/linux/sh/setjmp.S b/libc/sysdeps/linux/sh/setjmp.S
index 00475a008..3296c2ba9 100644
--- a/libc/sysdeps/linux/sh/setjmp.S
+++ b/libc/sysdeps/linux/sh/setjmp.S
@@ -77,7 +77,7 @@ __sigsetjmp_intern:
 	mov.l	r9, @-r4
 	mov.l	r8, @-r4
 
-#ifdef __PIC__
+#ifdef __HAVE_SHARED__ 
 	mov.l	.LG, r2
 	mova	.LG, r0
 	add	r0, r2
diff --git a/libc/sysdeps/linux/sh/syscall_error.S b/libc/sysdeps/linux/sh/syscall_error.S
index f55dd535a..d943dcbb0 100644
--- a/libc/sysdeps/linux/sh/syscall_error.S
+++ b/libc/sysdeps/linux/sh/syscall_error.S
@@ -3,7 +3,7 @@ __syscall_error:
 	/* Call errno_location, store '-r4' in errno and return -1 */
 	mov.l	r12, @-r15
 	sts.l	pr, @-r15
-#ifdef __PIC__
+#ifdef SHARED 
 	mova	.LG, r0
 	mov.l	.LG, r12
 	add	r0, r12
@@ -27,7 +27,7 @@ __syscall_error:
 
 	.align	4
 
-#ifdef __PIC__
+#ifdef SHARED
 1:	.long   __errno_location@GOT
 .LG:	.long	_GLOBAL_OFFSET_TABLE_
 #else
-- 
cgit v1.2.3