3 files changed, 181 insertions, 58 deletions
diff --git a/erts/aclocal.m4 b/erts/aclocal.m4
index f6d8f20e4e..017fdbd589 100644
--- a/erts/aclocal.m4
+++ b/erts/aclocal.m4
@@ -2075,63 +2075,159 @@ esac
 
 case "$GCC-$host_cpu" in
   yes-i86pc | yes-i*86 | yes-x86_64 | yes-amd64)
+
+    if test $ac_cv_sizeof_void_p = 4; then
+       dw_cmpxchg="cmpxchg8b"
+    else
+       dw_cmpxchg="cmpxchg16b"
+    fi
+
     gcc_dw_cmpxchg_asm=no
-    AC_MSG_CHECKING([for gcc double word cmpxchg asm support])    
-    AC_TRY_COMPILE([],
+    gcc_pic_dw_cmpxchg_asm=no
+    gcc_cflags_pic=no
+    gcc_cmpxchg8b_pic_no_clobber_ebx=no
+    gcc_cmpxchg8b_pic_no_clobber_ebx_register_shortage=no
+
+    save_CFLAGS="$CFLAGS"
+
+    # Check if it works out of the box using passed CFLAGS
+    # and with -fPIC added to CFLAGS if the passed CFLAGS
+    # doesn't trigger position independent code
+    pic_cmpxchg=unknown
+    while true; do
+
+        case $pic_cmpxchg in
+	  yes) pic_text="pic ";;
+	  *) pic_text="";;
+	esac
+
+	AC_MSG_CHECKING([for gcc $pic_text$dw_cmpxchg plain asm support])    
+
+	plain_cmpxchg=no
+    	AC_TRY_COMPILE([],
 	[
     char xchgd;
     long new[2], xchg[2], *p;		  
     __asm__ __volatile__(
-#if ETHR_SIZEOF_PTR == 4 && defined(__PIC__) && __PIC__
-	"pushl %%ebx\n\t"
-	"movl %8, %%ebx\n\t"
-#endif
 #if ETHR_SIZEOF_PTR == 4
 	"lock; cmpxchg8b %0\n\t"
 #else
 	"lock; cmpxchg16b %0\n\t"
 #endif
 	"setz %3\n\t"
-#if ETHR_SIZEOF_PTR == 4 && defined(__PIC__) && __PIC__
-	"popl %%ebx\n\t"
-#endif
-	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=c"(xchgd)
-	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "3"(new[1]),
-#if ETHR_SIZEOF_PTR == 4 && defined(__PIC__) && __PIC__
-	  "r"(new[0])
-#else
-	  "b"(new[0])
-#endif
+	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=q"(xchgd)
+	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "c"(new[1]), "b"(new[0])
 	: "cc", "memory");
+	],
+	[plain_cmpxchg=yes])
 
+	AC_MSG_RESULT([$plain_cmpxchg])
+
+	if test $pic_cmpxchg = yes; then
+	   gcc_pic_dw_cmpxchg_asm=$plain_cmpxchg
+	   break
+	fi
+
+	gcc_dw_cmpxchg_asm=$plain_cmpxchg
+
+    	# If not already compiling to position independent
+	# code add -fPIC to CFLAGS and do it again. This
+	# since we want also want to know how to compile
+	# to position independent code since this might
+	# cause problems with the use of the EBX register
+	# as input to the asm on 32-bit x86 and old gcc
+	# compilers (gcc vsn < 5).
+
+    	AC_TRY_COMPILE([],
+	[
+#if !defined(__PIC__) || !__PIC__
+#  error no pic
+#endif
 	],
-	[gcc_dw_cmpxchg_asm=yes])
-    if test $gcc_dw_cmpxchg_asm = no && test $ac_cv_sizeof_void_p = 4; then
+	[pic_cmpxchg=yes
+	 gcc_cflags_pic=yes],
+	[pic_cmpxchg=no])
+
+	if test $pic_cmpxchg = yes; then
+	   gcc_pic_dw_cmpxchg_asm=$gcc_dw_cmpxchg_asm
+	   break
+	fi
+
+	CFLAGS="$save_CFLAGS -fPIC"
+	pic_cmpxchg=yes
+
+    done
+
+    if test $gcc_pic_dw_cmpxchg_asm = no && test $ac_cv_sizeof_void_p = 4; then
+
+      AC_MSG_CHECKING([for gcc pic cmpxchg8b asm support with EBX workaround])
+
+      # Check if we can work around it by managing the ebx
+      # register explicitly in the asm...
+
       AC_TRY_COMPILE([],
+	[
+    char xchgd;
+    long new[2], xchg[2], *p;		  
+    __asm__ __volatile__(
+	"pushl %%ebx\n\t"
+	"movl %8, %%ebx\n\t"
+	"lock; cmpxchg8b %0\n\t"
+	"setz %3\n\t"
+	"popl %%ebx\n\t"
+	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=q"(xchgd)
+	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "c"(new[1]), "r"(new[0])
+	: "cc", "memory");
+	],
+	[gcc_pic_dw_cmpxchg_asm=yes
+	 gcc_cmpxchg8b_pic_no_clobber_ebx=yes])     
+
+      AC_MSG_RESULT([$gcc_pic_dw_cmpxchg_asm])
+
+      if test $gcc_pic_dw_cmpxchg_asm = no; then
+
+      	AC_MSG_CHECKING([for gcc pic cmpxchg8b asm support with EBX and register shortage workarounds])
+        # If no optimization is enabled we sometimes get a
+	# register shortage. Check if we can work around
+	# this...
+
+      	AC_TRY_COMPILE([],
 	  [
       char xchgd;
       long new[2], xchg[2], *p;
-#if !defined(__PIC__) || !__PIC__
-#  error nope
-#endif
       __asm__ __volatile__(
-    	  "pushl %%ebx\n\t"
-	  "movl (%7), %%ebx\n\t"
-	  "movl 4(%7), %%ecx\n\t"
-	  "lock; cmpxchg8b %0\n\t"
-  	  "setz %3\n\t"
-	  "popl %%ebx\n\t"
-	  : "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=c"(xchgd)
-	  : "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "3"(new)
+	"pushl %%ebx\n\t"
+	"movl (%7), %%ebx\n\t"
+	"movl 4(%7), %%ecx\n\t"
+	"lock; cmpxchg8b %0\n\t"
+	"setz %3\n\t"
+	"popl %%ebx\n\t"
+	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=c"(xchgd)
+	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "r"(new)
 	: "cc", "memory");
 
 	],
-	[gcc_dw_cmpxchg_asm=yes])
-      if test "$gcc_dw_cmpxchg_asm" = "yes"; then
-        AC_DEFINE(ETHR_CMPXCHG8B_REGISTER_SHORTAGE, 1, [Define if you get a register shortage with cmpxchg8b and position independent code])
+	[gcc_pic_dw_cmpxchg_asm=yes
+	 gcc_cmpxchg8b_pic_no_clobber_ebx=yes
+	 gcc_cmpxchg8b_pic_no_clobber_ebx_register_shortage=yes])
+
+        AC_MSG_RESULT([$gcc_pic_dw_cmpxchg_asm])
       fi
+
+      if test $gcc_cflags_pic = yes; then
+        gcc_dw_cmpxchg_asm=$gcc_pic_dw_cmpxchg_asm
+      fi
+ 
+   fi
+
+    CFLAGS="$save_CFLAGS"
+
+    if test "$gcc_cmpxchg8b_pic_no_clobber_ebx" = "yes"; then
+      AC_DEFINE(ETHR_CMPXCHG8B_PIC_NO_CLOBBER_EBX, 1, [Define if gcc wont let you clobber ebx with cmpxchg8b and position independent code])
+    fi
+    if test "$gcc_cmpxchg8b_pic_no_clobber_ebx_register_shortage" = "yes"; then
+      AC_DEFINE(ETHR_CMPXCHG8B_REGISTER_SHORTAGE, 1, [Define if you get a register shortage with cmpxchg8b and position independent code])
     fi
-    AC_MSG_RESULT([$gcc_dw_cmpxchg_asm])
     if test "$gcc_dw_cmpxchg_asm" = "yes"; then
       AC_DEFINE(ETHR_GCC_HAVE_DW_CMPXCHG_ASM_SUPPORT, 1, [Define if you use a gcc that supports the double word cmpxchg instruction])
     fi;;
diff --git a/erts/include/internal/ethread_header_config.h.in b/erts/include/internal/ethread_header_config.h.in
index 9cabd0591a..f4b08cfced 100644
--- a/erts/include/internal/ethread_header_config.h.in
+++ b/erts/include/internal/ethread_header_config.h.in
@@ -166,6 +166,10 @@
 /* Define if you use a gcc that supports the double word cmpxchg instruction */
 #undef ETHR_GCC_HAVE_DW_CMPXCHG_ASM_SUPPORT
 
+/* Define if gcc wont let you clobber ebx with cmpxchg8b and position
+   independent code */
+#undef ETHR_CMPXCHG8B_PIC_NO_CLOBBER_EBX
+
 /* Define if you get a register shortage with cmpxchg8b and position independent code */
 #undef ETHR_CMPXCHG8B_REGISTER_SHORTAGE
 
diff --git a/erts/include/internal/i386/ethr_dw_atomic.h b/erts/include/internal/i386/ethr_dw_atomic.h
index e8c4119ef0..5444a6345c 100644
--- a/erts/include/internal/i386/ethr_dw_atomic.h
+++ b/erts/include/internal/i386/ethr_dw_atomic.h
@@ -115,13 +115,19 @@ ethr_native_dw_atomic_addr(ethr_native_dw_atomic_t *var)
     return (ethr_sint_t *) ETHR_DW_NATMC_MEM__(var);
 }
 
-#if ETHR_SIZEOF_PTR == 4 && defined(__PIC__) && __PIC__
+#if defined(ETHR_CMPXCHG8B_PIC_NO_CLOBBER_EBX) && defined(__PIC__) && __PIC__
+#if ETHR_SIZEOF_PTR != 4
+#  error unexpected pic issue
+#endif
 /*
  * When position independent code is used in 32-bit mode, the EBX register
- * is used for storage of global offset table address, and we may not
- * use it as input or output in an asm. We need to save and restore the
- * EBX register explicitly (for some reason gcc doesn't provide this
- * service to us).
+ * is used for storage of global offset table address. When compiling with
+ * an old gcc (< vsn 5) we may not use it as input or output in an inline
+ * asm. We then need to save and restore the EBX register explicitly (for
+ * some reason old gcc compilers didn't provide this service to us).
+ * ETHR_CMPXCHG8B_PIC_NO_CLOBBER_EBX will be defined if we need to
+ * explicitly manage EBX ourselves.
+ *
  */
 #  define ETHR_NO_CLOBBER_EBX__ 1
 #else
@@ -151,36 +157,53 @@ ethr_native_dw_atomic_cmpxchg_mb(ethr_native_dw_atomic_t *var,
 
     ETHR_DW_DBG_ALIGNED__(p);
 
+#if ETHR_NO_CLOBBER_EBX__ && ETHR_CMPXCHG8B_REGISTER_SHORTAGE
+    /*
+     * gcc wont let us use ebx as input and we
+     * get a register shortage
+     */
+
     __asm__ __volatile__(
-#if ETHR_NO_CLOBBER_EBX__
 	"pushl %%ebx\n\t"
-#  if ETHR_CMPXCHG8B_REGISTER_SHORTAGE
 	"movl (%7), %%ebx\n\t"
 	"movl 4(%7), %%ecx\n\t"
-#  else
-	"movl %8, %%ebx\n\t"
-#  endif
-#endif
-	"lock; cmpxchg" ETHR_DW_CMPXCHG_SFX__ " %0\n\t"
+	"lock; cmpxchg8b %0\n\t"
 	"setz %3\n\t"
-#if ETHR_NO_CLOBBER_EBX__
 	"popl %%ebx\n\t"
-#endif
 	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=c"(xchgd)
-	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]),
-#if ETHR_NO_CLOBBER_EBX__
-#  if ETHR_CMPXCHG8B_REGISTER_SHORTAGE
-	  "3"(new)
-#  else
-	  "3"(new[1]),
-	  "r"(new[0])
-#  endif
+	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "r"(new)
+	: "cc", "memory");
+
+#elif ETHR_NO_CLOBBER_EBX__
+    /*
+     * gcc wont let us use ebx as input
+     */
+
+    __asm__ __volatile__(
+	"pushl %%ebx\n\t"
+	"movl %8, %%ebx\n\t"
+	"lock; cmpxchg8b %0\n\t"
+	"setz %3\n\t"
+	"popl %%ebx\n\t"
+	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=q"(xchgd)
+	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "c"(new[1]), "r"(new[0])
+	: "cc", "memory");
+
 #else
-	  "3"(new[1]),
-	  "b"(new[0])
-#endif
+    /*
+     * gcc lets us place values in the registers where
+     * we want them
+     */
+
+    __asm__ __volatile__(
+	"lock; cmpxchg" ETHR_DW_CMPXCHG_SFX__ " %0\n\t"
+	"setz %3\n\t"
+	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=q"(xchgd)
+	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "c"(new[1]), "b"(new[0])
 	: "cc", "memory");
 
+#endif
+
     return (int) xchgd;
 }