4 files changed, 1064 insertions, 128 deletions
diff --git a/erts/include/internal/gcc/ethr_atomic.h b/erts/include/internal/gcc/ethr_atomic.h
index f598f8537b..62eed78f76 100644
--- a/erts/include/internal/gcc/ethr_atomic.h
+++ b/erts/include/internal/gcc/ethr_atomic.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2010-2011. All Rights Reserved.
+ * Copyright Ericsson AB 2010-2015. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -18,78 +18,81 @@
  */
 
 /*
- * Description: Native atomics ethread support using gcc's builtins
+ * Description: Native atomics ethread support using gcc's __atomic
+ *              and __sync builtins
  * Author: Rickard Green
+ *
+ * Note: The C11 memory model implemented by gcc's __atomic
+ *       builtins does not match the ethread API very well.
+ *
+ *       Due to this we cannot use the __ATOMIC_SEQ_CST
+ *       memory model. For more information see the comment
+ *       in the begining of ethr_membar.h in this directory.
  */
 
 #undef ETHR_INCLUDE_ATOMIC_IMPL__
-#if !defined(ETHR_GCC_ATOMIC32_H__) && defined(ETHR_ATOMIC_WANT_32BIT_IMPL__)
-#define ETHR_GCC_ATOMIC32_H__
-#if defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP32)
-#  define ETHR_INCLUDE_ATOMIC_IMPL__ 4
-#endif
+#if !defined(ETHR_GCC_ATOMIC_ATOMIC32_H__)		\
+    && defined(ETHR_ATOMIC_WANT_32BIT_IMPL__)		\
+    && ((ETHR_HAVE___sync_val_compare_and_swap & 4)	\
+	|| (ETHR_HAVE___atomic_compare_exchange_n & 4))
+
+#define ETHR_GCC_ATOMIC_ATOMIC32_H__
+#define ETHR_INCLUDE_ATOMIC_IMPL__ 4
 #undef ETHR_ATOMIC_WANT_32BIT_IMPL__
-#elif !defined(ETHR_GCC_ATOMIC64_H__) && defined(ETHR_ATOMIC_WANT_64BIT_IMPL__)
-#define ETHR_GCC_ATOMIC64_H__
-#if defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP64)
-#  define ETHR_INCLUDE_ATOMIC_IMPL__ 8
-#endif
-#undef ETHR_ATOMIC_WANT_64BIT_IMPL__
-#endif
 
-#ifdef ETHR_INCLUDE_ATOMIC_IMPL__
+#elif !defined(ETHR_GCC_ATOMIC64_H__)			\
+    && defined(ETHR_ATOMIC_WANT_64BIT_IMPL__)		\
+    && ((ETHR_HAVE___sync_val_compare_and_swap & 8)	\
+	|| (ETHR_HAVE___atomic_compare_exchange_n & 8))
 
-#ifndef ETHR_GCC_ATOMIC_COMMON__
-#define ETHR_GCC_ATOMIC_COMMON__
+#define ETHR_GCC_ATOMIC64_H__
+#define ETHR_INCLUDE_ATOMIC_IMPL__ 8
+#undef ETHR_ATOMIC_WANT_64BIT_IMPL__
 
-#define ETHR_READ_AND_SET_WITHOUT_SYNC_OP__ 0
-#if defined(__i386__) || defined(__x86_64__) || defined(__sparc__) \
-    || defined(__powerpc__) || defined(__ppc__) || defined(__mips__)
-#  undef ETHR_READ_AND_SET_WITHOUT_SYNC_OP__
-#  define ETHR_READ_AND_SET_WITHOUT_SYNC_OP__ 1
 #endif
 
-#endif /* ETHR_GCC_ATOMIC_COMMON__ */
+#ifdef ETHR_INCLUDE_ATOMIC_IMPL__
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #define ETHR_HAVE_NATIVE_ATOMIC32 1
-#define ETHR_NATIVE_ATOMIC32_IMPL "gcc"
 #define ETHR_NATMC_FUNC__(X) ethr_native_atomic32_ ## X
 #define ETHR_ATMC_T__ ethr_native_atomic32_t
 #define ETHR_AINT_T__ ethr_sint32_t
-#if defined(ETHR_HAVE___SYNC_ADD_AND_FETCH32)
-#  define ETHR_HAVE___SYNC_ADD_AND_FETCH
-#endif
-#if defined(ETHR_HAVE___SYNC_FETCH_AND_AND32)
-#  define ETHR_HAVE___SYNC_FETCH_AND_AND
-#endif 
-#if defined(ETHR_HAVE___SYNC_FETCH_AND_OR32)
-#  define ETHR_HAVE___SYNC_FETCH_AND_OR
+#if ((ETHR_HAVE___sync_val_compare_and_swap & 4) \
+     && (ETHR_HAVE___atomic_compare_exchange_n & 4))
+#  define ETHR_NATIVE_ATOMIC32_IMPL "gcc_atomic_and_sync_builtins"
+#elif (ETHR_HAVE___atomic_compare_exchange_n & 4)
+#  define ETHR_NATIVE_ATOMIC32_IMPL "gcc_atomic_builtins"
+#elif (ETHR_HAVE___sync_val_compare_and_swap & 4)
+#  define ETHR_NATIVE_ATOMIC32_IMPL "gcc_sync_builtins"
+#else
+#  error "!?"
 #endif
 #elif ETHR_INCLUDE_ATOMIC_IMPL__ == 8
 #define ETHR_HAVE_NATIVE_ATOMIC64 1
-#define ETHR_NATIVE_ATOMIC64_IMPL "gcc"
 #define ETHR_NATMC_FUNC__(X) ethr_native_atomic64_ ## X
 #define ETHR_ATMC_T__ ethr_native_atomic64_t
 #define ETHR_AINT_T__ ethr_sint64_t
-#if defined(ETHR_HAVE___SYNC_ADD_AND_FETCH64)
-#  define ETHR_HAVE___SYNC_ADD_AND_FETCH
-#endif
-#if defined(ETHR_HAVE___SYNC_FETCH_AND_AND64)
-#  define ETHR_HAVE___SYNC_FETCH_AND_AND
-#endif 
-#if defined(ETHR_HAVE___SYNC_FETCH_AND_OR64)
-#  define ETHR_HAVE___SYNC_FETCH_AND_OR
+#if ((ETHR_HAVE___sync_val_compare_and_swap & 8) \
+     && (ETHR_HAVE___atomic_compare_exchange_n & 8))
+#  define ETHR_NATIVE_ATOMIC64_IMPL "gcc_atomic_and_sync_builtins"
+#elif (ETHR_HAVE___atomic_compare_exchange_n & 8)
+#  define ETHR_NATIVE_ATOMIC64_IMPL "gcc_atomic_builtins"
+#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
+#  define ETHR_NATIVE_ATOMIC64_IMPL "gcc_sync_builtins"
+#else
+#  error "!?"
 #endif
 #else
 #error "Unsupported integer size"
 #endif
 
+#undef ETHR_NATIVE_ATOMIC_IMPL__
+
 typedef struct {
-    volatile ETHR_AINT_T__ counter;
+    volatile ETHR_AINT_T__ value;
 } ETHR_ATMC_T__;
 
-
 #if defined(ETHR_TRY_INLINE_FUNCS) || defined(ETHR_ATOMIC_IMPL__)
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
@@ -98,13 +101,19 @@ typedef struct {
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADDR 1
 #endif
 
+
 static ETHR_INLINE ETHR_AINT_T__ *
 ETHR_NATMC_FUNC__(addr)(ETHR_ATMC_T__ *var)
 {
-    return (ETHR_AINT_T__ *) &var->counter;
+    return (ETHR_AINT_T__ *) &var->value;
 }
 
-#if ETHR_READ_AND_SET_WITHOUT_SYNC_OP__
+/*
+ * set()
+ */
+#if (ETHR_HAVE___atomic_store_n & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET 1
@@ -115,12 +124,109 @@ ETHR_NATMC_FUNC__(addr)(ETHR_ATMC_T__ *var)
 static ETHR_INLINE void
 ETHR_NATMC_FUNC__(set)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
 {
-    var->counter = value;
+    __atomic_store_n(&var->value, value, __ATOMIC_RELAXED);
 }
 
-#endif /* ETHR_READ_AND_SET_WITHOUT_SYNC_OP__ */
+#endif /* ETHR_GCC_RELAXED_VERSIONS__ */
 
-#if ETHR_READ_AND_SET_WITHOUT_SYNC_OP__
+#if (ETHR_GCC_RELB_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_SET_RELB 1
+#endif
+
+static ETHR_INLINE void
+ETHR_NATMC_FUNC__(set_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
+{
+    __atomic_store_n(&var->value, value, __ATOMIC_RELEASE);
+}
+
+#endif /* ETHR_GCC_RELB_VERSIONS__ */
+
+#elif (ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_SET 1
+#endif
+
+static ETHR_INLINE void
+ETHR_NATMC_FUNC__(set)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
+{
+    var->value = value;
+}
+
+#endif /* ETHR_GCC_RELAXED_VERSIONS__ */
+
+#if (ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELB_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_SET_RELB 1
+#endif
+
+static ETHR_INLINE void
+ETHR_NATMC_FUNC__(set_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
+{
+    var->value = value;
+}
+
+#endif /* ETHR_GCC_RELB_VERSIONS__ */
+
+#endif /* ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__ */
+
+#endif /* ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__ */
+
+/*
+ * read()
+ */
+
+#if (ETHR_HAVE___atomic_load_n & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_READ 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(read)(ETHR_ATMC_T__ *var)
+{
+    return __atomic_load_n(&var->value, __ATOMIC_RELAXED);
+}
+
+#endif /* ETHR_GCC_RELAXED_VERSIONS__ */
+
+#if ((ETHR_GCC_ACQB_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__) \
+     & ~ETHR___atomic_load_ACQUIRE_barrier_bug)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_READ_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(read_acqb)(ETHR_ATMC_T__ *var)
+{
+    return __atomic_load_n(&var->value, __ATOMIC_ACQUIRE);
+}
+
+#endif /* ETHR_GCC_ACQB_VERSIONS__ */
+
+#elif (ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ 1
@@ -131,12 +237,90 @@ ETHR_NATMC_FUNC__(set)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
 static ETHR_INLINE ETHR_AINT_T__
 ETHR_NATMC_FUNC__(read)(ETHR_ATMC_T__ *var)
 {
-    return var->counter;
+    return var->value;
+}
+
+#endif /* ETHR_GCC_RELAXED_VERSIONS__ */
+
+#if (ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_ACQB_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_READ_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(read_acqb)(ETHR_ATMC_T__ *var)
+{
+    return var->value;
+}
+
+#endif /* ETHR_GCC_ACQB_VERSIONS__ */
+
+#endif /* ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__ */
+
+#endif /* ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__ */
+
+/*
+ * add_return()
+ */
+#if (ETHR_HAVE___atomic_add_fetch & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADD_RETURN 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(add_return)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
+{
+    return __atomic_add_fetch(&var->value, incr, __ATOMIC_RELAXED);
 }
 
-#endif /* ETHR_READ_AND_SET_WITHOUT_SYNC_OP__ */
+#endif /* ETHR_GCC_RELAXED_MOD_VERSIONS__ */
 
-#if defined(ETHR_HAVE___SYNC_ADD_AND_FETCH)
+#if (ETHR_GCC_ACQB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADD_RETURN_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(add_return_acqb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
+{
+    return __atomic_add_fetch(&var->value, incr, __ATOMIC_ACQUIRE);
+}
+
+#endif /* ETHR_GCC_ACQB_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_RELB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADD_RETURN_RELB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(add_return_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
+{
+    return __atomic_add_fetch(&var->value, incr, __ATOMIC_RELEASE);
+}
+
+#endif /* ETHR_GCC_RELB_MOD_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_add_fetch */
+
+#if ((ETHR_HAVE___sync_add_and_fetch & ETHR_INCLUDE_ATOMIC_IMPL__) \
+     & ETHR_GCC_MB_MOD_VERSIONS__)
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_MB 1
@@ -147,12 +331,68 @@ ETHR_NATMC_FUNC__(read)(ETHR_ATMC_T__ *var)
 static ETHR_INLINE ETHR_AINT_T__
 ETHR_NATMC_FUNC__(add_return_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
 {
-    return __sync_add_and_fetch(&var->counter, incr);
+    return __sync_add_and_fetch(&var->value, incr);
+}
+
+#endif /* ETHR_HAVE___sync_add_and_fetch */
+
+/*
+ * and_retold()
+ */
+#if (ETHR_HAVE___atomic_fetch_and & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_AND_RETOLD 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_AND_RETOLD 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(and_retold)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
+{
+    return __atomic_fetch_and(&var->value, mask, __ATOMIC_RELAXED);
+}
+
+#endif /* ETHR_GCC_RELAXED_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_ACQB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_AND_RETOLD_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_AND_RETOLD_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(and_retold_acqb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
+{
+    return __atomic_fetch_and(&var->value, mask, __ATOMIC_ACQUIRE);
 }
 
+#endif /* ETHR_GCC_ACQB_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_RELB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_AND_RETOLD_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_AND_RETOLD_RELB 1
 #endif
 
-#if defined(ETHR_HAVE___SYNC_FETCH_AND_AND)
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(and_retold_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
+{
+    return __atomic_fetch_and(&var->value, mask, __ATOMIC_RELEASE);
+}
+
+#endif /* ETHR_GCC_RELB_MOD_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_fetch_and */
+
+#if ((ETHR_HAVE___sync_fetch_and_and & ETHR_INCLUDE_ATOMIC_IMPL__) \
+     & ETHR_GCC_MB_MOD_VERSIONS__)
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_AND_RETOLD_MB 1
@@ -163,12 +403,68 @@ ETHR_NATMC_FUNC__(add_return_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
 static ETHR_INLINE ETHR_AINT_T__
 ETHR_NATMC_FUNC__(and_retold_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
 {
-    return __sync_fetch_and_and(&var->counter, mask);
+    return __sync_fetch_and_and(&var->value, mask);
+}
+
+#endif /* ETHR_HAVE___sync_fetch_and_and */
+
+/*
+ * or_retold()
+ */
+#if (ETHR_HAVE___atomic_fetch_or & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_OR_RETOLD 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_OR_RETOLD 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(or_retold)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
+{
+    return __atomic_fetch_or(&var->value, mask, __ATOMIC_RELAXED);
+}
+
+#endif /* ETHR_GCC_RELAXED_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_ACQB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_OR_RETOLD_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_OR_RETOLD_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(or_retold_acqb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
+{
+    return __atomic_fetch_or(&var->value, mask, __ATOMIC_ACQUIRE);
 }
 
+#endif /* ETHR_GCC_ACQB_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_RELB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_OR_RETOLD_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_OR_RETOLD_RELB 1
 #endif
 
-#if defined(ETHR_HAVE___SYNC_FETCH_AND_OR)
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(or_retold_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
+{
+    return __atomic_fetch_or(&var->value, mask, __ATOMIC_RELEASE);
+}
+
+#endif /* ETHR_GCC_RELB_MOD_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_fetch_or */
+
+#if ((ETHR_HAVE___sync_fetch_and_or & ETHR_INCLUDE_ATOMIC_IMPL__) \
+     & ETHR_GCC_MB_MOD_VERSIONS__)
 
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_OR_RETOLD_MB 1
@@ -179,11 +475,73 @@ ETHR_NATMC_FUNC__(and_retold_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
 static ETHR_INLINE ETHR_AINT_T__
 ETHR_NATMC_FUNC__(or_retold_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ mask)
 {
-    return (ETHR_AINT_T__) __sync_fetch_and_or(&var->counter, mask);
+    return (ETHR_AINT_T__) __sync_fetch_and_or(&var->value, mask);
+}
+
+#endif /* ETHR_HAVE___sync_fetch_and_or */
+
+/*
+ * cmpxchg()
+ */
+#if (ETHR_HAVE___atomic_compare_exchange_n & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if (ETHR_GCC_RELAXED_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_CMPXCHG 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(cmpxchg)(ETHR_ATMC_T__ *var,
+			   ETHR_AINT_T__ new,
+			   ETHR_AINT_T__ exp)
+{
+    ETHR_AINT_T__ xchg = exp;
+    if (__atomic_compare_exchange_n(&var->value,
+				    &xchg,
+				    new,
+				    0, /* No spurious failures, please */
+				    __ATOMIC_RELAXED,
+				    __ATOMIC_RELAXED))
+	return exp;
+    return xchg;
 }
 
+#endif /* ETHR_GCC_RELAXED_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_ACQB_MOD_VERSIONS__ & ETHR_INCLUDE_ATOMIC_IMPL__)
+
+#if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_CMPXCHG_ACQB 1
 #endif
 
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(cmpxchg_acqb)(ETHR_ATMC_T__ *var,
+				ETHR_AINT_T__ new,
+				ETHR_AINT_T__ exp)
+{
+    ETHR_AINT_T__ xchg = exp;
+    if (__atomic_compare_exchange_n(&var->value,
+				    &xchg,
+				    new,
+				    0, /* No spurious failures, please */
+				    __ATOMIC_ACQUIRE,
+				    __ATOMIC_ACQUIRE))
+	return exp;
+    return xchg;
+}
+
+#endif /* ETHR_GCC_ACQB_MOD_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_compare_exchange_n */
+
+#if ((ETHR_HAVE___sync_val_compare_and_swap & ETHR_INCLUDE_ATOMIC_IMPL__) \
+     & ETHR_GCC_MB_MOD_VERSIONS__)
+
 #if ETHR_INCLUDE_ATOMIC_IMPL__ == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_MB 1
 #else
@@ -195,17 +553,16 @@ ETHR_NATMC_FUNC__(cmpxchg_mb)(ETHR_ATMC_T__ *var,
 			      ETHR_AINT_T__ new,
 			      ETHR_AINT_T__ old)
 {
-    return __sync_val_compare_and_swap(&var->counter, old, new);
+    return __sync_val_compare_and_swap(&var->value, old, new);
 }
 
+#endif /* ETHR_HAVE___sync_val_compare_and_swap */
+
 #endif /* ETHR_TRY_INLINE_FUNCS */
 
 #undef ETHR_NATMC_FUNC__
 #undef ETHR_ATMC_T__
 #undef ETHR_AINT_T__
 #undef ETHR_AINT_SUFFIX__
-#undef ETHR_HAVE___SYNC_ADD_AND_FETCH
-#undef ETHR_HAVE___SYNC_FETCH_AND_AND
-#undef ETHR_HAVE___SYNC_FETCH_AND_OR
 
 #endif
diff --git a/erts/include/internal/gcc/ethr_dw_atomic.h b/erts/include/internal/gcc/ethr_dw_atomic.h
index 6736f9c547..c2c8f85b7b 100644
--- a/erts/include/internal/gcc/ethr_dw_atomic.h
+++ b/erts/include/internal/gcc/ethr_dw_atomic.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2011. All Rights Reserved.
+ * Copyright Ericsson AB 2011-2015. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -18,35 +18,39 @@
  */
 
 /*
- * Description: Native double word atomics using gcc's builtins
+ * Description: Native double word atomics using gcc's __atomic
+ *              and __sync builtins
  * Author: Rickard Green
+ *
+ * Note: The C11 memory model implemented by gcc's __atomic
+ *       builtins does not match the ethread API very well.
+ *
+ *       Due to this we cannot use the __ATOMIC_SEQ_CST
+ *       memory model. For more information see the comment
+ *       in the begining of ethr_membar.h in this directory.
  */
 
 #undef ETHR_INCLUDE_DW_ATOMIC_IMPL__
-#ifndef ETHR_GCC_DW_ATOMIC_H__
-#  define ETHR_GCC_DW_ATOMIC_H__
-#  if ((ETHR_SIZEOF_PTR == 4 \
-        && defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP64)) \
-       || (ETHR_SIZEOF_PTR == 8 \
-           && defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP128) \
-	   && defined(ETHR_HAVE_INT128_T)))
-#    define ETHR_INCLUDE_DW_ATOMIC_IMPL__
-#  endif
+#if !defined(ETHR_GCC_ATOMIC_DW_ATOMIC_H__)				\
+    && ((ETHR_HAVE___sync_val_compare_and_swap & (2*ETHR_SIZEOF_PTR))	\
+	|| (ETHR_HAVE___atomic_compare_exchange_n & (2*ETHR_SIZEOF_PTR)))
+#  define ETHR_GCC_ATOMIC_DW_ATOMIC_H__
+#  define ETHR_INCLUDE_DW_ATOMIC_IMPL__
 #endif
 
 #ifdef ETHR_INCLUDE_DW_ATOMIC_IMPL__
 #  define ETHR_HAVE_NATIVE_SU_DW_ATOMIC
-#  define ETHR_NATIVE_DW_ATOMIC_IMPL "gcc"
 
-#  if defined(__i386__) || defined(__x86_64__)
-/*
- * If ETHR_RTCHK_USE_NATIVE_DW_ATOMIC_IMPL__ is defined, it will be used
- * at runtime in order to determine if native or fallback implementation
- * should be used.
- */
-#    define ETHR_RTCHK_USE_NATIVE_DW_ATOMIC_IMPL__ \
-       ETHR_X86_RUNTIME_CONF_HAVE_DW_CMPXCHG__
-#  endif
+#if ((ETHR_HAVE___sync_val_compare_and_swap & (2*ETHR_SIZEOF_PTR))	\
+     && (ETHR_HAVE___atomic_compare_exchange_n & (2*ETHR_SIZEOF_PTR)))
+#  define ETHR_NATIVE_DW_ATOMIC_IMPL "gcc_atomic_and_sync_builtins"
+#elif (ETHR_HAVE___atomic_compare_exchange_n & (2*ETHR_SIZEOF_PTR))
+#  define ETHR_NATIVE_DW_ATOMIC_IMPL "gcc_atomic_builtins"
+#elif (ETHR_HAVE___sync_val_compare_and_swap & (2*ETHR_SIZEOF_PTR))
+#  define ETHR_NATIVE_DW_ATOMIC_IMPL "gcc_sync_builtins"
+#else
+#  error "!?"
+#endif
 
 #  if ETHR_SIZEOF_PTR == 4
 #    define ETHR_DW_NATMC_ALIGN_MASK__ 0x7
@@ -89,15 +93,138 @@ typedef union {
 #    define ETHR_DW_DBG_ALIGNED__(PTR)
 #  endif
 
-#define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_ADDR
+
+#define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_ADDR 1
+
 static ETHR_INLINE ethr_sint_t *
 ethr_native_dw_atomic_addr(ethr_native_dw_atomic_t *var)
 {
     return (ethr_sint_t *) ETHR_DW_NATMC_MEM__(var);
 }
 
+#if (ETHR_HAVE___atomic_store_n & (2*ETHR_SIZEOF_PTR))
+
+#if (ETHR_GCC_RELAXED_VERSIONS__ & (2*ETHR_SIZEOF_PTR))
+
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_SET 1
+
+static ETHR_INLINE void
+ethr_native_su_dw_atomic_set(ethr_native_dw_atomic_t *var,
+			     ETHR_NATIVE_SU_DW_SINT_T value)
+{
+    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
+    ETHR_DW_DBG_ALIGNED__(p);
+    __atomic_store_n(p, value, __ATOMIC_RELAXED);
+}
+
+#endif /* ETHR_GCC_RELAXED_VERSIONS__ */
+
+#if (ETHR_GCC_RELB_VERSIONS__ & (2*ETHR_SIZEOF_PTR))
 
-#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_MB
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_SET_RELB 1
+
+static ETHR_INLINE void
+ethr_native_su_dw_atomic_set_relb(ethr_native_dw_atomic_t *var,
+				  ETHR_NATIVE_SU_DW_SINT_T value)
+{
+    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
+    ETHR_DW_DBG_ALIGNED__(p);
+    __atomic_store_n(p, value, __ATOMIC_RELEASE);
+}
+
+#endif /* ETHR_GCC_RELB_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_store_n */
+
+#if (ETHR_HAVE___atomic_load_n & (2*ETHR_SIZEOF_PTR))
+
+#if (ETHR_GCC_RELAXED_VERSIONS__ & (2*ETHR_SIZEOF_PTR))
+
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_READ 1
+
+static ETHR_INLINE ETHR_NATIVE_SU_DW_SINT_T
+ethr_native_su_dw_atomic_read(ethr_native_dw_atomic_t *var)
+{
+    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
+    ETHR_DW_DBG_ALIGNED__(p);
+    return __atomic_load_n(p, __ATOMIC_RELAXED);
+}
+
+#endif /* ETHR_GCC_RELAXED_VERSIONS__ */
+
+#if ((ETHR_GCC_ACQB_VERSIONS__ & (2*ETHR_SIZEOF_PTR))	\
+     & ~ETHR___atomic_load_ACQUIRE_barrier_bug)
+
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_READ_ACQB 1
+
+static ETHR_INLINE ETHR_NATIVE_SU_DW_SINT_T
+ethr_native_su_dw_atomic_read_acqb(ethr_native_dw_atomic_t *var)
+{
+    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
+    ETHR_DW_DBG_ALIGNED__(p);
+    return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+}
+
+#endif /* ETHR_GCC_ACQB_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_load_n */
+
+#if (ETHR_HAVE___atomic_compare_exchange_n & (2*ETHR_SIZEOF_PTR))
+
+#if (ETHR_GCC_RELAXED_MOD_VERSIONS__ & (2*ETHR_SIZEOF_PTR))
+
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG 1
+
+static ETHR_INLINE ETHR_NATIVE_SU_DW_SINT_T
+ethr_native_su_dw_atomic_cmpxchg(ethr_native_dw_atomic_t *var,
+				 ETHR_NATIVE_SU_DW_SINT_T new,
+				 ETHR_NATIVE_SU_DW_SINT_T exp)
+{
+    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
+    ETHR_NATIVE_SU_DW_SINT_T xchg = exp;
+    ETHR_DW_DBG_ALIGNED__(p);
+    if (__atomic_compare_exchange_n(p,
+				    &xchg,
+				    new,
+				    0,
+				    __ATOMIC_RELAXED,
+				    __ATOMIC_RELAXED))
+	return exp;
+    return xchg;
+}
+
+#endif /* ETHR_GCC_RELAXED_MOD_VERSIONS__ */
+
+#if (ETHR_GCC_ACQB_MOD_VERSIONS__ & (2*ETHR_SIZEOF_PTR))
+
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_ACQB 1
+
+static ETHR_INLINE ETHR_NATIVE_SU_DW_SINT_T
+ethr_native_su_dw_atomic_cmpxchg_acqb(ethr_native_dw_atomic_t *var,
+				      ETHR_NATIVE_SU_DW_SINT_T new,
+				      ETHR_NATIVE_SU_DW_SINT_T exp)
+{
+    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
+    ETHR_NATIVE_SU_DW_SINT_T xchg = exp;
+    ETHR_DW_DBG_ALIGNED__(p);
+    if (__atomic_compare_exchange_n(p,
+				    &xchg,
+				    new,
+				    0,
+				    __ATOMIC_ACQUIRE,
+				    __ATOMIC_ACQUIRE))
+	return exp;
+    return xchg;
+}
+
+#endif /* ETHR_GCC_ACQB_MOD_VERSIONS__ */
+
+#endif /* ETHR_HAVE___atomic_compare_exchange_n */
+
+#if ((ETHR_HAVE___sync_val_compare_and_swap & (2*ETHR_SIZEOF_PTR)) \
+     & ETHR_GCC_MB_MOD_VERSIONS__)
+
+#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_MB 1
 
 static ETHR_INLINE ETHR_NATIVE_SU_DW_SINT_T
 ethr_native_su_dw_atomic_cmpxchg_mb(ethr_native_dw_atomic_t *var,
@@ -109,7 +236,8 @@ ethr_native_su_dw_atomic_cmpxchg_mb(ethr_native_dw_atomic_t *var,
     return __sync_val_compare_and_swap(p, old, new);
 }
 
-#endif /* ETHR_TRY_INLINE_FUNCS */
+#endif /* ETHR_HAVE___sync_val_compare_and_swap */
 
-#endif /* ETHR_GCC_DW_ATOMIC_H__ */
+#endif /* ETHR_TRY_INLINE_FUNCS */
 
+#endif /* ETHR_INCLUDE_DW_ATOMIC_IMPL__ */
diff --git a/erts/include/internal/gcc/ethr_membar.h b/erts/include/internal/gcc/ethr_membar.h
index 7d428fc68e..d2d36907f3 100644
--- a/erts/include/internal/gcc/ethr_membar.h
+++ b/erts/include/internal/gcc/ethr_membar.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2011. All Rights Reserved.
+ * Copyright Ericsson AB 2011-2015. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -18,56 +18,196 @@
  */
 
 /*
- * Description: Memory barriers when using gcc's builtins
+ * Description: Memory barriers when using gcc's __atomic and
+ *              __sync builtins
  * Author: Rickard Green
+ *
+ * Note: The C11 memory model implemented by gcc's __atomic
+ *       builtins does not match the ethread API very well.
+ *
+ *       A function with a barrier postfix in the ethread atomic
+ *       API needs to ensure that all stores and loads are
+ *       ordered around it according to the semantics of the
+ *       barrier specified.
+ *
+ *       The C11 aproch is different. The __atomic builtins
+ *       API takes a memory model parameter. Assuming that all
+ *       memory syncronizations using the involved atomic
+ *       variables are made using this API, the synchronizations
+ *       will adhere to the memory models used. That is, you do
+ *       *not* know how loads and stores will be ordered around
+ *       a specific __atomic operation in the general case. You
+ *       only know the total effect of the combination of
+ *       operations issued will adhere to the model.
+ *
+ *       This limits how we can use the __atomic builtins. What
+ *       we cannot use:
+ *
+ *       1. We cannot rely on __atomic_thread_fence() to issue
+ *          any specific memory barriers at all. This regardless
+ *          of memory model parameter passed. That is, we cannot
+ *          use the __atomic_thread_fence() builtin at all.
+ *
+ *          Why is this? If all __atomic builtins accessing
+ *          memory issue memory barriers, __atomic_thread_fence()
+ *          does not have to issue memory barriers. The
+ *          implementation for the Itanium architecture is an
+ *          example of this. Even using the __ATOMIC_RELAXED
+ *          memory model all __atomic builtins accessing memory
+ *          will issue memory barriers. Due to this no memory
+ *          barriers at all will be issued by
+ *           __atomic_thread_fence() using either one of the
+ *          __ATOMIC_CONSUME, __ATOMIC_ACQUIRE, or
+ *          __ATOMIC_RELEASE memory models.
+ *
+ *       2. We cannot rely on any __atomic builtin with the
+ *          __ATOMIC_SEQ_CST memory model parameters to
+ *          issue any specific memory barriers. That is, we
+ *          cannot use these memory models at all.
+ *
+ *          Why is this? Since all synchronizations is expected
+ *          to be made using the __atomic builtins, memory
+ *          barriers only have to be issued by some of them,
+ *          and you do not know which ones wont issue memory
+ *          barriers.
+ *
+ *          One can easily be fooled into believing that when
+ *          using the __ATOMIC_SEQ_CST memory model on all
+ *          operations, all operations will issue full memory
+ *          barriers. This is however not the case. The
+ *          implementation for the x86_64 architecture is an
+ *          example of this. Since all operations except loads
+ *          issue full memory barriers, no memory barriers at
+ *          all is issued by loads. This could also be
+ *          implemented by issuing a full memory barrier on
+ *          loads, but no barrier at all on stores.
+ *
+ *       What can be used then?
+ *       1. All (legacy) __sync builtins implying full memory
+ *          barriers issued.
+ *       2. All __atomic builtins using the __ATOMIC_RELAXED
+ *          memory model can, of course, be used. This since
+ *          no ordering guarantees at all are made.
+ *       3. All __atomic builtins accessing memory using the
+ *          __ATOMIC_ACQUIRE and __ATOMIC_RELEASE memory
+ *          models. This since an __atomic builtin memory
+ *          access using the __ATOMIC_ACQUIRE must at least
+ *          issue an aquire memory barrier and an __atomic
+ *          builtin memory acess with the __ATOMIC_RELEASE
+ *          memory model must at least issue a release memory
+ *          barrier. Otherwise the two can not be paired.
+ *       4. All __atomic builtins accessing memory using the
+ *          __ATOMIC_CONSUME builtin can be used for the same
+ *          reason __ATOMIC_ACQUIRE can be used. The ethread
+ *          atomic framework implementing the ethread API
+ *          using native implementations does not expect the
+ *          native implementations to produce versions with
+ *          data dependent read barriers, so until the
+ *          framework is changed we haven't got any use for
+ *          for it.
+ *
+ *       For some architectures we have our own memory barrier
+ *       implementations. We prefer to use these since they
+ *       should be as fine grained as possible. For other
+ *       architectures we use the __sync_synchronize() builtin
+ *       which issue a full memory barrier. For these
+ *       architectures we have to assume that all loads and
+ *       stores can be reordered without limitation. That is,
+ *       unnecessary memory barriers will be issued if such
+ *       reordering actually cannot occur.
  */
 
-#ifndef ETHR_GCC_MEMBAR_H__
-#define ETHR_GCC_MEMBAR_H__
+/*
+ * We prefer to use our own memory barrier implementation if
+ * such exist instead of using __sync_synchronize()...
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#  include "../i386/ethr_membar.h"
+#elif defined(__sparc__)
+#  include "../sparc32/ethr_membar.h"
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__)
+#  include "../ppc32/ethr_membar.h"
+#elif !defined(ETHR_GCC_ATOMIC_MEMBAR_H__)			\
+    && (ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION			\
+	|| ETHR_HAVE___sync_synchronize				\
+	|| (ETHR_HAVE___sync_val_compare_and_swap & 12))
+#define ETHR_GCC_ATOMIC_MEMBAR_H__
 
 #define ETHR_LoadLoad	(1 << 0)
 #define ETHR_LoadStore	(1 << 1)
 #define ETHR_StoreLoad	(1 << 2)
 #define ETHR_StoreStore	(1 << 3)
 
+#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
+
+#if ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+    __asm__ __volatile__("dmb sy" : : : "memory");
+}
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_store_fence__(void)
+{
+    __asm__ __volatile__("dmb st" : : : "memory");
+}
+
+#define ETHR_MEMBAR(B) \
+ ETHR_CHOOSE_EXPR((B) == ETHR_StoreStore, ethr_store_fence__(), ethr_full_fence__())
+
+#elif ETHR_HAVE___sync_synchronize
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+    /*
+     * The compiler barriers are here to fix missing clobbers
+     * in __sync_synchronize() when using buggy LLVM
+     * implementation of __sync_synchronize(). They
+     * do not introduce any unnecessary overhead when used
+     * here, so we use them for all systems.
+     */
+    ETHR_COMPILER_BARRIER;
+    __sync_synchronize();
+    ETHR_COMPILER_BARRIER;
+}
+
+#else /* !ETHR_HAVE___sync_synchronize */
+
 /*
- * According to the documentation __sync_synchronize() will
- * issue a full memory barrier. However, __sync_synchronize()
- * is known to erroneously be a noop on at least some
- * platforms with some gcc versions. This has suposedly been
- * fixed in some gcc version, but we don't know from which
- * version. Therefore, we only use it when it has been
- * verified to work. Otherwise we use the workaround
- * below.
+ * Buggy __sync_synchronize(); call __sync_val_compare_and_swap()
+ * instead which imply a full memory barrier (and hope that one
+ * isn't buggy too).
  */
 
-#if defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP32)
+#if (ETHR_HAVE___sync_val_compare_and_swap & 4)
 #  define ETHR_MB_T__ ethr_sint32_t
-#elif defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP64)
+#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
 #  define ETHR_MB_T__ ethr_sint64_t
-#else
-#  error "No __sync_val_compare_and_swap"
 #endif
-#define ETHR_SYNC_SYNCHRONIZE_WORKAROUND__ \
-do { \
-    volatile ETHR_MB_T__ x___ = 0; \
-    (void) __sync_val_compare_and_swap(&x___, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1); \
-} while (0)
 
-#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+    volatile ETHR_MB_T__ x = 0;
+    (void) __sync_val_compare_and_swap(&x, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1);
+}
 
-#if defined(__mips__) && ETHR_AT_LEAST_GCC_VSN__(4, 2, 0)
-#  define ETHR_MEMBAR(B) __sync_synchronize()
-#  define ETHR_READ_DEPEND_MEMORY_BARRIER __sync_synchronize()
-#elif ((defined(__powerpc__) || defined(__ppc__)) \
-       && ETHR_AT_LEAST_GCC_VSN__(4, 1, 2))
-#  define ETHR_MEMBAR(B) __sync_synchronize()
-#else /* Use workaround */
-#  define ETHR_MEMBAR(B) \
-     ETHR_SYNC_SYNCHRONIZE_WORKAROUND__
-#  define ETHR_READ_DEPEND_MEMORY_BARRIER \
-     ETHR_SYNC_SYNCHRONIZE_WORKAROUND__
+#endif /* !ETHR_HAVE___sync_synchronize */
+
+#ifndef ETHR_MEMBAR
+#  define ETHR_MEMBAR(B) ethr_full_fence__()
 #endif
 
+/*
+ * Define ETHR_READ_DEPEND_MEMORY_BARRIER for all architechtures
+ * not known to order data dependent loads
+ */
+
+#if !defined(__ia64__) && !defined(__arm__)
+#  define ETHR_READ_DEPEND_MEMORY_BARRIER ETHR_MEMBAR(ETHR_LoadLoad)
+#endif
 
-#endif /* ETHR_GCC_MEMBAR_H__ */
+#endif /* ETHR_GCC_ATOMIC_MEMBAR_H__ */
diff --git a/erts/include/internal/gcc/ethread.h b/erts/include/internal/gcc/ethread.h
index 365a3535cf..be3e1da90e 100644
--- a/erts/include/internal/gcc/ethread.h
+++ b/erts/include/internal/gcc/ethread.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2010-2011. All Rights Reserved.
+ * Copyright Ericsson AB 2010-2015. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -18,20 +18,292 @@
  */
 
 /*
- * Description: Native atomic ethread support when using gcc
+ * Description: Native atomic ethread support when using gcc's __atomic
+ *              and __sync builtins
  * Author: Rickard Green
  */
 
-#ifndef ETHREAD_GCC_H__
-#define ETHREAD_GCC_H__
-
-#if defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP32) \
-     || defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP64)
+#if !defined(ETHREAD_GCC_NATIVE_H__) && ETHR_GCC_COMPILER
+#define ETHREAD_GCC_NATIVE_H__
 
 #ifndef ETHR_MEMBAR
 #  include "ethr_membar.h"
 #endif
 
+#define ETHR_GCC_VERSIONS_MASK__ 28
+
+#undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__
+#undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__
+#undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__
+#undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__
+#undef ETHR_GCC_RELAXED_VERSIONS__
+#undef ETHR_GCC_RELAXED_MOD_VERSIONS__
+#undef ETHR_GCC_ACQB_VERSIONS__
+#undef ETHR_GCC_ACQB_MOD_VERSIONS__
+#undef ETHR_GCC_RELB_VERSIONS__
+#undef ETHR_GCC_RELB_MOD_VERSIONS__
+#undef ETHR_GCC_MB_MOD_VERSIONS__
+
+/*
+ * True GNU GCCs before version 4.8 do not emit a memory barrier
+ * after the load in the __atomic_load_n(_, __ATOMIC_ACQUIRE)
+ * case (which is needed on most architectures).
+ */
+#undef ETHR___atomic_load_ACQUIRE_barrier_bug
+#if ETHR_GCC_COMPILER != ETHR_GCC_COMPILER_TRUE
+/*
+ * A gcc compatible compiler. We have no information
+ * about the existence of this bug, but we assume
+ * that it is not impossible that it could have
+ * been "inherited". Therefore, until we are certain
+ * that the bug does not exist, we assume that it
+ * does.
+ */
+#  define ETHR___atomic_load_ACQUIRE_barrier_bug ETHR_GCC_VERSIONS_MASK__
+#elif !ETHR_AT_LEAST_GCC_VSN__(4, 8, 0)
+/* True gcc of version < 4.8, i.e., bug exist... */
+#  define ETHR___atomic_load_ACQUIRE_barrier_bug ETHR_GCC_VERSIONS_MASK__
+#else /* True gcc of version >= 4.8 */
+/*
+ * Sizes less than or equal to word size have been fixed,
+ * but double word size has not been fixed.
+ */
+#  if ETHR_SIZEOF_PTR == 8
+#    define ETHR___atomic_load_ACQUIRE_barrier_bug \
+    (~(8|4) & ETHR_GCC_VERSIONS_MASK__)
+#  elif ETHR_SIZEOF_PTR == 4
+#    define ETHR___atomic_load_ACQUIRE_barrier_bug \
+    (~4 & ETHR_GCC_VERSIONS_MASK__)
+#  else
+#    error word size not supported
+#  endif
+#endif
+
+#define ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__ 0
+#define ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__ 0
+#define ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__ 0
+#define ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__ 0
+#define ETHR_GCC_RELAXED_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#define ETHR_GCC_RELAXED_MOD_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+
+#if ETHR_TRUST_GCC_ATOMIC_BUILTINS_MEMORY_BARRIERS
+#  define ETHR_GCC_ACQB_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#  define ETHR_GCC_ACQB_MOD_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#  define ETHR_GCC_RELB_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#  define ETHR_GCC_RELB_MOD_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#else
+/*
+ * This is currently the default (on most platforms) since
+ * we've seen too many memory barrier bugs produced by gcc...
+ */
+#  define ETHR_GCC_ACQB_VERSIONS__ 0
+#  define ETHR_GCC_ACQB_MOD_VERSIONS__ 0
+#  define ETHR_GCC_RELB_VERSIONS__ 0
+#  define ETHR_GCC_RELB_MOD_VERSIONS__ 0
+#endif
+/*
+ * In the general case we do not want any full barrier versions
+ * if we can implement more relaxed ones (using __atomic_* builtins).
+ * This since the implementations normally need extra memory barrier
+ * instructions to implement these. The x86/x86_64 implementations
+ * are an exception see below.
+ */
+#define ETHR_GCC_MB_MOD_VERSIONS__ \
+    (ETHR_GCC_VERSIONS_MASK__ & ~ETHR_HAVE___atomic_compare_exchange_n)
+
+#if ETHR_SIZEOF_PTR == 8
+#  define ETHR_GCC_VOLATILE_BIT_MASK__ 12
+#elif ETHR_SIZEOF_PTR == 4
+#  define ETHR_GCC_VOLATILE_BIT_MASK__ 4
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__sparc__)	\
+    || defined(__powerpc__) || defined(__ppc__) || defined(__mips__)	\
+    || defined(__alpha__) || defined(__ia64__)
+
+/*
+ * Aligned volatile stores and loads of data smaller
+ * than or equal to word size are atomic...
+ */
+#  undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__
+#  define ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__ ETHR_GCC_VOLATILE_BIT_MASK__
+#  undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__
+#  define ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__ ETHR_GCC_VOLATILE_BIT_MASK__
+
+#elif defined(__arm__)
+
+/* volatile stores are problematic on some machines... */
+#  undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__
+#  define ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__ ETHR_GCC_VOLATILE_BIT_MASK__
+
+#endif
+
+#if defined(__ia64__)
+
+/* Volatile stores produce stores with release barriers. */
+#  undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__
+#  define ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__ ETHR_GCC_VOLATILE_BIT_MASK__
+
+/* Volatile loads produce loads with acquire barrier. */
+#  undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__
+#  define ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__ ETHR_GCC_VOLATILE_BIT_MASK__
+
+/*
+ * We trust gcc to produce acquire/release barriers on itanium.
+ * Since all atomic ops also have at least acquire or release
+ * barriers (also when passed the relaxed memory model) it
+ * would be very inefficient not to use these as native
+ * barriers on Itanium.
+ */
+#  undef ETHR_GCC_ACQB_VERSIONS__
+#  define ETHR_GCC_ACQB_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#  undef ETHR_GCC_ACQB_MOD_VERSIONS__
+#  define ETHR_GCC_ACQB_MOD_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#  undef ETHR_GCC_RELB_VERSIONS__
+#  define ETHR_GCC_RELB_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+#  undef ETHR_GCC_RELB_MOD_VERSIONS__
+#  define ETHR_GCC_RELB_MOD_VERSIONS__ ETHR_GCC_VERSIONS_MASK__
+
+/*
+ * Itanium is not effected by the load acquire
+ * bug since the barrier is part of the instruction
+ * on Itanium (ld.acq), and not a separate instruction
+ * as on most platforms.
+ */
+#  undef ETHR___atomic_load_ACQUIRE_barrier_bug
+#  define ETHR___atomic_load_ACQUIRE_barrier_bug 0
+
+/*
+ * No point exposing relaxed versions since they are
+ * implemended using either acquire or release
+ * barriers.
+ */
+#  undef ETHR_GCC_RELAXED_VERSIONS__
+#  define ETHR_GCC_RELAXED_VERSIONS__ 0
+
+/* #endif defined(__ia64__) */
+#elif defined(__i386__) || defined(__x86_64__)
+
+/*
+ * Want full barrier versions of all modification
+ * operations since all of these are implemented
+ * using locked instructions implying full memory
+ * barriers.
+ */
+#  undef ETHR_GCC_MB_MOD_VERSIONS__
+#  define ETHR_GCC_MB_MOD_VERSIONS__ ETHR_HAVE___sync_val_compare_and_swap
+
+/*
+ * No point exposing acquire/release versions
+ * when we got full memory barrier versions
+ * of modification operations since all of these
+ * are implemented using locked instructions
+ * implying full memory barriers.
+ */
+#  if ETHR_GCC_ACQB_MOD_VERSIONS__
+#    undef ETHR_GCC_ACQB_MOD_VERSIONS__
+#    define ETHR_GCC_ACQB_MOD_VERSIONS__ \
+    (ETHR_GCC_VERSIONS_MASK__ & ~ETHR_HAVE___sync_val_compare_and_swap)
+#  endif
+#  if ETHR_GCC_RELB_MOD_VERSIONS__
+#    undef ETHR_GCC_RELB_MOD_VERSIONS__
+#    define ETHR_GCC_RELB_MOD_VERSIONS__ \
+    (ETHR_GCC_VERSIONS_MASK__ & ~ETHR_HAVE___sync_val_compare_and_swap)
+#  endif
+
+#  ifdef ETHR_X86_OUT_OF_ORDER
+
+/* See above... */
+#    undef ETHR_GCC_RELAXED_MOD_VERSIONS__
+#    define ETHR_GCC_RELAXED_MOD_VERSIONS__ 0
+
+#  else /* !ETHR_X86_OUT_OF_ORDER, i.e., we don't use any x86-OOO instructions... */
+
+/*
+ * Not effected by the load acquire barrier bug,
+ * since no barrier at all is needed for a load
+ * acquire...
+ */
+#    undef ETHR___atomic_load_ACQUIRE_barrier_bug
+#    define ETHR___atomic_load_ACQUIRE_barrier_bug 0
+
+/* Stores imply release barriers semantics. */
+#    undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__
+#    define ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__ ETHR_GCC_VOLATILE_BIT_MASK__
+
+/* Loads imply acquire barrier semantics. */
+#    undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__
+#    define ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__ ETHR_GCC_VOLATILE_BIT_MASK__
+
+/*
+ * Trust load acquire and store release for sizes
+ * where volatile operation implies these barrier
+ * semantics since no barriers are needed.
+ */
+#    if !ETHR_GCC_ACQB_VERSIONS__
+#      undef ETHR_GCC_ACQB_VERSIONS__
+#      define ETHR_GCC_ACQB_VERSIONS__ ETHR_GCC_VOLATILE_BIT_MASK__
+#    endif
+#    if !ETHR_GCC_RELB_VERSIONS__
+#      undef ETHR_GCC_RELB_VERSIONS__
+#      define ETHR_GCC_RELB_VERSIONS__ ETHR_GCC_VOLATILE_BIT_MASK__
+#    endif
+
+/*
+ * No point exposing relaxed versions at all since
+ * all mod operations are implemented with locked
+ * instructions implying full memory barriers and
+ * volatile store and load imply release and
+ * acquire barrier semantics.
+ */
+#    undef ETHR_GCC_RELAXED_VERSIONS__
+#    define ETHR_GCC_RELAXED_VERSIONS__ 0
+
+#  endif /* !ETHR_X86_OUT_OF_ORDER */
+
+/* #endif defined(__i386__) || defined(__x86_64__) */
+#elif defined(__powerpc__) || defined(__ppc__)
+
+#  if !defined(ETHR_PPC_HAVE_LWSYNC)
+/*
+ * Release barriers are typically implemented using
+ * the lwsync instruction. We want our runtime
+ * configure test to determine if the lwsync
+ * instruction is available on the system or not
+ * before we use it. Therefore, do not implement any
+ * native ops using the __ATOMIC_RELEASE model.
+ */
+#    undef ETHR_GCC_RELB_VERSIONS__
+#    define ETHR_GCC_RELB_VERSIONS__ 0
+#    if defined(ETHR_GCC_IMPLEMENT_ACQB_USING_LWSYNC)
+/*
+ * Acquire barriers are usually implemented by other means
+ * than lwsync, but can be implemented using lwsync. Define
+ * ETHR_GCC_IMPLEMENT_ACQB_USING_LWSYNC if acquire barriers
+ * are implemented using lwsync.
+ */
+#      undef ETHR_GCC_ACQB_VERSIONS__
+#      define ETHR_GCC_ACQB_VERSIONS__ 0
+#    endif
+#  endif
+
+#endif /* defined(__powerpc__) || defined(__ppc__) */
+
+#if !ETHR_GCC_RELAXED_VERSIONS__
+#  undef ETHR_GCC_RELAXED_MOD_VERSIONS__
+#  define ETHR_GCC_RELAXED_MOD_VERSIONS__ 0
+#endif
+
+#if !ETHR_GCC_ACQB_VERSIONS__
+#  undef ETHR_GCC_ACQB_MOD_VERSIONS__
+#  define ETHR_GCC_ACQB_MOD_VERSIONS__ 0
+#endif
+
+#if !ETHR_GCC_RELB_VERSIONS__
+#  undef ETHR_GCC_RELB_MOD_VERSIONS__
+#  define ETHR_GCC_RELB_MOD_VERSIONS__ 0
+#endif
+
 #if !defined(ETHR_HAVE_NATIVE_ATOMIC32)
 #  define ETHR_ATOMIC_WANT_32BIT_IMPL__
 #  include "ethr_atomic.h"
@@ -42,12 +314,51 @@
 #  include "ethr_atomic.h"
 #endif
 
+#if defined(__x86_64__)
+/*
+ * No instructions available for native implementation
+ * of these for dw-atomics...
+ */
+#  undef ETHR_GCC_RELAXED_VERSIONS__
+#  define ETHR_GCC_RELAXED_VERSIONS__ 0
+#  undef ETHR_GCC_ACQB_VERSIONS__
+#  define ETHR_GCC_ACQB_VERSIONS__ 0
+#  undef ETHR_GCC_RELB_VERSIONS__
+#  define ETHR_GCC_RELB_VERSIONS__ 0
+#endif
+
+#if !ETHR_GCC_RELAXED_VERSIONS__
+#  undef ETHR_GCC_RELAXED_MOD_VERSIONS__
+#  define ETHR_GCC_RELAXED_MOD_VERSIONS__ 0
+#endif
+
+#if !ETHR_GCC_ACQB_VERSIONS__
+#  undef ETHR_GCC_ACQB_MOD_VERSIONS__
+#  define ETHR_GCC_ACQB_MOD_VERSIONS__ 0
+#endif
+
+#if !ETHR_GCC_RELB_VERSIONS__
+#  undef ETHR_GCC_RELB_MOD_VERSIONS__
+#  define ETHR_GCC_RELB_MOD_VERSIONS__ 0
+#endif
+
 #if (!defined(ETHR_HAVE_NATIVE_DW_ATOMIC) \
      && !(ETHR_SIZEOF_PTR == 4 && defined(ETHR_HAVE_NATIVE_ATOMIC64)) \
      && !(ETHR_SIZEOF_PTR == 8 && defined(ETHR_HAVE_NATIVE_ATOMIC128)))
 #  include "ethr_dw_atomic.h"
 #endif
 
-#endif
+#undef ETHR___atomic_load_ACQUIRE_barrier_bug
+#undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE__
+#undef ETHR_GCC_VOLATILE_STORE_IS_ATOMIC_STORE_RELB__
+#undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD__
+#undef ETHR_GCC_VOLATILE_LOAD_IS_ATOMIC_LOAD_ACQB__
+#undef ETHR_GCC_RELAXED_VERSIONS__
+#undef ETHR_GCC_RELB_VERSIONS__
+#undef ETHR_GCC_RELB_VERSIONS__
+#undef ETHR_GCC_RELAXED_MOD_VERSIONS__
+#undef ETHR_GCC_ACQB_MOD_VERSIONS__
+#undef ETHR_GCC_RELB_MOD_VERSIONS__
+#undef ETHR_GCC_MB_MOD_VERSIONS__
 
-#endif
+#endif /* ETHREAD_GCC_NATIVE_H__ */