1 files changed, 184 insertions, 43 deletions
diff --git a/erts/include/internal/gcc/ethr_membar.h b/erts/include/internal/gcc/ethr_membar.h
index 7d428fc68e..07960ce040 100644
--- a/erts/include/internal/gcc/ethr_membar.h
+++ b/erts/include/internal/gcc/ethr_membar.h
@@ -1,73 +1,214 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2011. All Rights Reserved.
+ * Copyright Ericsson AB 2011-2015. All Rights Reserved.
  *
- * The contents of this file are subject to the Erlang Public License,
- * Version 1.1, (the "License"); you may not use this file except in
- * compliance with the License. You should have received a copy of the
- * Erlang Public License along with this software. If not, it can be
- * retrieved online at http://www.erlang.org/.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
- * the License for the specific language governing rights and limitations
- * under the License.
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  *
  * %CopyrightEnd%
  */
 
 /*
- * Description: Memory barriers when using gcc's builtins
+ * Description: Memory barriers when using gcc's __atomic and
+ *              __sync builtins
  * Author: Rickard Green
+ *
+ * Note: The C11 memory model implemented by gcc's __atomic
+ *       builtins does not match the ethread API very well.
+ *
+ *       A function with a barrier postfix in the ethread atomic
+ *       API needs to ensure that all stores and loads are
+ *       ordered around it according to the semantics of the
+ *       barrier specified.
+ *
+ *       The C11 aproch is different. The __atomic builtins
+ *       API takes a memory model parameter. Assuming that all
+ *       memory syncronizations using the involved atomic
+ *       variables are made using this API, the synchronizations
+ *       will adhere to the memory models used. That is, you do
+ *       *not* know how loads and stores will be ordered around
+ *       a specific __atomic operation in the general case. You
+ *       only know the total effect of the combination of
+ *       operations issued will adhere to the model.
+ *
+ *       This limits how we can use the __atomic builtins. What
+ *       we cannot use:
+ *
+ *       1. We cannot rely on __atomic_thread_fence() to issue
+ *          any specific memory barriers at all. This regardless
+ *          of memory model parameter passed. That is, we cannot
+ *          use the __atomic_thread_fence() builtin at all.
+ *
+ *          Why is this? If all __atomic builtins accessing
+ *          memory issue memory barriers, __atomic_thread_fence()
+ *          does not have to issue memory barriers. The
+ *          implementation for the Itanium architecture is an
+ *          example of this. Even using the __ATOMIC_RELAXED
+ *          memory model all __atomic builtins accessing memory
+ *          will issue memory barriers. Due to this no memory
+ *          barriers at all will be issued by
+ *           __atomic_thread_fence() using either one of the
+ *          __ATOMIC_CONSUME, __ATOMIC_ACQUIRE, or
+ *          __ATOMIC_RELEASE memory models.
+ *
+ *       2. We cannot rely on any __atomic builtin with the
+ *          __ATOMIC_SEQ_CST memory model parameters to
+ *          issue any specific memory barriers. That is, we
+ *          cannot use these memory models at all.
+ *
+ *          Why is this? Since all synchronizations is expected
+ *          to be made using the __atomic builtins, memory
+ *          barriers only have to be issued by some of them,
+ *          and you do not know which ones wont issue memory
+ *          barriers.
+ *
+ *          One can easily be fooled into believing that when
+ *          using the __ATOMIC_SEQ_CST memory model on all
+ *          operations, all operations will issue full memory
+ *          barriers. This is however not the case. The
+ *          implementation for the x86_64 architecture is an
+ *          example of this. Since all operations except loads
+ *          issue full memory barriers, no memory barriers at
+ *          all is issued by loads. This could also be
+ *          implemented by issuing a full memory barrier on
+ *          loads, but no barrier at all on stores.
+ *
+ *       What can be used then?
+ *       1. All (legacy) __sync builtins implying full memory
+ *          barriers issued.
+ *       2. All __atomic builtins using the __ATOMIC_RELAXED
+ *          memory model can, of course, be used. This since
+ *          no ordering guarantees at all are made.
+ *       3. All __atomic builtins accessing memory using the
+ *          __ATOMIC_ACQUIRE and __ATOMIC_RELEASE memory
+ *          models. This since an __atomic builtin memory
+ *          access using the __ATOMIC_ACQUIRE must at least
+ *          issue an aquire memory barrier and an __atomic
+ *          builtin memory acess with the __ATOMIC_RELEASE
+ *          memory model must at least issue a release memory
+ *          barrier. Otherwise the two can not be paired.
+ *       4. All __atomic builtins accessing memory using the
+ *          __ATOMIC_CONSUME builtin can be used for the same
+ *          reason __ATOMIC_ACQUIRE can be used. The ethread
+ *          atomic framework implementing the ethread API
+ *          using native implementations does not expect the
+ *          native implementations to produce versions with
+ *          data dependent read barriers, so until the
+ *          framework is changed we haven't got any use for
+ *          for it.
+ *
+ *       For some architectures we have our own memory barrier
+ *       implementations. We prefer to use these since they
+ *       should be as fine grained as possible. For other
+ *       architectures we use the __sync_synchronize() builtin
+ *       which issue a full memory barrier. For these
+ *       architectures we have to assume that all loads and
+ *       stores can be reordered without limitation. That is,
+ *       unnecessary memory barriers will be issued if such
+ *       reordering actually cannot occur.
  */
 
-#ifndef ETHR_GCC_MEMBAR_H__
-#define ETHR_GCC_MEMBAR_H__
+/*
+ * We prefer to use our own memory barrier implementation if
+ * such exist instead of using __sync_synchronize()...
+ */
+#if defined(__i386__) || defined(__x86_64__)
+#  include "../i386/ethr_membar.h"
+#elif defined(__sparc__)
+#  include "../sparc32/ethr_membar.h"
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__)
+#  include "../ppc32/ethr_membar.h"
+#elif !defined(ETHR_GCC_ATOMIC_MEMBAR_H__)			\
+    && (ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION			\
+	|| ETHR_HAVE___sync_synchronize				\
+	|| (ETHR_HAVE___sync_val_compare_and_swap & 12))
+#define ETHR_GCC_ATOMIC_MEMBAR_H__
 
 #define ETHR_LoadLoad	(1 << 0)
 #define ETHR_LoadStore	(1 << 1)
 #define ETHR_StoreLoad	(1 << 2)
 #define ETHR_StoreStore	(1 << 3)
 
+#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
+
+#if ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+    __asm__ __volatile__("dmb sy" : : : "memory");
+}
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_store_fence__(void)
+{
+    __asm__ __volatile__("dmb st" : : : "memory");
+}
+
+#define ETHR_MEMBAR(B) \
+ ETHR_CHOOSE_EXPR((B) == ETHR_StoreStore, ethr_store_fence__(), ethr_full_fence__())
+
+#elif ETHR_HAVE___sync_synchronize
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+    /*
+     * The compiler barriers are here to fix missing clobbers
+     * in __sync_synchronize() when using buggy LLVM
+     * implementation of __sync_synchronize(). They
+     * do not introduce any unnecessary overhead when used
+     * here, so we use them for all systems.
+     */
+    ETHR_COMPILER_BARRIER;
+    __sync_synchronize();
+    ETHR_COMPILER_BARRIER;
+}
+
+#else /* !ETHR_HAVE___sync_synchronize */
+
 /*
- * According to the documentation __sync_synchronize() will
- * issue a full memory barrier. However, __sync_synchronize()
- * is known to erroneously be a noop on at least some
- * platforms with some gcc versions. This has suposedly been
- * fixed in some gcc version, but we don't know from which
- * version. Therefore, we only use it when it has been
- * verified to work. Otherwise we use the workaround
- * below.
+ * Buggy __sync_synchronize(); call __sync_val_compare_and_swap()
+ * instead which imply a full memory barrier (and hope that one
+ * isn't buggy too).
  */
 
-#if defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP32)
+#if (ETHR_HAVE___sync_val_compare_and_swap & 4)
 #  define ETHR_MB_T__ ethr_sint32_t
-#elif defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP64)
+#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
 #  define ETHR_MB_T__ ethr_sint64_t
-#else
-#  error "No __sync_val_compare_and_swap"
 #endif
-#define ETHR_SYNC_SYNCHRONIZE_WORKAROUND__ \
-do { \
-    volatile ETHR_MB_T__ x___ = 0; \
-    (void) __sync_val_compare_and_swap(&x___, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1); \
-} while (0)
 
-#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+    volatile ETHR_MB_T__ x = 0;
+    (void) __sync_val_compare_and_swap(&x, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1);
+}
 
-#if defined(__mips__) && ETHR_AT_LEAST_GCC_VSN__(4, 2, 0)
-#  define ETHR_MEMBAR(B) __sync_synchronize()
-#  define ETHR_READ_DEPEND_MEMORY_BARRIER __sync_synchronize()
-#elif ((defined(__powerpc__) || defined(__ppc__)) \
-       && ETHR_AT_LEAST_GCC_VSN__(4, 1, 2))
-#  define ETHR_MEMBAR(B) __sync_synchronize()
-#else /* Use workaround */
-#  define ETHR_MEMBAR(B) \
-     ETHR_SYNC_SYNCHRONIZE_WORKAROUND__
-#  define ETHR_READ_DEPEND_MEMORY_BARRIER \
-     ETHR_SYNC_SYNCHRONIZE_WORKAROUND__
+#endif /* !ETHR_HAVE___sync_synchronize */
+
+#ifndef ETHR_MEMBAR
+#  define ETHR_MEMBAR(B) ethr_full_fence__()
 #endif
 
+/*
+ * Define ETHR_READ_DEPEND_MEMORY_BARRIER for all architechtures
+ * not known to order data dependent loads
+ */
+
+#if !defined(__ia64__) && !defined(__arm__)
+#  define ETHR_READ_DEPEND_MEMORY_BARRIER ETHR_MEMBAR(ETHR_LoadLoad)
+#endif
 
-#endif /* ETHR_GCC_MEMBAR_H__ */
+#endif /* ETHR_GCC_ATOMIC_MEMBAR_H__ */