aboutsummaryrefslogtreecommitdiffstats
path: root/erts/include/internal/gcc/ethr_membar.h
diff options
context:
space:
mode:
authorRickard Green <[email protected]>2015-03-20 22:05:10 +0100
committerRickard Green <[email protected]>2015-03-20 22:05:10 +0100
commitb99ce1c3a64f24e68b8a937338546fdf60501c65 (patch)
treef05aca3b1f87413ea9f04cd9b3569597da0f996f /erts/include/internal/gcc/ethr_membar.h
parent6642d62c071f94d3e76254453099e2df01f7ad0e (diff)
parent24fa075b5c0d54f2035a2ff510a82aa19187eda4 (diff)
downloadotp-b99ce1c3a64f24e68b8a937338546fdf60501c65.tar.gz
otp-b99ce1c3a64f24e68b8a937338546fdf60501c65.tar.bz2
otp-b99ce1c3a64f24e68b8a937338546fdf60501c65.zip
Merge branch 'rickard/gcc-atomics/OTP-12383'
* rickard/gcc-atomics/OTP-12383: Improve ethread atomics based on GCC builtins Conflicts: erts/aclocal.m4
Diffstat (limited to 'erts/include/internal/gcc/ethr_membar.h')
-rw-r--r--erts/include/internal/gcc/ethr_membar.h208
1 files changed, 174 insertions, 34 deletions
diff --git a/erts/include/internal/gcc/ethr_membar.h b/erts/include/internal/gcc/ethr_membar.h
index 7d428fc68e..d2d36907f3 100644
--- a/erts/include/internal/gcc/ethr_membar.h
+++ b/erts/include/internal/gcc/ethr_membar.h
@@ -1,7 +1,7 @@
/*
* %CopyrightBegin%
*
- * Copyright Ericsson AB 2011. All Rights Reserved.
+ * Copyright Ericsson AB 2011-2015. All Rights Reserved.
*
* The contents of this file are subject to the Erlang Public License,
* Version 1.1, (the "License"); you may not use this file except in
@@ -18,56 +18,196 @@
*/
/*
- * Description: Memory barriers when using gcc's builtins
+ * Description: Memory barriers when using gcc's __atomic and
+ * __sync builtins
* Author: Rickard Green
+ *
+ * Note: The C11 memory model implemented by gcc's __atomic
+ * builtins does not match the ethread API very well.
+ *
+ * A function with a barrier postfix in the ethread atomic
+ * API needs to ensure that all stores and loads are
+ * ordered around it according to the semantics of the
+ * barrier specified.
+ *
+ * The C11 aproch is different. The __atomic builtins
+ * API takes a memory model parameter. Assuming that all
+ * memory syncronizations using the involved atomic
+ * variables are made using this API, the synchronizations
+ * will adhere to the memory models used. That is, you do
+ * *not* know how loads and stores will be ordered around
+ * a specific __atomic operation in the general case. You
+ * only know the total effect of the combination of
+ * operations issued will adhere to the model.
+ *
+ * This limits how we can use the __atomic builtins. What
+ * we cannot use:
+ *
+ * 1. We cannot rely on __atomic_thread_fence() to issue
+ * any specific memory barriers at all. This regardless
+ * of memory model parameter passed. That is, we cannot
+ * use the __atomic_thread_fence() builtin at all.
+ *
+ * Why is this? If all __atomic builtins accessing
+ * memory issue memory barriers, __atomic_thread_fence()
+ * does not have to issue memory barriers. The
+ * implementation for the Itanium architecture is an
+ * example of this. Even using the __ATOMIC_RELAXED
+ * memory model all __atomic builtins accessing memory
+ * will issue memory barriers. Due to this no memory
+ * barriers at all will be issued by
+ * __atomic_thread_fence() using either one of the
+ * __ATOMIC_CONSUME, __ATOMIC_ACQUIRE, or
+ * __ATOMIC_RELEASE memory models.
+ *
+ * 2. We cannot rely on any __atomic builtin with the
+ * __ATOMIC_SEQ_CST memory model parameters to
+ * issue any specific memory barriers. That is, we
+ * cannot use these memory models at all.
+ *
+ * Why is this? Since all synchronizations is expected
+ * to be made using the __atomic builtins, memory
+ * barriers only have to be issued by some of them,
+ * and you do not know which ones wont issue memory
+ * barriers.
+ *
+ * One can easily be fooled into believing that when
+ * using the __ATOMIC_SEQ_CST memory model on all
+ * operations, all operations will issue full memory
+ * barriers. This is however not the case. The
+ * implementation for the x86_64 architecture is an
+ * example of this. Since all operations except loads
+ * issue full memory barriers, no memory barriers at
+ * all is issued by loads. This could also be
+ * implemented by issuing a full memory barrier on
+ * loads, but no barrier at all on stores.
+ *
+ * What can be used then?
+ * 1. All (legacy) __sync builtins implying full memory
+ * barriers issued.
+ * 2. All __atomic builtins using the __ATOMIC_RELAXED
+ * memory model can, of course, be used. This since
+ * no ordering guarantees at all are made.
+ * 3. All __atomic builtins accessing memory using the
+ * __ATOMIC_ACQUIRE and __ATOMIC_RELEASE memory
+ * models. This since an __atomic builtin memory
+ * access using the __ATOMIC_ACQUIRE must at least
+ * issue an aquire memory barrier and an __atomic
+ * builtin memory acess with the __ATOMIC_RELEASE
+ * memory model must at least issue a release memory
+ * barrier. Otherwise the two can not be paired.
+ * 4. All __atomic builtins accessing memory using the
+ * __ATOMIC_CONSUME builtin can be used for the same
+ * reason __ATOMIC_ACQUIRE can be used. The ethread
+ * atomic framework implementing the ethread API
+ * using native implementations does not expect the
+ * native implementations to produce versions with
+ * data dependent read barriers, so until the
+ * framework is changed we haven't got any use for
+ * for it.
+ *
+ * For some architectures we have our own memory barrier
+ * implementations. We prefer to use these since they
+ * should be as fine grained as possible. For other
+ * architectures we use the __sync_synchronize() builtin
+ * which issue a full memory barrier. For these
+ * architectures we have to assume that all loads and
+ * stores can be reordered without limitation. That is,
+ * unnecessary memory barriers will be issued if such
+ * reordering actually cannot occur.
*/
-#ifndef ETHR_GCC_MEMBAR_H__
-#define ETHR_GCC_MEMBAR_H__
+/*
+ * We prefer to use our own memory barrier implementation if
+ * such exist instead of using __sync_synchronize()...
+ */
+#if defined(__i386__) || defined(__x86_64__)
+# include "../i386/ethr_membar.h"
+#elif defined(__sparc__)
+# include "../sparc32/ethr_membar.h"
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__)
+# include "../ppc32/ethr_membar.h"
+#elif !defined(ETHR_GCC_ATOMIC_MEMBAR_H__) \
+ && (ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION \
+ || ETHR_HAVE___sync_synchronize \
+ || (ETHR_HAVE___sync_val_compare_and_swap & 12))
+#define ETHR_GCC_ATOMIC_MEMBAR_H__
#define ETHR_LoadLoad (1 << 0)
#define ETHR_LoadStore (1 << 1)
#define ETHR_StoreLoad (1 << 2)
#define ETHR_StoreStore (1 << 3)
+#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
+
+#if ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+ __asm__ __volatile__("dmb sy" : : : "memory");
+}
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_store_fence__(void)
+{
+ __asm__ __volatile__("dmb st" : : : "memory");
+}
+
+#define ETHR_MEMBAR(B) \
+ ETHR_CHOOSE_EXPR((B) == ETHR_StoreStore, ethr_store_fence__(), ethr_full_fence__())
+
+#elif ETHR_HAVE___sync_synchronize
+
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+ /*
+ * The compiler barriers are here to fix missing clobbers
+ * in __sync_synchronize() when using buggy LLVM
+ * implementation of __sync_synchronize(). They
+ * do not introduce any unnecessary overhead when used
+ * here, so we use them for all systems.
+ */
+ ETHR_COMPILER_BARRIER;
+ __sync_synchronize();
+ ETHR_COMPILER_BARRIER;
+}
+
+#else /* !ETHR_HAVE___sync_synchronize */
+
/*
- * According to the documentation __sync_synchronize() will
- * issue a full memory barrier. However, __sync_synchronize()
- * is known to erroneously be a noop on at least some
- * platforms with some gcc versions. This has suposedly been
- * fixed in some gcc version, but we don't know from which
- * version. Therefore, we only use it when it has been
- * verified to work. Otherwise we use the workaround
- * below.
+ * Buggy __sync_synchronize(); call __sync_val_compare_and_swap()
+ * instead which imply a full memory barrier (and hope that one
+ * isn't buggy too).
*/
-#if defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP32)
+#if (ETHR_HAVE___sync_val_compare_and_swap & 4)
# define ETHR_MB_T__ ethr_sint32_t
-#elif defined(ETHR_HAVE___SYNC_VAL_COMPARE_AND_SWAP64)
+#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
# define ETHR_MB_T__ ethr_sint64_t
-#else
-# error "No __sync_val_compare_and_swap"
#endif
-#define ETHR_SYNC_SYNCHRONIZE_WORKAROUND__ \
-do { \
- volatile ETHR_MB_T__ x___ = 0; \
- (void) __sync_val_compare_and_swap(&x___, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1); \
-} while (0)
-#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
+static __inline__ __attribute__((__always_inline__)) void
+ethr_full_fence__(void)
+{
+ volatile ETHR_MB_T__ x = 0;
+ (void) __sync_val_compare_and_swap(&x, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1);
+}
-#if defined(__mips__) && ETHR_AT_LEAST_GCC_VSN__(4, 2, 0)
-# define ETHR_MEMBAR(B) __sync_synchronize()
-# define ETHR_READ_DEPEND_MEMORY_BARRIER __sync_synchronize()
-#elif ((defined(__powerpc__) || defined(__ppc__)) \
- && ETHR_AT_LEAST_GCC_VSN__(4, 1, 2))
-# define ETHR_MEMBAR(B) __sync_synchronize()
-#else /* Use workaround */
-# define ETHR_MEMBAR(B) \
- ETHR_SYNC_SYNCHRONIZE_WORKAROUND__
-# define ETHR_READ_DEPEND_MEMORY_BARRIER \
- ETHR_SYNC_SYNCHRONIZE_WORKAROUND__
+#endif /* !ETHR_HAVE___sync_synchronize */
+
+#ifndef ETHR_MEMBAR
+# define ETHR_MEMBAR(B) ethr_full_fence__()
#endif
+/*
+ * Define ETHR_READ_DEPEND_MEMORY_BARRIER for all architechtures
+ * not known to order data dependent loads
+ */
+
+#if !defined(__ia64__) && !defined(__arm__)
+# define ETHR_READ_DEPEND_MEMORY_BARRIER ETHR_MEMBAR(ETHR_LoadLoad)
+#endif
-#endif /* ETHR_GCC_MEMBAR_H__ */
+#endif /* ETHR_GCC_ATOMIC_MEMBAR_H__ */