|
|
/*
* %CopyrightBegin%
*
* Copyright Ericsson AB 2011-2015. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* %CopyrightEnd%
*/
/*
* Description: Memory barriers when using gcc's __atomic and
* __sync builtins
* Author: Rickard Green
*
* Note: The C11 memory model implemented by gcc's __atomic
* builtins does not match the ethread API very well.
*
* A function with a barrier postfix in the ethread atomic
* API needs to ensure that all stores and loads are
* ordered around it according to the semantics of the
* barrier specified.
*
* The C11 aproch is different. The __atomic builtins
* API takes a memory model parameter. Assuming that all
* memory syncronizations using the involved atomic
* variables are made using this API, the synchronizations
* will adhere to the memory models used. That is, you do
* *not* know how loads and stores will be ordered around
* a specific __atomic operation in the general case. You
* only know the total effect of the combination of
* operations issued will adhere to the model.
*
* This limits how we can use the __atomic builtins. What
* we cannot use:
*
* 1. We cannot rely on __atomic_thread_fence() to issue
* any specific memory barriers at all. This regardless
* of memory model parameter passed. That is, we cannot
* use the __atomic_thread_fence() builtin at all.
*
* Why is this? If all __atomic builtins accessing
* memory issue memory barriers, __atomic_thread_fence()
* does not have to issue memory barriers. The
* implementation for the Itanium architecture is an
* example of this. Even using the __ATOMIC_RELAXED
* memory model all __atomic builtins accessing memory
* will issue memory barriers. Due to this no memory
* barriers at all will be issued by
* __atomic_thread_fence() using either one of the
* __ATOMIC_CONSUME, __ATOMIC_ACQUIRE, or
* __ATOMIC_RELEASE memory models.
*
* 2. We cannot rely on any __atomic builtin with the
* __ATOMIC_SEQ_CST memory model parameters to
* issue any specific memory barriers. That is, we
* cannot use these memory models at all.
*
* Why is this? Since all synchronizations is expected
* to be made using the __atomic builtins, memory
* barriers only have to be issued by some of them,
* and you do not know which ones wont issue memory
* barriers.
*
* One can easily be fooled into believing that when
* using the __ATOMIC_SEQ_CST memory model on all
* operations, all operations will issue full memory
* barriers. This is however not the case. The
* implementation for the x86_64 architecture is an
* example of this. Since all operations except loads
* issue full memory barriers, no memory barriers at
* all is issued by loads. This could also be
* implemented by issuing a full memory barrier on
* loads, but no barrier at all on stores.
*
* What can be used then?
* 1. All (legacy) __sync builtins implying full memory
* barriers issued.
* 2. All __atomic builtins using the __ATOMIC_RELAXED
* memory model can, of course, be used. This since
* no ordering guarantees at all are made.
* 3. All __atomic builtins accessing memory using the
* __ATOMIC_ACQUIRE and __ATOMIC_RELEASE memory
* models. This since an __atomic builtin memory
* access using the __ATOMIC_ACQUIRE must at least
* issue an aquire memory barrier and an __atomic
* builtin memory acess with the __ATOMIC_RELEASE
* memory model must at least issue a release memory
* barrier. Otherwise the two cannot be paired.
* 4. All __atomic builtins accessing memory using the
* __ATOMIC_CONSUME builtin can be used for the same
* reason __ATOMIC_ACQUIRE can be used. The ethread
* atomic framework implementing the ethread API
* using native implementations does not expect the
* native implementations to produce versions with
* data dependent read barriers, so until the
* framework is changed we haven't got any use for
* for it.
*
* For some architectures we have our own memory barrier
* implementations. We prefer to use these since they
* should be as fine grained as possible. For other
* architectures we use the __sync_synchronize() builtin
* which issue a full memory barrier. For these
* architectures we have to assume that all loads and
* stores can be reordered without limitation. That is,
* unnecessary memory barriers will be issued if such
* reordering actually cannot occur.
*/
/*
* We prefer to use our own memory barrier implementation if
* such exist instead of using __sync_synchronize()...
*/
#if defined(__i386__) || defined(__x86_64__)
# include "../i386/ethr_membar.h"
#elif defined(__sparc__)
# include "../sparc32/ethr_membar.h"
#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__)
# include "../ppc32/ethr_membar.h"
#elif !defined(ETHR_GCC_ATOMIC_MEMBAR_H__) \
&& (ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION \
|| ETHR_HAVE___sync_synchronize \
|| (ETHR_HAVE___sync_val_compare_and_swap & 12))
#define ETHR_GCC_ATOMIC_MEMBAR_H__
#define ETHR_LoadLoad (1 << 0)
#define ETHR_LoadStore (1 << 1)
#define ETHR_StoreLoad (1 << 2)
#define ETHR_StoreStore (1 << 3)
#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")
#if ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION
static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
__asm__ __volatile__("dmb sy" : : : "memory");
}
static __inline__ __attribute__((__always_inline__)) void
ethr_store_fence__(void)
{
__asm__ __volatile__("dmb st" : : : "memory");
}
#define ETHR_MEMBAR(B) \
ETHR_CHOOSE_EXPR((B) == ETHR_StoreStore, ethr_store_fence__(), ethr_full_fence__())
#elif ETHR_HAVE___sync_synchronize
static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
/*
* The compiler barriers are here to fix missing clobbers
* in __sync_synchronize() when using buggy LLVM
* implementation of __sync_synchronize(). They
* do not introduce any unnecessary overhead when used
* here, so we use them for all systems.
*/
ETHR_COMPILER_BARRIER;
__sync_synchronize();
ETHR_COMPILER_BARRIER;
}
#else /* !ETHR_HAVE___sync_synchronize */
/*
* Buggy __sync_synchronize(); call __sync_val_compare_and_swap()
* instead which imply a full memory barrier (and hope that one
* isn't buggy too).
*/
#if (ETHR_HAVE___sync_val_compare_and_swap & 4)
# define ETHR_MB_T__ ethr_sint32_t
#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
# define ETHR_MB_T__ ethr_sint64_t
#endif
static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
volatile ETHR_MB_T__ x = 0;
(void) __sync_val_compare_and_swap(&x, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1);
}
#endif /* !ETHR_HAVE___sync_synchronize */
#ifndef ETHR_MEMBAR
# define ETHR_MEMBAR(B) ethr_full_fence__()
#endif
/*
* Define ETHR_READ_DEPEND_MEMORY_BARRIER for all architechtures
* not known to order data dependent loads
*/
#if !defined(__ia64__) && !defined(__arm__)
# define ETHR_READ_DEPEND_MEMORY_BARRIER ETHR_MEMBAR(ETHR_LoadLoad)
#endif
#endif /* ETHR_GCC_ATOMIC_MEMBAR_H__ */
|