/*
 * %CopyrightBegin%
 *
 * Copyright Ericsson AB 2011-2015. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * %CopyrightEnd%
 */

/*
 * Description: Memory barriers when using gcc's __atomic and
 *              __sync builtins
 * Author: Rickard Green
 *
 * Note: The C11 memory model implemented by gcc's __atomic
 *       builtins does not match the ethread API very well.
 *
 *       A function with a barrier postfix in the ethread atomic
 *       API needs to ensure that all stores and loads are
 *       ordered around it according to the semantics of the
 *       barrier specified.
 *
 *       The C11 aproch is different. The __atomic builtins
 *       API takes a memory model parameter. Assuming that all
 *       memory syncronizations using the involved atomic
 *       variables are made using this API, the synchronizations
 *       will adhere to the memory models used. That is, you do
 *       *not* know how loads and stores will be ordered around
 *       a specific __atomic operation in the general case. You
 *       only know the total effect of the combination of
 *       operations issued will adhere to the model.
 *
 *       This limits how we can use the __atomic builtins. What
 *       we cannot use:
 *
 *       1. We cannot rely on __atomic_thread_fence() to issue
 *          any specific memory barriers at all. This regardless
 *          of memory model parameter passed. That is, we cannot
 *          use the __atomic_thread_fence() builtin at all.
 *
 *          Why is this? If all __atomic builtins accessing
 *          memory issue memory barriers, __atomic_thread_fence()
 *          does not have to issue memory barriers. The
 *          implementation for the Itanium architecture is an
 *          example of this. Even using the __ATOMIC_RELAXED
 *          memory model all __atomic builtins accessing memory
 *          will issue memory barriers. Due to this no memory
 *          barriers at all will be issued by
 *           __atomic_thread_fence() using either one of the
 *          __ATOMIC_CONSUME, __ATOMIC_ACQUIRE, or
 *          __ATOMIC_RELEASE memory models.
 *
 *       2. We cannot rely on any __atomic builtin with the
 *          __ATOMIC_SEQ_CST memory model parameters to
 *          issue any specific memory barriers. That is, we
 *          cannot use these memory models at all.
 *
 *          Why is this? Since all synchronizations is expected
 *          to be made using the __atomic builtins, memory
 *          barriers only have to be issued by some of them,
 *          and you do not know which ones wont issue memory
 *          barriers.
 *
 *          One can easily be fooled into believing that when
 *          using the __ATOMIC_SEQ_CST memory model on all
 *          operations, all operations will issue full memory
 *          barriers. This is however not the case. The
 *          implementation for the x86_64 architecture is an
 *          example of this. Since all operations except loads
 *          issue full memory barriers, no memory barriers at
 *          all is issued by loads. This could also be
 *          implemented by issuing a full memory barrier on
 *          loads, but no barrier at all on stores.
 *
 *       What can be used then?
 *       1. All (legacy) __sync builtins implying full memory
 *          barriers issued.
 *       2. All __atomic builtins using the __ATOMIC_RELAXED
 *          memory model can, of course, be used. This since
 *          no ordering guarantees at all are made.
 *       3. All __atomic builtins accessing memory using the
 *          __ATOMIC_ACQUIRE and __ATOMIC_RELEASE memory
 *          models. This since an __atomic builtin memory
 *          access using the __ATOMIC_ACQUIRE must at least
 *          issue an aquire memory barrier and an __atomic
 *          builtin memory acess with the __ATOMIC_RELEASE
 *          memory model must at least issue a release memory
 *          barrier. Otherwise the two can not be paired.
 *       4. All __atomic builtins accessing memory using the
 *          __ATOMIC_CONSUME builtin can be used for the same
 *          reason __ATOMIC_ACQUIRE can be used. The ethread
 *          atomic framework implementing the ethread API
 *          using native implementations does not expect the
 *          native implementations to produce versions with
 *          data dependent read barriers, so until the
 *          framework is changed we haven't got any use for
 *          for it.
 *
 *       For some architectures we have our own memory barrier
 *       implementations. We prefer to use these since they
 *       should be as fine grained as possible. For other
 *       architectures we use the __sync_synchronize() builtin
 *       which issue a full memory barrier. For these
 *       architectures we have to assume that all loads and
 *       stores can be reordered without limitation. That is,
 *       unnecessary memory barriers will be issued if such
 *       reordering actually cannot occur.
 */

/*
 * We prefer to use our own memory barrier implementation if
 * such exist instead of using __sync_synchronize()...
 */
#if defined(__i386__) || defined(__x86_64__)
#  include "../i386/ethr_membar.h"
#elif defined(__sparc__)
#  include "../sparc32/ethr_membar.h"
#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__)
#  include "../ppc32/ethr_membar.h"
#elif !defined(ETHR_GCC_ATOMIC_MEMBAR_H__)			\
    && (ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION			\
	|| ETHR_HAVE___sync_synchronize				\
	|| (ETHR_HAVE___sync_val_compare_and_swap & 12))
#define ETHR_GCC_ATOMIC_MEMBAR_H__

#define ETHR_LoadLoad	(1 << 0)
#define ETHR_LoadStore	(1 << 1)
#define ETHR_StoreLoad	(1 << 2)
#define ETHR_StoreStore	(1 << 3)

#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")

#if ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION

static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
    __asm__ __volatile__("dmb sy" : : : "memory");
}

static __inline__ __attribute__((__always_inline__)) void
ethr_store_fence__(void)
{
    __asm__ __volatile__("dmb st" : : : "memory");
}

#define ETHR_MEMBAR(B) \
 ETHR_CHOOSE_EXPR((B) == ETHR_StoreStore, ethr_store_fence__(), ethr_full_fence__())

#elif ETHR_HAVE___sync_synchronize

static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
    /*
     * The compiler barriers are here to fix missing clobbers
     * in __sync_synchronize() when using buggy LLVM
     * implementation of __sync_synchronize(). They
     * do not introduce any unnecessary overhead when used
     * here, so we use them for all systems.
     */
    ETHR_COMPILER_BARRIER;
    __sync_synchronize();
    ETHR_COMPILER_BARRIER;
}

#else /* !ETHR_HAVE___sync_synchronize */

/*
 * Buggy __sync_synchronize(); call __sync_val_compare_and_swap()
 * instead which imply a full memory barrier (and hope that one
 * isn't buggy too).
 */

#if (ETHR_HAVE___sync_val_compare_and_swap & 4)
#  define ETHR_MB_T__ ethr_sint32_t
#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
#  define ETHR_MB_T__ ethr_sint64_t
#endif

static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
    volatile ETHR_MB_T__ x = 0;
    (void) __sync_val_compare_and_swap(&x, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1);
}

#endif /* !ETHR_HAVE___sync_synchronize */

#ifndef ETHR_MEMBAR
#  define ETHR_MEMBAR(B) ethr_full_fence__()
#endif

/*
 * Define ETHR_READ_DEPEND_MEMORY_BARRIER for all architechtures
 * not known to order data dependent loads
 */

#if !defined(__ia64__) && !defined(__arm__)
#  define ETHR_READ_DEPEND_MEMORY_BARRIER ETHR_MEMBAR(ETHR_LoadLoad)
#endif

#endif /* ETHR_GCC_ATOMIC_MEMBAR_H__ */