/*
 * %CopyrightBegin%
 *
 * Copyright Ericsson AB 2011-2016. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * %CopyrightEnd%
 */

/*
 * Description: Native double word atomics for x86/x86_64
 * Author: Rickard Green
 */

#ifndef ETHR_X86_DW_ATOMIC_H__
#define ETHR_X86_DW_ATOMIC_H__

#ifdef ETHR_GCC_HAVE_DW_CMPXCHG_ASM_SUPPORT

#define ETHR_HAVE_NATIVE_DW_ATOMIC
#define ETHR_NATIVE_DW_ATOMIC_IMPL "ethread"

/*
 * If ETHR_RTCHK_USE_NATIVE_DW_ATOMIC_IMPL__ is defined, it will be used
 * at runtime in order to determine if native or fallback implementation
 * should be used.
 */
#define ETHR_RTCHK_USE_NATIVE_DW_ATOMIC_IMPL__ \
  ETHR_X86_RUNTIME_CONF_HAVE_DW_CMPXCHG__

#if ETHR_SIZEOF_PTR == 4
typedef volatile ethr_sint64_t * ethr_native_dw_ptr_t;
#  define ETHR_DW_NATMC_ALIGN_MASK__ 0x7
#  define ETHR_DW_CMPXCHG_SFX__ "8b"
#  define ETHR_NATIVE_SU_DW_SINT_T ethr_sint64_t
#else
#ifdef ETHR_HAVE_INT128_T
#  define ETHR_NATIVE_SU_DW_SINT_T ethr_sint128_t
typedef volatile ethr_sint128_t * ethr_native_dw_ptr_t;
#else
typedef struct {
    ethr_sint64_t sint64[2];
} ethr_native_sint128_t__;
typedef volatile ethr_native_sint128_t__ * ethr_native_dw_ptr_t;
#endif
#  define ETHR_DW_NATMC_ALIGN_MASK__ 0xf
#  define ETHR_DW_CMPXCHG_SFX__ "16b"
#endif

/*
 * We need 16 byte aligned memory in 64-bit mode, and 8 byte aligned
 * memory in 32-bit mode. 16 byte aligned malloc in 64-bit mode is
 * not common, and at least some glibc malloc implementations
 * only 4 byte align in 32-bit mode.
 *
 * This code assumes 8 byte aligned memory in 64-bit mode, and 4 byte
 * aligned memory in 32-bit mode. A malloc implementation that does
 * not adhere to these alignment requirements is seriously broken,
 * and we wont bother trying to work around it.
 *
 * Since memory alignment may be off by one word we need to align at
 * runtime. We, therefore, need an extra word allocated.
 */
#define ETHR_DW_NATMC_MEM__(VAR) \
   (&var->c[(int) ((ethr_uint_t) &(VAR)->c[0]) & ETHR_DW_NATMC_ALIGN_MASK__])
typedef union {
#ifdef ETHR_NATIVE_SU_DW_SINT_T
    volatile ETHR_NATIVE_SU_DW_SINT_T dw_sint;
#endif
    volatile ethr_sint_t sint[3];
    volatile char c[ETHR_SIZEOF_PTR*3];
} ethr_native_dw_atomic_t;


#if (defined(ETHR_TRY_INLINE_FUNCS) \
     || defined(ETHR_ATOMIC_IMPL__) \
     || defined(ETHR_X86_SSE2_ASM_C__)) \
    && ETHR_SIZEOF_PTR == 4 \
    && defined(ETHR_GCC_HAVE_SSE2_ASM_SUPPORT)
ethr_sint64_t
ethr_sse2_native_su_dw_atomic_read(ethr_native_dw_atomic_t *var);
void
ethr_sse2_native_su_dw_atomic_set(ethr_native_dw_atomic_t *var,
				  ethr_sint64_t val);
#endif

#if (defined(ETHR_TRY_INLINE_FUNCS) \
     || defined(ETHR_ATOMIC_IMPL__) \
     || defined(ETHR_X86_SSE2_ASM_C__))
#  ifdef ETHR_DEBUG
#    define ETHR_DW_DBG_ALIGNED__(PTR) \
       ETHR_ASSERT((((ethr_uint_t) (PTR)) & ETHR_DW_NATMC_ALIGN_MASK__) == 0);
#  else
#    define ETHR_DW_DBG_ALIGNED__(PTR)
#  endif
#endif

#if defined(ETHR_TRY_INLINE_FUNCS) || defined(ETHR_ATOMIC_IMPL__)

#define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_ADDR
static ETHR_INLINE ethr_sint_t *
ethr_native_dw_atomic_addr(ethr_native_dw_atomic_t *var)
{
    return (ethr_sint_t *) ETHR_DW_NATMC_MEM__(var);
}

#if defined(ETHR_CMPXCHG8B_PIC_NO_CLOBBER_EBX) && defined(__PIC__) && __PIC__
#if ETHR_SIZEOF_PTR != 4
#  error unexpected pic issue
#endif
/*
 * When position independent code is used in 32-bit mode, the EBX register
 * is used for storage of global offset table address. When compiling with
 * an old gcc (< vsn 5) we may not use it as input or output in an inline
 * asm. We then need to save and restore the EBX register explicitly (for
 * some reason old gcc compilers didn't provide this service to us).
 * ETHR_CMPXCHG8B_PIC_NO_CLOBBER_EBX will be defined if we need to
 * explicitly manage EBX ourselves.
 *
 */
#  define ETHR_NO_CLOBBER_EBX__ 1
#else
#  define ETHR_NO_CLOBBER_EBX__ 0
#endif

#if ETHR_NO_CLOBBER_EBX__ && !defined(ETHR_CMPXCHG8B_REGISTER_SHORTAGE)
/* When no optimization is on, we'll run into a register shortage */
#  if defined(ETHR_DEBUG) || defined(DEBUG) || defined(VALGRIND) \
      || defined(GCOV) || defined(PURIFY) || defined(PURECOV)
#    define ETHR_CMPXCHG8B_REGISTER_SHORTAGE 1
#  else
#    define ETHR_CMPXCHG8B_REGISTER_SHORTAGE 0
#  endif
#endif


#define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG_MB

static ETHR_INLINE int
ethr_native_dw_atomic_cmpxchg_mb(ethr_native_dw_atomic_t *var,
				 ethr_sint_t *new,
				 ethr_sint_t *xchg)
{
    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
    char xchgd;

    ETHR_DW_DBG_ALIGNED__(p);

#if ETHR_NO_CLOBBER_EBX__ && ETHR_CMPXCHG8B_REGISTER_SHORTAGE
    /*
     * gcc wont let us use ebx as input and we
     * get a register shortage
     */

    __asm__ __volatile__(
	"pushl %%ebx\n\t"
	"movl (%7), %%ebx\n\t"
	"movl 4(%7), %%ecx\n\t"
	"lock; cmpxchg8b %0\n\t"
	"setz %3\n\t"
	"popl %%ebx\n\t"
	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=c"(xchgd)
	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "r"(new)
	: "cc", "memory");

#elif ETHR_NO_CLOBBER_EBX__
    /*
     * gcc wont let us use ebx as input
     */

    __asm__ __volatile__(
	"pushl %%ebx\n\t"
	"movl %8, %%ebx\n\t"
	"lock; cmpxchg8b %0\n\t"
	"setz %3\n\t"
	"popl %%ebx\n\t"
	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=q"(xchgd)
	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "c"(new[1]), "r"(new[0])
	: "cc", "memory");

#else
    /*
     * gcc lets us place values in the registers where
     * we want them
     */

    __asm__ __volatile__(
	"lock; cmpxchg" ETHR_DW_CMPXCHG_SFX__ " %0\n\t"
	"setz %3\n\t"
	: "=m"(*p), "=d"(xchg[1]), "=a"(xchg[0]), "=q"(xchgd)
	: "m"(*p), "1"(xchg[1]), "2"(xchg[0]), "c"(new[1]), "b"(new[0])
	: "cc", "memory");

#endif

    return (int) xchgd;
}

#undef ETHR_NO_CLOBBER_EBX__

#if ETHR_SIZEOF_PTR == 4 && defined(ETHR_GCC_HAVE_SSE2_ASM_SUPPORT)

typedef union {
    ethr_sint64_t sint64;
    ethr_sint_t sint[2];
} ethr_dw_atomic_no_sse2_convert_t;

#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_READ

static ETHR_INLINE ethr_sint64_t
ethr_native_su_dw_atomic_read(ethr_native_dw_atomic_t *var)
{
    if (ETHR_X86_RUNTIME_CONF_HAVE_SSE2__)
	return ethr_sse2_native_su_dw_atomic_read(var);
    else {
	ethr_sint_t new[2];
	ethr_dw_atomic_no_sse2_convert_t xchg;
	new[0] = new[1] = xchg.sint[0] = xchg.sint[1] = 0x83838383;
	(void) ethr_native_dw_atomic_cmpxchg_mb(var, new, xchg.sint);
	return xchg.sint64;
    }
}

#define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_SET

static ETHR_INLINE void
ethr_native_su_dw_atomic_set(ethr_native_dw_atomic_t *var,
			     ethr_sint64_t val)
{
    if (ETHR_X86_RUNTIME_CONF_HAVE_SSE2__)
	ethr_sse2_native_su_dw_atomic_set(var, val);
    else {
	ethr_sint_t xchg[2] = {0, 0};
	ethr_dw_atomic_no_sse2_convert_t new;
	new.sint64 = val;
	while (!ethr_native_dw_atomic_cmpxchg_mb(var, new.sint, xchg));
    }
}

#endif /* ETHR_SIZEOF_PTR == 4 */

#endif /* ETHR_TRY_INLINE_FUNCS */

#if defined(ETHR_X86_SSE2_ASM_C__) \
    && ETHR_SIZEOF_PTR == 4 \
    && defined(ETHR_GCC_HAVE_SSE2_ASM_SUPPORT)

/*
 * 8-byte aligned loads and stores of 64-bit values are atomic from
 * pentium and forward. An ordinary volatile load or store in 32-bit
 * mode generates two 32-bit operations (at least with gcc-4.1.2 using
 * -msse2). In order to guarantee one 64-bit load/store operation
 * from/to memory we load/store via an xmm register using movq.
 *
 * Load/store can be achieved using cmpxchg8b, however, using movq is
 * much faster. Unfortunately we cannot do the same thing in 64-bit
 * mode; instead, we have to do loads and stores via cmpxchg16b.
 *
 * We do not inline these, but instead compile these into a separate
 * object file using -msse2. This since we don't want to use -msse2 for
 * the whole system. If we detect sse2 support (pentium4 and forward)
 * at runtime, we use them; otherwise, we fall back to using cmpxchg8b
 * for loads and stores. This way the binary can be moved between
 * processors with and without sse2 support.
 */

ethr_sint64_t
ethr_sse2_native_su_dw_atomic_read(ethr_native_dw_atomic_t *var)
{
    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
    ethr_sint64_t val;
    ETHR_DW_DBG_ALIGNED__(p);
    __asm__ __volatile__("movq %1, %0\n\t" : "=x"(val) : "m"(*p) : "memory");
    return val;
}

void
ethr_sse2_native_su_dw_atomic_set(ethr_native_dw_atomic_t *var,
				  ethr_sint64_t val)
{
    ethr_native_dw_ptr_t p = (ethr_native_dw_ptr_t) ETHR_DW_NATMC_MEM__(var);
    ETHR_DW_DBG_ALIGNED__(p);
    __asm__ __volatile__("movq %1, %0\n\t" : "=m"(*p) : "x"(val) : "memory");
}

#endif /* ETHR_X86_SSE2_ASM_C__ */

#endif /* ETHR_GCC_HAVE_DW_CMPXCHG_ASM_SUPPORT */

#endif /* ETHR_X86_DW_ATOMIC_H__ */