erts/include/internal/gcc/ethr_membar.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

/*
 * %CopyrightBegin%
 *
 * Copyright Ericsson AB 2011-2015. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * %CopyrightEnd%
 */

/*
 * Description: Memory barriers when using gcc's __atomic and
 *              __sync builtins
 * Author: Rickard Green
 *
 * Note: The C11 memory model implemented by gcc's __atomic
 *       builtins does not match the ethread API very well.
 *
 *       A function with a barrier postfix in the ethread atomic
 *       API needs to ensure that all stores and loads are
 *       ordered around it according to the semantics of the
 *       barrier specified.
 *
 *       The C11 aproch is different. The __atomic builtins
 *       API takes a memory model parameter. Assuming that all
 *       memory syncronizations using the involved atomic
 *       variables are made using this API, the synchronizations
 *       will adhere to the memory models used. That is, you do
 *       *not* know how loads and stores will be ordered around
 *       a specific __atomic operation in the general case. You
 *       only know the total effect of the combination of
 *       operations issued will adhere to the model.
 *
 *       This limits how we can use the __atomic builtins. What
 *       we cannot use:
 *
 *       1. We cannot rely on __atomic_thread_fence() to issue
 *          any specific memory barriers at all. This regardless
 *          of memory model parameter passed. That is, we cannot
 *          use the __atomic_thread_fence() builtin at all.
 *
 *          Why is this? If all __atomic builtins accessing
 *          memory issue memory barriers, __atomic_thread_fence()
 *          does not have to issue memory barriers. The
 *          implementation for the Itanium architecture is an
 *          example of this. Even using the __ATOMIC_RELAXED
 *          memory model all __atomic builtins accessing memory
 *          will issue memory barriers. Due to this no memory
 *          barriers at all will be issued by
 *           __atomic_thread_fence() using either one of the
 *          __ATOMIC_CONSUME, __ATOMIC_ACQUIRE, or
 *          __ATOMIC_RELEASE memory models.
 *
 *       2. We cannot rely on any __atomic builtin with the
 *          __ATOMIC_SEQ_CST memory model parameters to
 *          issue any specific memory barriers. That is, we
 *          cannot use these memory models at all.
 *
 *          Why is this? Since all synchronizations is expected
 *          to be made using the __atomic builtins, memory
 *          barriers only have to be issued by some of them,
 *          and you do not know which ones wont issue memory
 *          barriers.
 *
 *          One can easily be fooled into believing that when
 *          using the __ATOMIC_SEQ_CST memory model on all
 *          operations, all operations will issue full memory
 *          barriers. This is however not the case. The
 *          implementation for the x86_64 architecture is an
 *          example of this. Since all operations except loads
 *          issue full memory barriers, no memory barriers at
 *          all is issued by loads. This could also be
 *          implemented by issuing a full memory barrier on
 *          loads, but no barrier at all on stores.
 *
 *       What can be used then?
 *       1. All (legacy) __sync builtins implying full memory
 *          barriers issued.
 *       2. All __atomic builtins using the __ATOMIC_RELAXED
 *          memory model can, of course, be used. This since
 *          no ordering guarantees at all are made.
 *       3. All __atomic builtins accessing memory using the
 *          __ATOMIC_ACQUIRE and __ATOMIC_RELEASE memory
 *          models. This since an __atomic builtin memory
 *          access using the __ATOMIC_ACQUIRE must at least
 *          issue an aquire memory barrier and an __atomic
 *          builtin memory acess with the __ATOMIC_RELEASE
 *          memory model must at least issue a release memory
 *          barrier. Otherwise the two can not be paired.
 *       4. All __atomic builtins accessing memory using the
 *          __ATOMIC_CONSUME builtin can be used for the same
 *          reason __ATOMIC_ACQUIRE can be used. The ethread
 *          atomic framework implementing the ethread API
 *          using native implementations does not expect the
 *          native implementations to produce versions with
 *          data dependent read barriers, so until the
 *          framework is changed we haven't got any use for
 *          for it.
 *
 *       For some architectures we have our own memory barrier
 *       implementations. We prefer to use these since they
 *       should be as fine grained as possible. For other
 *       architectures we use the __sync_synchronize() builtin
 *       which issue a full memory barrier. For these
 *       architectures we have to assume that all loads and
 *       stores can be reordered without limitation. That is,
 *       unnecessary memory barriers will be issued if such
 *       reordering actually cannot occur.
 */

/*
 * We prefer to use our own memory barrier implementation if
 * such exist instead of using __sync_synchronize()...
 */
#if defined(__i386__) || defined(__x86_64__)
#  include "../i386/ethr_membar.h"
#elif defined(__sparc__)
#  include "../sparc32/ethr_membar.h"
#elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc64__)
#  include "../ppc32/ethr_membar.h"
#elif !defined(ETHR_GCC_ATOMIC_MEMBAR_H__)			\
    && (ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION			\
	|| ETHR_HAVE___sync_synchronize				\
	|| (ETHR_HAVE___sync_val_compare_and_swap & 12))
#define ETHR_GCC_ATOMIC_MEMBAR_H__

#define ETHR_LoadLoad	(1 << 0)
#define ETHR_LoadStore	(1 << 1)
#define ETHR_StoreLoad	(1 << 2)
#define ETHR_StoreStore	(1 << 3)

#define ETHR_COMPILER_BARRIER __asm__ __volatile__("" : : : "memory")

#if ETHR_HAVE_GCC_ASM_ARM_DMB_INSTRUCTION

static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
    __asm__ __volatile__("dmb sy" : : : "memory");
}

static __inline__ __attribute__((__always_inline__)) void
ethr_store_fence__(void)
{
    __asm__ __volatile__("dmb st" : : : "memory");
}

#define ETHR_MEMBAR(B) \
 ETHR_CHOOSE_EXPR((B) == ETHR_StoreStore, ethr_store_fence__(), ethr_full_fence__())

#elif ETHR_HAVE___sync_synchronize

static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
    /*
     * The compiler barriers are here to fix missing clobbers
     * in __sync_synchronize() when using buggy LLVM
     * implementation of __sync_synchronize(). They
     * do not introduce any unnecessary overhead when used
     * here, so we use them for all systems.
     */
    ETHR_COMPILER_BARRIER;
    __sync_synchronize();
    ETHR_COMPILER_BARRIER;
}

#else /* !ETHR_HAVE___sync_synchronize */

/*
 * Buggy __sync_synchronize(); call __sync_val_compare_and_swap()
 * instead which imply a full memory barrier (and hope that one
 * isn't buggy too).
 */

#if (ETHR_HAVE___sync_val_compare_and_swap & 4)
#  define ETHR_MB_T__ ethr_sint32_t
#elif (ETHR_HAVE___sync_val_compare_and_swap & 8)
#  define ETHR_MB_T__ ethr_sint64_t
#endif

static __inline__ __attribute__((__always_inline__)) void
ethr_full_fence__(void)
{
    volatile ETHR_MB_T__ x = 0;
    (void) __sync_val_compare_and_swap(&x, (ETHR_MB_T__) 0, (ETHR_MB_T__) 1);
}

#endif /* !ETHR_HAVE___sync_synchronize */

#ifndef ETHR_MEMBAR
#  define ETHR_MEMBAR(B) ethr_full_fence__()
#endif

/*
 * Define ETHR_READ_DEPEND_MEMORY_BARRIER for all architechtures
 * not known to order data dependent loads
 */

#if !defined(__ia64__) && !defined(__arm__)
#  define ETHR_READ_DEPEND_MEMORY_BARRIER ETHR_MEMBAR(ETHR_LoadLoad)
#endif

#endif /* ETHR_GCC_ATOMIC_MEMBAR_H__ */