17 files changed, 1680 insertions, 97 deletions
diff --git a/erts/aclocal.m4 b/erts/aclocal.m4
index ed492d55ff..d78025b0be 100644
--- a/erts/aclocal.m4
+++ b/erts/aclocal.m4
@@ -1421,9 +1421,31 @@ case "$THR_LIB_NAME" in
 	    	    	    int z;
 
 	    	    	    AO_nop_full();
+#if defined(AO_HAVE_store)
 	    	    	    AO_store(&x, (AO_t) 0);
+#elif defined(AO_HAVE_store_release)
+	    	    	    AO_store_release(&x, (AO_t) 0);
+#else
+#error No store
+#endif
+#if defined(AO_HAVE_load)
 	    	    	    z = AO_load(&x);
+#elif defined(AO_HAVE_load_acquire)
+	    	    	    z = AO_load_acquire(&x);
+#else
+#error No load
+#endif
+#if defined(AO_HAVE_compare_and_swap_full)
 	    	    	    z = AO_compare_and_swap_full(&x, (AO_t) 0, (AO_t) 1);
+#elif defined(AO_HAVE_compare_and_swap_release)
+	    	    	    z = AO_compare_and_swap_release(&x, (AO_t) 0, (AO_t) 1);
+#elif defined(AO_HAVE_compare_and_swap_acquire)
+	    	    	    z = AO_compare_and_swap_acquire(&x, (AO_t) 0, (AO_t) 1);
+#elif defined(AO_HAVE_compare_and_swap)
+	    	    	    z = AO_compare_and_swap(&x, (AO_t) 0, (AO_t) 1);
+#else
+#error No compare_and_swap
+#endif
 	    	        ],
 	    	        [ethr_have_native_atomics=yes
 	    	         ethr_have_libatomic_ops=yes])
diff --git a/erts/configure.in b/erts/configure.in
index 877e0d4c1c..1676d3d216 100644
--- a/erts/configure.in
+++ b/erts/configure.in
@@ -3970,6 +3970,7 @@ if test "$enable_dtrace_test" = "yes" ; then
                         DTRACE_ENABLED_2STEP=yes
 		     fi],
                     [])
+		$RM -f foo-dtrace.h
 		AS_IF([test "x$DTRACE_ENABLED_2STEP" = "xyes"],
 		      [AC_MSG_RESULT([yes])],
                       [AC_MSG_RESULT([no])])
diff --git a/erts/emulator/beam/erl_alloc_util.c b/erts/emulator/beam/erl_alloc_util.c
index 55052430e1..e3172dc4fb 100644
--- a/erts/emulator/beam/erl_alloc_util.c
+++ b/erts/emulator/beam/erl_alloc_util.c
@@ -205,7 +205,7 @@ MBC after deallocating first block:
      ASSERT(((UWord)(F) & (~FLG_MASK|THIS_FREE_BLK_HDR_FLG|PREV_FREE_BLK_HDR_FLG)) == THIS_FREE_BLK_HDR_FLG), \
      (B)->bhdr = ((Sz) | (F)), \
      (B)->u.carrier = (C))
-      
+
 #  define IS_MBC_FIRST_ABLK(AP,B) \
   ((((UWord)(B) & ~ERTS_SACRR_UNIT_MASK) == MBC_HEADER_SIZE(AP)) \
    && ((B)->bhdr & MBC_ABLK_OFFSET_MASK) == 0)
@@ -378,9 +378,8 @@ do {										\
 
 #ifdef ERTS_SMP
 #define SBC_HEADER_SIZE	   						\
-    (UNIT_CEILING(sizeof(Carrier_t)					\
-		  - sizeof(ErtsAlcCPoolData_t)				\
-		  + ABLK_HDR_SZ)					\
+    (UNIT_CEILING(offsetof(Carrier_t, cpool)                            \
+	          + ABLK_HDR_SZ)	                                \
      - ABLK_HDR_SZ)
 #else
 #define SBC_HEADER_SIZE	   						\
@@ -929,6 +928,88 @@ unlink_carrier(CarrierList_t *cl, Carrier_t *crr)
 
 #ifdef ERTS_SMP
 
+#ifdef DEBUG
+static int is_in_list(ErtsDoubleLink_t* sentinel, ErtsDoubleLink_t* node)
+{
+    ErtsDoubleLink_t* p;
+
+    ASSERT(node != sentinel);
+    for (p = sentinel->next; p != sentinel; p = p->next) {
+	if (p == node)
+	    return 1;
+    }
+    return 0;
+}
+#endif /* DEBUG */
+
+static ERTS_INLINE void
+link_edl_after(ErtsDoubleLink_t* after_me, ErtsDoubleLink_t* node)
+{
+    ErtsDoubleLink_t* before_me = after_me->next;
+    ASSERT(node != after_me && node != before_me);
+    node->next = before_me;
+    node->prev = after_me;
+    before_me->prev = node;
+    after_me->next = node;
+}
+
+static ERTS_INLINE void
+link_edl_before(ErtsDoubleLink_t* before_me, ErtsDoubleLink_t* node)
+{
+    ErtsDoubleLink_t* after_me = before_me->prev;
+    ASSERT(node != before_me && node != after_me);
+    node->next = before_me;
+    node->prev = after_me;
+    before_me->prev = node;
+    after_me->next = node;
+}
+
+static ERTS_INLINE void
+unlink_edl(ErtsDoubleLink_t* node)
+{
+    node->next->prev = node->prev;
+    node->prev->next = node->next;
+}
+
+static ERTS_INLINE void
+relink_edl_before(ErtsDoubleLink_t* before_me, ErtsDoubleLink_t* node)
+{
+    if (node != before_me && node != before_me->prev) {
+	unlink_edl(node);
+	link_edl_before(before_me, node);
+    }
+}
+
+static ERTS_INLINE int is_abandoned(Carrier_t *crr)
+{
+    return crr->cpool.abandoned.next != NULL;
+}
+
+static ERTS_INLINE void
+link_abandoned_carrier(ErtsDoubleLink_t* list, Carrier_t *crr)
+{
+    ASSERT(!is_abandoned(crr));
+
+    link_edl_after(list, &crr->cpool.abandoned);
+
+    ASSERT(crr->cpool.abandoned.next != &crr->cpool.abandoned);
+    ASSERT(crr->cpool.abandoned.prev != &crr->cpool.abandoned);
+}
+
+static ERTS_INLINE void
+unlink_abandoned_carrier(Carrier_t *crr)
+{
+    ASSERT(is_in_list(&crr->cpool.orig_allctr->cpool.pooled_list,
+		      &crr->cpool.abandoned) ||
+	   is_in_list(&crr->cpool.orig_allctr->cpool.traitor_list,
+		      &crr->cpool.abandoned));
+
+    unlink_edl(&crr->cpool.abandoned);
+
+    crr->cpool.abandoned.next = NULL;
+    crr->cpool.abandoned.prev = NULL;
+}
+
 static ERTS_INLINE void
 clear_busy_pool_carrier(Allctr_t *allctr, Carrier_t *crr)
 {
@@ -955,7 +1036,7 @@ clear_busy_pool_carrier(Allctr_t *allctr, Carrier_t *crr)
     }
 }
 
-#endif
+#endif /* ERTS_SMP */
 
 #if 0
 #define ERTS_DBG_CHK_FIX_LIST(A, FIX, IX, B)			\
@@ -2575,10 +2656,9 @@ mbc_realloc(Allctr_t *allctr, void *p, Uint size, Uint32 alcu_flgs,
 #ifdef ERTS_SMP
 
 #define ERTS_ALC_MAX_DEALLOC_CARRIER		10
-#define ERTS_ALC_CPOOL_MAX_FETCH_INSPECT	10
+#define ERTS_ALC_CPOOL_MAX_FETCH_INSPECT	20
+#define ERTS_ALC_CPOOL_MAX_TRAITOR_INSPECT	10
 #define ERTS_ALC_CPOOL_CHECK_LIMIT_COUNT	100
-#define ERTS_ALC_CPOOL_MAX_NO_CARRIERS		5
-#define ERTS_ALC_CPOOL_INSERT_ALLOWED_OFFSET	100
 #define ERTS_ALC_CPOOL_MAX_FAILED_STAT_READS	3
 
 #define ERTS_ALC_CPOOL_PTR_MOD_MRK		(((erts_aint_t) 1) << 0)
@@ -2755,9 +2835,6 @@ cpool_insert(Allctr_t *allctr, Carrier_t *crr)
 			(erts_aint_t) CARRIER_SZ(crr));
     erts_atomic_inc_nob(&allctr->cpool.stat.no_carriers);
 
-    erts_smp_atomic_set_nob(&crr->allctr,
-			    ((erts_aint_t) allctr)|ERTS_CRR_ALCTR_FLG_IN_POOL);
-
     /*
      * We search in 'next' direction and begin by passing
      * one element before trying to insert. This in order to
@@ -2816,6 +2893,9 @@ cpool_insert(Allctr_t *allctr, Carrier_t *crr)
     cpool_set_mod_marked(&cpd2p->prev,
 			 (erts_aint_t) &crr->cpool,
 			 (erts_aint_t) cpd1p);
+
+    erts_smp_atomic_set_wb(&crr->allctr,
+			   ((erts_aint_t) allctr)|ERTS_CRR_ALCTR_FLG_IN_POOL);
 }
 
 static void
@@ -2916,59 +2996,163 @@ cpool_delete(Allctr_t *allctr, Allctr_t *prev_allctr, Carrier_t *crr)
 static Carrier_t *
 cpool_fetch(Allctr_t *allctr, UWord size)
 {
-    int i;
+    int i, i_stop, has_passed_sentinel;
     Carrier_t *crr;
     ErtsAlcCPoolData_t *cpdp;
-    ErtsAlcCPoolData_t *sentinel = &carrier_pool[allctr->alloc_no].sentinel;
+    ErtsAlcCPoolData_t *cpool_entrance;
+    ErtsAlcCPoolData_t *sentinel;
+    ErtsDoubleLink_t* dl;
+    ErtsDoubleLink_t* first_old_traitor;
 
     ERTS_ALC_CPOOL_ASSERT(allctr->alloc_no == ERTS_ALC_A_INVALID /* testcase */
 			  || erts_thr_progress_is_managed_thread());
 
-    i = 0;
+    i = ERTS_ALC_CPOOL_MAX_FETCH_INSPECT;
+    first_old_traitor = allctr->cpool.traitor_list.next;
+    cpool_entrance = NULL;
 
-    /* First; check our own pending dealloc carrier list... */
-    crr = allctr->cpool.dc_list.last;
-    while (crr && i < ERTS_ALC_CPOOL_MAX_FETCH_INSPECT) {
-	if (erts_atomic_read_nob(&crr->cpool.max_size) >= size) {
-	    unlink_carrier(&allctr->cpool.dc_list, crr);
-#ifdef ERTS_ALC_CPOOL_DEBUG
-	    ERTS_ALC_CPOOL_ASSERT(erts_smp_atomic_xchg_nob(&crr->allctr,
-							   ((erts_aint_t) allctr))
-				  == (((erts_aint_t) allctr) & ~ERTS_CRR_ALCTR_FLG_MASK));
-#else
-	    erts_smp_atomic_set_nob(&crr->allctr, ((erts_aint_t) allctr));
-#endif
-	    return crr;
+    /*
+     * Search my own pooled_list,
+     * i.e my abandoned carriers that were in the pool last time I checked.
+     */
+
+    dl = allctr->cpool.pooled_list.next;
+    while(dl != &allctr->cpool.pooled_list) {
+	erts_aint_t exp, act;
+	crr = (Carrier_t *) (((char *) dl) - offsetof(Carrier_t, cpool.abandoned));
+
+	ASSERT(!is_in_list(&allctr->cpool.traitor_list, dl));
+	ASSERT(crr->cpool.orig_allctr == allctr);
+	dl = dl->next;
+	exp = erts_smp_atomic_read_rb(&crr->allctr);
+	if ((exp & ERTS_CRR_ALCTR_FLG_MASK) == ERTS_CRR_ALCTR_FLG_IN_POOL
+	    && erts_atomic_read_nob(&crr->cpool.max_size) >= size) {
+	    /* Try to fetch it... */
+	    act = erts_smp_atomic_cmpxchg_mb(&crr->allctr,
+					     (erts_aint_t) allctr,
+					     exp);
+	    if (act == exp) {
+		cpool_delete(allctr, ((Allctr_t *) (act & ~ERTS_CRR_ALCTR_FLG_MASK)), crr);
+		unlink_abandoned_carrier(crr);
+
+		/* Move sentinel to continue next search from here */
+		relink_edl_before(dl, &allctr->cpool.pooled_list);
+		return crr;
+	    }
+	    exp = act;
+	}
+	if (exp & ERTS_CRR_ALCTR_FLG_IN_POOL) {
+	    if (!cpool_entrance)
+		cpool_entrance = &crr->cpool;
+	}
+	else { /* Not in pool, move to traitor_list */
+	    unlink_abandoned_carrier(crr);
+	    link_abandoned_carrier(&allctr->cpool.traitor_list, crr);
+	}
+	if (--i <= 0) {
+	    /* Move sentinel to continue next search from here */
+	    relink_edl_before(dl, &allctr->cpool.pooled_list);
+	    return NULL;
 	}
-	crr = crr->prev;
-	i++;
     }
 
-    /* ... then the pool ... */
+    /* Now search traitor_list.
+     * i.e carriers employed by other allocators last time I checked.
+     * They might have been abandoned since then.
+     */
+
+    i_stop = (i < ERTS_ALC_CPOOL_MAX_TRAITOR_INSPECT ?
+	      0 : i - ERTS_ALC_CPOOL_MAX_TRAITOR_INSPECT);
+    dl = first_old_traitor;
+    while(dl != &allctr->cpool.traitor_list) {
+	erts_aint_t exp, act;
+	crr = (Carrier_t *) (((char *) dl) - offsetof(Carrier_t, cpool.abandoned));
+	ASSERT(dl != &allctr->cpool.pooled_list);
+	ASSERT(crr->cpool.orig_allctr == allctr);
+	dl = dl->next;
+	exp = erts_smp_atomic_read_rb(&crr->allctr);
+	if (exp & ERTS_CRR_ALCTR_FLG_IN_POOL) {
+	    if (!(exp & ERTS_CRR_ALCTR_FLG_BUSY)
+		&& erts_atomic_read_nob(&crr->cpool.max_size) >= size) {
+		/* Try to fetch it... */
+		act = erts_smp_atomic_cmpxchg_mb(&crr->allctr,
+						 (erts_aint_t) allctr,
+						 exp);
+		if (act == exp) {
+		    cpool_delete(allctr, ((Allctr_t *) (act & ~ERTS_CRR_ALCTR_FLG_MASK)), crr);
+		    unlink_abandoned_carrier(crr);
+
+		    /* Move sentinel to continue next search from here */
+		    relink_edl_before(dl, &allctr->cpool.traitor_list);
+		    return crr;
+		}
+		exp = act;
+	    }
+	    if (exp & ERTS_CRR_ALCTR_FLG_IN_POOL) {
+		if (!cpool_entrance)
+		    cpool_entrance = &crr->cpool;
+
+		/* Move to pooled_list */
+		unlink_abandoned_carrier(crr);
+		link_abandoned_carrier(&allctr->cpool.pooled_list, crr);
+	    }
+	}
+	if (--i <= i_stop) {
+	    /* Move sentinel to continue next search from here */
+	    relink_edl_before(dl, &allctr->cpool.traitor_list);
+	    if (i > 0)
+		break;
+	    else
+		return NULL;
+	}
+    }
 
     /*
-     * We search in 'prev' direction and begin by passing
-     * one element before trying to fetch. This in order to
-     * avoid contention with threads inserting elements.
+     * Finally search the shared pool and try employ foreign carriers
      */
 
-    cpdp = cpool_aint2cpd(cpool_read(&sentinel->prev));
-    if (cpdp == sentinel)
-	return NULL;
+    sentinel = &carrier_pool[allctr->alloc_no].sentinel;
+    if (cpool_entrance) {
+	/* We saw a pooled carried above, use it as entrance into the pool
+	 */
+	cpdp = cpool_entrance;
+    }
+    else {
+	/* No pooled carried seen above. Start search at cpool sentinel,
+	 * but begin by passing one element before trying to fetch.
+	 * This in order to avoid contention with threads inserting elements.
+	 */
+	cpool_entrance = sentinel;
+	cpdp = cpool_aint2cpd(cpool_read(&cpool_entrance->prev));
+	if (cpdp == sentinel)
+	    return NULL;
+    }
 
-    while (i < ERTS_ALC_CPOOL_MAX_FETCH_INSPECT) {
+    has_passed_sentinel = 0;
+    while (1) {
 	erts_aint_t exp;
 	cpdp = cpool_aint2cpd(cpool_read(&cpdp->prev));
-	if (cpdp == sentinel) {
+	if (cpdp == cpool_entrance) {
+	    if (cpool_entrance == sentinel) {
+		cpdp = cpool_aint2cpd(cpool_read(&cpdp->prev));
+		if (cpdp == sentinel)
+		    return NULL;
+	    }
+	    i = 0; /* Last one to inspect */
+	}
+	else if (cpdp == sentinel) {
+	    if (has_passed_sentinel) {
+		/* We been here before. cpool_entrance must have been removed */
+		return NULL;
+	    }
 	    cpdp = cpool_aint2cpd(cpool_read(&cpdp->prev));
 	    if (cpdp == sentinel)
 		return NULL;
-	    i = ERTS_ALC_CPOOL_MAX_FETCH_INSPECT; /* Last one to inspect */
+	    has_passed_sentinel = 1;
 	}
-	crr = (Carrier_t *) (((char *) cpdp) - offsetof(Carrier_t, cpool));
+	crr = (Carrier_t *)(((char *)cpdp) - offsetof(Carrier_t, cpool));
 	exp = erts_smp_atomic_read_rb(&crr->allctr);
-	if (((exp & (ERTS_CRR_ALCTR_FLG_IN_POOL|ERTS_CRR_ALCTR_FLG_BUSY))
-	     == ERTS_CRR_ALCTR_FLG_IN_POOL)
+	if (((exp & (ERTS_CRR_ALCTR_FLG_MASK)) == ERTS_CRR_ALCTR_FLG_IN_POOL)
 	    && (erts_atomic_read_nob(&cpdp->max_size) >= size)) {
 	    erts_aint_t act;
 	    /* Try to fetch it... */
@@ -2977,11 +3161,35 @@ cpool_fetch(Allctr_t *allctr, UWord size)
 					     exp);
 	    if (act == exp) {
 		cpool_delete(allctr, ((Allctr_t *) (act & ~ERTS_CRR_ALCTR_FLG_MASK)), crr);
+		if (crr->cpool.orig_allctr == allctr) {
+		    unlink_abandoned_carrier(crr);
+		}
 		return crr;
 	    }
 	}
-	i++;
+	if (--i <= 0)
+	    return NULL;
     }
+
+    /* Last; check our own pending dealloc carrier list... */
+    crr = allctr->cpool.dc_list.last;
+    while (crr) {
+	if (erts_atomic_read_nob(&crr->cpool.max_size) >= size) {
+	    unlink_carrier(&allctr->cpool.dc_list, crr);
+#ifdef ERTS_ALC_CPOOL_DEBUG
+	    ERTS_ALC_CPOOL_ASSERT(erts_smp_atomic_xchg_nob(&crr->allctr,
+							   ((erts_aint_t) allctr))
+				  == (((erts_aint_t) allctr) & ~ERTS_CRR_ALCTR_FLG_MASK));
+#else
+	    erts_smp_atomic_set_nob(&crr->allctr, ((erts_aint_t) allctr));
+#endif
+	    return crr;
+	}
+	crr = crr->prev;
+	if (--i <= 0)
+	    return NULL;
+    }
+
     return NULL;
 }
 
@@ -3078,6 +3286,9 @@ schedule_dealloc_carrier(Allctr_t *allctr, Carrier_t *crr)
 	return;
     }
 
+    if (is_abandoned(crr))
+	unlink_abandoned_carrier(crr);
+
     if (crr->cpool.thr_prgr == ERTS_THR_PRGR_INVALID
 	|| erts_thr_progress_has_reached(crr->cpool.thr_prgr)) {
 	dealloc_carrier(allctr, crr, 1);
@@ -3124,6 +3335,8 @@ cpool_init_carrier_data(Allctr_t *allctr, Carrier_t *crr)
 	    limit = (csz/100)*allctr->cpool.util_limit;
 	crr->cpool.abandon_limit = limit;
     }
+    crr->cpool.abandoned.next = NULL;
+    crr->cpool.abandoned.prev = NULL;
 }
 
 static void
@@ -3154,6 +3367,9 @@ abandon_carrier(Allctr_t *allctr, Carrier_t *crr)
     STAT_MBC_CPOOL_INSERT(allctr, crr);
 
     unlink_carrier(&allctr->mbc_list, crr);
+    if (crr->cpool.orig_allctr == allctr) {
+	link_abandoned_carrier(&allctr->cpool.pooled_list, crr);
+    }
 
     allctr->remove_mbc(allctr, crr);
 
@@ -3661,6 +3877,11 @@ destroy_carrier(Allctr_t *allctr, Block_t *blk, Carrier_t **busy_pcrr_pp)
 	if (busy_pcrr_pp && *busy_pcrr_pp) {
 	    ERTS_ALC_CPOOL_ASSERT(*busy_pcrr_pp == crr);
 	    *busy_pcrr_pp = NULL;
+	    ERTS_ALC_CPOOL_ASSERT(erts_smp_atomic_read_nob(&crr->allctr)
+				  == (((erts_aint_t) allctr)
+				      | ERTS_CRR_ALCTR_FLG_IN_POOL
+				      | ERTS_CRR_ALCTR_FLG_BUSY));
+	    erts_smp_atomic_set_nob(&crr->allctr, ((erts_aint_t) allctr));
 	    cpool_delete(allctr, allctr, crr);
 	}
 	else
@@ -5540,6 +5761,10 @@ erts_alcu_start(Allctr_t *allctr, AllctrInit_t *init)
 	    allctr->min_block_size = sz;
     }
 
+    allctr->cpool.pooled_list.next  = &allctr->cpool.pooled_list;
+    allctr->cpool.pooled_list.prev  = &allctr->cpool.pooled_list;
+    allctr->cpool.traitor_list.next = &allctr->cpool.traitor_list;
+    allctr->cpool.traitor_list.prev = &allctr->cpool.traitor_list;
     allctr->cpool.dc_list.first = NULL;
     allctr->cpool.dc_list.last = NULL;
     allctr->cpool.abandon_limit = 0;
diff --git a/erts/emulator/beam/erl_alloc_util.h b/erts/emulator/beam/erl_alloc_util.h
index 7be6b1ed9d..eee920e66c 100644
--- a/erts/emulator/beam/erl_alloc_util.h
+++ b/erts/emulator/beam/erl_alloc_util.h
@@ -268,6 +268,11 @@ typedef union {char c[ERTS_ALLOC_ALIGN_BYTES]; long l; double d;} Unit_t;
 
 #ifdef ERTS_SMP
 
+typedef struct ErtsDoubleLink_t_ {
+    struct ErtsDoubleLink_t_ *next;
+    struct ErtsDoubleLink_t_ *prev;
+}ErtsDoubleLink_t;
+
 typedef struct {
     erts_atomic_t next;
     erts_atomic_t prev;
@@ -277,6 +282,7 @@ typedef struct {
     UWord abandon_limit;
     UWord blocks;
     UWord blocks_size;
+    ErtsDoubleLink_t abandoned; /* node in pooled_list or traitor_list */
 } ErtsAlcCPoolData_t;
 
 #endif
@@ -500,7 +506,12 @@ struct Allctr_t_ {
     CarrierList_t	sbc_list;
 #ifdef ERTS_SMP
     struct {
-	CarrierList_t	dc_list;
+	/* pooled_list, traitor list and dc_list contain only
+           carriers _created_ by this allocator */
+	ErtsDoubleLink_t pooled_list;
+	ErtsDoubleLink_t traitor_list;
+	CarrierList_t	 dc_list;
+
 	UWord		abandon_limit;
 	int		disable_abandon;
 	int		check_limit_count;
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index 64bd598ba6..7ce950e090 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -472,7 +472,7 @@ cleanup_old_port_data(erts_aint_t data)
 	ErtsPortDataHeap *pdhp = (ErtsPortDataHeap *) data;
 	size_t size;
 	ERTS_SMP_DATA_DEPENDENCY_READ_MEMORY_BARRIER;
-	size = sizeof(ErtsPortDataHeap) + pdhp->hsize*(sizeof(Eterm) - 1);
+	size = sizeof(ErtsPortDataHeap) + (pdhp->hsize-1)*sizeof(Eterm);
 	erts_schedule_thr_prgr_later_cleanup_op(free_port_data_heap,
 						(void *) pdhp,
 						&pdhp->later_op,
@@ -508,7 +508,7 @@ erts_port_data_size(Port *prt)
     }
     else {
 	ErtsPortDataHeap *pdhp = (ErtsPortDataHeap *) data;
-	return (Uint) sizeof(ErtsPortDataHeap) + pdhp->hsize*(sizeof(Eterm)-1);
+	return (Uint) sizeof(ErtsPortDataHeap) + (pdhp->hsize-1)*sizeof(Eterm);
     }
 }
 
@@ -550,7 +550,7 @@ BIF_RETTYPE port_set_data_2(BIF_ALIST_2)
 
 	hsize = size_object(BIF_ARG_2);
 	pdhp = erts_alloc(ERTS_ALC_T_PORT_DATA_HEAP,
-			  sizeof(ErtsPortDataHeap) + hsize*(sizeof(Eterm)-1));
+			  sizeof(ErtsPortDataHeap) + (hsize-1)*sizeof(Eterm));
 	hp = &pdhp->heap[0];
 	pdhp->off_heap.first = NULL;
 	pdhp->off_heap.overhead = 0;
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 7b272885a7..b0e0cf13f8 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -3244,11 +3244,11 @@ chk_wake_sched(ErtsRunQueue *crq, int ix, int activate)
 	return 0;
     wrq = ERTS_RUNQ_IX(ix);
     flags = ERTS_RUNQ_FLGS_GET(wrq);
+    if (activate && !(flags & ERTS_RUNQ_FLG_SUSPENDED)) {
+	if (try_inc_no_active_runqs(ix+1))
+	    (void) ERTS_RUNQ_FLGS_UNSET(wrq, ERTS_RUNQ_FLG_INACTIVE);
+    }
     if (!(flags & (ERTS_RUNQ_FLG_SUSPENDED|ERTS_RUNQ_FLG_NONEMPTY))) {
-	if (activate) {
-	    if (try_inc_no_active_runqs(ix+1))
-		(void) ERTS_RUNQ_FLGS_UNSET(wrq, ERTS_RUNQ_FLG_INACTIVE);
-	}
 	wake_scheduler(wrq);
 	return 1;
     }
diff --git a/erts/emulator/internal_doc/CarrierMigration.md b/erts/emulator/internal_doc/CarrierMigration.md
index b93c11c6ec..2a9594db25 100644
--- a/erts/emulator/internal_doc/CarrierMigration.md
+++ b/erts/emulator/internal_doc/CarrierMigration.md
@@ -16,12 +16,12 @@ When a carrier is empty, i.e. contains only one large free block, it
 is deallocated. Since multiblock carriers can contain both allocated
 blocks and free blocks at the same time, an allocator instance might
 be stuck with a large amount of poorly utilized carriers if the memory
-load decrease. After a peak in memory usage it is expected that not
-all memory can be returned since the blocks still allocated is likely
+load decreases. After a peak in memory usage it is expected that not
+all memory can be returned since the blocks still allocated are likely
 to be dispersed over multiple carriers. Such poorly utilized carriers
-can usually be reused if the memory load increase again. However,
+can usually be reused if the memory load increases again. However,
 since each scheduler thread manages its own set of allocator
-instances, and memory load is not necessarily connected to CPU load we
+instances, and memory load is not necessarily correlated to CPU load, we
 might get into a situation where there are lots of poorly utilized
 multiblock carriers on some allocator instances while we need to
 allocate new multiblock carriers on other allocator instances. In
@@ -50,13 +50,13 @@ the allocator instance manages. Free blocks in one specific carrier
 can be referred to from potentially every other carrier that is
 managed, and the amount of such references can be huge. That is, the
 work of removing the free blocks of such a carrier from the search
-tree will be huge. One way of solving this could be to not migrate
+tree will be huge. One way of solving this could be not to migrate
 carriers that contain lots of free blocks, but this would prevent us
-from migrating carriers that potentially needs to be migrated in order
+from migrating carriers that potentially need to be migrated in order
 to solve the problem we set out to solve.
 
 By using one data structure of free blocks in each carrier and an
-allocator instance wide data structure of carriers managed by the
+allocator instance-wide data structure of carriers managed by the
 allocator instance, the work needed in order to remove and add
 carriers can be kept to a minimum. When migration of carriers is
 enabled on a specific allocator type, we require that an allocation
@@ -76,9 +76,9 @@ through a pool of carriers. In order for a carrier migration to
 complete, one scheduler needs to move the carrier into the pool, and
 another scheduler needs to take the carrier out of the pool.
 
-The pool is implemented as a lock free, circular, double linked,
+The pool is implemented as a lock-free, circular, double linked,
 list. The list contains a sentinel which is used as the starting point
-when inserting to, or fetching from the pool. Carriers in the pool are
+when inserting to, or fetching from, the pool. Carriers in the pool are
 elements in this list.
 
 The list can be modified by all scheduler threads
@@ -108,19 +108,19 @@ all search operations need to read the content of the sentinel. If we
 were to modify the sentinel, the cache line containing the sentinel
 would unnecessarily be bounced between processors.
 
-The `prev`, and `next` fields in the elements of the list contains the
+The `prev` and `next` fields in the elements of the list contain the
 value of the pointer, a modification marker, and a deleted
 marker. Memory operations on these fields are done using atomic memory
 operations. When a thread has set the modification marker in a field,
 no-one except the thread that set the marker is allowed to modify the
-field. If multiple modification markers needs to be set, we always
+field. If multiple modification markers need to be set, we always
 begin with `next` fields followed by `prev` fields in the order
 following the actual pointers. This guarantees that no deadlocks will
 occur.
 
 When a carrier is being removed from a pool, we mark it with a thread
 progress value that needs to be reached before we are allowed to
-modify the `next`, and `prev` fields. That is, until we reach this
+modify the `next` and `prev` fields. That is, until we reach this
 thread progress we are not allowed to insert the carrier into the pool
 again, and we are not allowed to deallocate the carrier. This ensures
 that threads inspecting the pool always will be able to traverse the
@@ -130,12 +130,12 @@ threads may have references to it via the pool.
 
 ### Migration ###
 
-There exist one pool for each allocator type enabling migration of
+There exists one pool for each allocator type enabling migration of
 carriers between scheduler specific allocator instances of the same
 allocator type.
 
 Each allocator instance keeps track of the current utilization of its
-multiblock carriers. When the utilization falls below the "abandon
+multiblock carriers. When the total utilization falls below the "abandon
 carrier utilization limit" it starts to inspect the utilization of the
 current carrier when deallocations are made. If also the utilization
 of the carrier falls below the "abandon carrier utilization limit" it
@@ -146,28 +146,53 @@ Since the carrier has been unlinked from the data structure of
 available free blocks, no more allocations will be made in the
 carrier. The allocator instance putting the carrier into the pool,
 however, still has the responsibility of performing deallocations in
-it while it remains in the pool.
+it while it remains in the pool. The allocator instance with this
+deallocation responsibility is here called the **employer**.
 
-Each carrier has a flag field containing information about allocator
-instance owning the carrier, a flag indicating if the carrier is in
+Each carrier has a flag field containing information about the
+employing allocator instance, a flag indicating if the carrier is in
 the pool or not, and a flag indicating if it is busy or not. When the
-carrier is in the pool, the owning allocator instance needs to mark it
+carrier is in the pool, the employing allocator instance needs to mark it
 as busy while operating on it. If another thread inspects it in order
-to try to fetch it from the pool, it will abort the fetch if it is
-busy. When fetching the carrier from the pool, ownership will changed
-and further deallocations in the carrier will be redirected to the new
-owner using the delayed dealloc functionality.
+to try to fetch it from the pool, it will skip it if it is busy. When
+fetching the carrier from the pool, employment will change and further
+deallocations in the carrier will be redirected to the new
+employer using the delayed dealloc functionality.
 
 If a carrier in the pool becomes empty, it will be withdrawn from the
 pool. All carriers that become empty are also always passed to its
-originating allocator instance for deallocation using the delayed
+**owning** allocator instance for deallocation using the delayed
 dealloc functionality. Since carriers this way always will be
-deallocated by the allocator instance that allocated the carrier the
+deallocated by the owner that allocated the carrier, the
 underlying functionality of allocating and deallocating carriers can
 remain simple and doesn't have to bother about multiple threads. In a
 NUMA system we will also not mix carriers originating from multiple
 NUMA nodes.
 
+In short:
+
+* The allocator instance that created a carrier **owns** it.
+* An empty carrier is always deallocated by its **owner**.
+* **Ownership** never changes.
+* The allocator instance that uses a carrier **employs** it.
+* An **employer** can abandon a carrier into the pool.
+* Pooled carriers are not allocated from.
+* Deallocation in a pooled carrier is still performed by its **employer**.
+* **Employment** can only change when a carrier is fetched from the pool.
+
+### Searching the pool ###
+
+To harbor real time characteristics, searching the pool is
+limited. We only inspect a limited number of carriers. If none of
+those carriers had a free block large enough to satisfy the allocation
+request, the search will fail. A carrier in the pool can also be busy
+if another thread is currently doing block deallocation work on the
+carrier. A busy carrier will also be skipped by the search as it can
+not satisfy the request. The pool is lock-free and we do not want to
+block, waiting for the other thread to finish.
+
+#### Before OTP 17.4 ####
+
 When an allocator instance needs more carrier space, it always begins
 by inspecting its own carriers that are waiting for thread progress
 before they can be deallocated. If no such carrier could be found, it
@@ -176,10 +201,69 @@ it will allocate a new carrier. Regardless of where the allocator
 instance gets the carrier from it the just links in the carrier into
 its data structure of free blocks.
 
+#### After OTP 17.4 ####
+
+The old search algorithm had a problem as the search always started at
+the same position in the pool, the sentinel. This could lead to
+contention from concurrent searching processes. But even worse, it
+could lead to a "bad" state when searches fail with a high rate
+leading to new carriers instead being allocated. These new carriers
+may later be inserted into the pool due to bad utilization. If the
+frequency of insertions into the pool is higher than successful
+fetching from the pool, memory will eventually get exhausted.
+
+This "bad" state consists of a cluster of small and/or highly
+fragmented carriers located at the sentinel in the pool. The largest free
+block in such a "bad" carrier is rather small, making it unable to satisfy
+most allocation requests. As the search always started at the
+sentinel, any such "bad" carriers that had been left in the pool would
+eventually cluster together at the sentinel. All searches first
+have to skip past this cluster of "bad" carriers to reach a "good"
+carrier. When the cluster gets to the same size as the search limit,
+all searches will essentially fail.
+
+To counter the "bad cluster" problem and also ease the contention, the
+search will now always start by first looking at the allocators **own**
+carriers. That is, carriers that were initially created by the
+allocator itself and later had been abandoned to the pool. If none of
+our own abandoned carrier would do, then the search continues into the
+pool, as before, to look for carriers created by other
+allocators. However, if we have at least one abandoned carrier of our
+own that could not satisfy the request, we can use that as entry point
+into the pool.
+
+The result is that we prefer carriers created by the thread itself,
+which is good for NUMA performance. And we get more entry points when
+searching the pool, which will ease contention and clustering.
+
+To do the first search among own carriers, every allocator instance
+has two new lists: `pooled_list` and `traitor_list`. These lists are only
+accessed by the allocator itself and they only contain the allocator's
+own carriers. When an owned carrier is abandoned and put in the
+pool, it is also linked into `pooled_list`. When we search our
+`pooled_list` and find a carrier that is no longer in the pool, we
+move that carrier from `pooled_list` to `traitor_list` as it is now
+employed by another allocator. If searching `pooled_list` fails, we
+also do a limited search of `traitor_list`. When finding an abandoned
+carrier in `traitor_list` it is either employed or moved back to
+`pooled_list` if it could not satisfy the allocation request.
+
+When searching `pooled_list` and `traitor_list` we always start at the
+point where the last search ended. This to avoid clustering
+problems and increase the probability to find a "good" carrier. As
+`pooled_list` and `traitor_list` are only accessed by the owning
+allocator instance, they need no thread synchronization at all.
+
+Furthermore, the search for own carriers that are scheduled
+for deallocation is now done as the last search option. The idea is
+that it is better to reuse a poorly utilized carrier than to
+resurrect an empty carrier that was just about to be released back to
+the OS.
+
 ### Result ###
 
 The use of this strategy of abandoning carriers with poor utilization
-and reusing these in allocator instances with an increased carrier
+and reusing them in allocator instances with an increased carrier
 demand is extremely effective and completely eliminates the problems
 that otherwise sometimes occurred when CPU load dropped while memory
 load did not.
diff --git a/erts/emulator/internal_doc/SuperCarrier.md b/erts/emulator/internal_doc/SuperCarrier.md
new file mode 100644
index 0000000000..0ad6af41de
--- /dev/null
+++ b/erts/emulator/internal_doc/SuperCarrier.md
@@ -0,0 +1,191 @@
+Super Carrier
+=============
+
+A super carrier is large memory area, allocated at VM start, which can
+be used during runtime to allocate normal carriers from.
+
+The super carrier feature was introduced in OTP R16B03. It is
+enabled with command line option +MMscs <size in Mb>
+and can be configured with other options.
+
+Problem
+-------
+
+The initial motivation for this feature was customers asking for a way
+to pre-allocate physcial memory at VM start for it to use.
+
+Other problems were different experienced limitations of the OS
+implementation of mmap:
+
+* Increasingly bad performance of mmap/munmap as the number of mmap'ed areas grow.
+* Fragmentation problem between mmap'ed areas.
+
+A third problem was management of low memory in the halfword
+emulator. The implementation used a naive linear search structure to
+hold free segments which would lead to poor performance when
+fragmentation increased.
+
+
+Solution
+--------
+
+Allocate one large continious area of address space at VM start and
+then use that area to satisfy our dynamic memory need during
+runtime. In other words: implement our own mmap.
+
+### Use cases ###
+
+If command line option +MMscrpm (Reserve Physical Memory) is set to
+false, only virtual space is allocated for the super carrier from
+start. The super carrier then acts as an "alternative mmap" implementation
+without changing the consumption of physical memory pages. Physical
+pages will be reserved on demand when an allocation is done from the super
+carrier and be unreserved when the memory is released back to the
+super carrier.
+
+If +MMscrpm is set to true, which is default, the initial allocation
+will reserve physical memory for the entire super carrier. This can be
+used by users that want to ensure a certain *minimum* amount of
+physical memory for the VM.
+
+However, what reservation of physical memory actually means highly
+depends on the operating system, and how it is configured. For
+example, different memory overcommit settings on Linux drastically
+change the behaviour.
+
+A third feature is to have the super carrier limit the *maximum*
+amount of memory used by the VM. If +MMsco (Super Carrier Only) is set
+to true, which is default, allocations will only be done from the
+super carrier. When the super carrier gets full, the VM will fail due
+to out of memory.
+If +MMsco is false, allocations will use mmap directly if the super
+carrier is full.
+
+
+
+### Implementation ###
+
+The entire super carrier implementation is kept in erl_mmap.c. The
+name suggest that it can be viewed as our own mmap implementation.
+
+A super carrier needs to satisfy two slightly different kinds of
+allocation requests; multi block carriers (MBC) and single block
+carriers (SBC). They are both rather large blocks of continious
+memory, but MBCs and SBCs have different demands on alignment and
+size.
+
+SBCs can have arbitrary size and do only need minimum 8-byte
+alignment.
+
+MBCs are more restricted. They can only have a number of fixed
+sizes that are powers of 2. The start address need to have a very
+large aligment (currently 256 kb, called "super alignment"). This is a
+design choice that allows very low overhead per allocated block in the
+MBC.
+
+To reduce fragmentation within the super carrier, it is good to keep SBCs
+and MBCs apart. MBCs with their uniform alignment and sizes can be
+packed very efficiently together. SBCs without demand for aligment can
+also be allocated quite efficiently together. But mixing them can lead
+to a lot of memory wasted when we need to create large holes of
+padding to the next alignment limit.
+
+The super carrier thus contains two areas. One area for MBCs growing from
+the bottom and up. And one area for SBCs growing from the top and
+down. Like a process with a heap and a stack growing towards each
+other.
+
+
+### Data structures ###
+
+The MBC area is called **sa** as in super aligned and the SBC area is
+called **sua** as in super un-aligned.
+
+Note that the "super" in super alignment and the "super" in super
+carrier has nothing to do with each other. We could have choosen
+another naming to avoid confusion, such as "meta" carrier or "giant"
+aligment.
+
+	+-------+ <---- sua.top
+	|  sua  |
+	|       |
+	|-------| <---- sua.bot
+	|       |
+	|       |
+	|       |
+	|-------| <---- sa.top
+	|       |
+	|  sa   |
+	|       |
+	+-------+ <---- sa.bot
+
+
+When a carrier is deallocated a free memory segment will be created
+inside the corresponding area, unless the carrier was at the very top
+(in `sa`) or bottom (in `sua`) in which case the area will just shrink
+down or up.
+
+We need to keep track of all the free segments in order to reuse them
+for new carrier allocations. One initial idea was to use the same
+mechanism that is used to keep track of free blocks within MBCs
+(alloc_util and the different strategies). However, that would not be
+as straight forward as one can think and can also waste quite a lot of
+memory as it uses prepended block headers. The granularity of the
+super carrier is one memory page (usually 4kb). We want to allocate
+and free entire pages and we don't want to waste an entire page just
+to hold the block header of the following pages.
+
+Instead we store the meta information about all the free segments in a
+dedicated area apart from the `sa` and `sua` areas. Every free segment is
+represented by a descriptor struct (`ErtsFreeSegDesc`).
+
+    typedef struct {
+        RBTNode snode;      /* node in 'stree' */
+        RBTNode anode;      /* node in 'atree' */
+        char* start;
+        char* end;
+    }ErtsFreeSegDesc;
+
+To find the smallest free segment that will satisfy a carrier allocation
+(best fit), the free segments are organized in a tree sorted by
+size (`stree`). We search in this tree at allocation. If no free segment of
+sufficient size was found, the area (`sa` or `sua`) is instead expanded.
+If two or more free segments with equal size exist, the one at lowest
+address is choosen for `sa` and highest address for `sua`.
+
+At carrier deallocation, we want to coalesce with any adjacent free
+segments, to form one large free segment. To do that, all free
+segments are also organized in a tree sorted in address order (`atree`).
+
+So, in total we keep four trees of free descriptors for the super
+carrier; two for `sa` and two for `sua`. They all use the same
+red-black-tree implementation that support the different sorting
+orders used.
+
+When allocating a new MBC we first search after a free segment in `sa`,
+then try to raise `sa.top`, and then as a fallback try to search after a
+free segment in `sua`. When an MBC is allocated in `sua`, a larger segment
+is allocated which is then trimmed to obtain the right
+alignment. Allocation search for an SBC is done in reverse order. When
+an SBC is allocated in `sa`, the size is aligned up to super aligned
+size.
+
+### The free descriptor area ###
+
+As mentioned above, the descriptors for the free segments are
+allocated in a separate area. This area has a constant configurable
+size (+MMscrfsd) that defaults to 65536 descriptors. This should be
+more than enough in most cases. If the descriptors area should fill up,
+new descriptor areas will be allocated first directly from the OS, and
+then from `sua` and `sa` in the super carrier, and lastly from the memory
+segment itself which is being deallocated. Allocating free descriptor
+areas from the super carrier is only a last resort, and should be
+avoided, as it creates fragmentation.
+
+### Halfword emulator ###
+
+The halfword emulator uses the super carrier implementation to manage
+its low memory mappings thar are needed for all term storage. The
+super carrier can here not be configured by command line options. One
+could imagine a second configurable instance of the super carrier used
+by high memory allocation, but that has not been implemented.
diff --git a/erts/emulator/sys/common/erl_check_io.c b/erts/emulator/sys/common/erl_check_io.c
index 81cb5dc4bb..0051b45b31 100644
--- a/erts/emulator/sys/common/erl_check_io.c
+++ b/erts/emulator/sys/common/erl_check_io.c
@@ -268,6 +268,8 @@ free_drv_select_data(ErtsDrvSelectDataState *dsp)
     erts_free(ERTS_ALC_T_DRV_SEL_D_STATE, dsp);   
 }
 
+#if ERTS_CIO_HAVE_DRV_EVENT
+
 static ERTS_INLINE ErtsDrvEventDataState *
 alloc_drv_event_data(void)
 {
@@ -290,6 +292,8 @@ free_drv_event_data(ErtsDrvEventDataState *dep)
     erts_free(ERTS_ALC_T_DRV_EV_D_STATE, dep);   
 }
 
+#endif /* ERTS_CIO_HAVE_DRV_EVENT */
+
 static ERTS_INLINE void
 remember_removed(ErtsDrvEventState *state, struct pollset_info* psi)
 {
diff --git a/erts/emulator/sys/unix/erl_unix_sys.h b/erts/emulator/sys/unix/erl_unix_sys.h
index f7a6298d5b..26ed2fb558 100644
--- a/erts/emulator/sys/unix/erl_unix_sys.h
+++ b/erts/emulator/sys/unix/erl_unix_sys.h
@@ -229,7 +229,7 @@ extern void sys_stop_cat(void);
 #ifdef USE_ISINF_ISNAN		/* simulate finite() */
 #  define isfinite(f) (!isinf(f) && !isnan(f))
 #  define HAVE_ISFINITE
-#elif defined(__GNUC__) && defined(HAVE_FINITE)
+#elif (defined(__GNUC__) && !defined(__llvm__)) && defined(HAVE_FINITE)
 /* We use finite in gcc as it emits assembler instead of
    the function call that isfinite emits. The assembler is
    significantly faster. */
diff --git a/erts/emulator/test/port_SUITE.erl b/erts/emulator/test/port_SUITE.erl
index 1bb4cb3637..6bbf93b7d7 100644
--- a/erts/emulator/test/port_SUITE.erl
+++ b/erts/emulator/test/port_SUITE.erl
@@ -2349,8 +2349,10 @@ port_setget_data(Config) when is_list(Config) ->
     Port = erlang:open_port({spawn_driver, "echo_drv"}, []),
 
     NSched = erlang:system_info(schedulers_online),
+    HeapData = {1,2,3,<<"A heap binary">>,fun()->"This is fun"end,
+	       list_to_binary(lists:seq(1,100))},
     PRs = lists:map(fun(I) ->
-			    spawn_opt(fun() -> port_setget_data_hammer(Port,1) end,
+			    spawn_opt(fun() -> port_setget_data_hammer(Port,HeapData,false,1) end,
 				      [monitor, {scheduler, I rem NSched}])
 		    end,
 		    lists:seq(1,10)),
@@ -2368,13 +2370,17 @@ port_setget_data(Config) when is_list(Config) ->
 		  PRs),
     ok.
 
-port_setget_data_hammer(Port, N) ->
+port_setget_data_hammer(Port, HeapData, IsSet0, N) ->
     Rand = random:uniform(3),
-    try case Rand of
-	    1 -> true = erlang:port_set_data(Port, atom);
-	    2 -> true = erlang:port_set_data(Port, {1,2,3});
-	    3 -> erlang:port_get_data(Port)
-	end
+    IsSet1 = try case Rand of
+		     1 -> true = erlang:port_set_data(Port, atom), true;
+		     2 -> true = erlang:port_set_data(Port, HeapData), true;
+		     3 -> case erlang:port_get_data(Port) of
+			      atom -> true;
+			      HeapData -> true;
+			      undefined -> false=IsSet0
+			  end
+		 end
     catch
 	error:badarg ->
 	    true = get(prepare_for_close),
@@ -2387,7 +2393,7 @@ port_setget_data_hammer(Port, N) ->
     after 0 ->
 	    ok
     end,
-    port_setget_data_hammer(Port, N+1).
+    port_setget_data_hammer(Port, HeapData, IsSet1, N+1).
 
 
 wait_until(Fun) ->
diff --git a/erts/emulator/utils/make_compiler_flags b/erts/emulator/utils/make_compiler_flags
index cebe8cd0c5..ca1bc47113 100755
--- a/erts/emulator/utils/make_compiler_flags
+++ b/erts/emulator/utils/make_compiler_flags
@@ -70,7 +70,7 @@ my($prog) = $prog[$#prog];
 print "/* Warning: Do not edit this file.\n";
 print "   Auto-generated by '$prog'.*/\n";
 
-foreach(keys %constants) {
+foreach (sort(keys %constants)) {
     print "const char* erts_build_flags_$_ = \"$constants{$_}\";\n"
 }
 
diff --git a/erts/include/internal/ethread.h b/erts/include/internal/ethread.h
index 72c054b588..ad5d05704c 100644
--- a/erts/include/internal/ethread.h
+++ b/erts/include/internal/ethread.h
@@ -364,8 +364,8 @@ extern ethr_runtime_t ethr_runtime__;
 #          include "sparc64/ethread.h"
 #        endif
 #      endif
-#      include "gcc/ethread.h"
 #      include "libatomic_ops/ethread.h"
+#      include "gcc/ethread.h"
 #    endif
 #  elif defined(ETHR_HAVE_LIBATOMIC_OPS)
 #    include "libatomic_ops/ethread.h"
diff --git a/erts/include/internal/libatomic_ops/ethr_atomic.h b/erts/include/internal/libatomic_ops/ethr_atomic.h
index fb1288c330..734cdf0890 100644
--- a/erts/include/internal/libatomic_ops/ethr_atomic.h
+++ b/erts/include/internal/libatomic_ops/ethr_atomic.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2010-2011. All Rights Reserved.
+ * Copyright Ericsson AB 2010-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -32,22 +32,23 @@
  * These operations need to be defined by libatomic_ops;
  * otherwise, we won't compile:
  * - AO_nop_full()
- * - AO_load()
- * - AO_store()
- * - AO_compare_and_swap()
+ * - AO_load() || AO_load_aquire()
+ * - AO_store() || AO_store_release()
+ * - AO_compare_and_swap() || AO_compare_and_swap_acquire()
+ *   || AO_compare_and_swap_release() || AO_compare_and_swap_full()
  *
  */
 
 #if ETHR_SIZEOF_AO_T == 4
 #define ETHR_HAVE_NATIVE_ATOMIC32 1
-#define ETHR_NATIVE_ATOMIC32_IMPL "libatomic_ops"
+#define ETHR_NATIVE_ATOMIC32_IMPL ETHR_NATIVE_IMPL__
 #define ETHR_NATMC_FUNC__(X) ethr_native_atomic32_ ## X
 #define ETHR_ATMC_T__ ethr_native_atomic32_t
 #define ETHR_AINT_T__ ethr_sint32_t
 #define ETHR_AINT_SUFFIX__ "l"
 #elif ETHR_SIZEOF_AO_T == 8
 #define ETHR_HAVE_NATIVE_ATOMIC64 1
-#define ETHR_NATIVE_ATOMIC64_IMPL "libatomic_ops"
+#define ETHR_NATIVE_ATOMIC64_IMPL ETHR_NATIVE_IMPL__
 #define ETHR_NATMC_FUNC__(X) ethr_native_atomic64_ ## X
 #define ETHR_ATMC_T__ ethr_native_atomic64_t
 #define ETHR_AINT_T__ ethr_sint64_t
@@ -74,6 +75,8 @@ ETHR_NATMC_FUNC__(addr)(ETHR_ATMC_T__ *var)
     return (ETHR_AINT_T__ *) &var->counter;
 }
 
+#ifdef AO_HAVE_store
+
 #if ETHR_SIZEOF_AO_T == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET 1
 #else
@@ -86,6 +89,24 @@ ETHR_NATMC_FUNC__(set)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
     AO_store(&var->counter, (AO_t) value);
 }
 
+#endif
+
+#ifdef AO_HAVE_store_write
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET_WB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_SET_WB 1
+#endif
+
+static ETHR_INLINE void
+ETHR_NATMC_FUNC__(set_wb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
+{
+    AO_store_write(&var->counter, (AO_t) value);
+}
+
+#endif
+
 #ifdef AO_HAVE_store_release
 
 #if ETHR_SIZEOF_AO_T == 4
@@ -102,6 +123,24 @@ ETHR_NATMC_FUNC__(set_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
 
 #endif
 
+#ifdef AO_HAVE_store_full
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_SET_MB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_SET_MB 1
+#endif
+
+static ETHR_INLINE void
+ETHR_NATMC_FUNC__(set_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ value)
+{
+    AO_store_full(&var->counter, (AO_t) value);
+}
+
+#endif
+
+#ifdef AO_HAVE_load
+
 #if ETHR_SIZEOF_AO_T == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ 1
 #else
@@ -114,6 +153,24 @@ ETHR_NATMC_FUNC__(read)(ETHR_ATMC_T__ *var)
     return (ETHR_AINT_T__) AO_load(&var->counter);
 }
 
+#endif
+
+#ifdef AO_HAVE_load_read
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ_RB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_READ_RB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(read_rb)(ETHR_ATMC_T__ *var)
+{
+    return (ETHR_AINT_T__) AO_load_read(&var->counter);
+}
+
+#endif
+
 #ifdef AO_HAVE_load_acquire
 
 #if ETHR_SIZEOF_AO_T == 4
@@ -130,6 +187,22 @@ ETHR_NATMC_FUNC__(read_acqb)(ETHR_ATMC_T__ *var)
 
 #endif
 
+#ifdef AO_HAVE_load_full
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_READ_MB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_READ_MB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(read_mb)(ETHR_ATMC_T__ *var)
+{
+    return (ETHR_AINT_T__) AO_load_full(&var->counter);
+}
+
+#endif
+
 #ifdef AO_HAVE_fetch_and_add
 
 #if ETHR_SIZEOF_AO_T == 4
@@ -146,6 +219,54 @@ ETHR_NATMC_FUNC__(add_return)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
 
 #endif
 
+#ifdef AO_HAVE_fetch_and_add_acquire
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADD_RETURN_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(add_return_acqb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_add_acquire(&var->counter, (AO_t) incr)) + incr;
+}
+
+#endif
+
+#ifdef AO_HAVE_fetch_and_add_release
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADD_RETURN_RELB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(add_return_relb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_add_release(&var->counter, (AO_t) incr)) + incr;
+}
+
+#endif
+
+#ifdef AO_HAVE_fetch_and_add_full
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_MB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_ADD_RETURN_MB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(add_return_mb)(ETHR_ATMC_T__ *var, ETHR_AINT_T__ incr)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_add_full(&var->counter, (AO_t) incr)) + incr;
+}
+
+#endif
+
 #ifdef AO_HAVE_fetch_and_add1
 
 #if ETHR_SIZEOF_AO_T == 4
@@ -178,6 +299,38 @@ ETHR_NATMC_FUNC__(inc_return_acqb)(ETHR_ATMC_T__ *var)
 
 #endif
 
+#ifdef AO_HAVE_fetch_and_add1_release
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_INC_RETURN_RELB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_INC_RETURN_RELB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(inc_return_relb)(ETHR_ATMC_T__ *var)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_add1_release(&var->counter)) + 1;
+}
+
+#endif
+
+#ifdef AO_HAVE_fetch_and_add1_full
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_INC_RETURN_MB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_INC_RETURN_MB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(inc_return_mb)(ETHR_ATMC_T__ *var)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_add1_full(&var->counter)) + 1;
+}
+
+#endif
+
 #ifdef AO_HAVE_fetch_and_sub1
 
 #if ETHR_SIZEOF_AO_T == 4
@@ -194,6 +347,22 @@ ETHR_NATMC_FUNC__(dec_return)(ETHR_ATMC_T__ *var)
 
 #endif
 
+#ifdef AO_HAVE_fetch_and_sub1_acquire
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_DEC_RETURN_ACQB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_DEC_RETURN_ACQB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(dec_return_acqb)(ETHR_ATMC_T__ *var)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_sub1_acquire(&var->counter)) - 1;
+}
+
+#endif
+
 #ifdef AO_HAVE_fetch_and_sub1_release
 
 #if ETHR_SIZEOF_AO_T == 4
@@ -210,7 +379,60 @@ ETHR_NATMC_FUNC__(dec_return_relb)(ETHR_ATMC_T__ *var)
 
 #endif
 
-#ifdef AO_HAVE_compare_and_swap
+#ifdef AO_HAVE_fetch_and_sub1_full
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_DEC_RETURN_MB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_DEC_RETURN_MB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(dec_return_mb)(ETHR_ATMC_T__ *var)
+{
+    return ((ETHR_AINT_T__) AO_fetch_and_sub1_full(&var->counter)) - 1;
+}
+
+#endif
+
+#if defined(AO_HAVE_compare_and_swap_full) || defined(AO_HAVE_fetch_compare_and_swap_full)
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_MB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_CMPXCHG_MB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(cmpxchg_mb)(ETHR_ATMC_T__ *var,
+			      ETHR_AINT_T__ new,
+			      ETHR_AINT_T__ exp)
+{
+#if defined(AO_HAVE_fetch_compare_and_swap_full)
+    return (ETHR_AINT_T__) AO_fetch_compare_and_swap_full(&var->counter,
+							  (AO_t) exp,
+							  (AO_t) new);
+#else
+    ETHR_AINT_T__ act;
+    do {
+	if (AO_compare_and_swap_full(&var->counter, (AO_t) exp, (AO_t) new))
+	    return exp;
+#ifdef AO_HAVE_load_acquire
+	act = (ETHR_AINT_T__) AO_load_acquire(&var->counter);
+#else
+	act = (ETHR_AINT_T__) AO_load(&var->counter);
+#endif
+    } while (act == exp);
+#ifndef AO_HAVE_load_acquire
+    AO_nop_full();
+#endif
+    return act;
+#endif
+}
+
+#endif
+
+#if defined(AO_HAVE_compare_and_swap) || defined(AO_HAVE_fetch_compare_and_swap)
 
 #if ETHR_SIZEOF_AO_T == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG 1
@@ -223,18 +445,28 @@ ETHR_NATMC_FUNC__(cmpxchg)(ETHR_ATMC_T__ *var,
 			   ETHR_AINT_T__ new,
 			   ETHR_AINT_T__ exp)
 {
+#if defined(AO_HAVE_fetch_compare_and_swap)
+    return (ETHR_AINT_T__) AO_fetch_compare_and_swap(&var->counter,
+						     (AO_t) exp,
+						     (AO_t) new);
+#else
     ETHR_AINT_T__ act;
     do {
 	if (AO_compare_and_swap(&var->counter, (AO_t) exp, (AO_t) new))
 	    return exp;
+#ifdef AO_HAVE_load
 	act = (ETHR_AINT_T__) AO_load(&var->counter);
+#else
+	act = (ETHR_AINT_T__) AO_load_aquire(&var->counter);
+#endif
     } while (act == exp);
     return act;
+#endif
 }
 
 #endif
 
-#ifdef AO_HAVE_compare_and_swap_acquire
+#if defined(AO_HAVE_compare_and_swap_acquire) || defined(AO_HAVE_fetch_compare_and_swap_acquire)
 
 #if ETHR_SIZEOF_AO_T == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_ACQB 1
@@ -247,6 +479,11 @@ ETHR_NATMC_FUNC__(cmpxchg_acqb)(ETHR_ATMC_T__ *var,
 				ETHR_AINT_T__ new,
 				ETHR_AINT_T__ exp)
 {
+#if defined(AO_HAVE_fetch_compare_and_swap_acquire)
+    return (ETHR_AINT_T__) AO_fetch_compare_and_swap_acquire(&var->counter,
+							     (AO_t) exp,
+							     (AO_t) new);
+#else
     ETHR_AINT_T__ act;
     do {
 	if (AO_compare_and_swap_acquire(&var->counter, (AO_t) exp, (AO_t) new))
@@ -261,11 +498,55 @@ ETHR_NATMC_FUNC__(cmpxchg_acqb)(ETHR_ATMC_T__ *var,
     AO_nop_full();
 #endif
     return act;
+#endif
+}
+
+#endif
+
+#if defined(AO_HAVE_compare_and_swap_read) || defined(AO_HAVE_fetch_compare_and_swap_read)
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_RB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_CMPXCHG_RB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(cmpxchg_rb)(ETHR_ATMC_T__ *var,
+			      ETHR_AINT_T__ new,
+			      ETHR_AINT_T__ exp)
+{
+#if defined(AO_HAVE_fetch_compare_and_swap_read)
+    return (ETHR_AINT_T__) AO_fetch_compare_and_swap_read(&var->counter,
+							     (AO_t) exp,
+							     (AO_t) new);
+#else
+    ETHR_AINT_T__ act;
+    do {
+	if (AO_compare_and_swap_read(&var->counter, (AO_t) exp, (AO_t) new))
+	    return exp;
+#if defined(AO_HAVE_load_read)
+	act = (ETHR_AINT_T__) AO_load_read(&var->counter);
+#elif defined(AO_HAVE_load)
+	act = (ETHR_AINT_T__) AO_load(&var->counter);
+#else
+	act = (ETHR_AINT_T__) AO_load_acquire(&var->counter);
+#endif
+    } while (act == exp);
+#ifndef AO_HAVE_load_read
+#ifdef AO_HAVE_nop_read
+    AO_nop_read();
+#else
+    AO_nop_full();
+#endif
+#endif
+    return act;
+#endif
 }
 
 #endif
 
-#ifdef AO_HAVE_compare_and_swap_release
+#if defined(AO_HAVE_compare_and_swap_release) || defined(AO_HAVE_fetch_compare_and_swap_release)
 
 #if ETHR_SIZEOF_AO_T == 4
 #  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_RELB 1
@@ -278,13 +559,57 @@ ETHR_NATMC_FUNC__(cmpxchg_relb)(ETHR_ATMC_T__ *var,
 				ETHR_AINT_T__ new,
 				ETHR_AINT_T__ exp)
 {
+#if defined(AO_HAVE_fetch_compare_and_swap_release)
+    return (ETHR_AINT_T__) AO_fetch_compare_and_swap_release(&var->counter,
+							     (AO_t) exp,
+							     (AO_t) new);
+#else
     ETHR_AINT_T__ act;
     do {
 	if (AO_compare_and_swap_release(&var->counter, (AO_t) exp, (AO_t) new))
 	    return exp;
+#ifdef AO_HAVE_load
 	act = (ETHR_AINT_T__) AO_load(&var->counter);
+#else
+	act = (ETHR_AINT_T__) AO_load_acquire(&var->counter);
+#endif
+    } while (act == exp);
+    return act;
+#endif
+}
+
+#endif
+
+#if defined(AO_HAVE_compare_and_swap_write) || defined(AO_HAVE_fetch_compare_and_swap_write)
+
+#if ETHR_SIZEOF_AO_T == 4
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_WB 1
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_ATOMIC64_CMPXCHG_WB 1
+#endif
+
+static ETHR_INLINE ETHR_AINT_T__
+ETHR_NATMC_FUNC__(cmpxchg_wb)(ETHR_ATMC_T__ *var,
+			      ETHR_AINT_T__ new,
+			      ETHR_AINT_T__ exp)
+{
+#if defined(AO_HAVE_fetch_compare_and_swap_write)
+    return (ETHR_AINT_T__) AO_fetch_compare_and_swap_write(&var->counter,
+							   (AO_t) exp,
+							   (AO_t) new);
+#else
+    ETHR_AINT_T__ act;
+    do {
+	if (AO_compare_and_swap_write(&var->counter, (AO_t) exp, (AO_t) new))
+	    return exp;
+#ifdef AO_HAVE_load
+	act = (ETHR_AINT_T__) AO_load(&var->counter);
+#else
+	act = (ETHR_AINT_T__) AO_load_acquire(&var->counter);
+#endif
     } while (act == exp);
     return act;
+#endif
 }
 
 #endif
diff --git a/erts/include/internal/libatomic_ops/ethr_dw_atomic.h b/erts/include/internal/libatomic_ops/ethr_dw_atomic.h
new file mode 100644
index 0000000000..4dd9f41e96
--- /dev/null
+++ b/erts/include/internal/libatomic_ops/ethr_dw_atomic.h
@@ -0,0 +1,567 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2014. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+/*
+ * Description: Native double word atomics using libatomic_ops
+ * Author: Rickard Green
+ */
+
+#ifndef ETHR_LIBATOMIC_OPS_DW_ATOMIC_H__
+#define ETHR_LIBATOMIC_OPS_DW_ATOMIC_H__
+
+#if defined(AO_HAVE_double_t)						\
+    && (defined(AO_HAVE_double_load_acquire)				\
+	|| defined(AO_HAVE_double_load))				\
+    && (defined(AO_HAVE_compare_double_and_swap_double)			\
+	|| defined(AO_HAVE_compare_double_and_swap_double_full)		\
+	|| defined(AO_HAVE_compare_double_and_swap_double_acquire)	\
+	|| defined(AO_HAVE_compare_double_and_swap_double_release)	\
+	|| defined(AO_HAVE_double_compare_and_swap)			\
+	|| defined(AO_HAVE_double_compare_and_swap_full)		\
+	|| defined(AO_HAVE_double_compare_and_swap_acquire)		\
+	|| defined(AO_HAVE_double_compare_and_swap_release))
+
+#if ETHR_SIZEOF_PTR == 4
+#  define ETHR_NATIVE_SU_DW_SINT_T ethr_sint64_t
+#elif ETHR_SIZEOF_PTR == 8 && defined(ETHR_HAVE_INT128_T)
+#  define ETHR_NATIVE_SU_DW_SINT_T ethr_sint128_t
+#endif
+
+typedef union {
+    volatile AO_double_t dw_mem;
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+    ETHR_NATIVE_SU_DW_SINT_T su_dw_sint;
+#endif
+} ethr_native_dw_atomic_t;
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_NATIVE_SU_DW_ATOMIC
+#else
+#  define ETHR_HAVE_NATIVE_DW_ATOMIC
+#endif
+
+#define ETHR_NATIVE_DW_ATOMIC_IMPL ETHR_NATIVE_IMPL__
+
+#if defined(ETHR_TRY_INLINE_FUNCS) || defined(ETHR_ATOMIC_IMPL__)
+
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_NDWA_FUNC__(Func) ethr_native_su_dw_atomic_ ## Func
+#  define ETHR_NDWA_RET_3_TYPE__ ETHR_NATIVE_SU_DW_SINT_T
+#  define ETHR_NDWA_RET_2_TYPE__ ETHR_NATIVE_SU_DW_SINT_T
+#  define ETHR_NDWA_VAL_ARG_TYPE__ ETHR_NATIVE_SU_DW_SINT_T
+#  define ETHR_NDWA_DECL_ARG__(Arg)
+#  if defined(AO_HAVE_DOUBLE_PTR_STORAGE)
+#    define ETHR_NDWA_VAL2AOVAL__(AOV, V)			\
+    ((AOV).AO_whole = (double_ptr_storage) (V))
+#    define ETHR_NDWA_AOVAL2VAL__(AOV, V)			\
+    ((V) = (ETHR_NATIVE_SU_DW_SINT_T) (AOV).AO_whole)
+#    define ETHR_NDWA_RETURN_VAL_3__(SUCCESS, AOVAL, VAL)	\
+    do {							\
+	return (ETHR_NATIVE_SU_DW_SINT_T) (AOVAL).AO_whole;	\
+    } while (0)
+#    define ETHR_NDWA_RETURN_VAL_2__(AOVAL, VAL)		\
+    do {							\
+	return (ETHR_NATIVE_SU_DW_SINT_T) (AOVAL).AO_whole;	\
+    } while (0)
+#    define ETHR_NDWA_AOVAL_EQ__(AOV1, AOV2)			\
+    ((AOV1).AO_whole == (AOV2).AO_whole)
+#  else
+typedef union {
+    ethr_sint_t sint[2];
+    ETHR_NATIVE_SU_DW_SINT_T dw_sint;
+}  ethr_dw_splitter_t;
+#    define ETHR_NDWA_VAL2AOVAL__(AOV, V)			\
+    do {							\
+	ethr_dw_splitter_t tmp__;				\
+	tmp__.dw_sint = (V);					\
+	(AOV).AO_val1 = (AO_t) tmp__.sint[0];			\
+	(AOV).AO_val2 = (AO_t) tmp__.sint[1];			\
+    } while (0)
+#    define ETHR_NDWA_AOVAL2VAL__(AOV, V)			\
+    do {							\
+	ethr_dw_splitter_t tmp__;				\
+	tmp__.sint[0] = (ethr_sint_t) (AOV).AO_val1;		\
+	tmp__.sint[1] = (ethr_sint_t) (AOV).AO_val2;		\
+	(V) = tmp__.dw_sint;					\
+    } while (0)
+#    define ETHR_NDWA_RETURN_VAL_3__(SUCCESS, AOVAL, VAL)	\
+    do {							\
+	ethr_dw_splitter_t tmp__;				\
+	tmp__.sint[0] = (ethr_sint_t) (AOVAL).AO_val1;		\
+	tmp__.sint[1] = (ethr_sint_t) (AOVAL).AO_val2;		\
+	return tmp__.dw_sint;					\
+    } while (0)
+#    define ETHR_NDWA_AOVAL_EQ__(AOV1, AOV2)			\
+    ((AOV1).AO_val1 == (AOV2).AO_val1				\
+	 && (AOV1).AO_val2 == (AOV2).AO_val2)
+#  endif
+#else
+#  define ETHR_NDWA_FUNC__(Func) ethr_native_dw_atomic_ ## Func
+#  define ETHR_NDWA_RET_3_TYPE__ int
+#  define ETHR_NDWA_RET_2_TYPE__ void
+#  define ETHR_NDWA_VAL_ARG_TYPE__ ethr_sint_t *
+#  define ETHR_NDWA_DECL_ARG__(Arg) , ETHR_NDWA_VAL_ARG_TYPE__ Arg
+#    define ETHR_NDWA_VAL2AOVAL__(AOV, V)			\
+    do {							\
+	(AOV).AO_val1 = (AO_t) (V)[0];				\
+	(AOV).AO_val2 = (AO_t) (V)[1];				\
+    } while (0)
+#    define ETHR_NDWA_AOVAL2VAL__(AOV, V)			\
+    do {							\
+	ethr_dw_splitter_t tmp__;				\
+	(V)[0] = (ethr_sint_t) (AOV).AO_val1;			\
+	(V)[1] = (ethr_sint_t) (AOV).AO_val2;			\
+    } while (0)
+#    define ETHR_NDWA_RETURN_VAL_3__(SUCCESS, AOVAL, VAL)	\
+    do {							\
+	(VAL)[0] = (ethr_sint_t) (AOVAL).AO_val1;		\
+	(VAL)[1] = (ethr_sint_t) (AOVAL).AO_val2;		\
+	return (SUCCESS);					\
+    } while (0)
+#    define ETHR_NDWA_RETURN_VAL_2__(AOVAL, VAL)		\
+    do {							\
+	(VAL)[0] = (ethr_sint_t) (AOVAL).AO_val1;		\
+	(VAL)[1] = (ethr_sint_t) (AOVAL).AO_val2;		\
+	return;							\
+    } while (0)
+#  if defined(AO_HAVE_DOUBLE_PTR_STORAGE)
+#    define ETHR_NDWA_AOVAL_EQ__(AOV1, AOV2)			\
+    ((AOV1).AO_whole == (AOV2).AO_whole)
+#  else
+#    define ETHR_NDWA_AOVAL_EQ__(AOV1, AOV2)			\
+    ((AOV1).AO_val1 == (AOV2).AO_val1				\
+	 && (AOV1).AO_val2 == (AOV2).AO_val2)
+#  endif
+#endif
+
+#define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_ADDR
+static ETHR_INLINE ethr_sint_t *
+ethr_native_dw_atomic_addr(ethr_native_dw_atomic_t *var)
+{
+    return (ethr_sint_t *) &var->dw_mem;
+}
+
+#ifdef AO_HAVE_double_load
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_READ
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_READ
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_2_TYPE__
+ETHR_NDWA_FUNC__(read)(ethr_native_dw_atomic_t *var
+		       ETHR_NDWA_DECL_ARG__(val))
+{
+    AO_double_t act = AO_double_load(&var->dw_mem);
+    ETHR_NDWA_RETURN_VAL_2__(act, val);
+}
+
+#endif
+
+#ifdef AO_HAVE_double_load_read
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_READ_RB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_READ_RB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_2_TYPE__
+ETHR_NDWA_FUNC__(read_rb)(ethr_native_dw_atomic_t *var
+			  ETHR_NDWA_DECL_ARG__(val))
+{
+    AO_double_t act = AO_double_load_read(&var->dw_mem);
+    ETHR_NDWA_RETURN_VAL_2__(act, val);
+}
+
+#endif
+
+#ifdef AO_HAVE_double_load_acquire
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_READ_ACQB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_READ_ACQB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_2_TYPE__
+ETHR_NDWA_FUNC__(read_acqb)(ethr_native_dw_atomic_t *var
+			    ETHR_NDWA_DECL_ARG__(val))
+{
+    AO_double_t act = AO_double_load_acquire(&var->dw_mem);
+    ETHR_NDWA_RETURN_VAL_2__(act, val);
+}
+
+#endif
+
+#ifdef AO_HAVE_double_store
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_SET
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_SET
+#endif
+
+static ETHR_INLINE void
+ETHR_NDWA_FUNC__(set)(ethr_native_dw_atomic_t *var,
+		      ETHR_NDWA_VAL_ARG_TYPE__ val)
+{
+    AO_double_t new;
+    ETHR_NDWA_VAL2AOVAL__(new, val);
+    AO_double_store(&var->dw_mem, new);
+}
+
+#endif
+
+#ifdef AO_HAVE_double_store_write
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_SET_WB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_SET_WB
+#endif
+
+static ETHR_INLINE void
+ETHR_NDWA_FUNC__(set_wb)(ethr_native_dw_atomic_t *var,
+			 ETHR_NDWA_VAL_ARG_TYPE__ val)
+{
+    AO_double_t new;
+    ETHR_NDWA_VAL2AOVAL__(new, val);
+    AO_double_store_write(&var->dw_mem, new);
+}
+
+#endif
+
+#ifdef AO_HAVE_double_store_release
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_SET_RELB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_SET_RELB
+#endif
+
+static ETHR_INLINE void
+ETHR_NDWA_FUNC__(set_relb)(ethr_native_dw_atomic_t *var,
+			   ETHR_NDWA_VAL_ARG_TYPE__ val)
+{
+    AO_double_t new;
+    ETHR_NDWA_VAL2AOVAL__(new, val);
+    AO_double_store_release(&var->dw_mem, new);
+}
+
+#endif
+
+#if defined(AO_HAVE_double_compare_and_swap_full) || defined(AO_HAVE_compare_double_and_swap_double_full)
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_MB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG_MB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_3_TYPE__
+ETHR_NDWA_FUNC__(cmpxchg_mb)(ethr_native_dw_atomic_t *var,
+			     ETHR_NDWA_VAL_ARG_TYPE__ new,
+			     ETHR_NDWA_VAL_ARG_TYPE__ exp)
+{
+    AO_double_t ao_act, ao_new, ao_exp;
+
+    ETHR_NDWA_VAL2AOVAL__(ao_exp, exp);
+    ETHR_NDWA_VAL2AOVAL__(ao_new, new);
+
+    do {
+	int xchgd;
+#if defined(AO_HAVE_double_compare_and_swap_full)
+	xchgd = AO_double_compare_and_swap_full(&var->dw_mem, ao_exp, ao_new);
+#elif defined(AO_HAVE_compare_double_and_swap_double_full)
+	xchgd = AO_compare_double_and_swap_double_full(&var->dw_mem,
+						       ao_exp.AO_val1,
+						       ao_exp.AO_val2,
+						       ao_new.AO_val1,
+						       ao_new.AO_val2);
+#endif
+
+	if (xchgd)
+	    ETHR_NDWA_RETURN_VAL_3__(1, ao_exp, exp);
+
+#ifdef AO_HAVE_double_load_acquire
+	ao_act = AO_double_load_acquire(&var->dw_mem);
+#else
+	ao_act = AO_double_load(&var->dw_mem);
+#endif
+
+    } while (ETHR_NDWA_AOVAL_EQ__(ao_exp, ao_act));
+
+#ifndef AO_HAVE_double_load_acquire
+    AO_nop_full();
+#endif
+
+    ETHR_NDWA_RETURN_VAL_3__(1, ao_act, exp);
+}
+
+#endif
+
+#if defined(AO_HAVE_double_compare_and_swap) || defined(AO_HAVE_compare_double_and_swap_double)
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_3_TYPE__
+ETHR_NDWA_FUNC__(cmpxchg)(ethr_native_dw_atomic_t *var,
+			  ETHR_NDWA_VAL_ARG_TYPE__ new,
+			  ETHR_NDWA_VAL_ARG_TYPE__ exp)
+{
+    AO_double_t ao_act, ao_new, ao_exp;
+
+    ETHR_NDWA_VAL2AOVAL__(ao_exp, exp);
+    ETHR_NDWA_VAL2AOVAL__(ao_new, new);
+
+    do {
+	int xchgd;
+#if defined(AO_HAVE_double_compare_and_swap)
+	xchgd = AO_double_compare_and_swap(&var->dw_mem, ao_exp, ao_new);
+#elif defined(AO_HAVE_compare_double_and_swap_double)
+	xchgd = AO_compare_double_and_swap_double(&var->dw_mem,
+						  ao_exp.AO_val1,
+						  ao_exp.AO_val2,
+						  ao_new.AO_val1,
+						  ao_new.AO_val2);
+#endif
+
+	if (xchgd)
+	    ETHR_NDWA_RETURN_VAL_3__(1, ao_exp, exp);
+
+#ifdef AO_HAVE_double_load
+	ao_act = AO_double_load(&var->dw_mem);
+#else
+	ao_act = AO_double_load_acquire(&var->dw_mem);
+#endif
+
+    } while (ETHR_NDWA_AOVAL_EQ__(ao_exp, ao_act));
+
+    ETHR_NDWA_RETURN_VAL_3__(1, ao_act, exp);
+}
+
+#endif
+
+#if defined(AO_HAVE_double_compare_and_swap_read) || defined(AO_HAVE_compare_double_and_swap_double_read)
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_RB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG_RB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_3_TYPE__
+ETHR_NDWA_FUNC__(cmpxchg_rb)(ethr_native_dw_atomic_t *var,
+			     ETHR_NDWA_VAL_ARG_TYPE__ new,
+			     ETHR_NDWA_VAL_ARG_TYPE__ exp)
+{
+    AO_double_t ao_act, ao_new, ao_exp;
+
+    ETHR_NDWA_VAL2AOVAL__(ao_exp, exp);
+    ETHR_NDWA_VAL2AOVAL__(ao_new, new);
+
+    do {
+	int xchgd;
+#if defined(AO_HAVE_double_compare_and_swap_read)
+	xchgd = AO_double_compare_and_swap_read(&var->dw_mem, ao_exp, ao_new);
+#elif defined(AO_HAVE_compare_double_and_swap_double_read)
+	xchgd = AO_compare_double_and_swap_double_read(&var->dw_mem,
+							  ao_exp.AO_val1,
+							  ao_exp.AO_val2,
+							  ao_new.AO_val1,
+							  ao_new.AO_val2);
+#endif
+
+	if (xchgd)
+	    ETHR_NDWA_RETURN_VAL_3__(1, ao_exp, exp);
+
+#if defined(AO_HAVE_double_load_read)
+	ao_act = AO_double_load_read(&var->dw_mem);
+#elif defined(AO_HAVE_double_load)
+	ao_act = AO_double_load(&var->dw_mem);
+#else
+	ao_act = AO_double_load_acquire(&var->dw_mem);
+#endif
+
+    } while (ETHR_NDWA_AOVAL_EQ__(ao_exp, ao_act));
+
+#ifndef AO_HAVE_double_load_read
+#ifdef AO_HAVE_nop_read
+    AO_nop_read();
+#else
+    AO_nop_full();
+#endif
+#endif
+
+    ETHR_NDWA_RETURN_VAL_3__(1, ao_act, exp);
+}
+
+#endif
+
+#if defined(AO_HAVE_double_compare_and_swap_acquire) || defined(AO_HAVE_compare_double_and_swap_double_acquire)
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_ACQB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG_ACQB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_3_TYPE__
+ETHR_NDWA_FUNC__(cmpxchg_acqb)(ethr_native_dw_atomic_t *var,
+			       ETHR_NDWA_VAL_ARG_TYPE__ new,
+			       ETHR_NDWA_VAL_ARG_TYPE__ exp)
+{
+    AO_double_t ao_act, ao_new, ao_exp;
+
+    ETHR_NDWA_VAL2AOVAL__(ao_exp, exp);
+    ETHR_NDWA_VAL2AOVAL__(ao_new, new);
+
+    do {
+	int xchgd;
+#if defined(AO_HAVE_double_compare_and_swap_acquire)
+	xchgd = AO_double_compare_and_swap_acquire(&var->dw_mem, ao_exp, ao_new);
+#elif defined(AO_HAVE_compare_double_and_swap_double_acquire)
+	xchgd = AO_compare_double_and_swap_double_acquire(&var->dw_mem,
+							  ao_exp.AO_val1,
+							  ao_exp.AO_val2,
+							  ao_new.AO_val1,
+							  ao_new.AO_val2);
+#endif
+
+	if (xchgd)
+	    ETHR_NDWA_RETURN_VAL_3__(1, ao_exp, exp);
+
+#ifdef AO_HAVE_double_load_acquire
+	ao_act = AO_double_load_acquire(&var->dw_mem);
+#else
+	ao_act = AO_double_load(&var->dw_mem);
+#endif
+
+    } while (ETHR_NDWA_AOVAL_EQ__(ao_exp, ao_act));
+
+#ifndef AO_HAVE_double_load_acquire
+    AO_nop_full();
+#endif
+
+    ETHR_NDWA_RETURN_VAL_3__(1, ao_act, exp);
+}
+
+#endif
+
+#if defined(AO_HAVE_double_compare_and_swap_write) || defined(AO_HAVE_compare_double_and_swap_double_write)
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_WB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG_WB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_3_TYPE__
+ETHR_NDWA_FUNC__(cmpxchg_wb)(ethr_native_dw_atomic_t *var,
+			     ETHR_NDWA_VAL_ARG_TYPE__ new,
+			     ETHR_NDWA_VAL_ARG_TYPE__ exp)
+{
+    AO_double_t ao_act, ao_new, ao_exp;
+
+    ETHR_NDWA_VAL2AOVAL__(ao_exp, exp);
+    ETHR_NDWA_VAL2AOVAL__(ao_new, new);
+
+    do {
+	int xchgd;
+#if defined(AO_HAVE_double_compare_and_swap_write)
+	xchgd = AO_double_compare_and_swap_write(&var->dw_mem, ao_exp, ao_new);
+#elif defined(AO_HAVE_compare_double_and_swap_double_write)
+	xchgd = AO_compare_double_and_swap_double_write(&var->dw_mem,
+							ao_exp.AO_val1,
+							ao_exp.AO_val2,
+							ao_new.AO_val1,
+							ao_new.AO_val2);
+#endif
+
+	if (xchgd)
+	    ETHR_NDWA_RETURN_VAL_3__(1, ao_exp, exp);
+
+#ifdef AO_HAVE_double_load
+	ao_act = AO_double_load(&var->dw_mem);
+#else
+	ao_act = AO_double_load_acquire(&var->dw_mem);
+#endif
+
+    } while (ETHR_NDWA_AOVAL_EQ__(ao_exp, ao_act));
+
+    ETHR_NDWA_RETURN_VAL_3__(1, ao_act, exp);
+}
+
+#endif
+
+#if defined(AO_HAVE_double_compare_and_swap_release) || defined(AO_HAVE_compare_double_and_swap_double_release)
+
+#if defined(ETHR_NATIVE_SU_DW_SINT_T)
+#  define ETHR_HAVE_ETHR_NATIVE_SU_DW_ATOMIC_CMPXCHG_RELB
+#else
+#  define ETHR_HAVE_ETHR_NATIVE_DW_ATOMIC_CMPXCHG_RELB
+#endif
+
+static ETHR_INLINE ETHR_NDWA_RET_3_TYPE__
+ETHR_NDWA_FUNC__(cmpxchg_relb)(ethr_native_dw_atomic_t *var,
+			       ETHR_NDWA_VAL_ARG_TYPE__ new,
+			       ETHR_NDWA_VAL_ARG_TYPE__ exp)
+{
+    AO_double_t ao_act, ao_new, ao_exp;
+
+    ETHR_NDWA_VAL2AOVAL__(ao_exp, exp);
+    ETHR_NDWA_VAL2AOVAL__(ao_new, new);
+
+    do {
+	int xchgd;
+#if defined(AO_HAVE_double_compare_and_swap_release)
+	xchgd = AO_double_compare_and_swap_release(&var->dw_mem, ao_exp, ao_new);
+#elif defined(AO_HAVE_compare_double_and_swap_double_release)
+	xchgd = AO_compare_double_and_swap_double_release(&var->dw_mem,
+							  ao_exp.AO_val1,
+							  ao_exp.AO_val2,
+							  ao_new.AO_val1,
+							  ao_new.AO_val2);
+#endif
+
+	if (xchgd)
+	    ETHR_NDWA_RETURN_VAL_3__(1, ao_exp, exp);
+
+	ao_act = AO_double_load(&var->dw_mem);
+
+    } while (ETHR_NDWA_AOVAL_EQ__(ao_exp, ao_act));
+
+    ETHR_NDWA_RETURN_VAL_3__(1, ao_act, exp);
+}
+
+#endif
+
+#endif /* defined(ETHR_TRY_INLINE_FUNCS) || defined(ETHR_ATOMIC_IMPL__) */
+
+#endif /* Have AO double functionality ... */
+
+#endif /* ETHR_LIBATOMIC_OPS_DW_ATOMIC_H__ */
+
diff --git a/erts/include/internal/libatomic_ops/ethread.h b/erts/include/internal/libatomic_ops/ethread.h
index e1fdd588bb..d65ee19b04 100644
--- a/erts/include/internal/libatomic_ops/ethread.h
+++ b/erts/include/internal/libatomic_ops/ethread.h
@@ -33,9 +33,12 @@
 #define AO_USE_PENTIUM4_INSTRS
 #endif
 
+#define ETHR_NATIVE_IMPL__ "libatomic_ops"
+
 #include "atomic_ops.h"
 #include "ethr_membar.h"
 #include "ethr_atomic.h"
+#include "ethr_dw_atomic.h"
 
 #endif
 
diff --git a/erts/include/internal/ppc32/atomic.h b/erts/include/internal/ppc32/atomic.h
index 6001620677..b558626b09 100644
--- a/erts/include/internal/ppc32/atomic.h
+++ b/erts/include/internal/ppc32/atomic.h
@@ -91,6 +91,20 @@ ethr_native_atomic32_add_return_acqb(ethr_native_atomic32_t *var, ethr_sint32_t
     return res;
 }
 
+
+#ifndef ETHR_PPC_HAVE_NO_LWSYNC
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_ADD_RETURN_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_add_return_relb(ethr_native_atomic32_t *var, ethr_sint32_t incr)
+{
+    ethr_lwsync__();
+    return ethr_native_atomic32_add_return(var, incr);
+}
+
+#endif
+
 #define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_INC_RETURN 1
 
 static ETHR_INLINE ethr_sint32_t
@@ -120,7 +134,19 @@ ethr_native_atomic32_inc_return_acqb(ethr_native_atomic32_t *var)
     __asm__ __volatile("isync\n\t" : : : "memory");
     return res;
 }
-    
+
+#ifndef ETHR_PPC_HAVE_NO_LWSYNC
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_INC_RETURN_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_inc_return_relb(ethr_native_atomic32_t *var)
+{
+    ethr_lwsync__();
+    return ethr_native_atomic32_inc_return(var);
+}
+
+#endif
 
 #define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_DEC_RETURN 1
 
@@ -152,6 +178,19 @@ ethr_native_atomic32_dec_return_acqb(ethr_native_atomic32_t *var)
     return res;
 }
 
+#ifndef ETHR_PPC_HAVE_NO_LWSYNC
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_DEC_RETURN_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_dec_return_relb(ethr_native_atomic32_t *var)
+{
+    ethr_lwsync__();
+    return ethr_native_atomic32_dec_return(var);
+}
+
+#endif
+
 #define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_AND_RETOLD 1
 
 static ETHR_INLINE ethr_sint32_t
@@ -182,6 +221,19 @@ ethr_native_atomic32_and_retold_acqb(ethr_native_atomic32_t *var, ethr_sint32_t
     return res;
 }
 
+#ifndef ETHR_PPC_HAVE_NO_LWSYNC
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_AND_RETOLD_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_and_retold_relb(ethr_native_atomic32_t *var, ethr_sint32_t mask)
+{
+    ethr_lwsync__();
+    return ethr_native_atomic32_and_retold(var, mask);
+}
+
+#endif
+
 #define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_OR_RETOLD 1
 
 static ETHR_INLINE ethr_sint32_t
@@ -212,6 +264,18 @@ ethr_native_atomic32_or_retold_acqb(ethr_native_atomic32_t *var, ethr_sint32_t m
     return res;
 }
 
+#ifndef ETHR_PPC_HAVE_NO_LWSYNC
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_OR_RETOLD_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_or_retold_relb(ethr_native_atomic32_t *var, ethr_sint32_t mask)
+{
+    ethr_lwsync__();
+    return ethr_native_atomic32_or_retold(var, mask);
+}
+
+#endif
 
 #define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_XCHG 1
 
@@ -242,6 +306,19 @@ ethr_native_atomic32_xchg_acqb(ethr_native_atomic32_t *var, ethr_sint32_t val)
     return res;
 }
 
+#ifndef ETHR_PPC_HAVE_NO_LWSYNC
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_XCHG_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_xchg_relb(ethr_native_atomic32_t *var, ethr_sint32_t val)
+{
+    ethr_lwsync__();
+    return ethr_native_atomic32_xchg(var, val);
+}
+
+#endif
+
 #define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG 1
 
 static ETHR_INLINE ethr_sint32_t
@@ -291,6 +368,73 @@ ethr_native_atomic32_cmpxchg_acqb(ethr_native_atomic32_t *var,
     return old;
 }
 
+#if !defined(ETHR_DISABLE_LWSYNC_FOR_CMPXCHG_RELB) && !defined(ETHR_PPC_HAVE_NO_LWSYNC)
+
+#define ETHR_HAVE_ETHR_NATIVE_ATOMIC32_CMPXCHG_RELB 1
+
+static ETHR_INLINE ethr_sint32_t
+ethr_native_atomic32_cmpxchg_relb(ethr_native_atomic32_t *var,
+				  ethr_sint32_t new,
+				  ethr_sint32_t expected)
+{
+    ethr_sint32_t actual;
+
+    /*
+     * We want to implement the release barrier using the
+     * 'lwsync' instruction instead of using the more
+     * expensive 'sync' instruction.
+     *
+     * cmpxchg looks something like this:
+     *
+     *   lwarx # Load
+     *   ...
+     *   if (fail)
+     *      goto done;
+     *   stwcx # Store
+     *   if (fail)
+     *      goto done;
+     *   ...
+     *
+     * In the case we succeeded, 'lwsync' will have
+     * ordered all previously issued loads and stores
+     * against the successful store to this variable.
+     * That is everything is fine!
+     *
+     * In the case we did not succeed, we need to order
+     * all previously issued loads and stores against
+     * the load of this variable. 'lwsync' does not
+     * guarantee this. In order to solve this we issue
+     * a 'sync' and redo the load. If the value has
+     * changed to what the user passed as expected value
+     * we need to try the cmpxchg operation again, since
+     * this value indicates success.
+     */
+
+    ethr_lwsync__();
+
+    actual = ethr_native_atomic32_cmpxchg(var, new, expected);
+
+#ifndef ETHR_PPC_HAVE_LWSYNC
+    /* We checked for lwsync support in runtime... */
+    if (ETHR_PPC_RUNTIME_CONF_HAVE_NO_LWSYNC__)
+	return actual; /* No need to; ethr_lwsync__() issued a sync... */
+#endif
+
+    /* ethr_lwsync__() issued an lwsync... */
+    if (actual == expected)
+	return actual; /* Successful operation */
+
+    /* Failure... need to issue a sync... */
+    ethr_sync__();
+    actual = ethr_native_atomic32_read(var);
+    if (actual != expected)
+	return actual; /* Fail... */
+    /* Try again... */
+    return ethr_native_atomic32_cmpxchg(var, new, expected);
+}
+
+#endif
+
 #endif /* ETHR_TRY_INLINE_FUNCS */
 
 #endif /* ETHREAD_PPC_ATOMIC_H */