22 files changed, 1563 insertions, 407 deletions
diff --git a/erts/configure.in b/erts/configure.in
index 400d91154a..4a4f478521 100644
--- a/erts/configure.in
+++ b/erts/configure.in
@@ -422,6 +422,56 @@ if test X"$with_ets_write_concurrency_locks" != X""; then
 		      [Define to override the default number of write_concurrency locks])
 fi
 
+AC_ARG_WITH(spectre-mitigation,
+    AS_HELP_STRING([--with-spectre-mitigation={yes|incomplete}],
+                   [enable spectre mitigation, either fully or with mitigations
+                    disabled in a handful places like the interpreter])
+    AS_HELP_STRING([--without-spectre-mitigation],
+                   [build without spectre mitigation]),
+    [],[with_spectre_mitigation=no])
+
+case "$with_spectre_mitigation" in
+    no) ;;
+    yes) ;;
+    incomplete) ;;
+    *) AC_MSG_ERROR([Invalid spectre mitigation setting]) ;;
+esac
+
+i_noretpoline_attr=""
+
+if test X"$with_spectre_mitigation" != X"no"; then
+    CFLAGS="$CFLAGS -mindirect-branch=thunk"
+
+    AC_MSG_CHECKING([for spectre mitigation])
+    AC_COMPILE_IFELSE(
+        [AC_LANG_PROGRAM([],[return 0;])],
+        [AC_MSG_RESULT([yes])],
+        [AC_MSG_ERROR([no])])
+
+    if test X"$with_spectre_mitigation" = X"incomplete"; then
+        # gcc and clang support this attribute if they're recent enough. Note
+        # that we must compile with -Werror to check for actual support as they
+        # warn rather than error out on unsupported attributes.
+
+        i_noretpoline_attr='__attribute__((__indirect_branch__("keep")))'
+        i_preserve_cflags="$CFLAGS"
+        CFLAGS="$CFLAGS -Werror"
+
+        AC_MSG_CHECKING([whether spectre mitigation can be disabled on a per-function basis])
+        AC_COMPILE_IFELSE(
+            [AC_LANG_PROGRAM([$i_noretpoline_attr],[return 0;])],
+            [AC_MSG_RESULT([yes])],
+            [AC_MSG_ERROR([no])])
+
+        CFLAGS="$i_preserve_cflags"
+    fi
+fi
+
+AC_DEFINE_UNQUOTED(ERTS_NO_RETPOLINE, $i_noretpoline_attr,
+                   [Per-function attribute for disabling retpoline. This is
+                    *only* defined when --with-spectre-mitigation=incomplete
+                    and has no effects otherwise])
+
 dnl ----------------------------------------------------------------------
 dnl Checks for programs.
 dnl ----------------------------------------------------------------------
diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml
index 610351db6c..79f703455a 100644
--- a/erts/doc/src/erl_dist_protocol.xml
+++ b/erts/doc/src/erl_dist_protocol.xml
@@ -829,6 +829,14 @@ DiB == gen_digest(ChA, ICA)?
         <item>
           <p>The node understand UTF-8 encoded atoms.</p>
         </item>
+        <tag><c>-define(DFLAG_MAP_TAG, 16#20000).</c></tag>
+        <item>
+          <p>The node understand the map tag.</p>
+        </item>
+        <tag><c>-define(DFLAG_BIG_CREATION, 16#40000).</c></tag>
+        <item>
+          <p>The node understand big node creation.</p>
+        </item>
       </taglist>
     </section>
   </section>
diff --git a/erts/doc/src/erl_ext_dist.xml b/erts/doc/src/erl_ext_dist.xml
index b7090d0472..5813af1d57 100644
--- a/erts/doc/src/erl_ext_dist.xml
+++ b/erts/doc/src/erl_ext_dist.xml
@@ -386,44 +386,6 @@
   </section>
 
     <section>
-      <marker id="REFERENCE_EXT"/>
-      <title>REFERENCE_EXT</title>
-      <table align="left">
-        <row>
-          <cell align="center">1</cell>
-          <cell align="center">N</cell>
-          <cell align="center">4</cell>
-          <cell align="center">1</cell>
-        </row>
-        <row>
-          <cell align="center"><c>101</c></cell>
-          <cell align="center"><c>Node</c></cell>
-          <cell align="center"><c>ID</c></cell>
-          <cell align="center"><c>Creation</c></cell>
-        </row>
-      <tcaption>REFERENCE_EXT</tcaption></table>
-      <p>
-        Encodes a reference object (an object generated with
-        <seealso marker="erlang:make_ref/0">erlang:make_ref/0</seealso>).
-        The <c>Node</c> term is an encoded atom, that is,
-        <seealso marker="#ATOM_UTF8_EXT"><c>ATOM_UTF8_EXT</c></seealso>, 
-        <seealso marker="#SMALL_ATOM_UTF8_EXT"><c>SMALL_ATOM_UTF8_EXT</c></seealso>, or
-        <seealso marker="#ATOM_CACHE_REF"><c>ATOM_CACHE_REF</c></seealso>. 
-        The <c>ID</c> field contains a big-endian unsigned integer,
-        but <em>is to be regarded as uninterpreted data</em>,
-        as this field is node-specific.
-        <c>Creation</c> is a byte containing a node serial number, which
-        makes it possible to separate old (crashed) nodes from a new one.
-      </p>
-      <p>
-        In <c>ID</c>, only 18 bits are significant; the rest are to be 0.
-        In <c>Creation</c>, only two bits are significant; the rest are to be 0.
-        See <seealso marker="#NEW_REFERENCE_EXT">
-        <c>NEW_REFERENCE_EXT</c></seealso>.
-      </p>
-    </section>
-
-    <section>
       <marker id="PORT_EXT"/>
       <title>PORT_EXT</title>
 	<table align="left">
@@ -441,13 +403,46 @@
 	  </row>
 	<tcaption>PORT_EXT</tcaption></table>
 	<p>
-	  Encodes a port object (obtained from
-          <seealso marker="erlang:open_port/2">
-          <c>erlang:open_port/2</c></seealso>).
-	  The <c>ID</c> is a node-specific identifier for a local port.
+	  Same as <seealso marker="#NEW_PORT_EXT"><c>NEW_PORT_EXT</c></seealso>
+	  except the <c>Creation</c> field is only one byte and only two
+	  bits are significant, the rest are to be 0.
+	</p>
+    </section>
+
+    <section>
+      <marker id="NEW_PORT_EXT"/>
+      <title>NEW_PORT_EXT</title>
+	<table align="left">
+	  <row>
+	    <cell align="center">1</cell>
+	    <cell align="center">N</cell>
+	    <cell align="center">4</cell>
+	    <cell align="center">4</cell>
+	  </row>
+	  <row>
+	    <cell align="center"><c>89</c></cell>
+	    <cell align="center"><c>Node</c></cell>
+	    <cell align="center"><c>ID</c></cell>
+	    <cell align="center"><c>Creation</c></cell>
+	  </row>
+	<tcaption>NEW_PORT_EXT</tcaption></table>
+	<p>
+	  Encodes a port identifier (obtained from
+          <seealso marker="erlang#open_port/2"><c>erlang:open_port/2</c></seealso>).
+	  <c>Node</c> is an encoded atom, that is,
+	  <seealso marker="#ATOM_UTF8_EXT"><c>ATOM_UTF8_EXT</c></seealso>,
+	  <seealso marker="#SMALL_ATOM_UTF8_EXT"><c>SMALL_ATOM_UTF8_EXT</c></seealso>
+	  or <seealso marker="#ATOM_CACHE_REF"><c>ATOM_CACHE_REF</c></seealso>.
+	  <c>ID</c> is a 32-bit big endian unsigned integer. Only 28 bits are
+	  significant; the rest are to be 0. The <c>Creation</c> works just like in
+	  <seealso marker="#NEW_PID_EXT"><c>NEW_PID_EXT</c></seealso>.
 	  Port operations are not allowed across node boundaries.
-	  The <c>Creation</c> works just like in
-	  <seealso marker="#REFERENCE_EXT"><c>REFERENCE_EXT</c></seealso>.
+	</p>
+	<p>Introduced in OTP 19, but only to be decoded and echoed back. Not
+	  encoded for local ports. Planned to supersede <seealso marker="#PORT_EXT">
+	  <c>PORT_EXT</c></seealso> in OTP 23 when
+	  <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_BIG_CREATON</c></seealso>
+	  becomes mandatory.
 	</p>
     </section>
 
@@ -471,12 +466,65 @@
 	  </row>
 	<tcaption>PID_EXT</tcaption></table>
 	<p>
-	  Encodes a process identifier object (obtained from
-          <seealso marker="erlang:spawn/3"><c>erlang:spawn/3</c></seealso> or
-	  friends). The <c>ID</c> and <c>Creation</c> fields works just like in
-	  <seealso marker="#REFERENCE_EXT"><c>REFERENCE_EXT</c></seealso>, while
-	  the <c>Serial</c> field is used to improve safety.	  
-	  In <c>ID</c>, only 15 bits are significant; the rest are to be 0.
+	  Same as <seealso marker="#NEW_PID_EXT"><c>NEW_PID_EXT</c></seealso>
+	  except the <c>Creation</c> field is only one byte and only two
+	  bits are significant, the rest are to be 0.
+	</p>
+    </section>
+
+    <section>
+      <marker id="NEW_PID_EXT"/>
+      <title>NEW_PID_EXT</title>
+	<table align="left">
+	  <row>
+	    <cell align="center">1</cell>
+	    <cell align="center">N</cell>
+	    <cell align="center">4</cell>
+	    <cell align="center">4</cell>
+	    <cell align="center">4</cell>
+	  </row>
+	  <row>
+	    <cell align="center"><c>88</c></cell>
+	    <cell align="center"><c>Node</c></cell>
+	    <cell align="center"><c>ID</c></cell>
+	    <cell align="center"><c>Serial</c></cell>
+	    <cell align="center"><c>Creation</c></cell>
+	  </row>
+	<tcaption>NEW_PID_EXT</tcaption></table>
+	<p>
+	  Encodes an Erlang process identifier object.
+	</p>
+	<taglist>
+	  <tag><c>Node</c></tag>
+	  <item><p>The name of the originating node, encoded using
+	    <seealso marker="#ATOM_UTF8_EXT"><c>ATOM_UTF8_EXT</c></seealso>,
+	    <seealso marker="#SMALL_ATOM_UTF8_EXT"><c>SMALL_ATOM_UTF8_EXT</c></seealso>
+	    or <seealso
+	    marker="#ATOM_CACHE_REF"><c>ATOM_CACHE_REF</c></seealso>.</p>
+	  </item>
+	  <tag><c>ID</c></tag>
+	  <item><p>A 32-bit big endian unsigned integer. Only 15 bits are
+	    significant; the rest are to be 0.</p>
+	  </item>
+	  <tag><c>Serial</c></tag>
+	  <item><p>A 32-bit big endian unsigned integer. Only 13 bits are
+	    significant; the rest are to be 0.</p>
+	  </item>
+	  <tag><c>Creation</c></tag>
+	  <item><p>A 32-bit big endian unsigned integer. All identifiers
+	    originating from the same node incarnation must have identical <c>Creation</c>
+	    values. This makes it possible to separate identifiers from old
+	    (crashed) nodes from a new one. The value zero should be avoided for
+	    normal operations as it is used as a wild card for debug purpose
+	    (like a pid returned by <seealso marker="erts:erlang#list_to_pid/1">
+	    erlang:list_to_pid/1</seealso>).</p>
+	  </item>
+	</taglist>
+	<p>Introduced in OTP 19, but only to be decoded and echoed back. Not
+	  encoded for local processes. Planned to supersede <seealso marker="#PID_EXT">
+	  <c>PID_EXT</c></seealso> in OTP 23 when
+	  <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_BIG_CREATON</c></seealso>
+	  becomes mandatory.
 	</p>
     </section>
 
@@ -700,6 +748,30 @@
     </section>
 
     <section>
+      <marker id="REFERENCE_EXT"/>
+      <title>REFERENCE_EXT (deprecated)</title>
+      <table align="left">
+        <row>
+          <cell align="center">1</cell>
+          <cell align="center">N</cell>
+          <cell align="center">4</cell>
+          <cell align="center">1</cell>
+        </row>
+        <row>
+          <cell align="center"><c>101</c></cell>
+          <cell align="center"><c>Node</c></cell>
+          <cell align="center"><c>ID</c></cell>
+          <cell align="center"><c>Creation</c></cell>
+        </row>
+      <tcaption>REFERENCE_EXT</tcaption></table>
+      <p>
+	The same as <seealso marker="#NEW_REFERENCE_EXT">
+	<c>NEW_REFERENCE_EXT</c></seealso> except <c>ID</c> is only one word
+	(<c>Len</c> = 1).
+      </p>
+    </section>
+
+    <section>
       <marker id="NEW_REFERENCE_EXT"/>
       <title>NEW_REFERENCE_EXT</title>
 	<table align="left">
@@ -719,29 +791,68 @@
 	  </row>
 	<tcaption>NEW_REFERENCE_EXT</tcaption></table>
 	<p>
-	  <c>Node</c> and <c>Creation</c> are as in
-	  <seealso marker="#REFERENCE_EXT"><c>REFERENCE_EXT</c></seealso>.
-	</p>
-	<p>
-	  <c>ID</c> contains a sequence of big-endian unsigned integers
-	  (4 bytes each, so <c>N'</c> is a multiple of 4),
-	  but is to be regarded as uninterpreted data.
-	</p>
-	<p>
-	  <c>N'</c> = 4 * <c>Len</c>.
-	</p>
-	<p>
-	  In the first word (4 bytes) of <c>ID</c>, only 18 bits are
-	  significant, the rest are to be 0.
-	  In <c>Creation</c>, only two bits are significant,
-	  the rest are to be 0.
+	  The same as <seealso marker="#NEWER_REFERENCE_EXT">
+	  <c>NEWER_REFERENCE_EXT</c></seealso> <em>except</em>:
 	</p>
+	<taglist>
+	  <tag><c>ID</c></tag>
+	  <item><p>In the first word (4 bytes) of <c>ID</c>, only 18 bits are
+	    significant, the rest must be 0.</p>
+	  </item>
+	  <tag><c>Creation</c></tag>
+	  <item><p>Only one byte long and only two bits are significant, the rest must be 0.</p>
+	  </item>
+	</taglist>
+    </section>
+
+    <section>
+      <marker id="NEWER_REFERENCE_EXT"/>
+      <title>NEWER_REFERENCE_EXT</title>
+	<table align="left">
+	  <row>
+	    <cell align="center">1</cell>
+	    <cell align="center">2</cell>
+	    <cell align="center">N</cell>
+	    <cell align="center">4</cell>
+	    <cell align="center">N'</cell>
+	  </row>
+	  <row>
+	    <cell align="center"><c>90</c></cell>
+	    <cell align="center"><c>Len</c></cell>
+	    <cell align="center"><c>Node</c></cell>
+	    <cell align="center"><c>Creation</c></cell>
+	    <cell align="center"><c>ID ...</c></cell>
+	  </row>
+	<tcaption>NEWER_REFERENCE_EXT</tcaption></table>
 	<p>
-	  <c>NEW_REFERENCE_EXT</c> was introduced with distribution version 4.
-	  In version 4, <c>N'</c> is to be at most 12.
+	  Encodes a reference term generated with
+	  <seealso marker="erts:erlang#make_ref/0">erlang:make_ref/0</seealso>.
 	</p>
-	<p>
-	  See <seealso marker="#REFERENCE_EXT"><c>REFERENCE_EXT</c></seealso>.
+	<taglist>
+	  <tag><c>Node</c></tag>
+	  <item><p>The name of the originating node, encoded using
+	    <seealso marker="#ATOM_UTF8_EXT"><c>ATOM_UTF8_EXT</c></seealso>,
+	    <seealso marker="#SMALL_ATOM_UTF8_EXT"><c>SMALL_ATOM_UTF8_EXT</c></seealso>
+	    or <seealso marker="#ATOM_CACHE_REF"><c>ATOM_CACHE_REF</c></seealso>.</p>
+	  </item>
+	  <tag><c>Len</c></tag>
+	  <item><p>A 16-bit big endian unsigned integer not larger than 3.</p>
+	  </item>
+	  <tag><c>ID</c></tag>
+	  <item><p>A sequence of <c>Len</c> big-endian unsigned integers
+	    (4 bytes each, so <c>N'</c>&nbsp;=&nbsp;4&nbsp;*&nbsp;<c>Len</c>),
+	    but is to be regarded as uninterpreted data.</p>
+	  </item>
+	  <tag><c>Creation</c></tag>
+	  <item><p>Works just like in
+	  <seealso marker="#NEW_PID_EXT"><c>NEW_PID_EXT</c></seealso>.</p>
+	  </item>
+	</taglist>
+	<p>Introduced in OTP 19, but only to be decoded and echoed back. Not
+	  encoded for local references. Planned to supersede <seealso marker="#NEW_REFERENCE_EXT">
+	  <c>NEW_REFERENCE_EXT</c></seealso> in OTP 23 when
+	  <seealso marker="erl_dist_protocol#dflags"><c>DFLAG_BIG_CREATON</c></seealso>
+	  becomes mandatory.
 	</p>
     </section>
 
diff --git a/erts/doc/src/notes.xml b/erts/doc/src/notes.xml
index 5b414853a3..56681c0250 100644
--- a/erts/doc/src/notes.xml
+++ b/erts/doc/src/notes.xml
@@ -31,6 +31,194 @@
   </header>
   <p>This document describes the changes made to the ERTS application.</p>
 
+<section><title>Erts 9.3.3.9</title>
+
+    <section><title>Improvements and New Features</title>
+      <list>
+        <item>
+	    <p>Added an optional <c>./configure</c> flag to compile
+	    the emulator with spectre mitigation:
+	    <c>--with-spectre-mitigation</c></p>
+	    <p>Note that this requires a recent version of GCC with
+	    support for spectre mitigation and the
+	    <c>--mindirect-branch=thunk</c> flag, such as
+	    <c>8.1</c>.</p>
+          <p>
+	    Own Id: OTP-15430 Aux Id: ERIERL-237 </p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.8</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+          <p>
+	    A bug that could cause dirty schedulers to become
+	    unresponsive has been fixed.</p>
+          <p>
+	    Own Id: OTP-15509 Aux Id: PR-2027, PR-2093 </p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.7</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+          <p>
+	    Fixed bug in operator <c>band</c> of two negative
+	    operands causing erroneous result if the absolute value
+	    of one of the operands have the lowest <c>N*W</c> bits as
+	    zero and the other absolute value is not larger than
+	    <c>N*W</c> bits. <c>N</c> is an integer of 1 or larger
+	    and <c>W</c> is 32 or 64 depending on word size.</p>
+          <p>
+	    Own Id: OTP-15487 Aux Id: ERL-804 </p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.6</title>
+
+    <section><title>Improvements and New Features</title>
+      <list>
+        <item>
+	    <p>List subtraction (The <c>--</c> operator) will now
+	    yield properly on large inputs.</p>
+          <p>
+	    Own Id: OTP-15371</p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.5</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+          <p>
+	    ERTS internal trees of monitor structures could get into
+	    an inconsistent state. This could cause <c>'DOWN'</c>
+	    messages not to be delivered when they should, as well as
+	    delivery of <c>'DOWN'</c> messages that should not be
+	    delivered.</p>
+          <p>
+	    This bug was introduced in ERTS version 9.0 (OTP 20.0)
+	    and was fixed in ERTS version 10.0 (OTP 21.0) due to a
+	    rewrite of the monitor code. That is, this bug only exist
+	    in the OTP 20 release.</p>
+          <p>
+	    Own Id: OTP-15399 Aux Id: ERL-751, ERIERL-262, OTP-14205 </p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.4</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+          <p>
+	    Fixed bug in <c>ets:select_replace</c> when called with a
+	    fully bound key could cause a following call to
+	    <c>ets:next</c> or <c>ets:prev</c> to crash the emulator
+	    or return invalid result.</p>
+          <p>
+	    Own Id: OTP-15346</p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.3</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+          <p>
+	    Fixed a bug which caused an emulator crash when
+	    <c>enif_send()</c> was called by a NIF that executed on a
+	    dirty scheduler. The bug was either triggered when the
+	    NIF called <c>enif_send()</c> without a message
+	    environment, or when the process executing the NIF was
+	    <c>send</c> traced.</p>
+          <p>
+	    Own Id: OTP-15223</p>
+        </item>
+        <item>
+          <p>
+	    Fixed a bug causing some Erlang references to be
+	    inconsistently ordered. This could for example cause
+	    failure to look up certain elements with references as
+	    keys in search data structures. This bug was introduced
+	    in R13B02.</p>
+          <p>
+	    Thanks to Simon Cornish for finding the bug and supplying
+	    a fix.</p>
+          <p>
+	    *** POTENTIAL INCOMPATIBILITY ***</p>
+          <p>
+	    Own Id: OTP-15225</p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.2</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+	    <p>Fixed a race condition in the inet driver that could
+	    cause receive to hang when the emulator was compiled with
+	    gcc 8.</p>
+          <p>
+	    Own Id: OTP-15158 Aux Id: ERL-654 </p>
+        </item>
+        <item>
+          <p>
+	    Fix bug in generation of erl_crash.dump, which could
+	    cause VM to crash.</p>
+          <p>
+	    Bug exist since erts-9.2 (OTP-20.2).</p>
+          <p>
+	    Own Id: OTP-15181</p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
+<section><title>Erts 9.3.3.1</title>
+
+    <section><title>Fixed Bugs and Malfunctions</title>
+      <list>
+        <item>
+	    <p>Fixed a rare bug that could cause processes to be
+	    scheduled after they had been freed.</p>
+          <p>
+	    Own Id: OTP-15067 Aux Id: ERL-573 </p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
 <section><title>Erts 9.3.3</title>
 
     <section><title>Fixed Bugs and Malfunctions</title>
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index b4e6c35579..7db0938b39 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -1237,6 +1237,7 @@ init_emulator(void)
  * the instructions' C labels to the loader.
  * The second call starts execution of BEAM code. This call never returns.
  */
+ERTS_NO_RETPOLINE
 void process_main(Eterm * x_reg_array, FloatDef* f_reg_array)
 {
     static int init_done = 0;
diff --git a/erts/emulator/beam/beam_ranges.c b/erts/emulator/beam/beam_ranges.c
index c7b17d58f3..9570fb34db 100644
--- a/erts/emulator/beam/beam_ranges.c
+++ b/erts/emulator/beam/beam_ranges.c
@@ -35,10 +35,8 @@ typedef struct {
 
 /*
  * Used for crash dumping of literals. The size of erts_dump_lit_areas is
- * always twice the number of active ranges (to allow for literals in both
- * current and old code).
+ * always at least the number of active ranges.
  */
-
 ErtsLiteralArea** erts_dump_lit_areas;
 Uint erts_dump_num_lit_areas;
 
@@ -180,8 +178,8 @@ erts_end_staging_ranges(int commit)
 				(erts_aint_t) (r[dst].modules +
 					       r[dst].n / 2));
 
-        if (r[dst].allocated * 2 > erts_dump_num_lit_areas) {
-            erts_dump_num_lit_areas *= 2;
+        if (r[dst].allocated > erts_dump_num_lit_areas) {
+            erts_dump_num_lit_areas = r[dst].allocated * 2;
             erts_dump_lit_areas = (ErtsLiteralArea **)
                 erts_realloc(ERTS_ALC_T_CRASH_DUMP,
                              (void *) erts_dump_lit_areas,
diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c
index c5cb268f09..60f0ecf42b 100644
--- a/erts/emulator/beam/big.c
+++ b/erts/emulator/beam/big.c
@@ -1159,8 +1159,11 @@ static dsize_t I_band(ErtsDigit* x, dsize_t xl, short xsgn,
 		*r++ = ~c1 & ~c2;
 		x++; y++;
 	    }
-	    while(xl--)
-		*r++ = ~*x++;
+	    while(xl--) {
+                DSUBb(*x,0,b1,c1);
+                *r++ = ~c1;
+		x++;
+            }
 	}
     }
     return I_btrail(r0, r, sign);
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 252bf1cc7e..af1133b853 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -322,6 +322,7 @@ type	THR_PRGR_DATA	LONG_LIVED	SYSTEM		thr_prgr_data
 type	T_THR_PRGR_DATA	SHORT_LIVED	SYSTEM		temp_thr_prgr_data
 type	RELEASE_LAREA	SHORT_LIVED	SYSTEM		release_literal_area
 +endif
+type    LIST_TRAP       SHORT_LIVED     PROCESSES       list_bif_trap_state
 
 #
 # Types used for special emulators
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 4050fb6146..43d99db0b2 100644
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -4459,7 +4459,16 @@ BIF_RETTYPE erts_debug_set_internal_state_2(BIF_ALIST_2)
                                             refbin));
             }
         }
-
+        else if (ERTS_IS_ATOM_STR("mbuf", BIF_ARG_1)) {
+            Uint sz = size_object(BIF_ARG_2);
+            ErlHeapFragment* frag = new_message_buffer(sz);
+            Eterm *hp = frag->mem;
+            Eterm copy = copy_struct(BIF_ARG_2, sz, &hp, &frag->off_heap);
+            frag->next = BIF_P->mbuf;
+            BIF_P->mbuf = frag;
+            BIF_P->mbuf_sz += sz;
+            BIF_RET(copy);
+        }
     }
 
     BIF_ERROR(BIF_P, BADARG);
diff --git a/erts/emulator/beam/erl_bif_lists.c b/erts/emulator/beam/erl_bif_lists.c
index 73d327da3e..94a41c285a 100644
--- a/erts/emulator/beam/erl_bif_lists.c
+++ b/erts/emulator/beam/erl_bif_lists.c
@@ -29,12 +29,13 @@
 #include "sys.h"
 #include "erl_vm.h"
 #include "global.h"
-#include "erl_process.h"
-#include "error.h"
 #include "bif.h"
+#include "erl_binary.h"
+
 
 static Eterm keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List);
 
+
 static BIF_RETTYPE append(Process* p, Eterm A, Eterm B)
 {
     Eterm list;
@@ -146,103 +147,725 @@ BIF_RETTYPE append_2(BIF_ALIST_2)
     return append(BIF_P, BIF_ARG_1, BIF_ARG_2);
 }
 
-/*
- * erlang:'--'/2
- */
+/* erlang:'--'/2
+ *
+ * Subtracts a list from another (LHS -- RHS), removing the first occurrence of
+ * each element in LHS from RHS. There is no type coercion so the elements must
+ * match exactly.
+ *
+ * The BIF is broken into several stages that can all trap individually, and it
+ * chooses its algorithm based on input size. If either input is small it will
+ * use a linear scan tuned to which side it's on, and if both inputs are large
+ * enough it will convert RHS into a multiset to provide good asymptotic
+ * behavior. */
 
-#define SMALL_VEC_SIZE 10
-static Eterm subtract(Process* p, Eterm A, Eterm B)
-{
-    Eterm  list;
-    Eterm* hp;
-    Uint  need;
-    Eterm  res;
-    Eterm small_vec[SMALL_VEC_SIZE];	/* Preallocated memory for small lists */
-    Eterm* vec_p;
-    Eterm* vp;
-    Sint i;
-    Sint n;
-    Sint m;
-    
-    if ((n = erts_list_length(A)) < 0) {
-	BIF_ERROR(p, BADARG);
+#define SUBTRACT_LHS_THRESHOLD 16
+#define SUBTRACT_RHS_THRESHOLD 16
+
+typedef enum {
+    SUBTRACT_STAGE_START,
+    SUBTRACT_STAGE_LEN_LHS,
+
+    /* Naive linear scan that's efficient when
+     * LEN_LHS <= SUBTRACT_LHS_THRESHOLD. */
+    SUBTRACT_STAGE_NAIVE_LHS,
+
+    SUBTRACT_STAGE_LEN_RHS,
+
+    /* As SUBTRACT_STAGE_NAIVE_LHS but for RHS. */
+    SUBTRACT_STAGE_NAIVE_RHS,
+
+    /* Creates a multiset from RHS for faster lookups before sweeping through
+     * LHS. The set is implemented as a red-black tree and duplicate elements
+     * are handled by a counter on each node. */
+    SUBTRACT_STAGE_SET_BUILD,
+    SUBTRACT_STAGE_SET_FINISH
+} ErtsSubtractCtxStage;
+
+typedef struct subtract_node__ {
+    struct subtract_node__ *parent;
+    struct subtract_node__ *left;
+    struct subtract_node__ *right;
+    int is_red;
+
+    Eterm key;
+    Uint count;
+} subtract_tree_t;
+
+typedef struct {
+    ErtsSubtractCtxStage stage;
+
+    Eterm lhs_original;
+    Eterm rhs_original;
+
+    Uint lhs_remaining;
+    Uint rhs_remaining;
+
+    Eterm iterator;
+
+    Eterm *result_cdr;
+    Eterm result;
+
+    union {
+        Eterm lhs_elements[SUBTRACT_LHS_THRESHOLD];
+        Eterm rhs_elements[SUBTRACT_RHS_THRESHOLD];
+
+        struct {
+            subtract_tree_t *tree;
+
+            /* A memory area for the tree's nodes, saving us the need to have
+             * one allocation per node. */
+            subtract_tree_t *alloc_start;
+            subtract_tree_t *alloc;
+        } rhs_set;
+    } u;
+} ErtsSubtractContext;
+
+#define ERTS_RBT_PREFIX subtract
+#define ERTS_RBT_T subtract_tree_t
+#define ERTS_RBT_KEY_T Eterm
+#define ERTS_RBT_FLAGS_T int
+#define ERTS_RBT_INIT_EMPTY_TNODE(T) \
+    do { \
+        (T)->parent = NULL; \
+        (T)->left = NULL; \
+        (T)->right = NULL; \
+    } while(0)
+#define ERTS_RBT_IS_RED(T) ((T)->is_red)
+#define ERTS_RBT_SET_RED(T) ((T)->is_red = 1)
+#define ERTS_RBT_IS_BLACK(T) (!ERTS_RBT_IS_RED(T))
+#define ERTS_RBT_SET_BLACK(T) ((T)->is_red = 0)
+#define ERTS_RBT_GET_FLAGS(T) ((T)->is_red)
+#define ERTS_RBT_SET_FLAGS(T, F) ((T)->is_red = F)
+#define ERTS_RBT_GET_PARENT(T) ((T)->parent)
+#define ERTS_RBT_SET_PARENT(T, P) ((T)->parent = P)
+#define ERTS_RBT_GET_RIGHT(T) ((T)->right)
+#define ERTS_RBT_SET_RIGHT(T, R) ((T)->right = (R))
+#define ERTS_RBT_GET_LEFT(T) ((T)->left)
+#define ERTS_RBT_SET_LEFT(T, L) ((T)->left = (L))
+#define ERTS_RBT_GET_KEY(T) ((T)->key)
+#define ERTS_RBT_IS_LT(KX, KY) (CMP_TERM(KX, KY) < 0)
+#define ERTS_RBT_IS_EQ(KX, KY) EQ(KX, KY)
+#define ERTS_RBT_WANT_LOOKUP_INSERT
+#define ERTS_RBT_WANT_LOOKUP
+#define ERTS_RBT_WANT_DELETE
+#define ERTS_RBT_UNDEF
+
+#include "erl_rbtree.h"
+
+static int subtract_continue(Process *p, ErtsSubtractContext *context);
+
+static void subtract_ctx_dtor(ErtsSubtractContext *context) {
+    switch (context->stage) {
+        case SUBTRACT_STAGE_SET_BUILD:
+        case SUBTRACT_STAGE_SET_FINISH:
+            erts_free(ERTS_ALC_T_LIST_TRAP, context->u.rhs_set.alloc_start);
+            break;
+        default:
+            break;
     }
-    if ((m = erts_list_length(B)) < 0) {
-	BIF_ERROR(p, BADARG);
+}
+
+static int subtract_ctx_bin_dtor(Binary *context_bin) {
+    ErtsSubtractContext *context = ERTS_MAGIC_BIN_DATA(context_bin);
+    subtract_ctx_dtor(context);
+    return 1;
+}
+
+static void subtract_ctx_move(ErtsSubtractContext *from,
+                              ErtsSubtractContext *to) {
+    int uses_result_cdr = 0;
+
+    to->stage = from->stage;
+
+    to->lhs_original = from->lhs_original;
+    to->rhs_original = from->rhs_original;
+
+    to->lhs_remaining = from->lhs_remaining;
+    to->rhs_remaining = from->rhs_remaining;
+
+    to->iterator = from->iterator;
+    to->result = from->result;
+
+    switch (to->stage) {
+        case SUBTRACT_STAGE_NAIVE_LHS:
+            sys_memcpy(to->u.lhs_elements,
+                       from->u.lhs_elements,
+                       sizeof(Eterm) * to->lhs_remaining);
+            break;
+        case SUBTRACT_STAGE_NAIVE_RHS:
+            sys_memcpy(to->u.rhs_elements,
+                       from->u.rhs_elements,
+                       sizeof(Eterm) * to->rhs_remaining);
+
+            uses_result_cdr = 1;
+            break;
+        case SUBTRACT_STAGE_SET_FINISH:
+            uses_result_cdr = 1;
+            /* FALL THROUGH */
+        case SUBTRACT_STAGE_SET_BUILD:
+            to->u.rhs_set.alloc_start = from->u.rhs_set.alloc_start;
+            to->u.rhs_set.alloc = from->u.rhs_set.alloc;
+            to->u.rhs_set.tree = from->u.rhs_set.tree;
+            break;
+        default:
+            break;
     }
-    
-    if (n == 0)
-	BIF_RET(NIL);
-    if (m == 0)
-	BIF_RET(A);
-    
-    /* allocate element vector */
-    if (n <= SMALL_VEC_SIZE)
-	vec_p = small_vec;
-    else
-	vec_p = (Eterm*) erts_alloc(ERTS_ALC_T_TMP, n * sizeof(Eterm));
-    
-    /* PUT ALL ELEMENTS IN VP */
-    vp = vec_p;
-    list = A;
-    i = n;
-    while(i--) {
-	Eterm* listp = list_val(list);
-	*vp++ = CAR(listp);
-	list = CDR(listp);
+
+    if (uses_result_cdr) {
+        if (from->result_cdr == &from->result) {
+            to->result_cdr = &to->result;
+        } else {
+            to->result_cdr = from->result_cdr;
+        }
     }
-    
-    /* UNMARK ALL DELETED CELLS */
-    list = B;
-    m = 0;  /* number of deleted elements */
-    while(is_list(list)) {
-	Eterm* listp = list_val(list);
-	Eterm  elem = CAR(listp);
-	i = n;
-	vp = vec_p;
-	while(i--) {
-	    if (is_value(*vp) && eq(*vp, elem)) {
-		*vp = THE_NON_VALUE;
-		m++;
-		break;
-	    }
-	    vp++;
-	}
-	list = CDR(listp);
+}
+
+static Eterm subtract_create_trap_state(Process *p,
+                                        ErtsSubtractContext *context) {
+    Binary *state_bin;
+    Eterm *hp;
+
+    state_bin = erts_create_magic_binary(sizeof(ErtsSubtractContext),
+                                         subtract_ctx_bin_dtor);
+
+    subtract_ctx_move(context, ERTS_MAGIC_BIN_DATA(state_bin));
+
+    hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE);
+
+    return erts_mk_magic_ref(&hp, &MSO(p), state_bin);
+}
+
+static int subtract_enter_len_lhs(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_LEN_LHS;
+
+    context->iterator = context->lhs_original;
+    context->lhs_remaining = 0;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_enter_len_rhs(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_LEN_RHS;
+
+    context->iterator = context->rhs_original;
+    context->rhs_remaining = 0;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_get_length(Process *p, Eterm *iterator_p, Uint *count_p) {
+    static const Sint ELEMENTS_PER_RED = 32;
+
+    Sint budget, count;
+    Eterm iterator;
+
+    budget = ELEMENTS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    iterator = *iterator_p;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    for (count = 0; count < budget && is_list(iterator); count++) {
+        iterator = CDR(list_val(iterator));
     }
-    
-    if (m == n)      /* All deleted ? */
-	res = NIL;
-    else if (m == 0)  /* None deleted ? */
-	res = A;
-    else {			/* REBUILD LIST */
-	res = NIL;
-	need = 2*(n - m);
-	hp = HAlloc(p, need);
-	vp = vec_p + n - 1;
-	while(vp >= vec_p) {
-	    if (is_value(*vp)) {
-		res = CONS(hp, *vp, res);
-		hp += 2;
-	    }
-	    vp--;
-	}
+
+    if (!is_list(iterator) && !is_nil(iterator)) {
+        return -1;
+    }
+
+    BUMP_REDS(p, count / ELEMENTS_PER_RED);
+
+    *iterator_p = iterator;
+    *count_p += count;
+
+    if (is_nil(iterator)) {
+        return 1;
     }
-    if (vec_p != small_vec)
-	erts_free(ERTS_ALC_T_TMP, (void *) vec_p);
-    BIF_RET(res);
+
+    return 0;
 }
 
-BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2)
-{
-    return subtract(BIF_P, BIF_ARG_1, BIF_ARG_2);
+static int subtract_enter_naive_lhs(Process *p, ErtsSubtractContext *context) {
+    Eterm iterator;
+    int i = 0;
+
+    context->stage = SUBTRACT_STAGE_NAIVE_LHS;
+
+    context->iterator = context->rhs_original;
+    context->result = NIL;
+
+    iterator = context->lhs_original;
+
+    while (is_list(iterator)) {
+        const Eterm *cell = list_val(iterator);
+
+        ASSERT(i < SUBTRACT_LHS_THRESHOLD);
+
+        context->u.lhs_elements[i++] = CAR(cell);
+        iterator = CDR(cell);
+    }
+
+    ASSERT(i == context->lhs_remaining);
+
+    return subtract_continue(p, context);
 }
 
-BIF_RETTYPE subtract_2(BIF_ALIST_2)
-{
-    return subtract(BIF_P, BIF_ARG_1, BIF_ARG_2);
+static int subtract_naive_lhs(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 16;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+    while (checks < budget && is_list(context->iterator)) {
+        const Eterm *cell;
+        Eterm value, next;
+        int found_at;
+
+        cell = list_val(context->iterator);
+
+        value = CAR(cell);
+        next = CDR(cell);
+
+        for (found_at = 0; found_at < context->lhs_remaining; found_at++) {
+            if (EQ(value, context->u.lhs_elements[found_at])) {
+                /* We shift the array one step down as we have to preserve
+                 * order.
+                 *
+                 * Note that we can't exit early as that would suppress errors
+                 * in the right-hand side (this runs prior to determining the
+                 * length of RHS). */
+
+                context->lhs_remaining--;
+                sys_memmove(&context->u.lhs_elements[found_at],
+                            &context->u.lhs_elements[found_at + 1],
+                            (context->lhs_remaining - found_at) * sizeof(Eterm));
+                break;
+            }
+        }
+
+        checks += MAX(1, context->lhs_remaining);
+        context->iterator = next;
+    }
+
+    BUMP_REDS(p, MIN(checks, budget) / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        return 0;
+    } else if (!is_nil(context->iterator)) {
+        return -1;
+    }
+
+    if (context->lhs_remaining > 0) {
+        Eterm *hp;
+        int i;
+
+        hp = HAlloc(p, context->lhs_remaining * 2);
+
+        for (i = context->lhs_remaining - 1; i >= 0; i--) {
+            Eterm value = context->u.lhs_elements[i];
+
+            context->result = CONS(hp, value, context->result);
+            hp += 2;
+        }
+    }
+
+    ASSERT(context->lhs_remaining > 0 || context->result == NIL);
+
+    return 1;
+}
+
+static int subtract_enter_naive_rhs(Process *p, ErtsSubtractContext *context) {
+    Eterm iterator;
+    int i = 0;
+
+    context->stage = SUBTRACT_STAGE_NAIVE_RHS;
+
+    context->iterator = context->lhs_original;
+    context->result_cdr = &context->result;
+    context->result = NIL;
+
+    iterator = context->rhs_original;
+
+    while (is_list(iterator)) {
+        const Eterm *cell = list_val(iterator);
+
+        ASSERT(i < SUBTRACT_RHS_THRESHOLD);
+
+        context->u.rhs_elements[i++] = CAR(cell);
+        iterator = CDR(cell);
+    }
+
+    ASSERT(i == context->rhs_remaining);
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_naive_rhs(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 16;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (checks < budget && is_list(context->iterator)) {
+        const Eterm *cell;
+        Eterm value, next;
+        int found_at;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        for (found_at = context->rhs_remaining - 1; found_at >= 0; found_at--) {
+            if (EQ(value, context->u.rhs_elements[found_at])) {
+                break;
+            }
+        }
+
+        if (found_at < 0) {
+            /* Destructively add the value to the result. This is safe
+             * since the GC is disabled and the unfinished term is never
+             * leaked to the outside world. */
+            Eterm *hp = HAllocX(p, 2, context->lhs_remaining * 2);
+
+            *context->result_cdr = make_list(hp);
+            context->result_cdr = &CDR(hp);
+
+            CAR(hp) = value;
+        } else if (found_at >= 0) {
+            Eterm swap;
+
+            if (context->rhs_remaining-- == 1) {
+                /* We've run out of items to remove, so the rest of the
+                 * result will be equal to the remainder of the input. We know
+                 * that LHS is well-formed as any errors would've been reported
+                 * during length determination. */
+                *context->result_cdr = next;
+
+                BUMP_REDS(p, MIN(budget, checks) / CHECKS_PER_RED);
+
+                return 1;
+            }
+
+            swap = context->u.rhs_elements[context->rhs_remaining];
+            context->u.rhs_elements[found_at] = swap;
+        }
+
+        checks += context->rhs_remaining;
+        context->iterator = next;
+        context->lhs_remaining--;
+    }
+
+    /* The result only has to be terminated when returning it to the user, but
+     * we're doing it when trapping as well to prevent headaches when
+     * debugging. */
+    *context->result_cdr = NIL;
+
+    BUMP_REDS(p, MIN(budget, checks) / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        ASSERT(context->lhs_remaining > 0 && context->rhs_remaining > 0);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int subtract_enter_set_build(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_SET_BUILD;
+
+    context->u.rhs_set.alloc_start =
+        erts_alloc(ERTS_ALC_T_LIST_TRAP,
+                   context->rhs_remaining * sizeof(subtract_tree_t));
+
+    context->u.rhs_set.alloc = context->u.rhs_set.alloc_start;
+    context->u.rhs_set.tree = NULL;
+
+    context->iterator = context->rhs_original;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_set_build(Process *p, ErtsSubtractContext *context) {
+    const static Sint INSERTIONS_PER_RED = 16;
+    Sint budget, insertions;
+
+    budget = INSERTIONS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    insertions = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (insertions < budget && is_list(context->iterator)) {
+        subtract_tree_t *existing_node, *new_node;
+        const Eterm *cell;
+        Eterm value, next;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        new_node = context->u.rhs_set.alloc;
+        new_node->key = value;
+        new_node->count = 1;
+
+        existing_node = subtract_rbt_lookup_insert(&context->u.rhs_set.tree,
+                                                   new_node);
+
+        if (existing_node != NULL) {
+            existing_node->count++;
+        } else {
+            context->u.rhs_set.alloc++;
+        }
+
+        context->iterator = next;
+        insertions++;
+    }
+
+    BUMP_REDS(p, insertions / INSERTIONS_PER_RED);
+
+    ASSERT(is_list(context->iterator) || is_nil(context->iterator));
+    ASSERT(context->u.rhs_set.tree != NULL);
+
+    return is_nil(context->iterator);
+}
+
+static int subtract_enter_set_finish(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_SET_FINISH;
+
+    context->result_cdr = &context->result;
+    context->result = NIL;
+
+    context->iterator = context->lhs_original;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_set_finish(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 8;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (checks < budget && is_list(context->iterator)) {
+        subtract_tree_t *node;
+        const Eterm *cell;
+        Eterm value, next;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        ASSERT(context->rhs_remaining > 0);
+
+        node = subtract_rbt_lookup(context->u.rhs_set.tree, value);
+
+        if (node == NULL) {
+            Eterm *hp = HAllocX(p, 2, context->lhs_remaining * 2);
+
+            *context->result_cdr = make_list(hp);
+            context->result_cdr = &CDR(hp);
+
+            CAR(hp) = value;
+        } else {
+            if (context->rhs_remaining-- == 1) {
+                *context->result_cdr = next;
+
+                BUMP_REDS(p, checks / CHECKS_PER_RED);
+
+                return 1;
+            }
+
+            if (node->count-- == 1) {
+                subtract_rbt_delete(&context->u.rhs_set.tree, node);
+            }
+        }
+
+        context->iterator = next;
+        context->lhs_remaining--;
+        checks++;
+    }
+
+    *context->result_cdr = NIL;
+
+    BUMP_REDS(p, checks / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        ASSERT(context->lhs_remaining > 0 && context->rhs_remaining > 0);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int subtract_continue(Process *p, ErtsSubtractContext *context) {
+    switch (context->stage) {
+        case SUBTRACT_STAGE_START: {
+            return subtract_enter_len_lhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_LEN_LHS: {
+            int res = subtract_get_length(p,
+                                          &context->iterator,
+                                          &context->lhs_remaining);
+
+            if (res != 1) {
+                return res;
+            }
+
+            if (context->lhs_remaining <= SUBTRACT_LHS_THRESHOLD) {
+                return subtract_enter_naive_lhs(p, context);
+            }
+
+            return subtract_enter_len_rhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_NAIVE_LHS: {
+            return subtract_naive_lhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_LEN_RHS: {
+            int res = subtract_get_length(p,
+                                          &context->iterator,
+                                          &context->rhs_remaining);
+
+            if (res != 1) {
+                return res;
+            }
+
+            /* We've walked through both lists fully now so we no longer need
+             * to check for errors past this point. */
+
+            if (context->rhs_remaining <= SUBTRACT_RHS_THRESHOLD) {
+                return subtract_enter_naive_rhs(p, context);
+            }
+
+            return subtract_enter_set_build(p, context);
+        }
+
+        case SUBTRACT_STAGE_NAIVE_RHS: {
+            return subtract_naive_rhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_SET_BUILD: {
+            int res = subtract_set_build(p, context);
+
+            if (res != 1) {
+                return res;
+            }
+
+            return subtract_enter_set_finish(p, context);
+        }
+
+        case SUBTRACT_STAGE_SET_FINISH: {
+            return subtract_set_finish(p, context);
+        }
+
+        default:
+            ERTS_ASSERT(!"unreachable");
+    }
+}
+
+static int subtract_start(Process *p, Eterm lhs, Eterm rhs,
+                          ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_START;
+
+    context->lhs_original = lhs;
+    context->rhs_original = rhs;
+
+    return subtract_continue(p, context);
 }
 
+/* erlang:'--'/2 */
+static Eterm subtract(Export *bif_entry, BIF_ALIST_2) {
+    Eterm lhs = BIF_ARG_1, rhs = BIF_ARG_2;
+
+    if ((is_list(lhs) || is_nil(lhs)) && (is_list(rhs) || is_nil(rhs))) {
+        /* We start with the context on the stack in the hopes that we won't
+         * have to trap. */
+        ErtsSubtractContext context;
+        int res;
+
+        res = subtract_start(BIF_P, lhs, rhs, &context);
+
+        if (res == 0) {
+            Eterm state_mref;
+
+            state_mref = subtract_create_trap_state(BIF_P, &context);
+            erts_set_gc_state(BIF_P, 0);
+
+            BIF_TRAP2(bif_entry, BIF_P, state_mref, NIL);
+        }
+
+        subtract_ctx_dtor(&context);
+
+        if (res < 0) {
+            BIF_ERROR(BIF_P, BADARG);
+        }
+
+        BIF_RET(context.result);
+    } else if (is_internal_magic_ref(lhs)) {
+        ErtsSubtractContext *context;
+        int (*dtor)(Binary*);
+        Binary *magic_bin;
+
+        int res;
+
+        magic_bin = erts_magic_ref2bin(lhs);
+        dtor = ERTS_MAGIC_BIN_DESTRUCTOR(magic_bin);
+
+        if (dtor != subtract_ctx_bin_dtor) {
+            BIF_ERROR(BIF_P, BADARG);
+        }
+
+        ASSERT(BIF_P->flags & F_DISABLE_GC);
+        ASSERT(rhs == NIL);
+
+        context = ERTS_MAGIC_BIN_DATA(magic_bin);
+        res = subtract_continue(BIF_P, context);
+
+        if (res == 0) {
+            BIF_TRAP2(bif_entry, BIF_P, lhs, NIL);
+        }
+
+        erts_set_gc_state(BIF_P, 1);
+
+        if (res < 0) {
+            ERTS_BIF_ERROR_TRAPPED2(BIF_P, BADARG, bif_entry,
+                                    context->lhs_original,
+                                    context->rhs_original);
+        }
+
+        BIF_RET(context->result);
+    }
+
+    ASSERT(!(BIF_P->flags & F_DISABLE_GC));
+
+    BIF_ERROR(BIF_P, BADARG);
+}
+
+BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2) {
+    return subtract(bif_export[BIF_ebif_minusminus_2], BIF_CALL_ARGS);
+}
+
+BIF_RETTYPE subtract_2(BIF_ALIST_2) {
+    return subtract(bif_export[BIF_subtract_2], BIF_CALL_ARGS);
+}
+
+
 BIF_RETTYPE lists_member_2(BIF_ALIST_2)
 {
     Eterm term;
diff --git a/erts/emulator/beam/erl_bif_unique.h b/erts/emulator/beam/erl_bif_unique.h
index 9aa631fde9..e3a633080d 100644
--- a/erts/emulator/beam/erl_bif_unique.h
+++ b/erts/emulator/beam/erl_bif_unique.h
@@ -242,11 +242,11 @@ erts_internal_ref_number_cmp(Uint32 num1[ERTS_REF_NUMBERS],
 			     Uint32 num2[ERTS_REF_NUMBERS])
 {
     if (num1[2] != num2[2])
-	return (int) ((Sint64) num1[2] - (Sint64) num2[2]);
+	return num1[2] > num2[2] ? 1 : -1;
     if (num1[1] != num2[1])
-	return (int) ((Sint64) num1[1] - (Sint64) num2[1]);
+	return num1[1] > num2[1] ? 1 : -1;
     if (num1[0] != num2[0])
-	return (int) ((Sint64) num1[0] - (Sint64) num2[0]);
+	return num1[0] > num2[0] ? 1 : -1;
     return 0;
 }
 
diff --git a/erts/emulator/beam/erl_db_hash.c b/erts/emulator/beam/erl_db_hash.c
index cf928a9035..47d304f860 100644
--- a/erts/emulator/beam/erl_db_hash.c
+++ b/erts/emulator/beam/erl_db_hash.c
@@ -1328,11 +1328,7 @@ static int match_traverse(Process* p, DbTableHash* tb,
                 unlock_hash_function(lck);
                 break;
             }
-            if (iterations_left <= 0 || MBUF(p)) {
-                /*
-                 * We have either reached our limit, or just created some heap fragments.
-                 * Since many heap fragments will make the GC slower, trap and GC now.
-                 */
+            if (iterations_left <= 0) {
                 unlock_hash_function(lck);
                 ret_value = on_trap(context_ptr, slot_ix, got, &mpi.mp, ret);
                 goto done;
@@ -1451,11 +1447,7 @@ static int match_traverse_continue(Process* p, DbTableHash* tb,
                 unlock_hash_function(lck);
                 break;
             }
-            if (iterations_left <= 0 || MBUF(p)) {
-                /*
-                 * We have either reached our limit, or just created some heap fragments.
-                 * Since many heap fragments will make the GC slower, trap and GC now.
-                 */
+            if (iterations_left <= 0) {
                 unlock_hash_function(lck);
                 ret_value = on_trap(context_ptr, slot_ix, got, mpp, ret);
                 goto done;
diff --git a/erts/emulator/beam/erl_db_tree.c b/erts/emulator/beam/erl_db_tree.c
index 7c80e92e50..d2b0bf95bd 100644
--- a/erts/emulator/beam/erl_db_tree.c
+++ b/erts/emulator/beam/erl_db_tree.c
@@ -1884,22 +1884,14 @@ static int db_select_replace_tree(Process *p, DbTable *tbl, Eterm tid,
     sc.mp = mpi.mp;
     sc.all_objects = mpi.all_objects;
 
-    stack = get_static_stack(tb);
     if (!mpi.got_partial && mpi.some_limitation &&
             CMP_EQ(mpi.least,mpi.most)) {
-        TreeDbTerm* term = *(mpi.save_term);
         doit_select_replace(tb,mpi.save_term,&sc,0 /* dummy */);
-        if (stack != NULL) {
-            if (TOP_NODE(stack) == term)
-                // throw away potentially invalid reference
-                REPLACE_TOP_NODE(stack, *(mpi.save_term));
-            release_stack(tb, stack);
-        }
+        reset_static_stack(tb); /* may refer replaced term */
         RET_TO_BIF(erts_make_integer(sc.replaced,p),DB_ERROR_NONE);
     }
 
-    if (stack == NULL)
-        stack = get_any_stack(tb);
+    stack = get_any_stack(tb);
 
     if (mpi.some_limitation) {
         if ((this = find_next_from_pb_key(tb, stack, mpi.most)) != NULL) {
@@ -3344,13 +3336,6 @@ static int doit_select(DbTableTree *tb, TreeDbTerm *this, void *ptr,
     if (is_value(ret)) {
 	sc->accum = CONS(hp, ret, sc->accum);
     }
-    if (MBUF(sc->p)) {
-	/*
-	 * Force a trap and GC if a heap fragment was created. Many heap fragments
-	 * make the GC slow.
-	 */
-	sc->max = 0;
-    }
     if (--(sc->max) <= 0) {
 	return 0;
     }
@@ -3407,13 +3392,6 @@ static int doit_select_chunk(DbTableTree *tb, TreeDbTerm *this, void *ptr,
 	++(sc->got);
 	sc->accum = CONS(hp, ret, sc->accum);
     }
-    if (MBUF(sc->p)) {
-	/*
-	 * Force a trap and GC if a heap fragment was created. Many heap fragments
-	 * make the GC slow.
-	 */
-	sc->max = 0;
-    }
     if (--(sc->max) <= 0 || sc->got == sc->chunk_size) {
 	return 0;
     }
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 4e479c26ef..f8e964e2cf 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -409,8 +409,18 @@ static void full_flush_env(ErlNifEnv* env)
 static void full_cache_env(ErlNifEnv* env)
 {    
 #ifdef ERTS_DIRTY_SCHEDULERS
-    if (env->proc->static_flags & ERTS_STC_FLG_SHADOW_PROC)
+    if (env->proc->static_flags & ERTS_STC_FLG_SHADOW_PROC) {
 	erts_cache_dirty_shadow_proc(env->proc);
+        /*
+         * If shadow proc had heap fragments when flushed
+         * those have now been moved to the real proc.
+         * Ensure heap pointers do not point into a heap
+         * fragment on real proc...
+         */
+        ASSERT(!env->proc->mbuf);
+	env->hp_end = HEAP_LIMIT(env->proc);
+	env->hp = HEAP_TOP(env->proc);
+    }
 #endif
     cache_env(env);
 }
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 9f6adb03d0..be94418755 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -88,10 +88,6 @@
 #undef HARDDEBUG
 #endif
 
-#ifdef HARDDEBUG
-#define HARDDEBUG_RUNQS
-#endif
-
 #ifdef HIPE
 #include "hipe_mode_switch.h"	/* for hipe_init_process() */
 #include "hipe_signal.h"	/* for hipe_thread_signal_init() */
@@ -3420,6 +3416,46 @@ static void suspend_scheduler(ErtsSchedulerData *esdp);
 
 #endif /* ERTS_SMP */
 
+#ifdef HARDDEBUG
+#define ERTS_HDBG_CHK_SLEEP_LIST(SL, L, F, FN) \
+     check_sleepers_list((SL), (L), (F), (FN))
+static void check_sleepers_list(ErtsSchedulerSleepList *sl,
+                                int lock,
+                                ErtsSchedulerSleepInfo *find,
+                                ErtsSchedulerSleepInfo *find_not)
+{
+    ErtsSchedulerSleepInfo *last_out;
+    int found = 0;
+
+    if (lock)
+        erts_smp_spin_lock(&sl->lock);
+
+    ERTS_ASSERT(!find_not || (!find_not->next && !find_not->prev));
+    
+    last_out = sl->list;
+    if (last_out) {
+        ErtsSchedulerSleepInfo *tmp = last_out;
+        do {
+            ERTS_ASSERT(tmp->next);
+            ERTS_ASSERT(tmp->prev);
+            ERTS_ASSERT(tmp->next->prev == tmp);
+            ERTS_ASSERT(tmp->prev->next == tmp);
+            ERTS_ASSERT(tmp != find_not);
+            if (tmp == find)
+                found = !0;
+            tmp = tmp->next;
+            
+        } while (tmp != last_out);
+    }
+    ERTS_ASSERT(!find || found);
+
+    if (lock)
+        erts_smp_spin_unlock(&sl->lock);
+}
+#else
+#define ERTS_HDBG_CHK_SLEEP_LIST(SL, L, F, FN) ((void) 0)
+#endif
+
 static void
 scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 {
@@ -3436,32 +3472,12 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 
     ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq));
 
-#ifdef ERTS_DIRTY_SCHEDULERS
-    if (ERTS_RUNQ_IX_IS_DIRTY(rq->ix))
-	erts_smp_spin_lock(&rq->sleepers.lock);
-#endif
     flgs = sched_prep_spin_wait(ssi);
     if (flgs & ERTS_SSI_FLG_SUSPENDED) {
 	/* Go suspend instead... */
-#ifdef ERTS_DIRTY_SCHEDULERS
-	if (ERTS_RUNQ_IX_IS_DIRTY(rq->ix))
-	    erts_smp_spin_unlock(&rq->sleepers.lock);
-#endif
 	return;
     }
 
-#ifdef ERTS_DIRTY_SCHEDULERS
-    if (ERTS_RUNQ_IX_IS_DIRTY(rq->ix)) {
-	ssi->prev = NULL;
-	ssi->next = rq->sleepers.list;
-	if (rq->sleepers.list)
-	    rq->sleepers.list->prev = ssi;
-	rq->sleepers.list = ssi;
-	erts_smp_spin_unlock(&rq->sleepers.lock);
-        dirty_active(esdp, -1);
-    }
-#endif
-
     /*
      * If all schedulers are waiting, one of them *should*
      * be waiting in erl_sys_schedule()
@@ -3471,6 +3487,28 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 
 	sched_waiting(esdp->no, rq);
 
+#ifdef ERTS_DIRTY_SCHEDULERS
+        if (ERTS_RUNQ_IX_IS_DIRTY(rq->ix)) {
+            erts_smp_spin_lock(&rq->sleepers.lock);
+            ERTS_HDBG_CHK_SLEEP_LIST(&rq->sleepers, 0, NULL, ssi);
+            ASSERT(!ssi->next); /* Not in sleepers list */
+            ASSERT(!ssi->prev);
+            if (!rq->sleepers.list) {
+                ssi->next = ssi->prev = ssi;
+                rq->sleepers.list = ssi;
+            }
+            else {
+                ssi->prev = rq->sleepers.list;
+                ssi->next = rq->sleepers.list->next;
+                ssi->prev->next = ssi;
+                ssi->next->prev = ssi;
+            }
+            ERTS_HDBG_CHK_SLEEP_LIST(&rq->sleepers, 0, ssi, NULL);
+            erts_smp_spin_unlock(&rq->sleepers.lock);
+            dirty_active(esdp, -1);
+        }
+#endif
+
 	erts_smp_runq_unlock(rq);
 
 	spincount = sched_busy_wait.tse;
@@ -3597,7 +3635,31 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 	    erts_thr_progress_active(esdp, thr_prgr_active = 1);
 	    sched_wall_time_change(esdp, 1);
         }
-
+        
+#ifdef ERTS_DIRTY_SCHEDULERS
+        if (ERTS_RUNQ_IX_IS_DIRTY(rq->ix)) {
+            erts_smp_spin_lock(&rq->sleepers.lock);
+            ERTS_HDBG_CHK_SLEEP_LIST(&rq->sleepers, 0, ssi->next ? ssi : NULL, NULL);
+            if (ssi->next) { /* Still in list... */
+                if (ssi->next == ssi) {
+                    ASSERT(rq->sleepers.list == ssi);
+                    ASSERT(ssi->prev == ssi);
+                    rq->sleepers.list = NULL;
+                }
+                else {
+                    ASSERT(ssi->prev != ssi);
+                    if (rq->sleepers.list == ssi)
+                        rq->sleepers.list = ssi->next;
+                    ssi->prev->next = ssi->next;
+                    ssi->next->prev = ssi->prev;
+                }
+                ssi->next = ssi->prev = NULL;
+            }
+            ERTS_HDBG_CHK_SLEEP_LIST(&rq->sleepers, 0, NULL, ssi);
+            erts_smp_spin_unlock(&rq->sleepers.lock);
+        }
+#endif
+        
 	erts_smp_runq_lock(rq);
 	sched_active(esdp->no, rq);
 
@@ -3883,55 +3945,44 @@ wake_scheduler(ErtsRunQueue *rq)
 
 #ifdef ERTS_DIRTY_SCHEDULERS
 static void
-wake_dirty_schedulers(ErtsRunQueue *rq, int one)
+wake_dirty_scheduler(ErtsRunQueue *rq)
 {
-    ErtsSchedulerSleepInfo *ssi;
+    ErtsSchedulerSleepInfo *lo_ssi, *fo_ssi;
     ErtsSchedulerSleepList *sl;
 
     ASSERT(ERTS_RUNQ_IX_IS_DIRTY(rq->ix));
 
     sl = &rq->sleepers;
     erts_smp_spin_lock(&sl->lock);
-    ssi = sl->list;
-    if (!ssi) {
+    ERTS_HDBG_CHK_SLEEP_LIST(&rq->sleepers, 0, NULL, NULL);
+    lo_ssi = sl->list;
+    if (!lo_ssi) {
 	erts_smp_spin_unlock(&sl->lock);
-	if (one)
-	    wake_scheduler(rq);
-    } else if (one) {
+        wake_scheduler(rq);
+    }
+    else {
 	erts_aint32_t flgs;
-	if (ssi->prev)
-	    ssi->prev->next = ssi->next;
-	else {
-	    ASSERT(sl->list == ssi);
-	    sl->list = ssi->next;
+        fo_ssi = lo_ssi->next;
+        ASSERT(fo_ssi->prev == lo_ssi);
+        if (fo_ssi == lo_ssi) {
+	    ASSERT(lo_ssi->prev == lo_ssi);
+            sl->list = NULL;
+        }
+        else {
+            ASSERT(lo_ssi->prev != lo_ssi);
+	    lo_ssi->next = fo_ssi->next;
+	    fo_ssi->next->prev = fo_ssi->prev;
 	}
-	if (ssi->next)
-	    ssi->next->prev = ssi->prev;
-
-	erts_smp_spin_unlock(&sl->lock);
-
-	ERTS_THR_MEMORY_BARRIER;
-	flgs = ssi_flags_set_wake(ssi);
-	erts_sched_finish_poke(ssi, flgs);
-    } else {
-	sl->list = NULL;
+        fo_ssi->next = fo_ssi->prev = NULL;
+        ERTS_HDBG_CHK_SLEEP_LIST(&rq->sleepers, 0, NULL, fo_ssi);
 	erts_smp_spin_unlock(&sl->lock);
 
 	ERTS_THR_MEMORY_BARRIER;
-	do {
-	    ErtsSchedulerSleepInfo *wake_ssi = ssi;
-	    ssi = ssi->next;
-	    erts_sched_finish_poke(wake_ssi, ssi_flags_set_wake(wake_ssi));
-	} while (ssi);
+	flgs = ssi_flags_set_wake(fo_ssi);
+	erts_sched_finish_poke(fo_ssi, flgs);
     }
 }
 
-static void
-wake_dirty_scheduler(ErtsRunQueue *rq)
-{
-    wake_dirty_schedulers(rq, 1);
-}
-
 #endif
 
 #define ERTS_NO_USED_RUNQS_SHIFT 16
@@ -4176,6 +4227,8 @@ dequeue_process(ErtsRunQueue *runq, int prio_q, erts_aint32_t *statep)
     ERTS_SMP_DATA_DEPENDENCY_READ_MEMORY_BARRIER;
 
     state = erts_smp_atomic32_read_nob(&p->state);
+    ASSERT(state & ERTS_PSFLG_IN_RUNQ);
+
     if (statep)
 	*statep = state;
 
@@ -4183,8 +4236,7 @@ dequeue_process(ErtsRunQueue *runq, int prio_q, erts_aint32_t *statep)
 
     rqi = &runq->procs.prio_info[prio];
 
-    if (p)
-	unqueue_process(runq, rpq, rqi, prio, NULL, p);
+    unqueue_process(runq, rpq, rqi, prio, NULL, p);
 
     return p;
 }
@@ -4512,7 +4564,7 @@ evacuate_run_queue(ErtsRunQueue *rq,
 
 	erts_smp_runq_unlock(to_rq);
 	smp_notify_inc_runq(to_rq);
-	erts_smp_runq_lock(to_rq);
+	erts_smp_runq_lock(rq);
     }
 
     if (rq->ports.start) {
@@ -4593,22 +4645,17 @@ evacuate_run_queue(ErtsRunQueue *rq,
 		    free_proxy_proc(proc);
 		else {
 		    erts_aint32_t clr_bits;
-#ifdef DEBUG
-		    erts_aint32_t old;
-#endif
 
 		    clr_bits = ERTS_PSFLG_IN_RUNQ;
 		    clr_bits |= qbit << ERTS_PSFLGS_IN_PRQ_MASK_OFFSET;
 
-#ifdef DEBUG
-		    old =
-#else
-			(void)
-#endif
-			erts_smp_atomic32_read_band_mb(&proc->state,
-						       ~clr_bits);
-			ASSERT((old & clr_bits) == clr_bits);
+                    state = erts_smp_atomic32_read_band_mb(&proc->state, ~clr_bits);
+                    ASSERT((state & clr_bits) == clr_bits);
 
+                    if (state & ERTS_PSFLG_FREE) {
+                        /* free and not queued by proxy */
+                        erts_proc_dec_refc(proc);
+		    }
 		}
 
 		goto handle_next_proc;
@@ -6371,6 +6418,8 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
     for (ix = 0; ix < no_dirty_cpu_schedulers; ix++) {
 	ErtsSchedulerSleepInfo *ssi = &aligned_dirty_cpu_sched_sleep_info[ix].ssi;
 	erts_smp_atomic32_init_nob(&ssi->flags, 0);
+        ssi->next = NULL;
+        ssi->prev = NULL;
 	ssi->event = NULL; /* initialized in sched_dirty_cpu_thread_func */
 	erts_atomic32_init_nob(&ssi->aux_work, 0);
     }
@@ -6381,6 +6430,8 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
     for (ix = 0; ix < no_dirty_io_schedulers; ix++) {
 	ErtsSchedulerSleepInfo *ssi = &aligned_dirty_io_sched_sleep_info[ix].ssi;
 	erts_smp_atomic32_init_nob(&ssi->flags, 0);
+        ssi->next = NULL;
+        ssi->prev = NULL;
 	ssi->event = NULL; /* initialized in sched_dirty_io_thread_func */
 	erts_atomic32_init_nob(&ssi->aux_work, 0);
     }
@@ -6726,13 +6777,14 @@ fin_dirty_enq_s_change(Process *p,
 	/* Already enqueue by someone else... */
 	if (pstruct_reserved) {
 	    /* We reserved process struct for enqueue; clear it... */
-#ifdef DEBUG
-	    erts_aint32_t old =
-#else
-		(void)
-#endif
-		erts_smp_atomic32_read_band_nob(&p->state, ~ERTS_PSFLG_IN_RUNQ);
-	    ASSERT(old & ERTS_PSFLG_IN_RUNQ);
+	    erts_aint32_t state;
+
+            state = erts_smp_atomic32_read_band_nob(&p->state, ~ERTS_PSFLG_IN_RUNQ);
+            ASSERT(state & ERTS_PSFLG_IN_RUNQ);
+
+            if (state & ERTS_PSFLG_FREE) {
+                erts_proc_dec_refc(p);
+            }
 	}
 	return 0;
     }
@@ -6903,8 +6955,9 @@ schedule_out_process(ErtsRunQueue *c_rq, erts_aint32_t state, Process *p,
 	enqueue = ERTS_ENQUEUE_NOT;
 
 	n &= ~running_flgs;
-	if ((a & (ERTS_PSFLG_ACTIVE_SYS|ERTS_PSFLG_DIRTY_ACTIVE_SYS))
-	    || (a & (ERTS_PSFLG_ACTIVE|ERTS_PSFLG_SUSPENDED)) == ERTS_PSFLG_ACTIVE) {
+	if ((!!(a & (ERTS_PSFLG_ACTIVE_SYS|ERTS_PSFLG_DIRTY_ACTIVE_SYS))
+	    | ((a & (ERTS_PSFLG_ACTIVE|ERTS_PSFLG_SUSPENDED)) == ERTS_PSFLG_ACTIVE))
+            & !(a & ERTS_PSFLG_FREE)) {
 	    enqueue = check_enqueue_in_prio_queue(p, &enq_prio, &n, a);
 	}
 	a = erts_smp_atomic32_cmpxchg_mb(&p->state, n, e);
@@ -7158,129 +7211,72 @@ erts_schedule_process(Process *p, erts_aint32_t state, ErtsProcLocks locks)
     schedule_process(p, state, locks);
 }
 
-static int
-schedule_process_sys_task(Process *p, erts_aint32_t prio, ErtsProcSysTask *st,
-			  erts_aint32_t *fail_state_p)
-{
-    int res;
-    int locked;
-    ErtsProcSysTaskQs *stqs, *free_stqs;
-    erts_aint32_t fail_state, state, a, n, enq_prio;
+/* Enqueues the given sys task on the process and schedules it. The task may be
+ * NULL if only scheduling is desired. */
+static ERTS_INLINE erts_aint32_t
+active_sys_enqueue(Process *p, ErtsProcSysTask *sys_task,
+                   erts_aint32_t task_prio, erts_aint32_t enable_flags,
+                   erts_aint32_t state, erts_aint32_t *fail_state_p)
+{
+    int runnable_procs = erts_system_profile_flags.runnable_procs;
+    erts_aint32_t n, a, enq_prio, fail_state;
+    int already_scheduled;
+    int status_locked;
     int enqueue; /* < 0 -> use proxy */
-    unsigned int prof_runnable_procs;
 
+    enable_flags |= ERTS_PSFLG_ACTIVE_SYS;
     fail_state = *fail_state_p;
-
-    res = 1; /* prepare for success */
-    st->next = st->prev = st; /* Prep for empty prio queue */
-    state = erts_smp_atomic32_read_nob(&p->state);
-    prof_runnable_procs = erts_system_profile_flags.runnable_procs;
-    locked = 0;
-    free_stqs = NULL;
-    if (state & ERTS_PSFLG_ACTIVE_SYS)
-	stqs = NULL;
-    else {
-    alloc_qs:
-	stqs = proc_sys_task_queues_alloc();
-	stqs->qmask = 1 << prio;
-	stqs->ncount = 0;
-	stqs->q[PRIORITY_MAX] = NULL;
-	stqs->q[PRIORITY_HIGH] = NULL;
-	stqs->q[PRIORITY_NORMAL] = NULL;
-	stqs->q[PRIORITY_LOW] = NULL;
-	stqs->q[prio] = st;
-    }
-
-    if (!locked) {
-	locked = 1;
-	erts_smp_proc_lock(p, ERTS_PROC_LOCK_STATUS);
-
-	state = erts_smp_atomic32_read_nob(&p->state);
-	if (state & fail_state) {
-	    *fail_state_p = (state & fail_state);
-	    free_stqs = stqs;
-	    res = 0;
-	    goto cleanup;
-	}
-    }
-
-    if (!p->sys_task_qs) {
-	if (stqs)
-	    p->sys_task_qs = stqs;
-	else
-	    goto alloc_qs;
-    }
-    else {
-	free_stqs = stqs;
-	stqs = p->sys_task_qs;
-	if (!stqs->q[prio]) {
-	    stqs->q[prio] = st;
-	    stqs->qmask |= 1 << prio;
-	}
-	else {
-	    st->next = stqs->q[prio];
-	    st->prev = stqs->q[prio]->prev;
-	    st->next->prev = st;
-	    st->prev->next = st;
-	    ASSERT(stqs->qmask & (1 << prio));
-	}
-    }
-
-    if (ERTS_PSFLGS_GET_ACT_PRIO(state) > prio) {
-	erts_aint32_t n, a, e;
-	/* Need to elevate actual prio */
-
-	a = state;
-	do {
-	    if (ERTS_PSFLGS_GET_ACT_PRIO(a) <= prio) {
-		n = a;
-		break;
-	    }
-	    n = e = a;
-	    n &= ~ERTS_PSFLGS_ACT_PRIO_MASK;
-	    n |= (prio << ERTS_PSFLGS_ACT_PRIO_OFFSET);
-	    a = erts_smp_atomic32_cmpxchg_nob(&p->state, n, e);    
-	} while (a != e);
-	state = n;
-    }
-
-
-    a = state;
+    already_scheduled = 0;
+    status_locked = 0;
     enq_prio = -1;
+    a = state;
 
-    /* Status lock prevents out of order "runnable proc" trace msgs */
-    ERTS_SMP_LC_ASSERT(ERTS_PROC_LOCK_STATUS & erts_proc_lc_my_proc_locks(p));
+    ERTS_SMP_LC_ASSERT(!(ERTS_PROC_LOCK_STATUS & erts_proc_lc_my_proc_locks(p)));
+    ASSERT(fail_state & (ERTS_PSFLG_EXITING | ERTS_PSFLG_FREE));
+    ASSERT(!(fail_state & enable_flags));
+    ASSERT(!(state & ERTS_PSFLG_PROXY));
 
-    if (!prof_runnable_procs) {
-	erts_smp_proc_unlock(p, ERTS_PROC_LOCK_STATUS);
-	locked = 0;
+    /* When runnable_procs is enabled, we need to take the status lock to
+     * prevent trace messages from being sent in the wrong order. The lock must
+     * be held over the call to add2runq.
+     *
+     * Otherwise, we only need to take it when we're enqueuing a task and can
+     * safely release it before add2runq. */
+    if (sys_task || runnable_procs) {
+        erts_smp_proc_lock(p, ERTS_PROC_LOCK_STATUS);
+        status_locked = 1;
     }
 
-    ASSERT(!(state & ERTS_PSFLG_PROXY));
-
     while (1) {
 	erts_aint32_t e;
 	n = e = a;
 
-	if (a & ERTS_PSFLG_FREE)
-	    goto cleanup; /* We don't want to schedule free processes... */
+	if (a & fail_state) {
+            *fail_state_p = a & fail_state;
+	    goto cleanup;
+        }
 
 	enqueue = ERTS_ENQUEUE_NOT;
-	n |= ERTS_PSFLG_ACTIVE_SYS;
+	n |= enable_flags;
+
 	if (!(a & (ERTS_PSFLG_RUNNING
 		   | ERTS_PSFLG_RUNNING_SYS
 		   | ERTS_PSFLG_DIRTY_RUNNING
-		   | ERTS_PSFLG_DIRTY_RUNNING_SYS)))
+		   | ERTS_PSFLG_DIRTY_RUNNING_SYS))) {
 	    enqueue = check_enqueue_in_prio_queue(p, &enq_prio, &n, a);
+        }
+
 	a = erts_smp_atomic32_cmpxchg_mb(&p->state, n, e);
-	if (a == e)
+	if (a == e) {
 	    break;
-	if (a == n && enqueue == ERTS_ENQUEUE_NOT)
-	    goto cleanup;
+	}
+        else if (a == n && enqueue == ERTS_ENQUEUE_NOT) {
+	    already_scheduled = 1;
+            break;
+        }
     }
 
-    if (prof_runnable_procs) {
-
+    if (!already_scheduled && runnable_procs) {
 	if (!(a & (ERTS_PSFLG_ACTIVE_SYS
 		   | ERTS_PSFLG_RUNNING
 		   | ERTS_PSFLG_RUNNING_SYS
@@ -7290,24 +7286,89 @@ schedule_process_sys_task(Process *p, erts_aint32_t prio, ErtsProcSysTask *st,
 	    /* We activated a prevously inactive process */
 	    profile_runnable_proc(p, am_active);
 	}
+    }
 
-	erts_smp_proc_unlock(p, ERTS_PROC_LOCK_STATUS);
-	locked = 0;
+    if (sys_task) {
+        ErtsProcSysTaskQs *stqs = p->sys_task_qs;
+
+        if (!stqs) {
+            sys_task->next = sys_task->prev = sys_task;
+
+            stqs = proc_sys_task_queues_alloc();
+
+            stqs->qmask = 1 << task_prio;
+            stqs->ncount = 0;
+            stqs->q[PRIORITY_MAX] = NULL;
+            stqs->q[PRIORITY_HIGH] = NULL;
+            stqs->q[PRIORITY_NORMAL] = NULL;
+            stqs->q[PRIORITY_LOW] = NULL;
+            stqs->q[task_prio] = sys_task;
+
+            p->sys_task_qs = stqs;
+        }
+        else {
+            if (!stqs->q[task_prio]) {
+                sys_task->next = sys_task->prev = sys_task;
+
+                stqs->q[task_prio] = sys_task;
+                stqs->qmask |= 1 << task_prio;
+            }
+            else {
+                sys_task->next = stqs->q[task_prio];
+                sys_task->prev = stqs->q[task_prio]->prev;
+                sys_task->next->prev = sys_task;
+                sys_task->prev->next = sys_task;
+                ASSERT(stqs->qmask & (1 << task_prio));
+            }
+        }
     }
 
-    add2runq(enqueue, enq_prio, p, n, NULL);
+    if (status_locked && !runnable_procs) {
+        erts_smp_proc_unlock(p, ERTS_PROC_LOCK_STATUS);
+        status_locked = 0;
+    }
+
+    if (!already_scheduled) {
+        add2runq(enqueue, enq_prio, p, n, NULL);
+    }
 
 cleanup:
+    if (status_locked) {
+        erts_smp_proc_unlock(p, ERTS_PROC_LOCK_STATUS);
+    }
 
-    if (locked)
-	erts_smp_proc_unlock(p, ERTS_PROC_LOCK_STATUS);
+    return n;
+}
 
-    if (free_stqs)
-	proc_sys_task_queues_free(free_stqs);
+static int
+schedule_process_sys_task(Process *p, erts_aint32_t prio, ErtsProcSysTask *st,
+			  erts_aint32_t *fail_state_p)
+{
+    erts_aint32_t fail_state, state;
 
-    ERTS_SMP_LC_ASSERT(!(ERTS_PROC_LOCK_STATUS & erts_proc_lc_my_proc_locks(p)));
+    /* Elevate priority if needed. */
+    state = erts_smp_atomic32_read_nob(&p->state);
+    if (ERTS_PSFLGS_GET_ACT_PRIO(state) > prio) {
+        erts_aint32_t n, a, e;
 
-    return res;
+        a = state;
+        do {
+            if (ERTS_PSFLGS_GET_ACT_PRIO(a) <= prio) {
+                n = a;
+                break;
+            }
+            n = e = a;
+            n &= ~ERTS_PSFLGS_ACT_PRIO_MASK;
+            n |= (prio << ERTS_PSFLGS_ACT_PRIO_OFFSET);
+            a = erts_smp_atomic32_cmpxchg_nob(&p->state, n, e);
+        } while (a != e);
+
+        state = n;
+    }
+
+    fail_state = *fail_state_p;
+
+    return !(active_sys_enqueue(p, st, prio, 0, state, fail_state_p) & fail_state);
 }
 
 static ERTS_INLINE int
@@ -7895,6 +7956,11 @@ suspend_scheduler(ErtsSchedulerData *esdp)
         return;
     }
 
+#ifdef HARDDEBUG
+    if (sched_type != ERTS_SCHED_NORMAL)
+        ERTS_HDBG_CHK_SLEEP_LIST(&esdp->run_queue->sleepers, !0, NULL, ssi);
+#endif
+    
     if (erts_smp_atomic32_read_nob(&ssi->flags) & ERTS_SSI_FLG_MSB_EXEC) {
         ASSERT(no == 1);
         if (!msb_scheduler_type_switch(sched_type, esdp, no))
@@ -10484,6 +10550,7 @@ Process *erts_schedule(ErtsSchedulerData *esdp, Process *p, int calls)
 
 	if (!is_normal_sched & !!(flags & ERTS_RUNQ_FLG_HALTING)) {
 	    /* Wait for emulator to terminate... */
+	    erts_smp_runq_unlock(rq);
 	    while (1)
 		erts_milli_sleep(1000*1000);
 	}
@@ -10758,6 +10825,7 @@ Process *erts_schedule(ErtsSchedulerData *esdp, Process *p, int calls)
 			}
 			else if (state & ERTS_PSFLG_FREE) {
 			    /* free and not queued by proxy */
+                            ASSERT(state & ERTS_PSFLG_IN_RUNQ);
 			    erts_proc_dec_refc(p);
 			}
                         if (!is_normal_sched)
@@ -14071,7 +14139,9 @@ erts_continue_exit_process(Process *p)
 	    n = e = a;
 	    ASSERT(a & ERTS_PSFLG_EXITING);
 	    n |= ERTS_PSFLG_FREE;
-	    n &= ~ERTS_PSFLG_ACTIVE;
+            n &= ~(ERTS_PSFLG_ACTIVE
+                   | ERTS_PSFLG_ACTIVE_SYS
+                   | ERTS_PSFLG_DIRTY_ACTIVE_SYS);
 	    if ((n & ERTS_PSFLG_IN_RUNQ) && !refc_inced) {
 		erts_proc_inc_refc(p);
 		refc_inced = 1;
@@ -14524,12 +14594,12 @@ void erts_halt(int code)
     if (-1 == erts_smp_atomic32_cmpxchg_acqb(&erts_halt_progress,
 					     erts_no_schedulers,
 					     -1)) {
+        notify_reap_ports_relb();
 #ifdef ERTS_DIRTY_SCHEDULERS
         ERTS_RUNQ_FLGS_SET(ERTS_DIRTY_CPU_RUNQ, ERTS_RUNQ_FLG_HALTING);
         ERTS_RUNQ_FLGS_SET(ERTS_DIRTY_IO_RUNQ, ERTS_RUNQ_FLG_HALTING);
 #endif
 	erts_halt_code = code;
-	notify_reap_ports_relb();
     }
 }
 
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index 5cac939771..b3385b780f 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -368,7 +368,7 @@ typedef struct ErtsSchedulerSleepInfo_ ErtsSchedulerSleepInfo;
 #ifdef ERTS_DIRTY_SCHEDULERS
 typedef struct {
     erts_smp_spinlock_t lock;
-    ErtsSchedulerSleepInfo *list;
+    ErtsSchedulerSleepInfo *list; /* circular lifo list; points to last out */
 } ErtsSchedulerSleepList;
 #endif
 
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index d7116bd2c3..5fc42f231a 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -3142,7 +3142,7 @@ tailrecur_ne:
 		ASSERT(alen == blen);
 		for (i = (Sint) alen - 1; i >= 0; i--)
 		    if (anum[i] != bnum[i])
-			RETURN_NEQ((Sint32) (anum[i] - bnum[i]));
+			RETURN_NEQ(anum[i] < bnum[i] ? -1 : 1);
 		goto pop_next;
 	    case (_TAG_HEADER_EXTERNAL_REF >> _TAG_PRIMARY_SIZE):
 		if (is_internal_ref(b)) {
diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c
index 554c48059f..f6de343f72 100644
--- a/erts/emulator/drivers/common/inet_drv.c
+++ b/erts/emulator/drivers/common/inet_drv.c
@@ -1018,6 +1018,7 @@ typedef struct {
     inet_async_op* oph;          /* queue head or NULL */
     inet_async_op* opt;          /* queue tail or NULL */
     inet_async_op  op_queue[INET_MAX_ASYNC];  /* call queue */
+    int op_ref;                 /* queue reference generator  */
 
     int   active;               /* 0 = passive, 1 = active, 2 = active once */
     Sint16 active_count;        /* counter for {active,N} */
@@ -1268,8 +1269,7 @@ static int packet_inet_output(udp_descriptor* udesc, HANDLE event);
 /* convert descriptor pointer to inet_descriptor pointer */
 #define INETP(d) (&(d)->inet)
 
-static int async_ref = 0;          /* async reference id generator */
-#define NEW_ASYNC_ID() ((async_ref++) & 0xffff)
+#define NEW_ASYNC_ID(desc) ((desc)->op_ref++ & 0xffff)
 
 /* check for transition from active to passive */
 #define INET_CHECK_ACTIVE_TO_PASSIVE(inet)                              \
@@ -1921,7 +1921,7 @@ static void enq_multi_op(tcp_descriptor *desc, char *buf, int req,
 			 ErlDrvTermData caller, MultiTimerData *timeout,
 			 ErlDrvMonitor *monitorp)
 {
-    int id = NEW_ASYNC_ID();
+    int id = NEW_ASYNC_ID(INETP(desc));
     enq_old_multi_op(desc,id,req,caller,timeout,monitorp);
     if (buf != NULL)
 	put_int16(id, buf);
@@ -1990,7 +1990,7 @@ static int remove_multi_op(tcp_descriptor *desc, int *id_p, int *req_p,
 static int enq_async_w_tmo(inet_descriptor* desc, char* buf, int req, unsigned timeout,
 			   ErlDrvMonitor *monitorp)
 {
-    int id = NEW_ASYNC_ID();
+    int id = NEW_ASYNC_ID(desc);
     inet_async_op* opp;
 
     if ((opp = desc->oph) == NULL)            /* queue empty */
@@ -8423,6 +8423,7 @@ static ErlDrvData inet_start(ErlDrvPort port, int size, int protocol)
     desc->delimiter    = '\n';         /* line delimiting char */
     desc->oph = NULL;
     desc->opt = NULL;
+    desc->op_ref = 0;
 
     desc->peer_ptr = NULL;
     desc->name_ptr = NULL;
diff --git a/erts/emulator/test/big_SUITE.erl b/erts/emulator/test/big_SUITE.erl
index c308760211..b25868d3cb 100644
--- a/erts/emulator/test/big_SUITE.erl
+++ b/erts/emulator/test/big_SUITE.erl
@@ -24,7 +24,7 @@
 
 -export([t_div/1, eq_28/1, eq_32/1, eq_big/1, eq_math/1, big_literals/1,
 	 borders/1, negative/1, big_float_1/1, big_float_2/1,
-         bxor_2pow/1,
+         bxor_2pow/1, band_2pow/1,
 	 shift_limit_1/1, powmod/1, system_limit/1, toobig/1, otp_6692/1]).
 
 %% Internal exports.
@@ -43,7 +43,7 @@ suite() ->
 all() -> 
     [t_div, eq_28, eq_32, eq_big, eq_math, big_literals,
      borders, negative, {group, big_float}, shift_limit_1,
-     bxor_2pow,
+     bxor_2pow, band_2pow,
      powmod, system_limit, toobig, otp_6692].
 
 groups() -> 
@@ -449,3 +449,38 @@ my_bxor(A, B, N, Acc0) ->
                false -> Acc0 bor (1 bsl N)
           end,
     my_bxor(A bsr 1, B bsr 1, N+1, Acc1).
+
+
+%% ERL-804
+band_2pow(_Config) ->
+    IL = lists:seq(8*3, 8*16, 4),
+    JL = lists:seq(0, 64),
+    [band_2pow_1((1 bsl I), (1 bsl J))
+     || I <- IL, J <- JL],
+    ok.
+
+band_2pow_1(A, B) ->
+    for(-1,1, fun(Ad) ->
+                      for(-1,1, fun(Bd) ->
+                                        band_2pow_2(A+Ad, B+Bd),
+                                        band_2pow_2(-A+Ad, B+Bd),
+                                        band_2pow_2(A+Ad, -B+Bd),
+                                        band_2pow_2(-A+Ad, -B+Bd)
+                                end)
+              end).
+
+band_2pow_2(A, B) ->
+    Correct = my_band(A, B),
+    case A band B of
+        Correct -> ok;
+        Wrong ->
+            io:format("~.16# band ~.16#\n", [A,B]),
+            io:format("Expected ~.16#\n", [Correct]),
+            io:format("Got      ~.16#\n", [Wrong]),
+            ct:fail({failed, 'band'})
+
+    end.
+
+%% Implement band without band
+my_band(A, B) ->
+    bnot ((bnot A) bor (bnot B)).
diff --git a/erts/emulator/test/process_SUITE.erl b/erts/emulator/test/process_SUITE.erl
index a8bcfac84d..899a5c26bd 100644
--- a/erts/emulator/test/process_SUITE.erl
+++ b/erts/emulator/test/process_SUITE.erl
@@ -58,6 +58,7 @@
 	 no_priority_inversion2/1,
 	 system_task_blast/1,
 	 system_task_on_suspended/1,
+         system_task_failed_enqueue/1,
 	 gc_request_when_gc_disabled/1,
 	 gc_request_blast_when_gc_disabled/1]).
 -export([prio_server/2, prio_client/2, init/1, handle_event/2]).
@@ -104,7 +105,7 @@ groups() ->
        otp_7738_resume]},
      {system_task, [],
       [no_priority_inversion, no_priority_inversion2,
-       system_task_blast, system_task_on_suspended,
+       system_task_blast, system_task_on_suspended, system_task_failed_enqueue,
        gc_request_when_gc_disabled, gc_request_blast_when_gc_disabled]}].
 
 init_per_suite(Config) ->
@@ -2531,6 +2532,57 @@ system_task_on_suspended(Config) when is_list(Config) ->
 	    ok
     end.
 
+%% When a system task couldn't be enqueued due to the process being in an
+%% incompatible state, it would linger in the system task list and get executed
+%% anyway the next time the process was scheduled. This would result in a
+%% double-free at best.
+%%
+%% This test continuously purges modules while other processes run dirty code,
+%% which will provoke this error as ERTS_PSTT_CPC can't be enqueued while a
+%% process is running dirty code.
+system_task_failed_enqueue(Config) when is_list(Config) ->
+    case erlang:system_info(dirty_cpu_schedulers) of
+        N when N > 0 ->
+            system_task_failed_enqueue_1(Config);
+        _ ->
+            {skipped, "No dirty scheduler support"}
+    end.
+
+system_task_failed_enqueue_1(Config) ->
+    Priv = proplists:get_value(priv_dir, Config),
+
+    Purgers = [spawn_link(fun() -> purge_loop(Priv, Id) end)
+               || Id <- lists:seq(1, erlang:system_info(schedulers))],
+    Hogs = [spawn_link(fun() -> dirty_loop() end)
+            || _ <- lists:seq(1, erlang:system_info(dirty_cpu_schedulers))],
+
+    ct:sleep(5000),
+
+    [begin
+         unlink(Pid),
+         exit(Pid, kill)
+     end || Pid <- (Purgers ++ Hogs)],
+
+    ok.
+
+purge_loop(PrivDir, Id) ->
+    Mod = "failed_enq_" ++ integer_to_list(Id),
+    Path = PrivDir ++ "/" ++ Mod,
+    file:write_file(Path ++ ".erl",
+                    "-module('" ++ Mod ++ "').\n" ++
+                        "-export([t/0]).\n" ++
+                        "t() -> ok."),
+    purge_loop_1(Path).
+purge_loop_1(Path) ->
+    {ok, Mod} = compile:file(Path, []),
+    erlang:delete_module(Mod),
+    erts_code_purger:purge(Mod),
+    purge_loop_1(Path).
+
+dirty_loop() ->
+    ok = erts_debug:dirty_cpu(reschedule, 10000),
+    dirty_loop().
+
 gc_request_when_gc_disabled(Config) when is_list(Config) ->
     AIS = erts_debug:set_internal_state(available_internal_state, true),
     gc_request_when_gc_disabled_do(ref),
diff --git a/erts/emulator/test/ref_SUITE.erl b/erts/emulator/test/ref_SUITE.erl
index 5f519d522e..74df857c65 100644
--- a/erts/emulator/test/ref_SUITE.erl
+++ b/erts/emulator/test/ref_SUITE.erl
@@ -22,6 +22,7 @@
 
 -export([all/0, suite/0]).
 -export([wrap_1/1]).
+-export([compare_list/1, compare_ets/1]).
 
 -export([loop_ref/1]).
 
@@ -32,7 +33,7 @@ suite() ->
      {timetrap, {minutes, 2}}].
 
 all() -> 
-    [wrap_1].
+    [wrap_1, compare_list, compare_ets].
 
 %% Check that refs don't wrap around easily.
 wrap_1(Config) when is_list(Config) ->
@@ -53,3 +54,28 @@ loop_ref(Parent) ->
 loop_ref(R, R, _) -> ok;
 loop_ref(R0, _, N) ->
     loop_ref(R0, make_ref(), N+1).
+
+%% Check that ref ordering works
+compare_list(Config) when is_list(Config) ->
+    %% Although this test uses external refs, it would apply the same to plain refs
+    ExtRef1 = <<131,114,0,3,100,0,3,110,64,98,3, 0,0,173,156, 0,216,0,4, 0,0,0,0>>,
+    ExtRef2 = <<131,114,0,3,100,0,3,110,64,98,3, 0,1,31,27,   129,4,0,1, 0,0,0,0>>,
+
+    Ref1 = binary_to_term(ExtRef1), %% #Ref<[email protected]>
+    Ref2 = binary_to_term(ExtRef2), %% #Ref<[email protected]>
+    OrderedList = [Ref1, Ref2],
+    OrderedList = lists:sort(OrderedList),
+    ok.
+
+%% This is the scarier case since it makes terms "invisible" in ets or Mnesia
+%% (the underlying fault cause is the same as compare_list/1)
+compare_ets(Config) when is_list(Config) ->
+    W2s = [610350147,899574699,2994196869,686384822,2397690439, 923302211],
+    ExtRefBase = <<131,114,0,3,100,0,3,110,64,98,3>>,
+    ExtRefs = [<<ExtRefBase/binary, 1:32, W2:32, 0:32>> || W2 <- W2s],
+    Refs = [binary_to_term(Bin) || Bin <- ExtRefs],
+
+    Ets = ets:new(refbug, [ordered_set]),
+    ets:insert(Ets, [{Ref,Ref} || Ref <- Refs]),
+    0 = length([R || R <- ets:tab2list(Ets), ets:lookup(Ets, element(1,R)) == []]),
+    ok.
diff --git a/erts/vsn.mk b/erts/vsn.mk
index 9222b74f81..317ce9b800 100644
--- a/erts/vsn.mk
+++ b/erts/vsn.mk
@@ -18,7 +18,7 @@
 # %CopyrightEnd%
 # 
 
-VSN = 9.3.3
+VSN = 9.3.3.9
 
 # Port number 4365 in 4.2
 # Port number 4366 in 4.3