16 files changed, 371 insertions, 59 deletions
diff --git a/erts/configure.in b/erts/configure.in
index bcdc6cd083..4a27b532de 100644
--- a/erts/configure.in
+++ b/erts/configure.in
@@ -388,6 +388,56 @@ if test X"$with_ets_write_concurrency_locks" != X""; then
 		      [Define to override the default number of write_concurrency locks])
 fi
 
+AC_ARG_WITH(spectre-mitigation,
+    AS_HELP_STRING([--with-spectre-mitigation={yes|incomplete}],
+                   [enable spectre mitigation, either fully or with mitigations
+                    disabled in a handful places like the interpreter])
+    AS_HELP_STRING([--without-spectre-mitigation],
+                   [build without spectre mitigation]),
+    [],[with_spectre_mitigation=no])
+
+case "$with_spectre_mitigation" in
+    no) ;;
+    yes) ;;
+    incomplete) ;;
+    *) AC_MSG_ERROR([Invalid spectre mitigation setting]) ;;
+esac
+
+i_noretpoline_attr=""
+
+if test X"$with_spectre_mitigation" != X"no"; then
+    CFLAGS="$CFLAGS -mindirect-branch=thunk"
+
+    AC_MSG_CHECKING([for spectre mitigation])
+    AC_COMPILE_IFELSE(
+        [AC_LANG_PROGRAM([],[return 0;])],
+        [AC_MSG_RESULT([yes])],
+        [AC_MSG_ERROR([no])])
+
+    if test X"$with_spectre_mitigation" = X"incomplete"; then
+        # gcc and clang support this attribute if they're recent enough. Note
+        # that we must compile with -Werror to check for actual support as they
+        # warn rather than error out on unsupported attributes.
+
+        i_noretpoline_attr='__attribute__((__indirect_branch__("keep")))'
+        i_preserve_cflags="$CFLAGS"
+        CFLAGS="$CFLAGS -Werror"
+
+        AC_MSG_CHECKING([whether spectre mitigation can be disabled on a per-function basis])
+        AC_COMPILE_IFELSE(
+            [AC_LANG_PROGRAM([$i_noretpoline_attr],[return 0;])],
+            [AC_MSG_RESULT([yes])],
+            [AC_MSG_ERROR([no])])
+
+        CFLAGS="$i_preserve_cflags"
+    fi
+fi
+
+AC_DEFINE_UNQUOTED(ERTS_NO_RETPOLINE, $i_noretpoline_attr,
+                   [Per-function attribute for disabling retpoline. This is
+                    *only* defined when --with-spectre-mitigation=incomplete
+                    and has no effects otherwise])
+
 dnl ----------------------------------------------------------------------
 dnl Checks for programs.
 dnl ----------------------------------------------------------------------
diff --git a/erts/doc/src/counters.xml b/erts/doc/src/counters.xml
index 85eedfdadc..ba4a22759f 100644
--- a/erts/doc/src/counters.xml
+++ b/erts/doc/src/counters.xml
@@ -85,14 +85,22 @@
 	  </item>
 	  <tag><c>write_concurrency</c></tag>
 	  <item><p>This is an optimization to achieve very efficient concurrent
-	  write operations at the expense of potential read inconsistency and memory
-	  consumption per counter.</p>
+	  <seealso marker="#add/3"><c>add</c></seealso> and <seealso
+	  marker="#sub/3"><c>sub</c></seealso> operations at the expense of potential read
+	  inconsistency and memory consumption per counter.</p>
 	  <p>Read operations may see sequentially inconsistent results with
 	  regard to concurrent write operations. Even if write operation A is done
 	  sequencially before write operation B, a concurrent reader may see any
 	  combination of A and B, including only B. A read operation is only
 	  guaranteed to see all writes done sequentially before the read. No writes
 	  are ever lost, but will eventually all be seen.</p>
+	  <p>The typical use case for <c>write_concurrency</c> is when
+	  concurrent calls to <seealso marker="#add/3"><c>add</c></seealso> and
+	  <seealso marker="#sub/3"><c>sub</c></seealso> toward the same counters
+	  are very frequent, while calls to <seealso marker="#get/2"><c>get</c>
+	  </seealso> and <seealso marker="#put/3"><c>put</c></seealso> are much
+	  less frequent. The lack of absolute read consistency must also be
+	  acceptable.</p>
 	  </item>
 	</taglist>
       </desc>
@@ -110,7 +118,8 @@
       <name name="add" arity="3"/>
       <fsummary>Add to counter</fsummary>
       <desc>
-        <p>Add <c><anno>Incr</anno></c> to counter.</p>
+        <p>Add <c><anno>Incr</anno></c> to counter at index
+	<c><anno>Ix</anno></c>.</p>
       </desc>
     </func>
 
@@ -118,7 +127,26 @@
       <name name="sub" arity="3"/>
       <fsummary>Subtract from counter</fsummary>
       <desc>
-        <p>Subtract <c><anno>Decr</anno></c> from counter.</p>
+        <p>Subtract <c><anno>Decr</anno></c> from counter at index
+	<c><anno>Ix</anno></c>.</p>
+      </desc>
+    </func>
+
+    <func>
+      <name name="put" arity="3"/>
+      <fsummary>Set counter to value</fsummary>
+      <desc>
+        <p>Write <c><anno>Value</anno></c> to counter at index
+	<c><anno>Ix</anno></c>.</p>
+	<note>
+	  <p>Despite its name, the <c>write_concurrency</c> optimization does not
+	  improve <c>put</c>. A call to <c>put</c> is a relative heavy
+	  operation compared to the very lightweight and scalable <seealso
+	  marker="#add/3"><c>add</c></seealso> and <seealso marker="#sub/3">
+	  <c>sub</c></seealso>. The cost for a <c>put</c> with
+	  <c>write_concurrency</c> is lika a <seealso marker="#get/2"><c>get</c>
+	  </seealso> plus a <c>put</c> without <c>write_concurrency</c>.</p>
+	</note>
       </desc>
     </func>
 
diff --git a/erts/doc/src/erl_nif.xml b/erts/doc/src/erl_nif.xml
index 190ec12d0e..bbc12b0a56 100644
--- a/erts/doc/src/erl_nif.xml
+++ b/erts/doc/src/erl_nif.xml
@@ -293,7 +293,7 @@ return term;</code>
           arguments. When you write to a shared state either through
           static variables or <seealso marker="#enif_priv_data">
           <c>enif_priv_data</c></seealso>, you need to supply your own explicit
-          synchronization. This includes terms in process-independent
+          synchronization. This includes terms in process independent
           environments that are shared between threads. Resource objects also
           require synchronization if you treat them as mutable.</p>
         <p>The library initialization callbacks <c>load</c> and
@@ -596,7 +596,7 @@ int writeiovec(ErlNifEnv *env, ERL_NIF_TERM term, ERL_NIF_TERM *tail,
           <c>--enable-static-nifs</c>, you must define <c>STATIC_ERLANG_NIF</c>
            before the <c>ERL_NIF_INIT</c> declaration.</p>
       </item>
-      <tag><marker id="load"/><c>int (*load)(ErlNifEnv* env, void** priv_data,
+      <tag><marker id="load"/><c>int (*load)(ErlNifEnv* caller_env, void** priv_data,
         ERL_NIF_TERM load_info)</c></tag>
       <item>
         <p><c>load</c> is called when the NIF library is loaded
@@ -612,7 +612,7 @@ int writeiovec(ErlNifEnv *env, ERL_NIF_TERM term, ERL_NIF_TERM *tail,
           anything other than <c>0</c>. <c>load</c> can be <c>NULL</c> if
           initialization is not needed.</p> 
       </item>
-      <tag><marker id="upgrade"/><c>int (*upgrade)(ErlNifEnv* env, void**
+      <tag><marker id="upgrade"/><c>int (*upgrade)(ErlNifEnv* caller_env, void**
         priv_data, void** old_priv_data, ERL_NIF_TERM load_info)</c></tag>
       <item>
         <p><c>upgrade</c> is called when the NIF library is loaded
@@ -626,7 +626,7 @@ int writeiovec(ErlNifEnv *env, ERL_NIF_TERM term, ERL_NIF_TERM *tail,
         <p>The library fails to load if <c>upgrade</c> returns
            anything other than <c>0</c> or if <c>upgrade</c> is <c>NULL</c>.</p>
       </item>
-      <tag><marker id="unload"/><c>void (*unload)(ErlNifEnv* env, void*
+      <tag><marker id="unload"/><c>void (*unload)(ErlNifEnv* caller_env, void*
         priv_data)</c></tag>
       <item>
         <p><c>unload</c> is called when the module code that
@@ -654,27 +654,41 @@ int writeiovec(ErlNifEnv *env, ERL_NIF_TERM term, ERL_NIF_TERM *tail,
         <p><c>ErlNifEnv</c> represents an environment that can host Erlang
           terms. All terms in an environment are valid as long as the
           environment is valid. <c>ErlNifEnv</c> is an opaque type; pointers to
-          it can only be passed on to API functions. Two types of environments
+          it can only be passed on to API functions. Three types of environments
           exist:</p>
         <taglist>
-          <tag>Process-bound environment</tag>
+          <tag>Process bound environment</tag>
           <item>
             <p>Passed as the first argument to all NIFs. All function arguments
               passed to a NIF belong to that environment. The return value from
               a NIF must also be a term belonging to the same environment.</p>
-            <p>A process-bound environment contains transient information
+            <p>A process bound environment contains transient information
               about the calling Erlang process. The environment is only valid
               in the thread where it was supplied as argument until the NIF
               returns. It is thus useless and dangerous to store pointers to
-              process-bound environments between NIF calls.</p>
+              process bound environments between NIF calls.</p>
           </item>
-          <tag>Process-independent environment</tag>
+	  <tag>Callback environment</tag>
+          <item>
+            <p>Passed as the first argument to all the non-NIF callback functions
+	    (<seealso marker="#load"><c>load</c></seealso>,
+	    <seealso marker="#upgrade"><c>upgrade</c></seealso>,
+	    <seealso marker="#unload"><c>unload</c></seealso>,
+	    <seealso marker="#ErlNifResourceDtor"><c>dtor</c></seealso>,
+	    <seealso marker="#ErlNifResourceDown"><c>down</c></seealso> and
+	    <seealso marker="#ErlNifResourceStop"><c>stop</c></seealso>).
+	    Works like a process bound environment but with a temporary
+	    pseudo process that "terminates" when the callback has
+	    returned. Terms may be created in this environment but they will
+	    only be accessible during the callback.</p>
+          </item>
+          <tag>Process independent environment</tag>
           <item>
             <p>Created by calling <seealso marker="#enif_alloc_env">
               <c>enif_alloc_env</c></seealso>. This environment can be
               used to store terms between NIF calls and to send terms with
               <seealso marker="#enif_send"><c>enif_send</c></seealso>. A
-              process-independent environment with all its terms is valid until
+              process independent environment with all its terms is valid until
               you explicitly invalidate it with
               <seealso marker="#enif_free_env"><c>enif_free_env</c></seealso>
               or <c>enif_send</c>.</p>
@@ -799,7 +813,7 @@ typedef struct {
       <tag><marker id="ErlNifResourceDtor"/><c>ErlNifResourceDtor</c></tag>
       <item>
         <code type="none">
-typedef void ErlNifResourceDtor(ErlNifEnv* env, void* obj);</code>
+typedef void ErlNifResourceDtor(ErlNifEnv* caller_env, void* obj);</code>
         <p>The function prototype of a resource destructor function.</p>
 	<p>The <c>obj</c> argument is a pointer to the resource. The only
 	allowed use for the resource in the destructor is to access its
@@ -809,7 +823,7 @@ typedef void ErlNifResourceDtor(ErlNifEnv* env, void* obj);</code>
       <tag><marker id="ErlNifResourceDown"/><c>ErlNifResourceDown</c></tag>
       <item>
         <code type="none">
-typedef void ErlNifResourceDown(ErlNifEnv* env, void* obj, ErlNifPid* pid, ErlNifMonitor* mon);</code>
+typedef void ErlNifResourceDown(ErlNifEnv* caller_env, void* obj, ErlNifPid* pid, ErlNifMonitor* mon);</code>
         <p>The function prototype of a resource down function,
 	  called on the behalf of <seealso marker="#enif_monitor_process">
 	  enif_monitor_process</seealso>. <c>obj</c> is the resource, <c>pid</c>
@@ -820,7 +834,7 @@ typedef void ErlNifResourceDown(ErlNifEnv* env, void* obj, ErlNifPid* pid, ErlNi
       <tag><marker id="ErlNifResourceStop"/><c>ErlNifResourceStop</c></tag>
       <item>
         <code type="none">
-typedef void ErlNifResourceStop(ErlNifEnv* env, void* obj, ErlNifEvent event, int is_direct_call);</code>
+typedef void ErlNifResourceStop(ErlNifEnv* caller_env, void* obj, ErlNifEvent event, int is_direct_call);</code>
         <p>The function prototype of a resource stop function,
 	  called on the behalf of <seealso marker="#enif_select">
 	  enif_select</seealso>. <c>obj</c> is the resource, <c>event</c> is OS event,
@@ -987,7 +1001,7 @@ typedef struct {
       <name><ret>ErlNifEnv *</ret><nametext>enif_alloc_env()</nametext></name>
       <fsummary>Create a new environment.</fsummary>
       <desc>
-        <p>Allocates a new process-independent environment. The environment can
+        <p>Allocates a new process independent environment. The environment can
           be used to hold terms that are not bound to any process. Such terms
           can later be copied to a process environment with
           <seealso marker="#enif_make_copy"><c>enif_make_copy</c></seealso> or
@@ -1211,14 +1225,17 @@ typedef struct {
     </func>
 
     <func>
-      <name><ret>int</ret><nametext>enif_demonitor_process(ErlNifEnv* env, void* obj,
+      <name><ret>int</ret><nametext>enif_demonitor_process(ErlNifEnv* caller_env, void* obj,
       const ErlNifMonitor* mon)</nametext></name>
       <fsummary>Cancel a process monitor.</fsummary>
       <desc>
         <marker id="enif_demonitor_process"></marker>
         <p>Cancels a monitor created earlier with <seealso marker="#enif_monitor_process">
 	<c>enif_monitor_process</c></seealso>. Argument <c>obj</c> is a pointer
-	to the resource holding the monitor and	<c>*mon</c> identifies the monitor.</p>
+	to the resource holding the monitor and	<c>*mon</c> identifies the
+	monitor.</p>
+	<p>Argument <c>caller_env</c> is the environment of the calling process
+	or callback. Must only be NULL if calling from a custom thread.</p>
         <p>Returns <c>0</c> if the monitor was successfully identified and removed.
 	Returns	a non-zero value if the monitor could not be identified, which means
 	it was either</p>
@@ -2268,7 +2285,7 @@ enif_inspect_iovec(env, max_elements, term, &amp;tail, &amp;iovec);
 	  between nodes.</p>
 	  <list type="bulleted">
 	    <item>
-	      <p>Two resource terms will compare equal iff they
+	      <p>Two resource terms will compare equal if and only if they
 	      would yield the same resource object pointer when passed to
 	      <seealso marker="#enif_get_resource"><c>enif_get_resource</c></seealso>.</p>
 	    </item>
@@ -2572,7 +2589,7 @@ enif_map_iterator_destroy(env, &amp;iter);</code>
     </func>
 
     <func>
-      <name><ret>int</ret><nametext>enif_monitor_process(ErlNifEnv* env, void* obj,
+      <name><ret>int</ret><nametext>enif_monitor_process(ErlNifEnv* caller_env, void* obj,
       const ErlNifPid* target_pid, ErlNifMonitor* mon)</nametext></name>
       <fsummary>Monitor a process from a resource.</fsummary>
       <desc>
@@ -2593,6 +2610,8 @@ enif_map_iterator_destroy(env, &amp;iter);</code>
 	<seealso marker="#enif_compare_monitors"><c>enif_compare_monitors</c></seealso>.
 	A monitor is automatically removed when it triggers or when
 	the resource is deallocated.</p>
+	<p>Argument <c>caller_env</c> is the environment of the calling process
+	or callback. Must only be NULL if calling from a custom thread.</p>
         <p>Returns <c>0</c> on success, &lt; 0 if no <c>down</c> callback is
           provided, and &gt; 0 if the process is no longer alive.</p>
         <p>This function is only thread-safe when the emulator with SMP support
@@ -2768,7 +2787,7 @@ enif_map_iterator_destroy(env, &amp;iter);</code>
          <item>The port ID of the receiving port. The port ID is to refer to a
            port on the local node.</item>
          <tag><c>msg_env</c></tag>
-         <item>The environment of the message term. Can be a process-independent
+         <item>The environment of the message term. Can be a process independent
            environment allocated with <seealso marker="#enif_alloc_env">
            <c>enif_alloc_env</c></seealso> or <c>NULL</c>.</item>
          <tag><c>msg</c></tag>
@@ -3124,26 +3143,26 @@ if (retval &amp; ERL_NIF_SELECT_STOP_CALLED) {
         <p>Initializes the <seealso marker="#ErlNifPid"><c>ErlNifPid</c></seealso>
 	variable at <c>*pid</c> to represent the calling process.</p>
         <p>Returns <c>pid</c> if successful, or NULL if <c>caller_env</c> is not
-	a <seealso marker="#ErlNifEnv">process-bound environment</seealso>.</p>
+	a <seealso marker="#ErlNifEnv">process bound environment</seealso>.</p>
       </desc>
     </func>
 
     <func>
-      <name><ret>int</ret><nametext>enif_send(ErlNifEnv* env, ErlNifPid* to_pid,
+      <name><ret>int</ret><nametext>enif_send(ErlNifEnv* caller_env, ErlNifPid* to_pid,
         ErlNifEnv* msg_env, ERL_NIF_TERM msg)</nametext></name>
       <fsummary>Send a message to a process.</fsummary>
       <desc>
         <p>Sends a message to a process.</p>
         <taglist>
-          <tag><c>env</c></tag>
-          <item>The environment of the calling process. Must be <c>NULL</c>
-            only if calling from a created thread.</item>
+          <tag><c>caller_env</c></tag>
+          <item>The environment of the calling process or callback. Must be <c>NULL</c>
+            only if calling from a custom thread not spawned by ERTS.</item>
           <tag><c>*to_pid</c></tag>
           <item>The pid of the receiving process. The pid is to refer to a
             process on the local node.</item>
           <tag><c>msg_env</c></tag>
           <item>The environment of the message term. Must be a
-            process-independent environment allocated with
+            process independent environment allocated with
             <seealso marker="#enif_alloc_env"><c>enif_alloc_env</c></seealso>
             or NULL.</item>
           <tag><c>msg</c></tag>
diff --git a/erts/doc/src/notes.xml b/erts/doc/src/notes.xml
index f384adcf52..2a823d9fe7 100644
--- a/erts/doc/src/notes.xml
+++ b/erts/doc/src/notes.xml
@@ -31,6 +31,26 @@
   </header>
   <p>This document describes the changes made to the ERTS application.</p>
 
+<section><title>Erts 10.1.3</title>
+
+    <section><title>Improvements and New Features</title>
+      <list>
+        <item>
+	    <p>Added an optional <c>./configure</c> flag to compile
+	    the emulator with spectre mitigation:
+	    <c>--with-spectre-mitigation</c></p>
+	    <p>Note that this requires a recent version of GCC with
+	    support for spectre mitigation and the
+	    <c>--mindirect-branch=thunk</c> flag, such as
+	    <c>8.1</c>.</p>
+          <p>
+	    Own Id: OTP-15430 Aux Id: ERIERL-237 </p>
+        </item>
+      </list>
+    </section>
+
+</section>
+
 <section><title>Erts 10.1.2</title>
 
     <section><title>Fixed Bugs and Malfunctions</title>
diff --git a/erts/doc/src/persistent_term.xml b/erts/doc/src/persistent_term.xml
index d2a138d65f..29a6c67051 100644
--- a/erts/doc/src/persistent_term.xml
+++ b/erts/doc/src/persistent_term.xml
@@ -71,7 +71,7 @@
       scan of their heaps for the term that has been deleted.  While
       such scan is relatively light-weight, if there are many
       processes, the system can become less responsive until all
-      process have scanned theirs heaps.</p></item>
+      process have scanned their heaps.</p></item>
 
       <item><p>If the deleted term (or any part of it) is still used
       by a process, that process will do a major (fullsweep) garbage
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index ab5920a67e..e909a0b4da 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -579,6 +579,7 @@ init_emulator(void)
  * the instructions' C labels to the loader.
  * The second call starts execution of BEAM code. This call never returns.
  */
+ERTS_NO_RETPOLINE
 void process_main(Eterm * x_reg_array, FloatDef* f_reg_array)
 {
     static int init_done = 0;
diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab
index aa3c3acd9f..d4ba90a61a 100644
--- a/erts/emulator/beam/bif.tab
+++ b/erts/emulator/beam/bif.tab
@@ -723,4 +723,5 @@ bif atomics:info/1
 bif erts_internal:counters_new/1
 bif erts_internal:counters_get/2
 bif erts_internal:counters_add/3
+bif erts_internal:counters_put/3
 bif erts_internal:counters_info/1
diff --git a/erts/emulator/beam/erl_bif_counters.c b/erts/emulator/beam/erl_bif_counters.c
index a46b462225..7c8884ba32 100644
--- a/erts/emulator/beam/erl_bif_counters.c
+++ b/erts/emulator/beam/erl_bif_counters.c
@@ -19,7 +19,7 @@
  */
 
 /*
- * Purpose:  High performance atomics.
+ * Purpose:  The implementation for 'counters' with 'write_concurrency'.
  */
 
 #ifdef HAVE_CONFIG_H
@@ -37,8 +37,17 @@
 #include "erl_bif_unique.h"
 #include "erl_map.h"
 
+/*
+ * Each logical counter consists of one 64-bit atomic instance per scheduler
+ * plus one instance for the "base value".
+ *
+ * get() reads all atomics for the counter and returns the sum.
+ * add() reads and writes only its own scheduler specific atomic instance.
+ * put() reads all scheduler specific atomics and writes a new base value.
+ */
+#define ATOMICS_PER_COUNTER (erts_no_schedulers + 1)
 
-#define COUNTERS_PER_CACHE_LINE (ERTS_CACHE_LINE_SIZE / sizeof(erts_atomic64_t))
+#define ATOMICS_PER_CACHE_LINE (ERTS_CACHE_LINE_SIZE / sizeof(erts_atomic64_t))
 
 typedef struct
 {
@@ -47,12 +56,12 @@ typedef struct
     UWord ulen;
 #endif
     union {
-        erts_atomic64_t v[COUNTERS_PER_CACHE_LINE];
+        erts_atomic64_t v[ATOMICS_PER_CACHE_LINE];
         byte cache_line__[ERTS_CACHE_LINE_SIZE];
     } u[1];
 }CountersRef;
 
-static int counters_destructor(Binary *unused)
+static int counters_destructor(Binary *mbin)
 {
     return 1;
 }
@@ -76,10 +85,10 @@ BIF_RETTYPE erts_internal_counters_new_1(BIF_ALIST_1)
         BIF_ERROR(BIF_P, BADARG);
     }
 
-    if (cnt > (ERTS_UWORD_MAX / (sizeof(erts_atomic64_t)*2*erts_no_schedulers)))
+    if (cnt > (ERTS_UWORD_MAX / (sizeof(erts_atomic64_t)*2*ATOMICS_PER_COUNTER)))
         BIF_ERROR(BIF_P, SYSTEM_LIMIT);
 
-    cache_lines = erts_no_schedulers * div_ceil(cnt, COUNTERS_PER_CACHE_LINE);
+    cache_lines = ATOMICS_PER_COUNTER * div_ceil(cnt, ATOMICS_PER_CACHE_LINE);
     bytes = offsetof(CountersRef, u) + cache_lines * ERTS_CACHE_LINE_SIZE;
     mbin = erts_create_magic_binary_x(bytes,
                                       counters_destructor,
@@ -87,12 +96,13 @@ BIF_RETTYPE erts_internal_counters_new_1(BIF_ALIST_1)
                                       0);
     p = ERTS_MAGIC_BIN_DATA(mbin);
     p->arity = cnt;
+
 #ifdef DEBUG
     p->ulen = cache_lines;
 #endif
     ASSERT((byte*)&p->u[cache_lines] <= ((byte*)p + bytes));
     for (ui=0; ui < cache_lines; ui++)
-        for (vi=0; vi < COUNTERS_PER_CACHE_LINE; vi++)
+        for (vi=0; vi < ATOMICS_PER_CACHE_LINE; vi++)
             erts_atomic64_init_nob(&p->u[ui].v[vi], 0);
     hp = HAlloc(BIF_P, ERTS_MAGIC_REF_THING_SIZE);
     return erts_mk_magic_ref(&hp, &MSO(BIF_P), mbin);
@@ -120,8 +130,8 @@ static ERTS_INLINE int get_ref_cnt(Eterm ref, Eterm index,
     UWord ix, ui, vi;
     if (!get_ref(ref, &p) || !term_to_UWord(index, &ix) || --ix >= p->arity)
         return 0;
-    ui = (ix / COUNTERS_PER_CACHE_LINE) * erts_no_schedulers + sched_ix;
-    vi = ix % COUNTERS_PER_CACHE_LINE;
+    ui = (ix / ATOMICS_PER_CACHE_LINE) * ATOMICS_PER_COUNTER + sched_ix;
+    vi = ix % ATOMICS_PER_CACHE_LINE;
     ASSERT(ui < p->ulen);
     *pp = p;
     *app = &p->u[ui].v[vi];
@@ -134,7 +144,8 @@ static ERTS_INLINE int get_ref_my_cnt(Eterm ref, Eterm index,
 {
     ErtsSchedulerData *esdp = erts_get_scheduler_data();
     ASSERT(esdp && !ERTS_SCHEDULER_IS_DIRTY(esdp));
-    return get_ref_cnt(ref, index, pp, app, esdp->no - 1);
+    ASSERT(esdp->no > 0 && esdp->no < ATOMICS_PER_COUNTER);
+    return get_ref_cnt(ref, index, pp, app, esdp->no);
 }
 
 static ERTS_INLINE int get_ref_first_cnt(Eterm ref, Eterm index,
@@ -172,7 +183,7 @@ BIF_RETTYPE erts_internal_counters_get_2(BIF_ALIST_2)
     if (!get_ref_first_cnt(BIF_ARG_1, BIF_ARG_2, &p, &ap)) {
         BIF_ERROR(BIF_P, BADARG);
     }
-    for (j = erts_no_schedulers; j ; --j) {
+    for (j = ATOMICS_PER_COUNTER; j ; --j) {
         acc += erts_atomic64_read_nob(ap);
         ap = (erts_atomic64_t*) ((byte*)ap + ERTS_CACHE_LINE_SIZE);
     }
@@ -194,6 +205,31 @@ BIF_RETTYPE erts_internal_counters_add_3(BIF_ALIST_3)
     return am_ok;
 }
 
+BIF_RETTYPE erts_internal_counters_put_3(BIF_ALIST_3)
+{
+    CountersRef* p;
+    erts_atomic64_t* first_ap;
+    erts_atomic64_t* ap;
+    erts_aint64_t acc;
+    erts_aint64_t val;
+    int j;
+
+    if (!get_ref_first_cnt(BIF_ARG_1, BIF_ARG_2, &p, &first_ap)
+        || !term_to_Sint64(BIF_ARG_3, &val)) {
+        BIF_ERROR(BIF_P, BADARG);
+    }
+
+    ap = first_ap;
+    acc = 0;
+    j = ATOMICS_PER_COUNTER - 1;
+    do {
+        ap = (erts_atomic64_t*) ((byte*)ap + ERTS_CACHE_LINE_SIZE);
+        acc += erts_atomic64_read_nob(ap);
+    } while (--j);
+    erts_atomic64_set_nob(first_ap, val-acc);
+
+    return am_ok;
+}
 
 BIF_RETTYPE erts_internal_counters_info_1(BIF_ALIST_1)
 {
diff --git a/erts/emulator/beam/erl_gc.c b/erts/emulator/beam/erl_gc.c
index 47dd115c82..b4df418cd5 100644
--- a/erts/emulator/beam/erl_gc.c
+++ b/erts/emulator/beam/erl_gc.c
@@ -681,7 +681,7 @@ garbage_collect(Process* p, ErlHeapFragment *live_hf_end,
     ErtsMonotonicTime start_time;
     ErtsSchedulerData *esdp = erts_proc_sched_data(p);
     erts_aint32_t state;
-    ERTS_MSACC_PUSH_STATE_M();
+    ERTS_MSACC_PUSH_STATE();
 #ifdef USE_VM_PROBES
     DTRACE_CHARBUF(pidbuf, DTRACE_TERM_BUF_SIZE);
 #endif
@@ -711,7 +711,7 @@ garbage_collect(Process* p, ErlHeapFragment *live_hf_end,
     else if (p->live_hf_end != ERTS_INVALID_HFRAG_PTR)
 	live_hf_end = p->live_hf_end;
 
-    ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_GC);
+    ERTS_MSACC_SET_STATE_CACHED(ERTS_MSACC_STATE_GC);
 
     erts_atomic32_read_bor_nob(&p->state, ERTS_PSFLG_GC);
     if (erts_system_monitor_long_gc != 0)
@@ -759,7 +759,7 @@ garbage_collect(Process* p, ErlHeapFragment *live_hf_end,
         gc_trace_end_tag = am_gc_minor_end;
     } else {
 do_major_collection:
-        ERTS_MSACC_SET_STATE_CACHED_M_X(ERTS_MSACC_STATE_GC_FULL);
+        ERTS_MSACC_SET_STATE_CACHED_X(ERTS_MSACC_STATE_GC_FULL);
         if (IS_TRACED_FL(p, F_TRACE_GC)) {
             trace_gc(p, am_gc_major_start, need, THE_NON_VALUE);
         }
@@ -770,7 +770,7 @@ do_major_collection:
             p->flags &= ~(F_DIRTY_MAJOR_GC|F_DIRTY_MINOR_GC);
         DTRACE2(gc_major_end, pidbuf, reclaimed_now);
         gc_trace_end_tag = am_gc_major_end;
-        ERTS_MSACC_SET_STATE_CACHED_M_X(ERTS_MSACC_STATE_GC);
+        ERTS_MSACC_SET_STATE_CACHED_X(ERTS_MSACC_STATE_GC);
     }
 
     reset_active_writer(p);
@@ -800,7 +800,7 @@ do_major_collection:
 
         /* We have to make sure that we have space for need on the heap */
         res = delay_garbage_collection(p, live_hf_end, need, fcalls);
-        ERTS_MSACC_POP_STATE_M();
+        ERTS_MSACC_POP_STATE();
         return res;
     }
 
@@ -843,7 +843,7 @@ do_major_collection:
     FLAGS(p) &= ~(F_FORCE_GC|F_HIBERNATED);
     p->live_hf_end = ERTS_INVALID_HFRAG_PTR;
 
-    ERTS_MSACC_POP_STATE_M();
+    ERTS_MSACC_POP_STATE();
 
 #ifdef CHECK_FOR_HOLES
     /*
diff --git a/erts/emulator/beam/erl_node_tables.c b/erts/emulator/beam/erl_node_tables.c
index f4dc60941a..18ed782ae3 100644
--- a/erts/emulator/beam/erl_node_tables.c
+++ b/erts/emulator/beam/erl_node_tables.c
@@ -421,8 +421,25 @@ static void schedule_delete_dist_entry(DistEntry* dep)
      *
      * Note that timeouts do not guarantee thread progress.
      */
-    erts_schedule_thr_prgr_later_op(start_timer_delete_dist_entry,
-                                    dep, &dep->later_op);
+    ErtsSchedulerData *esdp = erts_get_scheduler_data();
+    if (esdp && !ERTS_SCHEDULER_IS_DIRTY(esdp)) {
+        erts_schedule_thr_prgr_later_op(start_timer_delete_dist_entry,
+                                        dep, &dep->later_op);
+    } else {
+        /*
+         * Since OTP 20, it's possible that destructor is executed on
+         *  a dirty scheduler. Aux work cannot be done on a dirty
+         *  scheduler, and scheduling any aux work on a dirty scheduler
+         *  makes the scheduler to loop infinitely.
+         * To avoid this, make a spot jump: schedule this function again
+         *  on a first normal scheduler. It is guaranteed to be always
+         *  online. Since it's a rare event, this shall not pose a big
+         *  utilisation hit.
+         */
+        erts_schedule_misc_aux_work(1,
+                                    (void (*)(void *))schedule_delete_dist_entry,
+                                    (void *) dep);
+    }
 }
 
 static void
diff --git a/erts/emulator/test/counters_SUITE.erl b/erts/emulator/test/counters_SUITE.erl
index 7de164096b..b3f0358c1e 100644
--- a/erts/emulator/test/counters_SUITE.erl
+++ b/erts/emulator/test/counters_SUITE.erl
@@ -21,12 +21,13 @@
 
 -include_lib("common_test/include/ct.hrl").
 
--compile(export_all).
+-export([suite/0, all/0]).
+-export([basic/1, bad/1, limits/1, indep/1, write_concurrency/1]).
 
 suite() -> [{ct_hooks,[ts_install_cth]}].
 
 all() ->
-    [basic, bad, limits].
+    [basic, bad, limits, indep, write_concurrency].
 
 basic(Config) when is_list(Config) ->
     Size = 10,
@@ -53,15 +54,21 @@ basic_do(Ref, Ix) ->
     77 = counters:get(Ref, Ix),
     ok = counters:sub(Ref, Ix, -10),
     87 = counters:get(Ref, Ix),
+    ok = counters:put(Ref, Ix, 0),
+    0 = counters:get(Ref, Ix),
+    ok = counters:put(Ref, Ix, 123),
+    123 = counters:get(Ref, Ix),
+    ok = counters:put(Ref, Ix, -321),
+    -321 = counters:get(Ref, Ix),
     ok.
 
 check_memory(atomics, Memory, Size) ->
     {_,true} = {Memory, Memory > Size*8},
     {_,true} = {Memory, Memory < Size*max_atomic_sz() + 100};
 check_memory(write_concurrency, Memory, Size) ->
-    NScheds = erlang:system_info(schedulers),
-    {_,true} = {Memory, Memory > NScheds*Size*8},
-    {_,true} = {Memory, Memory < NScheds*(Size+7)*max_atomic_sz() + 100}.
+    NWords = erlang:system_info(schedulers) + 1,
+    {_,true} = {Memory, Memory > NWords*Size*8},
+    {_,true} = {Memory, Memory < NWords*(Size+7)*max_atomic_sz() + 100}.
 
 max_atomic_sz() ->
     case erlang:system_info({wordsize, external}) of
@@ -90,23 +97,138 @@ bad(Config) when is_list(Config) ->
 
 
 limits(Config) when is_list(Config) ->
+    limits_do(counters:new(1,[atomics])),
+    limits_do(counters:new(1,[write_concurrency])),
+    ok.
+
+limits_do(Ref) ->
     Bits = 64,
     Max = (1 bsl (Bits-1)) - 1,
     Min = -(1 bsl (Bits-1)),
 
-    Ref = counters:new(1,[]),
     0 = counters:get(Ref, 1),
-    ok = counters:add(Ref, 1, Max),
+    ok = counters:put(Ref, 1, Max),
+    Max = counters:get(Ref, 1),
     ok = counters:add(Ref, 1, 1),
     Min = counters:get(Ref, 1),
     ok  = counters:sub(Ref, 1, 1),
     Max = counters:get(Ref, 1),
+    ok = counters:put(Ref, 1, Min),
+    Min = counters:get(Ref, 1),
 
     IncrMax = (Max bsl 1) bor 1,
-    ok = counters:sub(Ref, 1, counters:get(Ref, 1)),
+    ok = counters:put(Ref, 1, 0),
     ok = counters:add(Ref, 1, IncrMax),
     -1 = counters:get(Ref, 1),
     {'EXIT',{badarg,_}} = (catch counters:add(Ref, 1, IncrMax+1)),
     {'EXIT',{badarg,_}} = (catch counters:add(Ref, 1, Min-1)),
+    {'EXIT',{badarg,_}} = (catch counters:put(Ref, 1, Max+1)),
+    {'EXIT',{badarg,_}} = (catch counters:add(Ref, 1, Min-1)),
+    ok.
+
 
+%% Verify that independent workers, using different counters
+%% within the same array, do not interfere with each other.
+indep(Config) when is_list(Config) ->
+    NScheds = erlang:system_info(schedulers),
+    Ref = counters:new(NScheds,[write_concurrency]),
+    Rounds = 100,
+    Papa = self(),
+    Pids = [spawn_opt(fun () ->
+                               Val = I*197,
+                               counters:put(Ref, I, Val),
+                               indep_looper(Rounds, Ref, I, Val),
+                               Papa ! {self(), done}
+                       end,
+                      [link, {scheduler, I}])
+            || I <- lists:seq(1, NScheds)],
+    [receive {P,done} -> ok end || P <- Pids],
     ok.
+
+indep_looper(0, _, _ , _) ->
+    ok;
+indep_looper(N, Ref, I, Val0) ->
+    %%io:format("Val0 = ~p\n", [Val0]),
+    Val0 = counters:get(Ref, I),
+    Val1 = indep_adder(Ref, I, Val0),
+    indep_subber(Ref, I, Val1),
+    Val2 = N*7 + I,
+    counters:put(Ref, I, Val2),
+    indep_looper(N-1, Ref, I, Val2).
+
+indep_adder(Ref, I, Val) when Val < (1 bsl 62) ->
+    %%io:format("adder Val = ~p\n", [Val]),
+    Incr = abs(Val div 2) + I + 984735,
+    counters:add(Ref, I, Incr),
+    Res = Val + Incr,
+    Res = counters:get(Ref, I),
+    indep_adder(Ref, I, Res);
+indep_adder(_Ref, _I, Val) ->
+    Val.
+
+indep_subber(Ref, I, Val) when Val > -(1 bsl 62) ->
+    %%io:format("subber Val = ~p\n", [Val]),
+    Decr = (abs(Val div 2) + I + 725634),
+    counters:sub(Ref, I, Decr),
+    Res = Val - Decr,
+    Res = counters:get(Ref, I),
+    indep_subber(Ref, I, Res);
+indep_subber(_Ref, _I, Val) ->
+    Val.
+
+
+
+%% Verify write_concurrency yields correct results.
+write_concurrency(Config) when is_list(Config) ->
+    rand:seed(exs1024s),
+    io:format("*** SEED: ~p ***\n", [rand:export_seed()]),
+    NScheds = erlang:system_info(schedulers),
+    Size = 100,
+    Ref = counters:new(Size,[write_concurrency]),
+    Rounds = 1000,
+    Papa = self(),
+    Pids = [spawn_opt(fun Worker() ->
+                              receive
+                                  {go, Ix, Incr} ->
+                                      wc_looper(Rounds, Ref, Ix, Incr),
+                                      Papa ! {self(), done, Rounds*Incr},
+                                      Worker();
+                                  stop ->
+                                      ok
+                              end
+                       end,
+                      [link, {scheduler, N}])
+            || N <- lists:seq(1, NScheds)],
+    [begin
+         Base = rand_log64(),
+         counters:put(Ref, Index, Base),
+         SendList = [{P,{go, Index, rand_log64()}} || P <- Pids],
+         [P ! Msg || {P,Msg} <- SendList],
+         Added = lists:sum([receive {P,done,Contrib} -> Contrib end || P <- Pids]),
+         Result = mask_sint64(Base+Added),
+         {_,Result} = {Result, counters:get(Ref, Index)}
+     end
+     || Index <- lists:seq(1, Size)],
+
+    [begin unlink(P), P ! stop end || P <- Pids],
+    ok.
+
+wc_looper(0, _, _, _) ->
+    ok;
+wc_looper(N, Ref, Ix, Incr) ->
+    counters:add(Ref, Ix, Incr),
+    wc_looper(N-1, Ref, Ix, Incr).
+
+mask_sint64(X) ->
+    SMask = 1 bsl 63,
+    UMask = SMask - 1,
+    (X band UMask) - (X band SMask).
+
+%% A random signed 64-bit integer
+%% with a uniformly distributed number of significant bits.
+rand_log64() ->
+    Uint = round(math:pow(2, rand:uniform()*63)),
+    case rand:uniform(2) of
+        1 -> -Uint;
+        2 -> Uint
+    end.
diff --git a/erts/preloaded/ebin/counters.beam b/erts/preloaded/ebin/counters.beam
index caaa6167e1..4e1a3566f7 100644
--- a/erts/preloaded/ebin/counters.beam
+++ b/erts/preloaded/ebin/counters.beam
diff --git a/erts/preloaded/ebin/erts_internal.beam b/erts/preloaded/ebin/erts_internal.beam
index e174f71966..651d5e9d05 100644
--- a/erts/preloaded/ebin/erts_internal.beam
+++ b/erts/preloaded/ebin/erts_internal.beam
diff --git a/erts/preloaded/src/counters.erl b/erts/preloaded/src/counters.erl
index 67354f648d..a0e3035e0f 100644
--- a/erts/preloaded/src/counters.erl
+++ b/erts/preloaded/src/counters.erl
@@ -26,6 +26,7 @@
          get/2,
          add/3,
          sub/3,
+         put/3,
          info/1]).
 
 -export_type([counters_ref/0]).
@@ -76,6 +77,19 @@ add(_, _, _) ->
 sub(Ref, Ix, Decr) ->
     add(Ref, Ix, -Decr).
 
+
+-spec put(Ref, Ix, Value) -> ok when
+      Ref  :: counters_ref(),
+      Ix :: integer(),
+      Value :: integer().
+put({atomics, Ref}, Ix, Value) ->
+    atomics:put(Ref, Ix, Value);
+put({write_concurrency, Ref}, Ix, Value) ->
+    erts_internal:counters_put(Ref, Ix, Value);
+put(_, _, _) ->
+    erlang:error(badarg).
+
+
 -spec info(Ref) -> Info when
       Ref  :: counters_ref(),
       Info :: #{'size':=Size, 'memory':=Memory},
diff --git a/erts/preloaded/src/erts_internal.erl b/erts/preloaded/src/erts_internal.erl
index d491a505c6..8f29a569f2 100644
--- a/erts/preloaded/src/erts_internal.erl
+++ b/erts/preloaded/src/erts_internal.erl
@@ -95,7 +95,7 @@
 -export([atomics_new/2]).
 
 -export([counters_new/1, counters_get/2, counters_add/3,
-         counters_info/1]).
+         counters_put/3, counters_info/1]).
 
 %%
 %% Await result of send to port
@@ -719,6 +719,10 @@ counters_get(_Ref, _Ix) ->
 counters_add(_Ref, _Ix, _Incr) ->
     erlang:nif_error(undef).
 
+-spec counters_put(reference(), pos_integer(), integer()) -> ok.
+counters_put(_Ref, _Ix, _Value) ->
+    erlang:nif_error(undef).
+
 -spec counters_info(reference()) -> #{}.
 counters_info(_Ref) ->
     erlang:nif_error(undef).
diff --git a/erts/vsn.mk b/erts/vsn.mk
index 241d5a3b85..643a8a2e76 100644
--- a/erts/vsn.mk
+++ b/erts/vsn.mk
@@ -18,7 +18,7 @@
 # %CopyrightEnd%
 # 
 
-VSN = 10.1.2
+VSN = 10.1.3
 
 # Port number 4365 in 4.2
 # Port number 4366 in 4.3