15 files changed, 514 insertions, 183 deletions
diff --git a/erts/doc/src/erl.xml b/erts/doc/src/erl.xml
index e1aa5ce76e..8b152b83f5 100644
--- a/erts/doc/src/erl.xml
+++ b/erts/doc/src/erl.xml
@@ -379,6 +379,16 @@
           <c><![CDATA[Host]]></c> is the fully qualified host name of the
           current host. For short names, use flag <c><![CDATA[-sname]]></c>
           instead.</p>
+        <warning>
+          <p>
+            Starting a distributed node without also specifying
+            <seealso marker="#proto_dist"><c>-proto_dist inet_tls</c></seealso>
+            will expose the node to attacks that may give the attacker
+            complete access to the node and in extension the cluster.
+            When using un-secure distributed nodes, make sure that the
+            network is configured to keep potential attackers out.
+          </p>
+        </warning>
       </item>
       <tag><c><![CDATA[-noinput]]></c></tag>
       <item>
@@ -428,12 +438,17 @@
       </item>
       <tag><c><![CDATA[-proto_dist Proto]]></c></tag>
       <item>
+        <marker id="proto_dist"/>
         <p>Specifies a protocol for Erlang distribution:</p>
         <taglist>
           <tag><c>inet_tcp</c></tag>
           <item>TCP over IPv4 (the default)</item>
           <tag><c>inet_tls</c></tag>
-          <item>Distribution over TLS/SSL</item>
+          <item>Distribution over TLS/SSL, See the
+            <seealso marker="ssl:ssl_distribution">
+              Using SSL for Erlang Distribution</seealso> User's Guide
+              for details on how to setup a secure distributed node.
+          </item>
           <tag><c>inet6_tcp</c></tag>
           <item>TCP over IPv6</item>
         </taglist>
@@ -497,6 +512,16 @@
           exist between nodes running with flag <c><![CDATA[-sname]]></c>
           and those running with flag <c><![CDATA[-name]]></c>, as node
           names must be unique in distributed Erlang systems.</p>
+        <warning>
+          <p>
+            Starting a distributed node without also specifying
+            <seealso marker="#proto_dist"><c>-proto_dist inet_tls</c></seealso>
+            will expose the node to attacks that may give the attacker
+            complete access to the node and in extension the cluster.
+            When using un-secure distributed nodes, make sure that the
+            network is configured to keep potential attackers out.
+          </p>
+        </warning>
       </item>
       <tag><marker id="start_epmd"/><c>-start_epmd true | false</c></tag>
       <item>
diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml
index ee74983730..8391408a2e 100644
--- a/erts/doc/src/erl_dist_protocol.xml
+++ b/erts/doc/src/erl_dist_protocol.xml
@@ -70,6 +70,17 @@
 
   <p>The integers in all multibyte fields are in big-endian order.</p>
 
+  <warning>
+    <p>
+      The Erlang Distribution protocol is not by itself secure and does not
+      aim to be so. In order to get secure distribution the distributed nodes
+      should be configured to use distribution over tls.
+      See the <seealso marker="ssl:ssl_distribution">
+      Using SSL for Erlang Distribution</seealso> User's Guide
+      for details on how to setup a secure distributed node.
+    </p>
+  </warning>
+
   <section>
     <title>EPMD Protocol</title>
     <p>The requests served by the EPMD are summarized in the following
diff --git a/erts/doc/src/erl_nif.xml b/erts/doc/src/erl_nif.xml
index 51ba70994a..3eb3e04f33 100644
--- a/erts/doc/src/erl_nif.xml
+++ b/erts/doc/src/erl_nif.xml
@@ -714,6 +714,7 @@ typedef struct {
 typedef struct {
     ErlNifResourceDtor* dtor;
     ErlNifResourceStop* stop;
+    ErlNifResourceDown* down;
 } ErlNifResourceTypeInit;</code>
         <p>Initialization structure read by <seealso marker="#enif_open_resource_type_x">
 	enif_open_resource_type_x</seealso>.</p>
@@ -1395,13 +1396,12 @@ typedef enum {
         <p>Returns <c>true</c> if a pending exception is associated with the
           environment <c>env</c>. If <c>reason</c> is a <c>NULL</c> pointer,
           ignore it. Otherwise, if a pending exception associated with
-          <c>env</c> exists, set <c>ERL_NIF_TERM</c> to which <c>reason</c>
-          points to the value of the exception's term. For example, if
-          <seealso marker="#enif_make_badarg">
+          <c>env</c> exists, set <c>*reason</c> to the value of the exception
+	  term. For example, if <seealso marker="#enif_make_badarg">
           <c>enif_make_badarg</c></seealso> is called to set a pending
           <c>badarg</c> exception, a later call to
           <c>enif_has_pending_exception(env, &amp;reason)</c> sets
-          <c>reason</c> to the atom <c>badarg</c>, then return <c>true</c>.</p>
+          <c>*reason</c> to the atom <c>badarg</c>, then return <c>true</c>.</p>
         <p>See also <seealso marker="#enif_make_badarg">
           <c>enif_make_badarg</c></seealso> and
           <seealso marker="#enif_raise_exception">
@@ -2384,20 +2384,23 @@ enif_map_iterator_destroy(env, &amp;iter);</code>
           called in the two callbacks
           <seealso marker="#load"><c>load</c></seealso> and
           <seealso marker="#upgrade"><c>upgrade</c></seealso>.</p>
+	<p>See also <seealso marker="#enif_open_resource_type_x">
+	  <c>enif_open_resource_type_x</c></seealso>.</p>
       </desc>
     </func>
 
     <func>
       <name><ret>ErlNifResourceType *</ret>
         <nametext>enif_open_resource_type_x(ErlNifEnv* env, const char* name,
-	ErlNifResourceTypeInit* init,
+	const ErlNifResourceTypeInit* init,
         ErlNifResourceFlags flags, ErlNifResourceFlags* tried)</nametext>
       </name>
       <fsummary>Create or takeover a resource type.</fsummary>
       <desc>
 	<p>Same as <seealso marker="#enif_open_resource_type"><c>enif_open_resource_type</c></seealso>
-	  except is also accept a <c>stop</c> callback for resource types that are
-	  used together with <seealso marker="#enif_select"><c>enif_select</c></seealso>.</p>
+	  except it accepts additional callback functions for resource types that are
+	  used together with <seealso marker="#enif_select"><c>enif_select</c></seealso>
+	  and <seealso marker="#enif_monitor_process"><c>enif_monitor_process</c></seealso>.</p>
 	<p>Argument <c>init</c> is a pointer to an
 	  <seealso marker="#ErlNifResourceTypeInit"><c>ErlNifResourceTypeInit</c></seealso>
 	  structure that contains the function pointers for destructor, down and stop callbacks
@@ -2695,6 +2698,21 @@ enif_map_iterator_destroy(env, &amp;iter);</code>
 	  the event object. This safe way of closing event objects must be used
 	  even if all notifications have been received and no further calls to
 	  <c>enif_select</c> have been made.</p>
+	<p>The first call to <c>enif_select</c> for a specific OS <c>event</c> will establish
+	  a relation between the event object and the containing resource. All subsequent calls
+	  for an <c>event</c> must pass its containing resource as argument
+	  <c>obj</c>. The relation is dissolved when <c>enif_select</c> has
+	  been called with <c>mode</c> as <c>ERL_NIF_SELECT_STOP</c> and the
+	  corresponding <c>stop</c> callback has returned. A resource can contain
+	  several event objects but one event object can only be contained within
+	  one resource. A resource will not be destructed until all its contained relations
+	  have been dissolved.</p>
+        <note>
+	  <p>Use <seealso marker="#enif_monitor_process"><c>enif_monitor_process</c></seealso>
+	  together with <c>enif_select</c> to detect failing Erlang
+	  processes and prevent them from causing permanent leakage of resources
+	  and their contained OS event objects.</p>
+	</note>
 	<p>Returns a non-negative value on success where the following bits can be set:</p>
         <taglist>
           <tag><c>ERL_NIF_SELECT_STOP_CALLED</c></tag>
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index 6d165e9eff..d9cc5ef936 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -6148,28 +6148,60 @@ true</pre>
       <fsummary>Information about active processes and ports.</fsummary>
       <desc>
         <marker id="statistics_active_tasks"></marker>
+        <p>Returns the same as
+	<seealso marker="#statistics_active_tasks_all">
+        <c>statistics(active_tasks_all)</c></seealso>
+	with the exception that no information about the dirty
+	IO run queue and its associated schedulers is part of
+	the result. That is, only tasks that are expected to be
+	CPU bound are part of the result.</p>
+      </desc>
+    </func>
+
+    <func>
+      <name name="statistics" arity="1" clause_i="2"/>
+      <fsummary>Information about active processes and ports.</fsummary>
+      <desc>
+        <marker id="statistics_active_tasks_all"></marker>
         <p>Returns a list where each element represents the amount
           of active processes and ports on each run queue and its
-          associated scheduler. That is, the number of processes and
-          ports that are ready to run, or are currently running. The
-          element location in the list corresponds to the scheduler
-          and its run queue. The first element corresponds to scheduler
-          number 1 and so on. The information is <em>not</em> gathered
-          atomically. That is, the result is not necessarily a
-          consistent snapshot of the state, but instead quite
-          efficiently gathered.</p>
+          associated schedulers. That is, the number of processes and
+          ports that are ready to run, or are currently running.
+	  Values for normal run queues and their associated schedulers
+	  are located first in the resulting list. The first element
+	  corresponds to scheduler number 1 and so on. If support for
+	  dirty schedulers exist, an element with the value for the
+	  dirty CPU run queue and its associated dirty CPU schedulers
+	  follow and then as last element the value for the the dirty
+	  IO run queue and its associated dirty IO schedulers follow.
+	  The information is <em>not</em> gathered atomically. That is,
+	  the result is not necessarily a consistent snapshot of the
+	  state, but instead quite efficiently gathered.</p>
+	<note><p>Each normal scheduler has one run queue that it
+	manages. If dirty schedulers schedulers are supported, all
+	dirty CPU schedulers share one run queue, and all dirty IO
+	schedulers share one run queue. That is, we have multiple
+	normal run queues, one dirty CPU run queue and one dirty
+	IO run queue. Work can <em>not</em> migrate between the
+	different types of run queues. Only work in normal run
+	queues can migrate to other normal run queues. This has
+	to be taken into account when evaluating the result.</p></note>
         <p>See also
           <seealso marker="#statistics_total_active_tasks">
           <c>statistics(total_active_tasks)</c></seealso>,
           <seealso marker="#statistics_run_queue_lengths">
-          <c>statistics(run_queue_lengths)</c></seealso>, and
+          <c>statistics(run_queue_lengths)</c></seealso>,
+          <seealso marker="#statistics_run_queue_lengths_all">
+          <c>statistics(run_queue_lengths_all)</c></seealso>,
           <seealso marker="#statistics_total_run_queue_lengths">
-          <c>statistics(total_run_queue_lengths)</c></seealso>.</p>
+          <c>statistics(total_run_queue_lengths)</c></seealso>, and
+          <seealso marker="#statistics_total_run_queue_lengths_all">
+          <c>statistics(total_run_queue_lengths_all)</c></seealso>.</p>
       </desc>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="2"/>
+      <name name="statistics" arity="1" clause_i="3"/>
       <fsummary>Information about context switches.</fsummary>
       <desc>
         <p>Returns the total number of context switches since the
@@ -6178,7 +6210,7 @@ true</pre>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="3"/>
+      <name name="statistics" arity="1" clause_i="4"/>
       <fsummary>Information about exact reductions.</fsummary>
       <desc>
         <marker id="statistics_exact_reductions"></marker>
@@ -6194,7 +6226,7 @@ true</pre>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="4"/>
+      <name name="statistics" arity="1" clause_i="5"/>
       <fsummary>Information about garbage collection.</fsummary>
       <desc>
         <p>Returns information about garbage collection, for example:</p>
@@ -6206,7 +6238,7 @@ true</pre>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="5"/>
+      <name name="statistics" arity="1" clause_i="6"/>
       <fsummary>Information about I/O.</fsummary>
       <desc>
         <p>Returns <c><anno>Input</anno></c>,
@@ -6217,7 +6249,7 @@ true</pre>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="6"/>
+      <name name="statistics" arity="1" clause_i="7"/>
       <fsummary>Information about microstate accounting.</fsummary>
       <desc>
         <marker id="statistics_microstate_accounting"></marker>
@@ -6353,7 +6385,7 @@ lists:map(
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="7"/>
+      <name name="statistics" arity="1" clause_i="8"/>
       <fsummary>Information about reductions.</fsummary>
       <desc>
         <marker id="statistics_reductions"></marker>
@@ -6372,12 +6404,13 @@ lists:map(
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="8"/>
+      <name name="statistics" arity="1" clause_i="9"/>
       <fsummary>Information about the run-queues.</fsummary>
       <desc><marker id="statistics_run_queue"></marker>
-        <p>Returns the total length of the run-queues. That is, the number
+        <p>Returns the total length of all normal run-queues. That is, the number
           of processes and ports that are ready to run on all available
-          run-queues. The information is gathered atomically. That
+          normal run-queues. Dirty run queues are not part of the
+	  result. The information is gathered atomically. That
           is, the result is a consistent snapshot of the state, but
           this operation is much more expensive compared to
           <seealso marker="#statistics_total_run_queue_lengths">
@@ -6387,29 +6420,63 @@ lists:map(
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="9"/>
+      <name name="statistics" arity="1" clause_i="10"/>
       <fsummary>Information about the run-queue lengths.</fsummary>
       <desc><marker id="statistics_run_queue_lengths"></marker>
+        <p>Returns the same as
+	<seealso marker="#statistics_run_queue_lengths_all">
+        <c>statistics(run_queue_lengths_all)</c></seealso>
+	with the exception that no information about the dirty
+	IO run queue is part of the result. That is, only
+	run queues with work that is expected to be CPU bound
+	is part of the result.</p>
+      </desc>
+    </func>
+
+    <func>
+      <name name="statistics" arity="1" clause_i="11"/>
+      <fsummary>Information about the run-queue lengths.</fsummary>
+      <desc><marker id="statistics_run_queue_lengths_all"></marker>
         <p>Returns a list where each element represents the amount
-          of processes and ports ready to run for each run queue. The
-          element location in the list corresponds to the run queue
-          of a scheduler. The first element corresponds to the run
-          queue of scheduler number 1 and so on. The information is
-          <em>not</em> gathered atomically. That is, the result is
-          not necessarily a consistent snapshot of the state, but
-          instead quite efficiently gathered.</p>
+          of processes and ports ready to run for each run queue.
+	  Values for normal run queues are located first in the
+	  resulting list. The first element corresponds to the
+	  normal run queue of scheduler number 1 and so on. If
+	  support for dirty schedulers exist, values for the dirty
+	  CPU run queue and the dirty IO run queue follow (in that
+	  order) at the end. The information is <em>not</em>
+	  gathered atomically. That is, the result is not
+	  necessarily a consistent snapshot of the state, but
+	  instead quite efficiently gathered.</p>
+	<note><p>Each normal scheduler has one run queue that it
+	manages. If dirty schedulers schedulers are supported, all
+	dirty CPU schedulers share one run queue, and all dirty IO
+	schedulers share one run queue. That is, we have multiple
+	normal run queues, one dirty CPU run queue and one dirty
+	IO run queue. Work can <em>not</em> migrate between the
+	different types of run queues. Only work in normal run
+	queues can migrate to other normal run queues. This has
+	to be taken into account when evaluating the result.</p></note>
         <p>See also
+          <seealso marker="#statistics_run_queue_lengths">
+          <c>statistics(run_queue_lengths)</c></seealso>,
+          <seealso marker="#statistics_total_run_queue_lengths_all">
+          <c>statistics(total_run_queue_lengths_all)</c></seealso>,
           <seealso marker="#statistics_total_run_queue_lengths">
           <c>statistics(total_run_queue_lengths)</c></seealso>,
           <seealso marker="#statistics_active_tasks">
-          <c>statistics(active_tasks)</c></seealso>, and
+          <c>statistics(active_tasks)</c></seealso>,
+          <seealso marker="#statistics_active_tasks_all">
+          <c>statistics(active_tasks_all)</c></seealso>, and
           <seealso marker="#statistics_total_active_tasks">
-          <c>statistics(total_active_tasks)</c></seealso>.</p>
+          <c>statistics(total_active_tasks)</c></seealso>,
+          <seealso marker="#statistics_total_active_tasks_all">
+          <c>statistics(total_active_tasks_all)</c></seealso>.</p>
       </desc>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="10"/>
+      <name name="statistics" arity="1" clause_i="12"/>
       <fsummary>Information about runtime.</fsummary>
       <desc>
         <p>Returns information about runtime, in milliseconds.</p>
@@ -6424,7 +6491,7 @@ lists:map(
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="11"/>
+      <name name="statistics" arity="1" clause_i="13"/>
       <fsummary>Information about each schedulers work time.</fsummary>
       <desc>
         <marker id="statistics_scheduler_wall_time"></marker>
@@ -6545,7 +6612,7 @@ ok
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="12"/>
+      <name name="statistics" arity="1" clause_i="14"/>
       <fsummary>Information about each schedulers work time.</fsummary>
       <desc>
         <marker id="statistics_scheduler_wall_time_all"></marker>
@@ -6570,47 +6637,47 @@ ok
       </desc>
     </func>
     <func>
-      <name name="statistics" arity="1" clause_i="13"/>
+      <name name="statistics" arity="1" clause_i="15"/>
       <fsummary>Information about active processes and ports.</fsummary>
       <desc><marker id="statistics_total_active_tasks"></marker>
-        <p>Returns the total amount of active processes and ports in
-          the system. That is, the number of processes and ports that
-          are ready to run, or are currently running. The information
-          is <em>not</em> gathered atomically. That is, the result
-          is not necessarily a consistent snapshot of the state, but
-          instead quite efficiently gathered.</p>
-        <p>See also
-          <seealso marker="#statistics_active_tasks">
-          <c>statistics(active_tasks)</c></seealso>,
-          <seealso marker="#statistics_run_queue_lengths">
-          <c>statistics(run_queue_lengths)</c></seealso>, and
-          <seealso marker="#statistics_total_run_queue_lengths">
-          <c>statistics(total_run_queue_lengths)</c></seealso>.</p>
+        <p>The same as calling
+	<c>lists:sum(</c><seealso marker="#statistics_active_tasks"><c>statistics(active_tasks)</c></seealso><c>)</c>,
+	but more efficient.</p>
       </desc>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="14"/>
+      <name name="statistics" arity="1" clause_i="16"/>
+      <fsummary>Information about active processes and ports.</fsummary>
+      <desc><marker id="statistics_total_active_tasks_all"></marker>
+        <p>The same as calling
+	<c>lists:sum(</c><seealso marker="#statistics_active_tasks_all"><c>statistics(active_tasks_all)</c></seealso><c>)</c>,
+	but more efficient.</p>
+      </desc>
+    </func>
+
+    <func>
+      <name name="statistics" arity="1" clause_i="17"/>
       <fsummary>Information about the run-queue lengths.</fsummary>
       <desc><marker id="statistics_total_run_queue_lengths"></marker>
-        <p>Returns the total length of the run queues. That is, the number
-          of processes and ports that are ready to run on all available
-          run queues. The information is <em>not</em> gathered atomically.
-          That is, the result is not necessarily a consistent snapshot of
-          the state, but much more efficiently gathered compared to
-          <seealso marker="#statistics_run_queue">
-          <c>statistics(run_queue)</c></seealso>.</p>
-        <p>See also <seealso marker="#statistics_run_queue_lengths">
-          <c>statistics(run_queue_lengths)</c></seealso>,
-          <seealso marker="#statistics_total_active_tasks">
-          <c>statistics(total_active_tasks)</c></seealso>, and
-          <seealso marker="#statistics_active_tasks">
-          <c>statistics(active_tasks)</c></seealso>.</p>
+        <p>The same as calling
+	<c>lists:sum(</c><seealso marker="#statistics_run_queue_lengths"><c>statistics(run_queue_lengths)</c></seealso><c>)</c>,
+	but more efficient.</p>
       </desc>
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="15"/>
+      <name name="statistics" arity="1" clause_i="18"/>
+      <fsummary>Information about the run-queue lengths.</fsummary>
+      <desc><marker id="statistics_total_run_queue_lengths_all"></marker>
+        <p>The same as calling
+	<c>lists:sum(</c><seealso marker="#statistics_run_queue_lengths_all"><c>statistics(run_queue_lengths_all)</c></seealso><c>)</c>,
+	but more efficient.</p>
+      </desc>
+    </func>
+
+    <func>
+      <name name="statistics" arity="1" clause_i="19"/>
       <fsummary>Information about wall clock.</fsummary>
       <desc>
         <p>Returns information about wall clock. <c>wall_clock</c> can
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index 477a7676d6..a44d23b181 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -76,6 +76,7 @@ atom ac
 atom accessor
 atom active
 atom active_tasks
+atom active_tasks_all
 atom alive
 atom all
 atom all_but_first
@@ -567,6 +568,7 @@ atom return_to
 atom return_trace
 atom run_queue
 atom run_queue_lengths
+atom run_queue_lengths_all
 atom runnable
 atom runnable_ports
 atom runnable_procs
@@ -656,8 +658,10 @@ atom Times='*'
 atom timestamp
 atom total
 atom total_active_tasks
+atom total_active_tasks_all
 atom total_heap_size
 atom total_run_queue_lengths
+atom total_run_queue_lengths_all
 atom tpkt
 atom trace trace_ts traced
 atom trace_control_word
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 5fc70dfc02..e2773475b0 100644
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -3450,9 +3450,15 @@ BIF_RETTYPE statistics_1(BIF_ALIST_1)
 	if (is_non_value(res))
 	    BIF_RET(am_undefined);
 	BIF_TRAP1(gather_sched_wall_time_res_trap, BIF_P, res);
-    } else if (BIF_ARG_1 == am_total_active_tasks
-	       || BIF_ARG_1 == am_total_run_queue_lengths) {
-	Uint no = erts_run_queues_len(NULL, 0, BIF_ARG_1 == am_total_active_tasks);
+    } else if ((BIF_ARG_1 == am_total_active_tasks)
+	       | (BIF_ARG_1 == am_total_run_queue_lengths)
+               | (BIF_ARG_1 == am_total_active_tasks_all)
+	       | (BIF_ARG_1 == am_total_run_queue_lengths_all)) {
+	Uint no = erts_run_queues_len(NULL, 0,
+                                      ((BIF_ARG_1 == am_total_active_tasks)
+                                       | (BIF_ARG_1 == am_total_active_tasks_all)),
+                                      ((BIF_ARG_1 == am_total_active_tasks_all)
+                                       | (BIF_ARG_1 == am_total_run_queue_lengths_all)));
 	if (IS_USMALL(0, no))
 	    res = make_small(no);
 	else {
@@ -3460,13 +3466,21 @@ BIF_RETTYPE statistics_1(BIF_ALIST_1)
 	    res = uint_to_big(no, hp);
 	}
 	BIF_RET(res);
-    } else if (BIF_ARG_1 == am_active_tasks
-	       || BIF_ARG_1 == am_run_queue_lengths) {
+    } else if ((BIF_ARG_1 == am_active_tasks)
+           | (BIF_ARG_1 == am_run_queue_lengths)
+           | (BIF_ARG_1 == am_active_tasks_all)
+           | (BIF_ARG_1 == am_run_queue_lengths_all)) {
 	Eterm res, *hp, **hpp;
 	Uint sz, *szp;
-	int no_qs = erts_no_run_queues;
+        int incl_dirty_io = ((BIF_ARG_1 == am_active_tasks_all)
+                             | (BIF_ARG_1 == am_run_queue_lengths_all));
+        int no_qs = (erts_no_run_queues + ERTS_NUM_DIRTY_CPU_RUNQS +
+                     (incl_dirty_io ? ERTS_NUM_DIRTY_IO_RUNQS : 0));
 	Uint *qszs = erts_alloc(ERTS_ALC_T_TMP,sizeof(Uint)*no_qs*2);
-	(void) erts_run_queues_len(qszs, 0, BIF_ARG_1 == am_active_tasks);
+        (void) erts_run_queues_len(qszs, 0,
+                                   ((BIF_ARG_1 == am_active_tasks)
+                                    | (BIF_ARG_1 == am_active_tasks_all)),
+                                   incl_dirty_io);
 	sz = 0;
 	szp = &sz;
 	hpp = NULL;
@@ -3539,7 +3553,7 @@ BIF_RETTYPE statistics_1(BIF_ALIST_1)
 	res = TUPLE2(hp, b1, b2);
 	BIF_RET(res);
     } else if (BIF_ARG_1 ==  am_run_queue) {
-	res = erts_run_queues_len(NULL, 1, 0);
+	res = erts_run_queues_len(NULL, 1, 0, 0);
 	BIF_RET(make_small(res));
     } else if (BIF_ARG_1 == am_wall_clock) {
 	UWord w1, w2;
@@ -3557,9 +3571,9 @@ BIF_RETTYPE statistics_1(BIF_ALIST_1)
     else if (ERTS_IS_ATOM_STR("run_queues", BIF_ARG_1)) {
 	Eterm res, *hp, **hpp;
 	Uint sz, *szp;
-	int no_qs = erts_no_run_queues;
+	int no_qs = erts_no_run_queues + ERTS_NUM_DIRTY_RUNQS;
 	Uint *qszs = erts_alloc(ERTS_ALC_T_TMP,sizeof(Uint)*no_qs*2);
-	(void) erts_run_queues_len(qszs, 0, 0);
+	(void) erts_run_queues_len(qszs, 0, 0, 1);
 	sz = 0;
 	szp = &sz;
 	hpp = NULL;
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index da27c7e7c6..7952e3031d 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -405,13 +405,59 @@ static erts_atomic_t runq_supervisor_sleeping;
 ErtsSchedulerData *erts_scheduler_data;
 #endif
 
-ErtsAlignedRunQueue *erts_aligned_run_queues;
-Uint erts_no_run_queues;
+ErtsAlignedRunQueue * ERTS_WRITE_UNLIKELY(erts_aligned_run_queues);
+Uint ERTS_WRITE_UNLIKELY(erts_no_run_queues);
 
-ErtsAlignedSchedulerData *erts_aligned_scheduler_data;
 #ifdef ERTS_DIRTY_SCHEDULERS
-ErtsAlignedSchedulerData *erts_aligned_dirty_cpu_scheduler_data;
-ErtsAlignedSchedulerData *erts_aligned_dirty_io_scheduler_data;
+
+struct {
+    union {
+        erts_smp_atomic32_t active;
+        char align__[ERTS_CACHE_LINE_SIZE];
+    } cpu;
+    union {
+        erts_smp_atomic32_t active;
+        char align__[ERTS_CACHE_LINE_SIZE];
+    } io;
+} dirty_count erts_align_attribute(ERTS_CACHE_LINE_SIZE);
+
+#endif
+
+static ERTS_INLINE void
+dirty_active(ErtsSchedulerData *esdp, erts_aint32_t add)
+{
+#ifdef ERTS_DIRTY_SCHEDULERS
+    erts_aint32_t val;
+    erts_smp_atomic32_t *ap;
+    switch (esdp->type) {
+    case ERTS_SCHED_DIRTY_CPU:
+        ap = &dirty_count.cpu.active;
+        break;
+    case ERTS_SCHED_DIRTY_IO:
+        ap = &dirty_count.io.active;
+        break;
+    default:
+        ap = NULL;
+        ERTS_INTERNAL_ERROR("Not a dirty scheduler");
+        break;
+    }
+
+    /*
+     * All updates done under run-queue lock, so
+     * no inc or dec needed...
+     */
+    ERTS_SMP_ASSERT(erts_smp_lc_runq_is_locked(esdp->run_queue));
+
+    val = erts_smp_atomic32_read_nob(ap);
+    val += add;
+    erts_smp_atomic32_set_nob(ap, val);
+#endif
+}
+
+ErtsAlignedSchedulerData * ERTS_WRITE_UNLIKELY(erts_aligned_scheduler_data);
+#ifdef ERTS_DIRTY_SCHEDULERS
+ErtsAlignedSchedulerData * ERTS_WRITE_UNLIKELY(erts_aligned_dirty_cpu_scheduler_data);
+ErtsAlignedSchedulerData * ERTS_WRITE_UNLIKELY(erts_aligned_dirty_io_scheduler_data);
 typedef union {
     Process dsp;
     char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(Process))];
@@ -539,22 +585,28 @@ do {									\
     }									\
 } while (0)
 
-#define ERTS_ATOMIC_FOREACH_RUNQ_X(RQVAR, DO, DOX)			\
+#define ERTS_ATOMIC_FOREACH_RUNQ_X(RQVAR, NRQS, DO, DOX)		\
 do {									\
     ErtsRunQueue *RQVAR;						\
+    int nrqs = (NRQS);                                                  \
     int ix__;								\
-    for (ix__ = 0; ix__ < erts_no_run_queues; ix__++) {			\
+    for (ix__ = 0; ix__ < nrqs; ix__++) {                               \
 	RQVAR = ERTS_RUNQ_IX(ix__);					\
 	erts_smp_runq_lock(RQVAR);					\
 	{ DO; }								\
     }									\
     { DOX; }								\
-    for (ix__ = 0; ix__ < erts_no_run_queues; ix__++)			\
+    for (ix__ = 0; ix__ < nrqs; ix__++)                                 \
 	erts_smp_runq_unlock(ERTS_RUNQ_IX(ix__));			\
 } while (0)
 
-#define ERTS_ATOMIC_FOREACH_RUNQ(RQVAR, DO) \
-  ERTS_ATOMIC_FOREACH_RUNQ_X(RQVAR, DO, )
+#define ERTS_ATOMIC_FOREACH_RUNQ(RQVAR, DO)                             \
+  ERTS_ATOMIC_FOREACH_RUNQ_X(RQVAR, erts_no_run_queues + ERTS_NUM_DIRTY_RUNQS, DO, )
+
+#define ERTS_ATOMIC_FOREACH_NORMAL_RUNQ(RQVAR, DO)                      \
+    ERTS_ATOMIC_FOREACH_RUNQ_X(RQVAR, erts_no_run_queues, DO, )
+
+
 /*
  * Local functions.
  */
@@ -2971,7 +3023,7 @@ erts_active_schedulers(void)
 {
     Uint as = erts_no_schedulers;
 
-    ERTS_ATOMIC_FOREACH_RUNQ(rq, as -= abs(rq->waiting));
+    ERTS_ATOMIC_FOREACH_NORMAL_RUNQ(rq, as -= abs(rq->waiting));
 
     return as;
 }
@@ -3383,6 +3435,7 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 	    rq->sleepers.list->prev = ssi;
 	rq->sleepers.list = ssi;
 	erts_smp_spin_unlock(&rq->sleepers.lock);
+        dirty_active(esdp, -1);
     }
 #endif
 
@@ -3724,6 +3777,9 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 	sched_active_sys(esdp->no, rq);
     }
 
+    if (ERTS_SCHEDULER_IS_DIRTY(esdp))
+        dirty_active(esdp, 1);
+
     ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq));
 }
 
@@ -6123,7 +6179,7 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
 #endif
 		     )
 {
-    int ix, n, no_ssi;
+    int ix, n, no_ssi, tot_rqs;
     char *daww_ptr;
     size_t daww_sz;
     size_t size_runqs;
@@ -6156,26 +6212,19 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
     /* Create and initialize run queues */
 
     n = no_schedulers;
-    size_runqs = sizeof(ErtsAlignedRunQueue) * (n + ERTS_NUM_DIRTY_RUNQS);
+    tot_rqs = (n + ERTS_NUM_DIRTY_RUNQS);
+    size_runqs = sizeof(ErtsAlignedRunQueue) * tot_rqs;
     erts_aligned_run_queues =
 	erts_alloc_permanent_cache_aligned(ERTS_ALC_T_RUNQS, size_runqs);
 #ifdef ERTS_SMP
-#ifdef ERTS_DIRTY_SCHEDULERS
-    erts_aligned_run_queues += ERTS_NUM_DIRTY_RUNQS;
-#endif
     erts_smp_atomic32_init_nob(&no_empty_run_queues, 0);
 #endif
 
     erts_no_run_queues = n;
 
-    for (ix = -(ERTS_NUM_DIRTY_RUNQS); ix < n; ix++) {
+    for (ix = 0; ix < tot_rqs; ix++) {
 	int pix, rix;
-#ifdef ERTS_DIRTY_SCHEDULERS
-	ErtsRunQueue *rq = ERTS_RUNQ_IX_IS_DIRTY(ix) ?
-	    ERTS_DIRTY_RUNQ_IX(ix) : ERTS_RUNQ_IX(ix);
-#else
 	ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
-#endif
 
 	rq->ix = ix;
 
@@ -6448,6 +6497,11 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
 			     ERTS_SCHED_DIRTY_IO,
 			     no_dirty_io_schedulers);
 
+    erts_smp_atomic32_init_nob(&dirty_count.cpu.active,
+                               (erts_aint32_t) no_dirty_cpu_schedulers);
+    erts_smp_atomic32_init_nob(&dirty_count.io.active,
+                               (erts_aint32_t) no_dirty_io_schedulers);
+
 #endif
 
     if (set_schdlr_sspnd_change_flags)
@@ -7822,6 +7876,7 @@ suspend_scheduler(ErtsSchedulerData *esdp)
 #endif
 
     if (sched_type != ERTS_SCHED_NORMAL) {
+        dirty_active(esdp, -1);
 	erts_smp_runq_unlock(esdp->run_queue);
         dirty_sched_wall_time_change(esdp, 0);
     }
@@ -8130,7 +8185,9 @@ suspend_scheduler(ErtsSchedulerData *esdp)
     erts_smp_runq_lock(esdp->run_queue);
     non_empty_runq(esdp->run_queue);
 
-    if (sched_type == ERTS_SCHED_NORMAL) {
+    if (sched_type != ERTS_SCHED_NORMAL)
+        dirty_active(esdp, 1);
+    else {
 	schedule_bound_processes(esdp->run_queue, &sbp);
 
 	erts_sched_check_cpu_bind_post_suspend(esdp);
@@ -9764,38 +9821,69 @@ erts_internal_is_process_executing_dirty_1(BIF_ALIST_1)
     BIF_RET(am_false);
 }
 
+static ERTS_INLINE void
+run_queues_len_aux(ErtsRunQueue *rq, Uint *tot_len, Uint *qlen, int *ip, int incl_active_sched, int locked)
+{
+    Sint rq_len;
+
+    if (locked)
+        rq_len = (Sint) erts_smp_atomic32_read_dirty(&rq->len);
+    else
+        rq_len = (Sint) erts_smp_atomic32_read_nob(&rq->len);
+    ASSERT(rq_len >= 0);
+
+    if (incl_active_sched) {
+#ifdef ERTS_DIRTY_SCHEDULERS
+        if (ERTS_RUNQ_IX_IS_DIRTY(rq->ix)) {
+            erts_aint32_t dcnt;
+            if (ERTS_RUNQ_IS_DIRTY_CPU_RUNQ(rq)) {
+                dcnt = erts_smp_atomic32_read_nob(&dirty_count.cpu.active);
+                ASSERT(0 <= dcnt && dcnt <= erts_no_dirty_cpu_schedulers);
+            }
+            else {
+                ASSERT(ERTS_RUNQ_IS_DIRTY_IO_RUNQ(rq));
+                dcnt = erts_smp_atomic32_read_nob(&dirty_count.io.active);
+                ASSERT(0 <= dcnt && dcnt <= erts_no_dirty_io_schedulers);
+            }
+            rq_len += (Sint) dcnt;
+        }
+        else
+#endif
+        {
+            if (ERTS_RUNQ_FLGS_GET_NOB(rq) & ERTS_RUNQ_FLG_EXEC)
+                rq_len++;
+        }
+    }
+    if (qlen)
+        qlen[(*ip)++] = rq_len;
+    *tot_len += (Uint) rq_len;
+}
 
 Uint
-erts_run_queues_len(Uint *qlen, int atomic_queues_read, int incl_active_sched)
+erts_run_queues_len(Uint *qlen, int atomic_queues_read, int incl_active_sched,
+                    int incl_dirty_io)
 {
-    int i = 0;
+    int i = 0, j = 0;
     Uint len = 0;
-    if (atomic_queues_read)
-	ERTS_ATOMIC_FOREACH_RUNQ(rq,
-	 {
-	     Sint rq_len = (Sint) erts_smp_atomic32_read_dirty(&rq->len);
-	     ASSERT(rq_len >= 0);
-	     if (incl_active_sched
-		 && (ERTS_RUNQ_FLGS_GET_NOB(rq) & ERTS_RUNQ_FLG_EXEC)) {
-		 rq_len++;
-	     }
-	     if (qlen)
-		 qlen[i++] = rq_len;
-	     len += (Uint) rq_len;
-	 }
-	    );
+    int no_rqs = erts_no_run_queues;
+
+#ifdef ERTS_DIRTY_SCHEDULERS
+    if (incl_dirty_io)
+        no_rqs += ERTS_NUM_DIRTY_RUNQS;
+    else
+        no_rqs += ERTS_NUM_DIRTY_CPU_RUNQS;
+#endif
+
+    if (atomic_queues_read) {
+        ERTS_ATOMIC_FOREACH_RUNQ_X(rq, no_rqs,
+                                   run_queues_len_aux(rq, &len, qlen, &j,
+                                                      incl_active_sched, 1),
+                                   /* Nothing... */);
+    }
     else {
-	for (i = 0; i < erts_no_run_queues; i++) {
+	for (i = 0; i < no_rqs; i++) {
 	    ErtsRunQueue *rq = ERTS_RUNQ_IX(i);
-	    Sint rq_len = (Sint) erts_smp_atomic32_read_nob(&rq->len);
-	    ASSERT(rq_len >= 0);
-	     if (incl_active_sched
-		 && (ERTS_RUNQ_FLGS_GET_NOB(rq) & ERTS_RUNQ_FLG_EXEC)) {
-		 rq_len++;
-	     }
-	    if (qlen)
-		qlen[i] = rq_len;
-	    len += (Uint) rq_len;
+            run_queues_len_aux(rq, &len, qlen, &j, incl_active_sched, 0);
 	}
 
     }
@@ -12097,6 +12185,8 @@ erts_get_total_reductions(Uint *redsp, Uint *diffp)
     Uint reds = 0;
     ERTS_ATOMIC_FOREACH_RUNQ_X(rq,
 
+                               erts_no_run_queues + ERTS_NUM_DIRTY_RUNQS,
+
 			       reds += rq->procs.reductions,
 
 			       if (redsp) *redsp = reds;
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index 2b169bb9ce..d44e8c252d 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -1541,24 +1541,29 @@ extern int erts_system_profile_ts_type;
     } while (0)
 
 #if defined(ERTS_DIRTY_SCHEDULERS) && defined(ERTS_SMP)
-#define ERTS_NUM_DIRTY_RUNQS 2
+#define ERTS_NUM_DIRTY_CPU_RUNQS 1
+#define ERTS_NUM_DIRTY_IO_RUNQS 1
 #else
-#define ERTS_NUM_DIRTY_RUNQS 0
+#define ERTS_NUM_DIRTY_CPU_RUNQS 0
+#define ERTS_NUM_DIRTY_IO_RUNQS 0
 #endif
 
+#define ERTS_NUM_DIRTY_RUNQS (ERTS_NUM_DIRTY_CPU_RUNQS+ERTS_NUM_DIRTY_IO_RUNQS)
+
 #define ERTS_RUNQ_IX(IX)						\
-  (ASSERT(0 <= (IX) && (IX) < erts_no_run_queues),			\
+  (ASSERT(0 <= (IX) && (IX) < erts_no_run_queues+ERTS_NUM_DIRTY_RUNQS), \
    &erts_aligned_run_queues[(IX)].runq)
 #ifdef ERTS_DIRTY_SCHEDULERS
 #define ERTS_RUNQ_IX_IS_DIRTY(IX)					\
-  (-(ERTS_NUM_DIRTY_RUNQS) <= (IX) && (IX) < 0)
+  (ASSERT(0 <= (IX) && (IX) < erts_no_run_queues+ERTS_NUM_DIRTY_RUNQS), \
+   (erts_no_run_queues <= (IX)))
 #define ERTS_DIRTY_RUNQ_IX(IX)						\
   (ASSERT(ERTS_RUNQ_IX_IS_DIRTY(IX)),					\
    &erts_aligned_run_queues[(IX)].runq)
-#define ERTS_DIRTY_CPU_RUNQ (&erts_aligned_run_queues[-1].runq)
-#define ERTS_DIRTY_IO_RUNQ  (&erts_aligned_run_queues[-2].runq)
-#define ERTS_RUNQ_IS_DIRTY_CPU_RUNQ(RQ) ((RQ)->ix == -1)
-#define ERTS_RUNQ_IS_DIRTY_IO_RUNQ(RQ) ((RQ)->ix == -2)
+#define ERTS_DIRTY_CPU_RUNQ (&erts_aligned_run_queues[erts_no_run_queues].runq)
+#define ERTS_DIRTY_IO_RUNQ  (&erts_aligned_run_queues[erts_no_run_queues+1].runq)
+#define ERTS_RUNQ_IS_DIRTY_CPU_RUNQ(RQ) ((RQ) == ERTS_DIRTY_CPU_RUNQ)
+#define ERTS_RUNQ_IS_DIRTY_IO_RUNQ(RQ) ((RQ) == ERTS_DIRTY_IO_RUNQ)
 #else
 #define ERTS_RUNQ_IX_IS_DIRTY(IX) 0
 #endif
@@ -1836,7 +1841,7 @@ Uint erts_active_schedulers(void);
 void erts_init_process(int, int, int);
 Eterm erts_process_state2status(erts_aint32_t);
 Eterm erts_process_status(Process *, Eterm);
-Uint erts_run_queues_len(Uint *, int, int);
+Uint erts_run_queues_len(Uint *, int, int, int);
 void erts_add_to_runq(Process *);
 Eterm erts_bound_schedulers_term(Process *c_p);
 Eterm erts_get_cpu_topology_term(Process *c_p, Eterm which);
diff --git a/erts/emulator/test/nif_SUITE.erl b/erts/emulator/test/nif_SUITE.erl
index 1eb58699b2..bcea9e3539 100644
--- a/erts/emulator/test/nif_SUITE.erl
+++ b/erts/emulator/test/nif_SUITE.erl
@@ -488,7 +488,7 @@ select(Config) when is_list(Config) ->
     %% Wait for read
     eagain = read_nif(R, 3),
     0 = select_nif(R,?ERL_NIF_SELECT_READ,R,null,Ref),
-    [] = flush(),
+    [] = flush(0),
     ok = write_nif(W, <<"hej">>),
     [{select, R, Ref, ready_input}] = flush(),
     0 = select_nif(R,?ERL_NIF_SELECT_READ,R,self(),Ref2),
@@ -505,7 +505,7 @@ select(Config) when is_list(Config) ->
     %% Wait for write
     Written = write_full(W, $a),
     0 = select_nif(W,?ERL_NIF_SELECT_WRITE,W,self(),Ref),
-    [] = flush(),
+    [] = flush(0),
     Written = read_nif(R,byte_size(Written)),
     [{select, W, Ref, ready_output}] = flush(),
 
@@ -515,7 +515,7 @@ select(Config) when is_list(Config) ->
     [{fd_resource_stop, W_ptr, _}] = flush(),
     {1, {W_ptr,_}} = last_fd_stop_call(),
     true = is_closed_nif(W),
-    [] = flush(),
+    [] = flush(0),
     0 = select_nif(R,?ERL_NIF_SELECT_READ,R,self(),Ref),
     [{select, R, Ref, ready_input}] = flush(),
     eof = read_nif(R,1),
@@ -540,7 +540,7 @@ select_2(Config) ->
     0 = select_nif(R,?ERL_NIF_SELECT_READ,R,null,Ref1),
     0 = select_nif(R,?ERL_NIF_SELECT_READ,R,self(),Ref2),
 
-    [] = flush(),
+    [] = flush(0),
     ok = write_nif(W, <<"hej">>),
     [{select, R, Ref2, ready_input}] = flush(),
     <<"hej">> = read_nif(R, 3),
@@ -551,7 +551,7 @@ select_2(Config) ->
     Papa = self(),
     spawn_link(fun() ->
                        0 = select_nif(R,?ERL_NIF_SELECT_READ,R,null,Ref1),
-                       [] = flush(),
+                       [] = flush(0),
                        Papa ! sync,
                        [{select, R, Ref1, ready_input}] = flush(),
                        <<"hej">> = read_nif(R, 3),
@@ -560,7 +560,7 @@ select_2(Config) ->
     sync = receive_any(),
     ok = write_nif(W, <<"hej">>),
     done = receive_any(),
-    [] = flush(),
+    [] = flush(0),
 
     check_stop_ret(select_nif(R,?ERL_NIF_SELECT_STOP,R,null,Ref1)),
     [{fd_resource_stop, R_ptr, _}] = flush(),
@@ -629,6 +629,15 @@ monitor_process_a(Config) ->
 monitor_process_b(Config) ->
     ensure_lib_loaded(Config),
 
+    monitor_process_b_do(false),
+    case erlang:system_info(threads) of
+        true ->  monitor_process_b_do(true);
+        false -> ok
+    end,
+    ok.
+
+
+monitor_process_b_do(FromThread) ->
     Pid = spawn_link(fun() ->
                              receive
                                  return -> ok
@@ -637,8 +646,11 @@ monitor_process_b(Config) ->
     R_ptr = alloc_monitor_resource_nif(),
     {0,_} = monitor_process_nif(R_ptr, Pid, true, self()),
     [R_ptr] = monitored_by(Pid),
-    ok = release_resource(R_ptr),
-    [] = flush(),
+    case FromThread of
+        false -> ok = release_resource(R_ptr);
+        true -> ok = release_resource_from_thread(R_ptr)
+    end,
+    [] = flush(0),
     {R_ptr, _, 1} = last_resource_dtor_call(),
     [] = monitored_by(Pid),
     Pid ! return,
@@ -660,7 +672,7 @@ monitor_process_c(Config) ->
                              exit
                      end),
     [{Pid, done, R_ptr, Mon1},
-     {monitor_resource_down, R_ptr, Pid, Mon2}] = flush(),
+     {monitor_resource_down, R_ptr, Pid, Mon2}] = flush(2),
     compare_monitors_nif(Mon1, Mon2),
     {R_ptr, _, 1} = last_resource_dtor_call(),
     ok.
@@ -708,7 +720,7 @@ demonitor_process(Config) ->
     1 = demonitor_process_nif(R_ptr, MonBin2),
 
     ok = release_resource(R_ptr),
-    [] = flush(),
+    [] = flush(0),
     {R_ptr, _, 1} = last_resource_dtor_call(),
     [] = monitored_by(Pid),
     Pid ! return,
@@ -2307,10 +2319,16 @@ receive_any(Timeout) ->
     after Timeout -> timeout end.
 
 flush() ->
-    flush(10).
-flush(Timeout) ->
+    flush(1).
+
+flush(0) ->
+    flush(0, 10);  % don't waste too much time waiting for nothing
+flush(N) ->
+    flush(N, 1000).
+
+flush(N, Timeout) ->
     receive M ->
-            [M | flush(Timeout)]
+            [M | flush(N-1)]
     after Timeout ->
             []
     end.
@@ -2619,9 +2637,9 @@ nif_snprintf(Config) ->
 nif_internal_hash(Config) ->
     ensure_lib_loaded(Config),
     HashValueBitSize = nif_hash_result_bitsize(internal),
-    Terms = unique([random_term() || _ <- lists:seq(1, 5000)]),
+    Terms = unique([random_term() || _ <- lists:seq(1, 500)]),
     HashValues = [hash_nif(internal, Term, 0) || Term <- Terms],
-    test_bit_distribution_fitness(HashValues, HashValueBitSize, 0.05).
+    test_bit_distribution_fitness(HashValues, HashValueBitSize).
 
 nif_internal_hash_salted(Config) ->
     ensure_lib_loaded(Config),
@@ -2630,7 +2648,7 @@ nif_internal_hash_salted(Config) ->
 nif_phash2(Config) ->
     ensure_lib_loaded(Config),
     HashValueBitSize = nif_hash_result_bitsize(phash2),
-    Terms = unique([random_term() || _ <- lists:seq(1, 5000)]),
+    Terms = unique([random_term() || _ <- lists:seq(1, 500)]),
     HashValues =
         lists:map(
           fun (Term) ->
@@ -2643,12 +2661,12 @@ nif_phash2(Config) ->
                   HashValue
           end,
           Terms),
-    test_bit_distribution_fitness(HashValues, HashValueBitSize, 0.05).
+    test_bit_distribution_fitness(HashValues, HashValueBitSize).
 
 test_salted_nif_hash(HashType) ->
     HashValueBitSize = nif_hash_result_bitsize(HashType),
-    Terms = unique([random_term() || _ <- lists:seq(1, 5000)]),
-    Salts = unique([random_uint32() || _ <- lists:seq(1, 100)]),
+    Terms = unique([random_term() || _ <- lists:seq(1, 500)]),
+    Salts = unique([random_uint32() || _ <- lists:seq(1, 50)]),
     {HashValuesPerSalt, HashValuesPerTerm} =
         lists:mapfoldl(
           fun (Salt, Acc) ->
@@ -2669,22 +2687,20 @@ test_salted_nif_hash(HashType) ->
     % Test per-salt hash distribution of different terms
     lists:foreach(
       fun ({_Salt, HashValues}) ->
-              test_bit_distribution_fitness(HashValues, HashValueBitSize, 0.05)
+              test_bit_distribution_fitness(HashValues, HashValueBitSize)
       end,
       HashValuesPerSalt),
 
     % Test per-term hash distribution of different salts
     dict:fold(
       fun (_Term, HashValues, Acc) ->
-              % Be more tolerant of relative deviation,
-              % as there's fewer hash values here.
-              test_bit_distribution_fitness(HashValues, HashValueBitSize, 0.30),
+              test_bit_distribution_fitness(HashValues, HashValueBitSize),
               Acc
       end,
       ok,
       HashValuesPerTerm).
 
-test_bit_distribution_fitness(Integers, BitSize, MaxRelativeDeviation) ->
+test_bit_distribution_fitness(Integers, BitSize) ->
     MaxInteger = (1 bsl BitSize) - 1,
     OnesPerBit =
         lists:foldl(
@@ -2700,19 +2716,29 @@ test_bit_distribution_fitness(Integers, BitSize, MaxRelativeDeviation) ->
           orddict:new(),
           Integers),
 
-    ExpectedNrOfOnes = length(Integers) div 2,
+    N = length(Integers),
+    ExpectedNrOfOnes = N div 2,
+    %% ExpectedNrOfOnes should have a binomial distribution
+    %% with a standard deviation as:
+    ExpectedStdDev = math:sqrt(N) / 2,
+    %% which can be approximated as a normal distribution
+    %% where we allow a deviation of 6 std.devs
+    %% for a fail probability of 0.000000002:
+    MaxStdDevs = 6,
+
     FailureText =
         orddict:fold(
           fun (BitIndex, NrOfOnes, Acc) ->
-                  RelativeDeviation = abs(NrOfOnes - ExpectedNrOfOnes) / length(Integers),
-                  case RelativeDeviation >= MaxRelativeDeviation of
-                      false -> Acc;
+                  Deviation = abs(NrOfOnes - ExpectedNrOfOnes) / ExpectedStdDev,
+                  case Deviation >= MaxStdDevs of
+                      false ->
+                          Acc;
                       true ->
                           [Acc,
                            io_lib:format(
                              "Unreasonable deviation on number of set bits (i=~p): "
-                             "expected ~p, got ~p (relative dev. ~.3f)~n",
-                             [BitIndex, ExpectedNrOfOnes, NrOfOnes, RelativeDeviation])]
+                             "expected ~p, got ~p (# std.dev ~.3f > ~p)~n",
+                             [BitIndex, ExpectedNrOfOnes, NrOfOnes, Deviation, MaxStdDevs])]
                   end
           end,
           [],
@@ -2789,6 +2815,7 @@ alloc_resource(_,_) -> ?nif_stub.
 make_resource(_) -> ?nif_stub.
 get_resource(_,_) -> ?nif_stub.
 release_resource(_) -> ?nif_stub.
+release_resource_from_thread(_) -> ?nif_stub.
 last_resource_dtor_call() -> ?nif_stub.
 make_new_resource(_,_) -> ?nif_stub.
 check_is(_,_,_,_,_,_,_,_,_,_,_) -> ?nif_stub.
diff --git a/erts/emulator/test/nif_SUITE_data/nif_SUITE.c b/erts/emulator/test/nif_SUITE_data/nif_SUITE.c
index 3747291e7e..15d31162ed 100644
--- a/erts/emulator/test/nif_SUITE_data/nif_SUITE.c
+++ b/erts/emulator/test/nif_SUITE_data/nif_SUITE.c
@@ -972,6 +972,30 @@ static ERL_NIF_TERM release_resource(ErlNifEnv* env, int argc, const ERL_NIF_TER
     return enif_make_atom(env,"ok");
 }
 
+static void* threaded_release_resource(void* resource)
+{
+    enif_release_resource(resource);
+}
+
+static ERL_NIF_TERM release_resource_from_thread(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
+{
+    void* resource;
+    ErlNifTid tid;
+    int err;
+
+    if (!get_pointer(env, argv[0], &resource)) {
+        return enif_make_badarg(env);
+    }
+    if (enif_thread_create("nif_SUITE:release_resource_from_thread", &tid,
+                           threaded_release_resource, resource, NULL) != 0) {
+        return enif_make_badarg(env);
+    }
+    err = enif_thread_join(tid, NULL);
+    assert(err == 0);
+    return atom_ok;
+}
+
+
 /*
  * argv[0] an atom
  * argv[1] a binary
@@ -2537,6 +2561,7 @@ static ERL_NIF_TERM monitor_frenzy_nif(ErlNifEnv* env, int argc, const ERL_NIF_T
     static unsigned long spawn_cnt = 0;
     static unsigned long kill_cnt = 0;
     static unsigned long proc_histogram[FRENZY_PROCS_MAX];
+    static int initialized = 0;
 
     static const unsigned int primes[] = {7, 13, 17, 19};
 
@@ -2556,7 +2581,7 @@ static ERL_NIF_TERM monitor_frenzy_nif(ErlNifEnv* env, int argc, const ERL_NIF_T
 
     if (enif_is_atom(env, Op)) {
         if (Op == atom_init) {
-            if (procs_lock || !enif_get_uint(env, Rnd, &frenzy_rand_bits_max))
+            if (initialized || !enif_get_uint(env, Rnd, &frenzy_rand_bits_max))
                 return enif_make_badarg(env);
 
             procs_lock = enif_mutex_create("nif_SUITE:monitor_frenzy.procs");
@@ -2583,6 +2608,7 @@ static ERL_NIF_TERM monitor_frenzy_nif(ErlNifEnv* env, int argc, const ERL_NIF_T
 
             spawn_cnt = 1;
             kill_cnt = 0;
+            initialized = 1;
             return enif_make_uint(env, 0);  /* SelfPix */
         }
         else if (Op == atom_stats) {
@@ -2613,7 +2639,7 @@ static ERL_NIF_TERM monitor_frenzy_nif(ErlNifEnv* env, int argc, const ERL_NIF_T
                                              enif_make_ulong(env, res_dtor_cnt)));
 
         }
-        else if (Op == atom_stop && procs_lock) {  /* stop all */
+        else if (Op == atom_stop && initialized) {  /* stop all */
 
             /* Release all resources */
             for (rix = 0; rix < FRENZY_RESOURCES_MAX; rix++) {
@@ -2903,6 +2929,7 @@ static ErlNifFunc nif_funcs[] =
     {"make_resource", 1, make_resource},
     {"get_resource", 2, get_resource},
     {"release_resource", 1, release_resource},
+    {"release_resource_from_thread", 1, release_resource_from_thread},
     {"last_resource_dtor_call", 0, last_resource_dtor_call},
     {"make_new_resource", 2, make_new_resource},
     {"check_is", 11, check_is},
diff --git a/erts/emulator/test/statistics_SUITE.erl b/erts/emulator/test/statistics_SUITE.erl
index 0cea941687..7690557fda 100644
--- a/erts/emulator/test/statistics_SUITE.erl
+++ b/erts/emulator/test/statistics_SUITE.erl
@@ -502,20 +502,37 @@ run_queues_lengths_active_tasks(_Config) ->
                          end,
                          lists:seq(1,10)),
 
+                   
+
     TRQLs0 = statistics(total_run_queue_lengths),
+    TRQLAs0 = statistics(total_run_queue_lengths_all),
     TATs0 = statistics(total_active_tasks),
+    TATAs0 = statistics(total_active_tasks_all),
     true = is_integer(TRQLs0),
     true = is_integer(TATs0),
     true = TRQLs0 >= 0,
+    true = TRQLAs0 >= 0,
     true = TATs0 >= 11,
+    true = TATAs0 >= 11,
 
     NoScheds = erlang:system_info(schedulers),
+    {DefRqs,
+     AllRqs} = case erlang:system_info(dirty_cpu_schedulers) of
+                   0 -> {NoScheds, NoScheds};
+                   _ -> {NoScheds+1, NoScheds+2}
+               end,
     RQLs0 = statistics(run_queue_lengths),
+    RQLAs0 = statistics(run_queue_lengths_all),
     ATs0 = statistics(active_tasks),
-    NoScheds = length(RQLs0),
-    NoScheds = length(ATs0),
+    ATAs0 = statistics(active_tasks_all),
+    DefRqs = length(RQLs0),
+    AllRqs = length(RQLAs0),
+    DefRqs = length(ATs0),
+    AllRqs = length(ATAs0),
     true = lists:sum(RQLs0) >= 0,
+    true = lists:sum(RQLAs0) >= 0,
     true = lists:sum(ATs0) >= 11,
+    true = lists:sum(ATAs0) >= 11,
 
     SO = erlang:system_flag(schedulers_online, 1),
 
@@ -531,8 +548,8 @@ run_queues_lengths_active_tasks(_Config) ->
 
     RQLs1 = statistics(run_queue_lengths),
     ATs1 = statistics(active_tasks),
-    NoScheds = length(RQLs1),
-    NoScheds = length(ATs1),
+    DefRqs = length(RQLs1),
+    DefRqs = length(ATs1),
     TRQLs2 = lists:sum(RQLs1),
     TATs2 = lists:sum(ATs1),
     true = TRQLs2 >= 10,
diff --git a/erts/etc/common/erlexec.c b/erts/etc/common/erlexec.c
index 70520eea15..51ed2d0dff 100644
--- a/erts/etc/common/erlexec.c
+++ b/erts/etc/common/erlexec.c
@@ -555,7 +555,7 @@ int main(int argc, char **argv)
     if(s) {
         add_Eargs(s);         /* argv[0] = scriptname*/
     } else {
-        add_Eargs(progname);  /* argv[0] = erl or cerl */
+        add_Eargs(emu);       /* argv[0] = erl or cerl */
     }
     /*
      * Add the bindir to the path (unless it is there already).
diff --git a/erts/etc/unix/etp-commands.in b/erts/etc/unix/etp-commands.in
index b7b3a2ae99..8f70f879d5 100644
--- a/erts/etc/unix/etp-commands.in
+++ b/erts/etc/unix/etp-commands.in
@@ -1,3 +1,4 @@
+# -*- gdb-script -*-
 #
 # %CopyrightBegin%
 # 
@@ -2153,13 +2154,22 @@ define etp-processes
     printf "No processes, since system isn't initialized!\n"
   else
     set $proc_ix = 0
-    while $proc_ix < erts_proc.r.o.max
-      set $proc = (Process *) *((UWord *) &erts_proc.r.o.tab[$proc_ix])
-      if ($proc != ((Process *) 0) && $proc != &erts_invalid_process)
+    set $proc_max_ix = erts_proc.r.o.max
+    set $proc_tab = erts_proc.r.o.tab
+    set $invalid_proc = &erts_invalid_process
+    set $proc_decentile = $proc_max_ix / 10
+    set $proc_printile = $proc_decentile
+    while $proc_ix < $proc_max_ix
+      set $proc = (Process *) *((UWord *) ($proc_tab + $proc_ix))
+      if ($proc != ((Process *) 0) && $proc != $invalid_proc)
         printf "---\n"
         printf "  Pix: %d\n", $proc_ix
         etp-process-info $proc
       end
+      if $proc_ix == $proc_printile
+        printf "--- %d%% (%d / %d) searched\n", $proc_printile / $proc_decentile * 10, $proc_ix, $proc_max_ix
+        set $proc_printile += $proc_decentile
+      end
       set $proc_ix++
     end
     printf "---\n",
@@ -2479,15 +2489,19 @@ document etp-port-info
 %---------------------------------------------------------------------------
 end
 
-
 define etp-ports
   if (!erts_initialized)
     printf "No ports, since system isn't initialized!\n"
   else
     set $port_ix = 0
-    while $port_ix < erts_port.r.o.max
-      set $port = (Port *) *((UWord *) &erts_port.r.o.tab[$port_ix])
-      if ($port != ((Port *) 0) && $port != &erts_invalid_port)
+    set $port_max_ix = erts_port.r.o.max
+    set $port_tab = erts_port.r.o.tab
+    set $invalid_port = &erts_invalid_port
+    set $port_decentile = $port_max_ix / 10
+    set $port_printile = $port_decentile
+    while $port_ix < $port_max_ix
+      set $port = (Port *) *((UWord *) ($port_tab + $port_ix))
+      if ($port != ((Port *) 0) && $port != $invalid_port)
         if (*(((Uint32 *) &(((Port *) $port)->state))) & 0x100) == 0
           # I.e, not free
           printf "---\n"
@@ -2495,6 +2509,10 @@ define etp-ports
           etp-port-info $port
         end
       end
+      if $port_ix == $port_printile
+        printf "--- %d%% (%d / %d) searched\n", $port_printile / $port_decentile * 10, $port_ix, $port_max_ix
+        set $port_printile += $port_decentile
+      end
       set $port_ix++
     end
     printf "---\n",
@@ -2810,10 +2828,10 @@ define etp-run-queue-info-internal
   else
     if ($sched_type == 1)
       printf "\n--- Dirty CPU Run Queue ---\n"
-      set $runq = &erts_aligned_run_queues[-1].runq
+      set $runq = &erts_aligned_run_queues[erts_no_run_queues].runq
     else
       printf "\n--- Dirty I/O Run Queue ---\n"
-      set $runq = &erts_aligned_run_queues[-2].runq
+      set $runq = &erts_aligned_run_queues[erts_no_run_queues+1].runq
     end
   end
   printf "  Length: total=%d", *((Uint32 *) &($runq->len))
diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam
index 980df873ca..63518ed6e1 100644
--- a/erts/preloaded/ebin/erlang.beam
+++ b/erts/preloaded/ebin/erlang.beam
diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl
index fcb27ef575..72dd804412 100644
--- a/erts/preloaded/src/erlang.erl
+++ b/erts/preloaded/src/erlang.erl
@@ -2306,6 +2306,8 @@ spawn_opt(_Tuple) ->
 
 -spec statistics(active_tasks) -> [ActiveTasks] when
       ActiveTasks :: non_neg_integer();
+		(active_tasks_all) -> [ActiveTasks] when
+      ActiveTasks :: non_neg_integer();
 		(context_switches) -> {ContextSwitches,0} when
       ContextSwitches :: non_neg_integer();
                 (exact_reductions) -> {Total_Exact_Reductions,
@@ -2335,6 +2337,8 @@ spawn_opt(_Tuple) ->
                 (run_queue) -> non_neg_integer();
                 (run_queue_lengths) -> [RunQueueLength] when
       RunQueueLength :: non_neg_integer();
+                (run_queue_lengths_all) -> [RunQueueLength] when
+      RunQueueLength :: non_neg_integer();
                 (runtime) -> {Total_Run_Time, Time_Since_Last_Call} when
       Total_Run_Time :: non_neg_integer(),
       Time_Since_Last_Call :: non_neg_integer();
@@ -2347,9 +2351,13 @@ spawn_opt(_Tuple) ->
       ActiveTime  :: non_neg_integer(),
       TotalTime   :: non_neg_integer();
 		(total_active_tasks) -> ActiveTasks when
+      ActiveTasks :: non_neg_integer(); 
+		(total_active_tasks_all) -> ActiveTasks when
       ActiveTasks :: non_neg_integer();
                 (total_run_queue_lengths) -> TotalRunQueueLengths when
       TotalRunQueueLengths :: non_neg_integer();
+                (total_run_queue_lengths_all) -> TotalRunQueueLengths when
+      TotalRunQueueLengths :: non_neg_integer();
                 (wall_clock) -> {Total_Wallclock_Time,
                                  Wallclock_Time_Since_Last_Call} when
       Total_Wallclock_Time :: non_neg_integer(),