From f9459092940943876dff040ee997515b96fd5d50 Mon Sep 17 00:00:00 2001
From: Rickard Green <rickard@erlang.org>
Date: Wed, 4 Jan 2017 18:10:26 +0100
Subject: Scheduler wall time support for dirty schedulers

---
 erts/doc/src/erlang.xml                 |  97 +++++++--
 erts/emulator/beam/atom.names           |   3 +-
 erts/emulator/beam/beam_debug.c         |  13 +-
 erts/emulator/beam/bif.c                |   2 +-
 erts/emulator/beam/erl_bif_info.c       |   7 +-
 erts/emulator/beam/erl_process.c        | 366 ++++++++++++++++++++++++++------
 erts/emulator/beam/erl_process.h        |   9 +-
 erts/emulator/test/statistics_SUITE.erl | 131 ++++++++++--
 erts/preloaded/ebin/erlang.beam         | Bin 105976 -> 106168 bytes
 erts/preloaded/src/erlang.erl           |   5 +
 10 files changed, 535 insertions(+), 98 deletions(-)

(limited to 'erts')
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index b3fab3874b..1f64d7be86 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -6415,12 +6415,17 @@ lists:map(
           <c><anno>TotalTime</anno></c> is the total time duration since
           <seealso marker="#system_flag_scheduler_wall_time">
           <c>scheduler_wall_time</c></seealso>
-          activation. The time unit is undefined and can be subject
-          to change between releases, OSs, and system restarts.
-          <c>scheduler_wall_time</c> is only to be used to
-          calculate relative values for scheduler-utilization.
-          <c><anno>ActiveTime</anno></c> can never exceed
-          <c><anno>TotalTime</anno></c>.</p>
+          activation for the specific scheduler. Note that
+	  activation time can differ significantly between
+	  schedulers. Currently dirty schedulers are activated
+	  at system start while normal schedulers are activated
+	  some time after the <c>scheduler_wall_time</c>
+	  functionality is enabled. The time unit is undefined
+	  and can be subject to change between releases, OSs,
+	  and system restarts. <c>scheduler_wall_time</c> is only
+	  to be used to calculate relative values for scheduler
+	  utilization. <c><anno>ActiveTime</anno></c> can never
+	  exceed <c><anno>TotalTime</anno></c>.</p>
         <p>The definition of a busy scheduler is when it is not idle
           and is not scheduling (selecting) a process or port,
           that is:</p>
@@ -6438,15 +6443,37 @@ lists:map(
           <c>scheduler_wall_time</c></seealso> is turned off.</p>
         <p>The list of scheduler information is unsorted and can
           appear in different order between calls.</p>
+	<p>As of ERTS version 9.0, also dirty CPU schedulers will
+	be included in the result. That is, all scheduler threads
+	that are expected to handle CPU bound work. If you also
+	want information about dirty I/O schedulers, use
+	<seealso marker="#statistics_scheduler_wall_time_all"><c>statistics(scheduler_wall_time_all)</c></seealso>
+	instead.</p>
+	
+	<p>Normal schedulers will have scheduler identifiers in
+	the range <c>1 =&lt; <anno>SchedulerId</anno> =&lt;
+	</c><seealso marker="#system_info_schedulers"><c>erlang:system_info(schedulers)</c></seealso>.
+	Dirty CPU schedulers will have scheduler identifiers in
+	the range <c>erlang:system_info(schedulers) &lt;
+	<anno>SchedulerId</anno> =&lt; erlang:system_info(schedulers)
+	+
+	</c><seealso marker="#system_info_dirty_cpu_schedulers"><c>erlang:system_info(dirty_cpu_schedulers)</c></seealso>.
+	</p>
+	<note><p>The different types of schedulers handle
+	specific types of jobs. Every job is assigned to a specific
+	scheduler type. Jobs can migrate between different schedulers
+	of the same type, but never between schedulers of different
+	types. This fact has to be taken under consideration when
+	evaluating the result returned.</p></note>
         <p>Using <c>scheduler_wall_time</c> to calculate
-          scheduler-utilization:</p>
+          scheduler utilization:</p>
         <pre>
 > <input>erlang:system_flag(scheduler_wall_time, true).</input>
 false
 > <input>Ts0 = lists:sort(erlang:statistics(scheduler_wall_time)), ok.</input>
 ok</pre>
         <p>Some time later the user takes another snapshot and calculates
-          scheduler-utilization per scheduler, for example:</p>
+          scheduler utilization per scheduler, for example:</p>
         <pre>
 > <input>Ts1 = lists:sort(erlang:statistics(scheduler_wall_time)), ok.</input>
 ok
@@ -6461,11 +6488,32 @@ ok
  {7,0.973237033077876},
  {8,0.9741297293248656}]</pre>
         <p>Using the same snapshots to calculate a total
-          scheduler-utilization:</p>
+          scheduler utilization:</p>
         <pre>
 > <input>{A, T} = lists:foldl(fun({{_, A0, T0}, {_, A1, T1}}, {Ai,Ti}) ->
-	{Ai + (A1 - A0), Ti + (T1 - T0)} end, {0, 0}, lists:zip(Ts0,Ts1)), A/T.</input>
+	{Ai + (A1 - A0), Ti + (T1 - T0)} end, {0, 0}, lists:zip(Ts0,Ts1)),
+	TotalSchedulerUtilization = A/T.</input>
+0.9769136803764825</pre>
+        <p>Total scheduler utilization will equal <c>1.0</c> when
+	all schedulers have been active all the time between the
+	two measurements.</p>
+	<p>Another (probably more) useful value is to calculate
+	total scheduler utilization weighted against maximum amount
+	of available CPU time:</p>
+        <pre>
+> <input>WeightedSchedulerUtilization = (TotalSchedulerUtilization
+                                  * (erlang:system_info(schedulers)
+                                     + erlang:system_info(dirty_cpu_schedulers)))
+                                 / erlang:system_info(logical_processors_available).</input>
 0.9769136803764825</pre>
+        <p>This weighted scheduler utilization will reach <c>1.0</c>
+	when schedulers are active the same amount of time as
+	maximum available CPU time. If more schedulers exist
+	than available logical processors, this value may
+	be greater than <c>1.0</c>.</p>
+	<p>As of ERTS version 9.0, the Erlang runtime system
+	with SMP support will as default have more schedulers
+	than logical processors. This due to the dirty schedulers.</p>
         <note>
           <p><c>scheduler_wall_time</c> is by default disabled. To
             enable it, use
@@ -6476,6 +6524,31 @@ ok
 
     <func>
       <name name="statistics" arity="1" clause_i="12"/>
+      <fsummary>Information about each schedulers work time.</fsummary>
+      <desc>
+        <marker id="statistics_scheduler_wall_time_all"></marker>
+	<p>The same as
+	<seealso marker="#statistics_scheduler_wall_time"><c>statistics(scheduler_wall_time)</c></seealso>,
+	except that it also include information about all dirty I/O
+	schedulers.</p>
+	<p>Dirty IO schedulers will have scheduler identifiers in
+	the range
+	<seealso marker="#system_info_schedulers"><c>erlang:system_info(schedulers)</c></seealso><c>
+	+
+	</c><seealso marker="#system_info_dirty_cpu_schedulers"><c>erlang:system_info(dirty_cpu_schedulers)</c></seealso><c> &lt;
+	<anno>SchedulerId</anno> =&lt; erlang:system_info(schedulers)
+	+ erlang:system_info(dirty_cpu_schedulers)
+	+
+	</c><seealso marker="#system_info_dirty_io_schedulers"><c>erlang:system_info(dirty_io_schedulers)</c></seealso>.</p>
+	<note><p>Note that work executing on dirty I/O schedulers
+	are expected to mainly wait for I/O. That is, when you
+	get high scheduler utilization on dirty I/O schedulers,
+	CPU utilization is <em>not</em> expected to be high due to
+	this work.</p></note>
+      </desc>
+    </func>
+    <func>
+      <name name="statistics" arity="1" clause_i="13"/>
       <fsummary>Information about active processes and ports.</fsummary>
       <desc><marker id="statistics_total_active_tasks"></marker>
         <p>Returns the total amount of active processes and ports in
@@ -6495,7 +6568,7 @@ ok
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="13"/>
+      <name name="statistics" arity="1" clause_i="14"/>
       <fsummary>Information about the run-queue lengths.</fsummary>
       <desc><marker id="statistics_total_run_queue_lengths"></marker>
         <p>Returns the total length of the run queues. That is, the number
@@ -6515,7 +6588,7 @@ ok
     </func>
 
     <func>
-      <name name="statistics" arity="1" clause_i="14"/>
+      <name name="statistics" arity="1" clause_i="15"/>
       <fsummary>Information about wall clock.</fsummary>
       <desc>
         <p>Returns information about wall clock. <c>wall_clock</c> can
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index 7df350116a..dd0a42b5ba 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -574,6 +574,8 @@ atom safe
 atom save_calls
 atom scheduler 
 atom scheduler_id
+atom scheduler_wall_time
+atom scheduler_wall_time_all
 atom schedulers_online
 atom scheme
 atom scientific
@@ -678,7 +680,6 @@ atom waiting
 atom wall_clock
 atom warning
 atom warning_msg
-atom scheduler_wall_time
 atom wordsize
 atom write_concurrency
 atom xor
diff --git a/erts/emulator/beam/beam_debug.c b/erts/emulator/beam/beam_debug.c
index 3ffbb0364c..8326d348af 100644
--- a/erts/emulator/beam/beam_debug.c
+++ b/erts/emulator/beam/beam_debug.c
@@ -994,13 +994,20 @@ dirty_test(Process *c_p, Eterm type, Eterm arg1, Eterm arg2, UWord *I)
 	Eterm *hp, *hp2;
 	Uint sz;
 	int i;
+	ErtsSchedulerData *esdp = erts_proc_sched_data(c_p);
+        int dirty_io = esdp->type == ERTS_SCHED_DIRTY_IO;
+
 	if (ERTS_PROC_IS_EXITING(real_c_p))
 	    goto badarg;
 	dirty_send_message(c_p, arg2, am_alive);
-	/* Wait until dead */
 
-	while (!ERTS_PROC_IS_EXITING(real_c_p))
-	    erts_thr_yield();
+	/* Wait until dead */
+	while (!ERTS_PROC_IS_EXITING(real_c_p)) {
+            if (dirty_io)
+                ms_wait(c_p, make_small(100), 0);
+            else
+                erts_thr_yield();
+        }
 
 	ms_wait(c_p, make_small(1000), 0);
 
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index a5b66db35c..65c370c55b 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -4601,7 +4601,7 @@ BIF_RETTYPE system_flag_2(BIF_ALIST_2)
 	    erts_aint32_t new = BIF_ARG_2 == am_true ? 1 : 0;
 	    erts_aint32_t old = erts_smp_atomic32_xchg_nob(&sched_wall_time,
 							   new);
-	    Eterm ref = erts_sched_wall_time_request(BIF_P, 1, new);
+	    Eterm ref = erts_sched_wall_time_request(BIF_P, 1, new, 0, 0);
 	    ASSERT(is_value(ref));
 	    BIF_TRAP2(await_sched_wall_time_mod_trap,
 		      BIF_P,
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 5f2122b969..5ee19aead8 100644
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -3385,7 +3385,12 @@ BIF_RETTYPE statistics_1(BIF_ALIST_1)
     Eterm* hp;
 
     if (BIF_ARG_1 == am_scheduler_wall_time) {
-	res = erts_sched_wall_time_request(BIF_P, 0, 0);
+	res = erts_sched_wall_time_request(BIF_P, 0, 0, 1, 0);
+	if (is_non_value(res))
+	    BIF_RET(am_undefined);
+	BIF_TRAP1(gather_sched_wall_time_res_trap, BIF_P, res);
+    } else if (BIF_ARG_1 == am_scheduler_wall_time_all) {
+	res = erts_sched_wall_time_request(BIF_P, 0, 0, 1, 1);
 	if (is_non_value(res))
 	    BIF_RET(am_undefined);
 	BIF_TRAP1(gather_sched_wall_time_res_trap, BIF_P, res);
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 084ecb14ec..8c9aa45a81 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -804,15 +804,28 @@ erts_late_init_process(void)
 
 }
 
+#define ERTS_SCHED_WTIME_IDLE ~((Uint64) 0)
+
 static void
-init_sched_wall_time(ErtsSchedWallTime *swtp)
+init_sched_wall_time(ErtsSchedulerData *esdp, Uint64 time_stamp)
 {
-    swtp->need = erts_sched_balance_util;
-    swtp->enabled = 0;
-    swtp->start = 0;
-    swtp->working.total = 0;
-    swtp->working.start = 0;
-    swtp->working.currently = 0;
+#ifdef ERTS_DIRTY_SCHEDULERS
+    if (esdp->type != ERTS_SCHED_NORMAL) {
+        erts_atomic32_init_nob(&esdp->sched_wall_time.u.mod, 0);
+        esdp->sched_wall_time.enabled = 1;
+        esdp->sched_wall_time.start = time_stamp;
+        esdp->sched_wall_time.working.total = 0;
+        esdp->sched_wall_time.working.start = ERTS_SCHED_WTIME_IDLE;
+    }
+    else
+#endif
+    {
+        esdp->sched_wall_time.u.need = erts_sched_balance_util;
+        esdp->sched_wall_time.enabled = 0;
+        esdp->sched_wall_time.start = 0;
+        esdp->sched_wall_time.working.total = 0;
+        esdp->sched_wall_time.working.start = 0;
+    }
 }
 
 static ERTS_INLINE Uint64
@@ -1003,31 +1016,145 @@ init_runq_sched_util(ErtsRunQueueSchedUtil *rqsu, int enabled)
 
 #endif /* ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT */
 
-static ERTS_INLINE void
+#ifdef ERTS_DIRTY_SCHEDULERS
+
+typedef struct {
+    Uint64 working;
+    Uint64 total;
+} ErtsDirtySchedWallTime;
+
+static void
+read_dirty_sched_wall_time(ErtsSchedulerData *esdp, ErtsDirtySchedWallTime *info)
+{
+    erts_aint32_t mod1;
+    Uint64 working, start, ts;
+
+    mod1 = erts_atomic32_read_nob(&esdp->sched_wall_time.u.mod);
+
+    while (1) {
+        erts_aint32_t mod2;
+
+        /* Spin until values are not written... */
+        while (1) {
+            if ((mod1 & 1) == 0)
+                break;
+            ERTS_SPIN_BODY;
+            mod1 = erts_atomic32_read_nob(&esdp->sched_wall_time.u.mod);
+        }
+
+        ERTS_THR_READ_MEMORY_BARRIER;
+
+        working = esdp->sched_wall_time.working.total;
+        start = esdp->sched_wall_time.working.start;
+
+        ERTS_THR_READ_MEMORY_BARRIER;
+
+        mod2 = erts_atomic32_read_nob(&esdp->sched_wall_time.u.mod);
+        if (mod1 == mod2)
+            break;
+        mod1 = mod2;
+    }
+
+    ts = sched_wall_time_ts();
+    ts -= esdp->sched_wall_time.start;
+
+    info->total = ts;
+
+    if (start == ERTS_SCHED_WTIME_IDLE || ts < start)
+        info->working = working;
+    else
+        info->working = working + (ts - start);
+
+    if (info->working > info->total)
+        info->working = info->total;
+}
+
+#endif
+
+#ifdef ERTS_SMP
+
+static void
+dirty_sched_wall_time_change(ErtsSchedulerData *esdp, int working)
+{
+    erts_aint32_t mod;
+    Uint64 ts = sched_wall_time_ts();
+
+    ts -= esdp->sched_wall_time.start;
+
+    /*
+     * This thread is the only thread writing in
+     * this sched_wall_time struct. We set 'mod' to
+     * an odd value while writing...
+     */
+    mod = erts_atomic32_read_dirty(&esdp->sched_wall_time.u.mod);
+    ASSERT((mod & 1) == 0);
+    mod++;
+
+    erts_atomic32_set_nob(&esdp->sched_wall_time.u.mod, mod);
+    ERTS_THR_WRITE_MEMORY_BARRIER;
+
+    if (working) {
+        ASSERT(esdp->sched_wall_time.working.start
+               == ERTS_SCHED_WTIME_IDLE);
+
+        esdp->sched_wall_time.working.start = ts;
+
+    }
+    else {
+        Uint64 total;
+
+        ASSERT(esdp->sched_wall_time.working.start
+               != ERTS_SCHED_WTIME_IDLE);
+
+        total = esdp->sched_wall_time.working.total;
+        total += ts - esdp->sched_wall_time.working.start;
+
+        esdp->sched_wall_time.working.total = total;
+        esdp->sched_wall_time.working.start = ERTS_SCHED_WTIME_IDLE;
+
+
+    }
+
+    ERTS_THR_WRITE_MEMORY_BARRIER;
+    mod++;
+    erts_atomic32_set_nob(&esdp->sched_wall_time.u.mod, mod);
+
+#if 0
+    if (!working) {
+        ERTS_MSACC_SET_STATE_M_X(ERTS_MSACC_STATE_BUSY_WAIT);
+    } else {
+        ERTS_MSACC_SET_STATE_M_X(ERTS_MSACC_STATE_OTHER);
+    }
+#endif
+}
+
+#endif /* ERTS_SMP */
+
+static void
 sched_wall_time_change(ErtsSchedulerData *esdp, int working)
 {
-    if (esdp->sched_wall_time.need) {
+    if (esdp->sched_wall_time.u.need) {
 	Uint64 ts = sched_wall_time_ts();
 #if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
-	update_avg_sched_util(esdp, ts, working);
+        update_avg_sched_util(esdp, ts, working);
 #endif
 	if (esdp->sched_wall_time.enabled) {
 	    if (working) {
-#ifdef DEBUG
-		ASSERT(!esdp->sched_wall_time.working.currently);
-		esdp->sched_wall_time.working.currently = 1;
-#endif
+                ASSERT(esdp->sched_wall_time.working.start
+                       == ERTS_SCHED_WTIME_IDLE);
 		ts -= esdp->sched_wall_time.start;
 		esdp->sched_wall_time.working.start = ts;
 	    }
 	    else {
-#ifdef DEBUG
-		ASSERT(esdp->sched_wall_time.working.currently);
-		esdp->sched_wall_time.working.currently = 0;
-#endif
+                ASSERT(esdp->sched_wall_time.working.start
+                       != ERTS_SCHED_WTIME_IDLE);
 		ts -= esdp->sched_wall_time.start;
 		ts -= esdp->sched_wall_time.working.start;
 		esdp->sched_wall_time.working.total += ts;
+#ifdef DEBUG
+                esdp->sched_wall_time.working.start
+                    = ERTS_SCHED_WTIME_IDLE;
+#endif
 	    }
 	}
     }
@@ -1047,6 +1174,10 @@ typedef struct {
     Eterm ref_heap[REF_THING_SIZE];
     Uint req_sched;
     erts_smp_atomic32_t refc;
+#ifdef ERTS_DIRTY_SCHEDULERS
+    int want_dirty_cpu;
+    int want_dirty_io;
+#endif
 } ErtsSchedWallTimeReq;
 
 typedef struct {
@@ -1068,6 +1199,7 @@ ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(screq,
 				 5,
 				 ERTS_ALC_T_SYS_CHECK_REQ)
 
+
 static void
 reply_sched_wall_time(void *vswtrp)
 {
@@ -1091,23 +1223,23 @@ reply_sched_wall_time(void *vswtrp)
 #endif
     if (swtrp->set) {
 	if (!swtrp->enable && esdp->sched_wall_time.enabled) {
-	    esdp->sched_wall_time.need = erts_sched_balance_util;
+	    esdp->sched_wall_time.u.need = erts_sched_balance_util;
 	    esdp->sched_wall_time.enabled = 0;
 	}
 	else if (swtrp->enable && !esdp->sched_wall_time.enabled) {
 	    Uint64 ts = sched_wall_time_ts();
-	    esdp->sched_wall_time.need = 1;
+	    esdp->sched_wall_time.u.need = 1;
 	    esdp->sched_wall_time.enabled = 1;
 	    esdp->sched_wall_time.start = ts;
 	    esdp->sched_wall_time.working.total = 0;
 	    esdp->sched_wall_time.working.start = 0;
-	    esdp->sched_wall_time.working.currently = 1;
 	}
     }
 
     if (esdp->sched_wall_time.enabled) {
 	Uint64 ts = sched_wall_time_ts();
-	ASSERT(esdp->sched_wall_time.working.currently);
+        ASSERT(esdp->sched_wall_time.working.start
+               != ERTS_SCHED_WTIME_IDLE);
 	ts -= esdp->sched_wall_time.start;
 	total = ts;
 	ts -= esdp->sched_wall_time.working.start;
@@ -1118,30 +1250,117 @@ reply_sched_wall_time(void *vswtrp)
     hpp = NULL;
     szp = &sz;
 
-    while (1) {
-	if (hpp)
-	    ref_copy = STORE_NC(hpp, ohp, swtrp->ref);
-	else
-	    *szp += REF_THING_SIZE;
+#ifdef ERTS_DIRTY_SCHEDULERS
+    if (esdp->sched_wall_time.enabled
+        && swtrp->req_sched == esdp->no
+        && (swtrp->want_dirty_cpu || swtrp->want_dirty_io)) {
+        /* Reply with info about this scheduler and all dirty schedulers... */
+        ErtsDirtySchedWallTime *dswt;
+        int ix, no_dirty_scheds, want_dcpu, want_dio, soffset;
+
+        want_dcpu = swtrp->want_dirty_cpu;
+        want_dio = swtrp->want_dirty_io;
+
+        no_dirty_scheds = 0;
+        if (want_dcpu)
+            no_dirty_scheds += erts_no_dirty_cpu_schedulers;
+        if (want_dio)
+            no_dirty_scheds += erts_no_dirty_io_schedulers;
+
+        ASSERT(no_dirty_scheds);
+
+        dswt = erts_alloc(ERTS_ALC_T_TMP,
+                          sizeof(ErtsDirtySchedWallTime)
+                          * no_dirty_scheds);
+
+        for (ix = 0; ix < no_dirty_scheds; ix++) {
+            ErtsSchedulerData *esdp;
+            if (want_dcpu && ix < erts_no_dirty_cpu_schedulers)
+                esdp = &erts_aligned_dirty_cpu_scheduler_data[ix].esd;
+            else {
+                int dio_ix = ix - erts_no_dirty_cpu_schedulers;
+                esdp = &erts_aligned_dirty_io_scheduler_data[dio_ix].esd;
+            }
+            read_dirty_sched_wall_time(esdp, &dswt[ix]);
+        }
 
-	if (swtrp->set)
-	    msg = ref_copy;
-	else {
-	    msg = (!esdp->sched_wall_time.enabled
-		   ? am_notsup
-		   : erts_bld_tuple(hpp, szp, 3,
-				    make_small(esdp->no),
-				    erts_bld_uint64(hpp, szp, working),
-				    erts_bld_uint64(hpp, szp, total)));
+        soffset = erts_no_schedulers + 1;
 
-	    msg = erts_bld_tuple(hpp, szp, 2, ref_copy, msg);
-	}
-	if (hpp)
-	    break;
+        if (!want_dcpu) {
+            ASSERT(want_dio);
+            soffset += erts_no_dirty_cpu_schedulers;
+        }
+
+        while (1) {
+            if (hpp)
+                ref_copy = STORE_NC(hpp, ohp, swtrp->ref);
+            else
+                *szp += REF_THING_SIZE;
+
+            ASSERT(!swtrp->set);
+
+            /* info about dirty schedulers... */
+            msg = NIL;
+            for (ix = no_dirty_scheds-1; ix >= 0; ix--) {
+                msg = erts_bld_cons(hpp, szp,
+                                    erts_bld_tuple(hpp, szp, 3,
+                                                   make_small(ix+soffset),
+                                                   erts_bld_uint64(hpp, szp,
+                                                                   dswt[ix].working),
+                                                   erts_bld_uint64(hpp, szp,
+                                                                   dswt[ix].total)),
+                                    msg);
+            }
+            /* info about this scheduler... */
+            msg = erts_bld_cons(hpp, szp,
+                                erts_bld_tuple(hpp, szp, 3,
+                                               make_small(esdp->no),
+                                               erts_bld_uint64(hpp, szp, working),
+                                               erts_bld_uint64(hpp, szp, total)),
+                                msg);
+
+            msg = erts_bld_tuple(hpp, szp, 2, ref_copy, msg);
+
+            if (hpp)
+                break;
+
+            mp = erts_alloc_message_heap(rp, &rp_locks, sz, &hp, &ohp);
+            szp = NULL;
+            hpp = &hp;
+        }
 
-	mp = erts_alloc_message_heap(rp, &rp_locks, sz, &hp, &ohp);
-	szp = NULL;
-	hpp = &hp;
+        erts_free(ERTS_ALC_T_TMP, dswt);
+    }
+    else
+#endif
+    {
+        /* Reply with info about this scheduler only... */
+
+        while (1) {
+            if (hpp)
+                ref_copy = STORE_NC(hpp, ohp, swtrp->ref);
+            else
+                *szp += REF_THING_SIZE;
+
+            if (swtrp->set)
+                msg = ref_copy;
+            else {
+                msg = (!esdp->sched_wall_time.enabled
+                       ? am_undefined
+                       : erts_bld_tuple(hpp, szp, 3,
+                                        make_small(esdp->no),
+                                        erts_bld_uint64(hpp, szp, working),
+                                        erts_bld_uint64(hpp, szp, total)));
+
+                msg = erts_bld_tuple(hpp, szp, 2, ref_copy, msg);
+            }
+            if (hpp)
+                break;
+
+            mp = erts_alloc_message_heap(rp, &rp_locks, sz, &hp, &ohp);
+            szp = NULL;
+            hpp = &hp;
+        }
     }
 
     erts_queue_message(rp, rp_locks, mp, msg, am_system);
@@ -1159,7 +1378,8 @@ reply_sched_wall_time(void *vswtrp)
 }
 
 Eterm
-erts_sched_wall_time_request(Process *c_p, int set, int enable)
+erts_sched_wall_time_request(Process *c_p, int set, int enable,
+                             int want_dirty_cpu, int want_dirty_io)
 {
     ErtsSchedulerData *esdp = erts_proc_sched_data(c_p);
     Eterm ref;
@@ -1181,6 +1401,10 @@ erts_sched_wall_time_request(Process *c_p, int set, int enable)
     swtrp->proc = c_p;
     swtrp->ref = STORE_NC(&hp, NULL, ref);
     swtrp->req_sched = esdp->no;
+#ifdef ERTS_DIRTY_SCHEDULERS
+    swtrp->want_dirty_cpu = want_dirty_cpu;
+    swtrp->want_dirty_io = want_dirty_io;
+#endif
     erts_smp_atomic32_init_nob(&swtrp->refc,
 			       (erts_aint32_t) erts_no_schedulers);
 
@@ -3106,8 +3330,10 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
 
     tse_wait:
 
-	if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && thr_prgr_active != working)
-	    sched_wall_time_change(esdp, thr_prgr_active);
+	if (ERTS_SCHEDULER_IS_DIRTY(esdp))
+	    dirty_sched_wall_time_change(esdp, working = 0);
+        else if (thr_prgr_active != working)
+	    sched_wall_time_change(esdp, working = thr_prgr_active);
 
 	while (1) {
 	    ErtsMonotonicTime current_time = 0;
@@ -3218,10 +3444,12 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq)
                                             (ERTS_SSI_FLG_SUSPENDED
                                              | ERTS_SSI_FLG_MSB_EXEC));
 
-	if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && !thr_prgr_active) {
+	if (ERTS_SCHEDULER_IS_DIRTY(esdp))
+	    dirty_sched_wall_time_change(esdp, working = 1);
+        else if (!thr_prgr_active) {
 	    erts_thr_progress_active(esdp, thr_prgr_active = 1);
 	    sched_wall_time_change(esdp, 1);
-	}
+        }
 
 	erts_smp_runq_lock(rq);
 	sched_active(esdp->no, rq);
@@ -5712,7 +5940,8 @@ init_scheduler_data(ErtsSchedulerData* esdp, int num,
 		    ErtsSchedulerSleepInfo* ssi,
 		    ErtsRunQueue* runq,
 		    char** daww_ptr, size_t daww_sz,
-		    Process *shadow_proc)
+		    Process *shadow_proc,
+                    Uint64 time_stamp)
 {
     esdp->timer_wheel = NULL;
 #ifdef ERTS_SMP
@@ -5794,7 +6023,7 @@ init_scheduler_data(ErtsSchedulerData* esdp, int num,
 
     esdp->reductions = 0;
 
-    init_sched_wall_time(&esdp->sched_wall_time);
+    init_sched_wall_time(esdp, time_stamp);
     erts_port_task_handle_init(&esdp->nosuspend_port_task_handle);
 }
 
@@ -6008,17 +6237,18 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
 
     erts_aligned_scheduler_data = 
 	erts_alloc_permanent_cache_aligned(ERTS_ALC_T_SCHDLR_DATA,
-					   n*sizeof(ErtsAlignedSchedulerData));					   
+					   n*sizeof(ErtsAlignedSchedulerData));
 
     for (ix = 0; ix < n; ix++) {
 	ErtsSchedulerData *esdp = ERTS_SCHEDULER_IX(ix);
 	init_scheduler_data(esdp, ix+1, ERTS_SCHED_SLEEP_INFO_IX(ix),
 			    ERTS_RUNQ_IX(ix), &daww_ptr, daww_sz,
-			    NULL);
+			    NULL, 0);
     }
 
 #ifdef ERTS_DIRTY_SCHEDULERS
     {
+        Uint64 ts = sched_wall_time_ts();
 	int dirty_scheds = no_dirty_cpu_schedulers + no_dirty_io_schedulers;
 	int adspix = 0;
 	ErtsAlignedDirtyShadowProcess *adsp =
@@ -6038,13 +6268,13 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online
 	    ErtsSchedulerData *esdp = ERTS_DIRTY_CPU_SCHEDULER_IX(ix);
 	    init_scheduler_data(esdp, ix+1, ERTS_DIRTY_CPU_SCHED_SLEEP_INFO_IX(ix),
 				ERTS_DIRTY_CPU_RUNQ, NULL, 0,
-				&adsp[adspix++].dsp);
+				&adsp[adspix++].dsp, ts);
 	}
 	for (ix = 0; ix < no_dirty_io_schedulers; ix++) {
 	    ErtsSchedulerData *esdp = ERTS_DIRTY_IO_SCHEDULER_IX(ix);
 	    init_scheduler_data(esdp, ix+1, ERTS_DIRTY_IO_SCHED_SLEEP_INFO_IX(ix),
 				ERTS_DIRTY_IO_RUNQ, NULL, 0,
-				&adsp[adspix++].dsp);
+				&adsp[adspix++].dsp, ts);
 	}
     }
 #endif
@@ -7506,8 +7736,10 @@ suspend_scheduler(ErtsSchedulerData *esdp)
 
 #endif
 
-    if (sched_type != ERTS_SCHED_NORMAL)
+    if (sched_type != ERTS_SCHED_NORMAL) {
 	erts_smp_runq_unlock(esdp->run_queue);
+        dirty_sched_wall_time_change(esdp, 0);
+    }
     else {
         if (no != 1)
             evacuate_run_queue(esdp->run_queue, &sbp);
@@ -7518,8 +7750,8 @@ suspend_scheduler(ErtsSchedulerData *esdp)
 
 	if (erts_system_profile_flags.scheduler)
 	    profile_scheduler(make_small(esdp->no), am_inactive);
-
     }
+
     erts_smp_mtx_lock(&schdlr_sspnd.mtx);
 
     flgs = sched_prep_spin_suspended(ssi, ERTS_SSI_FLG_SUSPENDED);
@@ -7797,15 +8029,17 @@ suspend_scheduler(ErtsSchedulerData *esdp)
 
     ASSERT(curr_online);
 
-    if (sched_type == ERTS_SCHED_NORMAL) {
-	(void) erts_get_monotonic_time(esdp);
-	if (erts_system_profile_flags.scheduler)
-	    profile_scheduler(make_small(esdp->no), am_active);
+    if (sched_type != ERTS_SCHED_NORMAL)
+        dirty_sched_wall_time_change(esdp, 1);
+    else {
+        (void) erts_get_monotonic_time(esdp);
+        if (erts_system_profile_flags.scheduler)
+            profile_scheduler(make_small(esdp->no), am_active);
 
-	if (!thr_prgr_active) {
-	    erts_thr_progress_active(esdp, thr_prgr_active = 1);
-	    sched_wall_time_change(esdp, 1);
-	}
+        if (!thr_prgr_active) {
+            erts_thr_progress_active(esdp, thr_prgr_active = 1);
+            sched_wall_time_change(esdp, 1);
+        }
     }
 
     erts_smp_runq_lock(esdp->run_queue);
@@ -8518,6 +8752,8 @@ sched_dirty_cpu_thread_func(void *vesdp)
     callbacks.wait = NULL;
     callbacks.finalize_wait = NULL;
 
+    dirty_sched_wall_time_change(esdp, 1);
+
     esdp->thr_id += erts_no_schedulers;
 
     erts_msacc_init_thread("dirty_cpu_scheduler", no, 0);
@@ -8565,6 +8801,8 @@ sched_dirty_io_thread_func(void *vesdp)
     callbacks.wait = NULL;
     callbacks.finalize_wait = NULL;
 
+    dirty_sched_wall_time_change(esdp, 1);
+
     esdp->thr_id += erts_no_schedulers + erts_no_dirty_cpu_schedulers;
 
     erts_msacc_init_thread("dirty_io_scheduler", no, 0);
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index fc2055332b..ce0989883c 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -550,13 +550,15 @@ do {								\
 } while (0)
 
 typedef struct {
-    int need; /* "+sbu true" or scheduler_wall_time enabled */
+    union {
+        erts_atomic32_t mod; /* on dirty schedulers */
+        int need; /* "+sbu true" or scheduler_wall_time enabled */
+    } u;
     int enabled;
     Uint64 start;
     struct {
 	Uint64 total;
 	Uint64 start;
-	int currently;
     } working;
 } ErtsSchedWallTime;
 
@@ -1598,7 +1600,8 @@ void erts_init_scheduling(int, int
 void erts_execute_dirty_system_task(Process *c_p);
 #endif
 int erts_set_gc_state(Process *c_p, int enable);
-Eterm erts_sched_wall_time_request(Process *c_p, int set, int enable);
+Eterm erts_sched_wall_time_request(Process *c_p, int set, int enable,
+                                   int dirty_cpu, int want_dirty_io);
 Eterm erts_system_check_request(Process *c_p);
 Eterm erts_gc_info_request(Process *c_p);
 Uint64 erts_get_proc_interval(void);
diff --git a/erts/emulator/test/statistics_SUITE.erl b/erts/emulator/test/statistics_SUITE.erl
index a1f12ba93c..f51244485b 100644
--- a/erts/emulator/test/statistics_SUITE.erl
+++ b/erts/emulator/test/statistics_SUITE.erl
@@ -28,6 +28,8 @@
 	 runtime_update/1, runtime_diff/1,
 	 run_queue_one/1,
 	 scheduler_wall_time/1,
+         scheduler_wall_time_all/1,
+         msb_scheduler_wall_time/1,
 	 reductions/1, reductions_big/1, garbage_collection/1, io/1,
 	 badarg/1, run_queues_lengths_active_tasks/1, msacc/1]).
 
@@ -43,7 +45,9 @@ suite() ->
 
 all() -> 
     [{group, wall_clock}, {group, runtime}, reductions,
-     reductions_big, {group, run_queue}, scheduler_wall_time,
+     reductions_big, {group, run_queue},
+     scheduler_wall_time, scheduler_wall_time_all,
+     msb_scheduler_wall_time,
      garbage_collection, io, badarg,
      run_queues_lengths_active_tasks,
      msacc].
@@ -271,35 +275,64 @@ hog_iter(0, Mon) ->
 
 %% Tests that statistics(scheduler_wall_time) works as intended
 scheduler_wall_time(Config) when is_list(Config) ->
+    scheduler_wall_time_test(scheduler_wall_time).
+
+%% Tests that statistics(scheduler_wall_time_all) works as intended
+scheduler_wall_time_all(Config) when is_list(Config) ->
+    scheduler_wall_time_test(scheduler_wall_time_all).
+
+scheduler_wall_time_test(Type) ->
     %% Should return undefined if system_flag is not turned on yet
-    undefined = statistics(scheduler_wall_time),
+    undefined = statistics(Type),
     %% Turn on statistics
     false = erlang:system_flag(scheduler_wall_time, true),
     try
         Schedulers = erlang:system_info(schedulers_online),
+        DirtyCPUSchedulers = erlang:system_info(dirty_cpu_schedulers_online),
+        DirtyIOSchedulers = erlang:system_info(dirty_io_schedulers),
+        TotLoadSchedulers = case Type of
+                                scheduler_wall_time_all ->
+                                    Schedulers + DirtyCPUSchedulers + DirtyIOSchedulers;
+                                scheduler_wall_time ->
+                                    Schedulers + DirtyCPUSchedulers
+                        end,
+                            
         %% Let testserver and everyone else finish their work
         timer:sleep(1500),
         %% Empty load
-        EmptyLoad = get_load(),
+        EmptyLoad = get_load(Type),
         {false, _} = {lists:any(fun(Load) -> Load > 50 end, EmptyLoad),EmptyLoad},
         MeMySelfAndI = self(),
         StartHog = fun() ->
-                           Pid = spawn(?MODULE, hog, [self()]),
+                           Pid = spawn_link(?MODULE, hog, [self()]),
                            receive hog_started -> MeMySelfAndI ! go end,
                            Pid
                    end,
+        StartDirtyHog = fun(Func) ->
+                                F = fun () ->
+                                            erts_debug:Func(alive_waitexiting,
+                                                            MeMySelfAndI)
+                                    end,
+                                Pid = spawn_link(F),
+                                receive {alive, Pid} -> ok end,
+                                Pid
+                        end,
         P1 = StartHog(),
         %% Max on one, the other schedulers empty (hopefully)
         %% Be generous the process can jump between schedulers
         %% which is ok and we don't want the test to fail for wrong reasons
-        _L1 = [S1Load|EmptyScheds1] = get_load(),
+        _L1 = [S1Load|EmptyScheds1] = get_load(Type),
         {true,_}  = {S1Load > 50,S1Load},
         {false,_} = {lists:any(fun(Load) -> Load > 50 end, EmptyScheds1),EmptyScheds1},
         {true,_}  = {lists:sum(EmptyScheds1) < 60,EmptyScheds1},
 
         %% 50% load
         HalfHogs = [StartHog() || _ <- lists:seq(1, (Schedulers-1) div 2)],
-        HalfLoad = lists:sum(get_load()) div Schedulers,
+        HalfDirtyCPUHogs = [StartDirtyHog(dirty_cpu)
+                            || _ <- lists:seq(1, DirtyCPUSchedulers div 2)],
+        HalfDirtyIOHogs = [StartDirtyHog(dirty_io)
+                           || _ <- lists:seq(1, DirtyIOSchedulers div 2)],
+        HalfLoad = lists:sum(get_load(Type)) div TotLoadSchedulers,
         if Schedulers < 2, HalfLoad > 80 -> ok; %% Ok only one scheduler online and one hog
            %% We want roughly 50% load
            HalfLoad > 40, HalfLoad < 60 -> ok;
@@ -308,23 +341,30 @@ scheduler_wall_time(Config) when is_list(Config) ->
 
         %% 100% load
         LastHogs = [StartHog() || _ <- lists:seq(1, Schedulers div 2)],
-        FullScheds = get_load(),
+        LastDirtyCPUHogs = [StartDirtyHog(dirty_cpu)
+                            || _ <- lists:seq(1, DirtyCPUSchedulers div 2)],
+        LastDirtyIOHogs = [StartDirtyHog(dirty_io)
+                           || _ <- lists:seq(1, DirtyIOSchedulers div 2)],
+        FullScheds = get_load(Type),
         {false,_} = {lists:any(fun(Load) -> Load < 80 end, FullScheds),FullScheds},
-        FullLoad = lists:sum(FullScheds) div Schedulers,
+        FullLoad = lists:sum(FullScheds) div TotLoadSchedulers,
         if FullLoad > 90 -> ok;
            true -> exit({fullload, FullLoad})
         end,
 
 	KillHog = fun (HP) ->
 			  HPM = erlang:monitor(process, HP),
+                          unlink(HP),
 			  exit(HP, kill),
 			  receive
 			      {'DOWN', HPM, process, HP, killed} ->
 				  ok
 			  end
 		  end,
-        [KillHog(Pid) || Pid <- [P1|HalfHogs++LastHogs]],
-        AfterLoad = get_load(),
+        [KillHog(Pid) || Pid <- [P1|HalfHogs++HalfDirtyCPUHogs++HalfDirtyIOHogs
+                                 ++LastHogs++LastDirtyCPUHogs++LastDirtyIOHogs]],
+        receive after 2000 -> ok end, %% Give dirty schedulers time to complete...
+        AfterLoad = get_load(Type),
 	io:format("AfterLoad=~p~n", [AfterLoad]),
         {false,_} = {lists:any(fun(Load) -> Load > 25 end, AfterLoad),AfterLoad},
         true = erlang:system_flag(scheduler_wall_time, false)
@@ -332,16 +372,81 @@ scheduler_wall_time(Config) when is_list(Config) ->
         erlang:system_flag(scheduler_wall_time, false)
     end.
 
-get_load() ->
-    Start = erlang:statistics(scheduler_wall_time),
+get_load(Type) ->
+    Start = erlang:statistics(Type),
     timer:sleep(1500),
-    End = erlang:statistics(scheduler_wall_time),
+    End = erlang:statistics(Type),
     lists:reverse(lists:sort(load_percentage(lists:sort(Start),lists:sort(End)))).
 
 load_percentage([{Id, WN, TN}|Ss], [{Id, WP, TP}|Ps]) ->
     [100*(WN-WP) div (TN-TP)|load_percentage(Ss, Ps)];
 load_percentage([], []) -> [].
 
+count(0) ->
+    ok;
+count(N) ->
+    count(N-1).
+
+msb_swt_hog(true) ->
+    count(1000000),
+    erts_debug:dirty_cpu(wait, 10),
+    erts_debug:dirty_io(wait, 10),
+    msb_swt_hog(true);
+msb_swt_hog(false) ->
+    count(1000000),
+    msb_swt_hog(false).
+
+msb_scheduler_wall_time(Config) ->
+    erlang:system_flag(scheduler_wall_time, true),
+    Dirty = erlang:system_info(dirty_cpu_schedulers) /= 0,
+    Hogs = lists:map(fun (_) ->
+                             spawn_opt(fun () ->
+                                               msb_swt_hog(Dirty)
+                                       end, [{priority,low}, link, monitor])
+                     end, lists:seq(1,10)),
+    erlang:system_flag(multi_scheduling, block),
+    try
+        SWT1 = lists:sort(statistics(scheduler_wall_time_all)),
+        %% io:format("SWT1 = ~p~n", [SWT1]),
+        receive after 4000 -> ok end,
+        SWT2 = lists:sort(statistics(scheduler_wall_time_all)),
+        %% io:format("SWT2 = ~p~n", [SWT2]),
+        SWT = lists:zip(SWT1, SWT2),
+        io:format("SU = ~p~n", [lists:map(fun({{I, A0, T0}, {I, A1, T1}}) ->
+                                                  {I, (A1 - A0)/(T1 - T0)} end,
+                                         SWT)]),
+        {A, T} = lists:foldl(fun({{_, A0, T0}, {_, A1, T1}}, {Ai,Ti}) ->
+                                     {Ai + (A1 - A0), Ti + (T1 - T0)}
+                             end,
+                             {0, 0},
+                             SWT),
+        TSU = A/T,
+        WSU = ((TSU * (erlang:system_info(schedulers)
+                       + erlang:system_info(dirty_cpu_schedulers)
+                       + erlang:system_info(dirty_io_schedulers)))
+               / 1),
+        %% Weighted scheduler utilization should be
+        %% very close to 1.0, i.e., we execute the
+        %% same time as one thread executing all
+        %% the time...
+        io:format("WSU = ~p~n", [WSU]),
+        true = 0.9 < WSU andalso WSU < 1.1,
+        ok
+    after
+        erlang:system_flag(multi_scheduling, unblock),
+        erlang:system_flag(scheduler_wall_time, false),
+        lists:foreach(fun ({HP, _HM}) ->
+                              unlink(HP),
+                              exit(HP, kill)
+                      end, Hogs),
+        lists:foreach(fun ({HP, HM}) ->
+                              receive
+                                  {'DOWN', HM, process, HP, _} ->
+                                      ok
+                              end
+                      end, Hogs),
+        ok
+    end.
 
 %% Tests that statistics(garbage_collection) is callable.
 %% It is not clear how to test anything more.
diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam
index 359d28c0aa..0799546c60 100644
Binary files a/erts/preloaded/ebin/erlang.beam and b/erts/preloaded/ebin/erlang.beam differ
diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl
index 99ef5c6314..86ebb4dd4b 100644
--- a/erts/preloaded/src/erlang.erl
+++ b/erts/preloaded/src/erlang.erl
@@ -2332,6 +2332,10 @@ spawn_opt(_Tuple) ->
                 (scheduler_wall_time) -> [{SchedulerId, ActiveTime, TotalTime}] | undefined when
       SchedulerId :: pos_integer(),
       ActiveTime  :: non_neg_integer(),
+      TotalTime   :: non_neg_integer();
+                (scheduler_wall_time_all) -> [{SchedulerId, ActiveTime, TotalTime}] | undefined when
+      SchedulerId :: pos_integer(),
+      ActiveTime  :: non_neg_integer(),
       TotalTime   :: non_neg_integer();
 		(total_active_tasks) -> ActiveTasks when
       ActiveTasks :: non_neg_integer();
@@ -4014,6 +4018,7 @@ sched_wall_time(Ref, N, undefined) ->
 sched_wall_time(Ref, N, Acc) ->
     receive
 	{Ref, undefined} -> sched_wall_time(Ref, N-1, undefined);
+	{Ref, SWTL} when erlang:is_list(SWTL) -> sched_wall_time(Ref, N-1, Acc ++ SWTL);
 	{Ref, SWT} -> sched_wall_time(Ref, N-1, [SWT|Acc])
     end.
 
-- 
cgit v1.2.3