aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--erts/doc/src/erl.xml27
-rw-r--r--erts/emulator/beam/erl_init.c26
-rw-r--r--erts/emulator/beam/erl_process.c424
-rw-r--r--erts/emulator/beam/erl_process.h104
-rw-r--r--erts/etc/common/erlexec.c1
5 files changed, 525 insertions, 57 deletions
diff --git a/erts/doc/src/erl.xml b/erts/doc/src/erl.xml
index bf0d132955..2aa247e293 100644
--- a/erts/doc/src/erl.xml
+++ b/erts/doc/src/erl.xml
@@ -941,6 +941,10 @@
when schedulers frequently run out of work. When disabled,
the frequency with which schedulers run out of work will
not be taken into account by the load balancing logic.
+ <br/>&nbsp;&nbsp;<c>+scl false</c> is similar to
+ <seealso marker="#+sub">+sub true</seealso> with the difference
+ that <c>+sub true</c> also will balance scheduler utilization
+ between schedulers.
</p>
</item>
<tag><marker id="+sct"><c>+sct CpuTopology</c></marker></tag>
@@ -1087,6 +1091,29 @@
documentation of the <seealso marker="#+sbt">+sbt</seealso> flag.
</p>
</item>
+ <tag><marker id="+sub"><c>+sub true|false</c></marker></tag>
+ <item>
+ <p>Enable or disable
+ <seealso marker="erts:erlang#statistics_scheduler_wall_time">scheduler
+ utilization</seealso> balancing of load. By default scheduler
+ utilization balancing is disabled and instead scheduler
+ compaction of load is enabled which will strive for a load
+ distribution which causes as many scheduler threads as possible
+ to be fully loaded (i.e., not run out of work). When scheduler
+ utilization balancing is enabled the system will instead try to
+ balance scheduler utilization between schedulers. That is,
+ strive for equal scheduler utilization on all schedulers.
+ <br/>&nbsp;&nbsp;&nbsp;<c>+sub true</c> is only supported on
+ systems where the runtime system detects and use a monotonically
+ increasing high resolution clock. On other systems, the runtime
+ system will fail to start.
+ <br/>&nbsp;&nbsp;&nbsp;<c>+sub true</c> implies
+ <seealso marker="#+scl">+scl false</seealso>. The difference
+ between <c>+sub true</c> and <c>+scl false</c> is that
+ <c>+scl false</c> will not try to balance the scheduler
+ utilization.
+ </p>
+ </item>
<tag><marker id="+swct"><c>+sws very_eager|eager|medium|lazy|very_lazy</c></marker></tag>
<item>
<p>
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 1af80dd04b..1d4f617746 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -537,6 +537,12 @@ void erts_usage(void)
erts_fprintf(stderr, " see the erl(1) documentation for more info.\n");
erts_fprintf(stderr, "-sct cput set cpu topology,\n");
erts_fprintf(stderr, " see the erl(1) documentation for more info.\n");
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+ erts_fprintf(stderr, "-sub bool enable/disable scheduler utilization balancing,\n");
+#else
+ erts_fprintf(stderr, "-sub false disable scheduler utilization balancing,\n");
+#endif
+ erts_fprintf(stderr, " see the erl(1) documentation for more info.\n");
erts_fprintf(stderr, "-sws val set scheduler wakeup strategy, valid values are:\n");
erts_fprintf(stderr, " default|legacy.\n");
erts_fprintf(stderr, "-swct val set scheduler wake cleanup threshold, valid values are:\n");
@@ -1512,6 +1518,26 @@ erl_start(int argc, char **argv)
erts_usage();
}
}
+ else if (has_prefix("ub", sub_param)) {
+ arg = get_arg(sub_param+2, argv[i+1], &i);
+ if (sys_strcmp("true", arg) == 0) {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+ erts_sched_balance_util = 1;
+#else
+ erts_fprintf(stderr,
+ "scheduler utilization balancing not "
+ "supported on this system\n");
+ erts_usage();
+#endif
+ }
+ else if (sys_strcmp("false", arg) == 0)
+ erts_sched_balance_util = 0;
+ else {
+ erts_fprintf(stderr, "bad scheduler utilization balancing "
+ " value '%s'\n", arg);
+ erts_usage();
+ }
+ }
else if (has_prefix("wct", sub_param)) {
arg = get_arg(sub_param+3, argv[i+1], &i);
if (erts_sched_set_wake_cleanup_threshold(arg) != 0) {
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 21fd8dd50a..6143c5fa3a 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -144,6 +144,7 @@ extern BeamInstr beam_exit[];
extern BeamInstr beam_continue_exit[];
int erts_sched_compact_load;
+int erts_sched_balance_util = 0;
Uint erts_no_schedulers;
#define ERTS_THR_PRGR_LATER_CLEANUP_OP_THRESHOLD_VERY_LAZY (4*1024*1024)
@@ -608,6 +609,7 @@ erts_late_init_process(void)
static void
init_sched_wall_time(ErtsSchedWallTime *swtp)
{
+ swtp->need = erts_sched_balance_util;
swtp->enabled = 0;
swtp->start = 0;
swtp->working.total = 0;
@@ -630,27 +632,253 @@ sched_wall_time_ts(void)
#endif
}
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+
+#ifdef ARCH_64
+
+static ERTS_INLINE Uint64
+aschedtime_read(ErtsAtomicSchedTime *var)
+{
+ return (Uint64) erts_atomic_read_nob((erts_atomic_t *) var);
+}
+
+static ERTS_INLINE void
+aschedtime_set(ErtsAtomicSchedTime *var, Uint64 val)
+{
+ erts_atomic_set_nob((erts_atomic_t *) var, (erts_aint_t) val);
+}
+
+static ERTS_INLINE void
+aschedtime_init(ErtsAtomicSchedTime *var)
+{
+ erts_atomic_init_nob((erts_atomic_t *) var, (erts_aint_t) 0);
+}
+
+#elif defined(ARCH_32)
+
+static ERTS_INLINE Uint64
+aschedtime_read(ErtsAtomicSchedTime *var)
+{
+ erts_dw_aint_t dw;
+ erts_dw_atomic_read_nob((erts_dw_atomic_t *) var, &dw);
+#ifdef ETHR_SU_DW_NAINT_T__
+ return (Uint64) dw.dw_sint;
+#else
+ {
+ Uint64 res;
+ res = (Uint64) ((Uint32) dw.sint[ERTS_DW_AINT_HIGH_WORD]);
+ res <<= 32;
+ res |= (Uint64) ((Uint32) dw.sint[ERTS_DW_AINT_LOW_WORD]);
+ return res;
+ }
+#endif
+}
+
+static ERTS_INLINE void
+aschedtime_set(ErtsAtomicSchedTime *var, Uint64 val)
+{
+ erts_dw_aint_t dw;
+#ifdef ETHR_SU_DW_NAINT_T__
+ dw.dw_sint = (ETHR_SU_DW_NAINT_T__) val;
+#else
+ dw.sint[ERTS_DW_AINT_LOW_WORD] = (erts_aint_t) (val & 0xffffffff);
+ dw.sint[ERTS_DW_AINT_HIGH_WORD] = (erts_aint_t) ((val >> 32) & 0xffffffff);
+#endif
+ erts_dw_atomic_set_nob((erts_dw_atomic_t *) var, &dw);
+}
+
+static ERTS_INLINE void
+aschedtime_init(ErtsAtomicSchedTime *var)
+{
+ erts_dw_aint_t dw;
+ dw.sint[ERTS_DW_AINT_LOW_WORD] = (erts_aint_t) 0;
+ dw.sint[ERTS_DW_AINT_HIGH_WORD] = (erts_aint_t) 0;
+ erts_dw_atomic_init_nob((erts_dw_atomic_t *) var, &dw);
+}
+
+#else
+# error :-/
+#endif
+
+#define ERTS_GET_AVG_MAX_UNLOCKED_TRY 50
+#define ERTS_SCHED_AVG_UTIL_WRITE_MARKER (~((Uint64) 0))
+
+/* Intervals in nanoseconds */
+#define ERTS_SCHED_UTIL_SHORT_INTERVAL ((Uint64) 1*1000*1000*1000)
+#define ERTS_SCHED_UTIL_LONG_INTERVAL ((Uint64) 10*1000*1000*1000)
+
+
+#define ERTS_SCHED_UTIL_IGNORE_IMBALANCE_DIFF 5000 /* ppm */
+
+static ERTS_INLINE Uint64
+calc_sched_worktime(int is_working, Uint64 now, Uint64 last,
+ Uint64 interval, Uint64 old_worktime)
+{
+ Uint64 worktime;
+ Uint64 new;
+
+ if (now <= last)
+ return old_worktime;
+
+ new = now - last;
+
+ if (new >= interval)
+ return is_working ? interval : (Uint64) 0;
+
+
+ /*
+ * Division by 1000 in order to avoid
+ * overflow. If changed update assertions
+ * in init_runq_sched_util().
+ */
+ worktime = old_worktime;
+ worktime *= (interval - new)/1000;
+ worktime /= (interval/1000);
+ if (is_working)
+ worktime += new;
+
+ ASSERT(0 <= worktime && worktime <= interval);
+
+ return worktime;
+}
+
+static ERTS_INLINE void
+update_avg_sched_util(ErtsSchedulerData *esdp, Uint64 now, int is_working)
+{
+ ErtsRunQueue *rq;
+ int worked;
+ Uint64 swt, lwt, last;
+
+ rq = esdp->run_queue;
+ last = aschedtime_read(&rq->sched_util.last);
+
+ if (now <= last) {
+ ASSERT(last == ERTS_SCHED_AVG_UTIL_WRITE_MARKER);
+ return;
+ }
+
+ ASSERT(now >= last);
+
+ worked = rq->sched_util.is_working;
+
+ swt = calc_sched_worktime(worked, now, last, ERTS_SCHED_UTIL_SHORT_INTERVAL,
+ rq->sched_util.worktime.short_interval);
+ lwt = calc_sched_worktime(worked, now, last, ERTS_SCHED_UTIL_LONG_INTERVAL,
+ rq->sched_util.worktime.long_interval);
+
+ aschedtime_set(&rq->sched_util.last, ERTS_SCHED_AVG_UTIL_WRITE_MARKER);
+ ERTS_THR_WRITE_MEMORY_BARRIER;
+ rq->sched_util.is_working = is_working;
+ rq->sched_util.worktime.short_interval = swt;
+ rq->sched_util.worktime.long_interval = lwt;
+ ERTS_THR_WRITE_MEMORY_BARRIER;
+ aschedtime_set(&rq->sched_util.last, now);
+}
+
+int
+erts_get_sched_util(ErtsRunQueue *rq, int initially_locked, int short_interval)
+{
+ /* Average scheduler utilization in ppm */
+ int util, is_working, try = 0, locked = initially_locked;
+ Uint64 worktime, old_worktime, now, last, interval, *old_worktimep;
+
+ if (short_interval) {
+ old_worktimep = &rq->sched_util.worktime.short_interval;
+ interval = ERTS_SCHED_UTIL_SHORT_INTERVAL;
+ }
+ else {
+ old_worktimep = &rq->sched_util.worktime.long_interval;
+ interval = ERTS_SCHED_UTIL_LONG_INTERVAL;
+ }
+
+ while (1) {
+ Uint64 chk_last;
+ last = aschedtime_read(&rq->sched_util.last);
+ ERTS_THR_READ_MEMORY_BARRIER;
+ is_working = rq->sched_util.is_working;
+ old_worktime = *old_worktimep;
+ ERTS_THR_READ_MEMORY_BARRIER;
+ chk_last = aschedtime_read(&rq->sched_util.last);
+ if (chk_last == last)
+ break;
+ if (!locked) {
+ if (++try >= ERTS_GET_AVG_MAX_UNLOCKED_TRY) {
+ /* Writer will eventually block on runq-lock */
+ erts_smp_runq_lock(rq);
+ locked = 1;
+ }
+ }
+ }
+
+ if (!initially_locked && locked)
+ erts_smp_runq_unlock(rq);
+
+ now = sched_wall_time_ts();
+ worktime = calc_sched_worktime(is_working, now, last, interval, old_worktime);
+
+ util = (int) ((worktime * 1000000)/interval);
+
+ ASSERT(0 <= util && util <= 1000000);
+
+ return util;
+}
+
+static void
+init_runq_sched_util(ErtsRunQueueSchedUtil *rqsu, int enabled)
+{
+ aschedtime_init(&rqsu->last);
+ if (!enabled)
+ aschedtime_set(&rqsu->last, ERTS_SCHED_AVG_UTIL_WRITE_MARKER);
+ rqsu->is_working = 0;
+ rqsu->worktime.short_interval = (Uint64) 0;
+ rqsu->worktime.long_interval = (Uint64) 0;
+
+#ifdef DEBUG
+ {
+ Uint64 intrvl;
+ /*
+ * If one of these asserts fail we may have
+ * overflow in calc_sched_worktime(). Which
+ * have to be fixed either by shrinking
+ * interval size, or fix calculation of
+ * worktime in calc_sched_worktime().
+ */
+ intrvl = ERTS_SCHED_UTIL_SHORT_INTERVAL;
+ ASSERT(intrvl*(intrvl/1000) > intrvl);
+ intrvl = ERTS_SCHED_UTIL_LONG_INTERVAL;
+ ASSERT(intrvl*(intrvl/1000) > intrvl);
+ }
+#endif
+}
+
+#endif /* ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT */
+
static ERTS_INLINE void
sched_wall_time_change(ErtsSchedulerData *esdp, int working)
{
- if (esdp->sched_wall_time.enabled) {
+ if (esdp->sched_wall_time.need) {
Uint64 ts = sched_wall_time_ts();
- if (working) {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ update_avg_sched_util(esdp, ts, working);
+#endif
+ if (esdp->sched_wall_time.enabled) {
+ if (working) {
#ifdef DEBUG
- ASSERT(!esdp->sched_wall_time.working.currently);
- esdp->sched_wall_time.working.currently = 1;
+ ASSERT(!esdp->sched_wall_time.working.currently);
+ esdp->sched_wall_time.working.currently = 1;
#endif
- ts -= esdp->sched_wall_time.start;
- esdp->sched_wall_time.working.start = ts;
- }
- else {
+ ts -= esdp->sched_wall_time.start;
+ esdp->sched_wall_time.working.start = ts;
+ }
+ else {
#ifdef DEBUG
- ASSERT(esdp->sched_wall_time.working.currently);
- esdp->sched_wall_time.working.currently = 0;
+ ASSERT(esdp->sched_wall_time.working.currently);
+ esdp->sched_wall_time.working.currently = 0;
#endif
- ts -= esdp->sched_wall_time.start;
- ts -= esdp->sched_wall_time.working.start;
- esdp->sched_wall_time.working.total += ts;
+ ts -= esdp->sched_wall_time.start;
+ ts -= esdp->sched_wall_time.working.start;
+ esdp->sched_wall_time.working.total += ts;
+ }
}
}
}
@@ -705,10 +933,13 @@ reply_sched_wall_time(void *vswtrp)
ASSERT(esdp);
if (swtrp->set) {
- if (!swtrp->enable && esdp->sched_wall_time.enabled)
+ if (!swtrp->enable && esdp->sched_wall_time.enabled) {
+ esdp->sched_wall_time.need = erts_sched_balance_util;
esdp->sched_wall_time.enabled = 0;
+ }
else if (swtrp->enable && !esdp->sched_wall_time.enabled) {
Uint64 ts = sched_wall_time_ts();
+ esdp->sched_wall_time.need = 1;
esdp->sched_wall_time.enabled = 1;
esdp->sched_wall_time.start = ts;
esdp->sched_wall_time.working.total = 0;
@@ -2084,9 +2315,8 @@ ongoing_multi_scheduling_block(void)
}
static ERTS_INLINE void
-empty_runq(ErtsRunQueue *rq)
+empty_runq_aux(ErtsRunQueue *rq, Uint32 old_flags)
{
- Uint32 old_flags = ERTS_RUNQ_FLGS_UNSET(rq, ERTS_RUNQ_FLG_NONEMPTY|ERTS_RUNQ_FLG_PROTECTED);
if (old_flags & ERTS_RUNQ_FLG_NONEMPTY) {
#ifdef DEBUG
erts_aint32_t empty = erts_smp_atomic32_read_nob(&no_empty_run_queues);
@@ -2107,6 +2337,23 @@ empty_runq(ErtsRunQueue *rq)
}
static ERTS_INLINE void
+empty_runq(ErtsRunQueue *rq)
+{
+ Uint32 old_flags = ERTS_RUNQ_FLGS_UNSET(rq, ERTS_RUNQ_FLG_NONEMPTY|ERTS_RUNQ_FLG_PROTECTED);
+ empty_runq_aux(rq, old_flags);
+}
+
+static ERTS_INLINE Uint32
+empty_protected_runq(ErtsRunQueue *rq)
+{
+ Uint32 old_flags = ERTS_RUNQ_FLGS_BSET(rq,
+ ERTS_RUNQ_FLG_NONEMPTY|ERTS_RUNQ_FLG_PROTECTED,
+ ERTS_RUNQ_FLG_PROTECTED);
+ empty_runq_aux(rq, old_flags);
+ return old_flags;
+}
+
+static ERTS_INLINE void
non_empty_runq(ErtsRunQueue *rq)
{
Uint32 old_flags = ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_NONEMPTY);
@@ -2130,6 +2377,18 @@ non_empty_runq(ErtsRunQueue *rq)
}
}
+void
+erts_empty_runq(ErtsRunQueue *rq)
+{
+ empty_runq(rq);
+}
+
+void
+erts_non_empty_runq(ErtsRunQueue *rq)
+{
+ non_empty_runq(rq);
+}
+
static erts_aint32_t
sched_prep_spin_wait(ErtsSchedulerSleepInfo *ssi)
{
@@ -2632,7 +2891,7 @@ ssi_flags_set_wake(ErtsSchedulerSleepInfo *ssi)
}
static void
-wake_scheduler(ErtsRunQueue *rq, int incq)
+wake_scheduler(ErtsRunQueue *rq)
{
ErtsSchedulerSleepInfo *ssi;
erts_aint32_t flgs;
@@ -2651,9 +2910,6 @@ wake_scheduler(ErtsRunQueue *rq, int incq)
flgs = ssi_flags_set_wake(ssi);
erts_sched_finish_poke(ssi, flgs);
-
- if (incq && (flgs & ERTS_SSI_FLG_WAITING))
- non_empty_runq(rq);
}
#define ERTS_NO_USED_RUNQS_SHIFT 16
@@ -2744,7 +3000,7 @@ chk_wake_sched(ErtsRunQueue *crq, int ix, int activate)
if (try_inc_no_active_runqs(ix+1))
(void) ERTS_RUNQ_FLGS_UNSET(wrq, ERTS_RUNQ_FLG_INACTIVE);
}
- wake_scheduler(wrq, 0);
+ wake_scheduler(wrq);
return 1;
}
return 0;
@@ -2792,7 +3048,7 @@ smp_notify_inc_runq(ErtsRunQueue *runq)
{
#ifdef ERTS_SMP
if (runq)
- wake_scheduler(runq, 1);
+ wake_scheduler(runq);
#endif
}
@@ -2810,7 +3066,7 @@ erts_sched_notify_check_cpu_bind(void)
for (ix = 0; ix < erts_no_run_queues; ix++) {
ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
(void) ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_CHK_CPU_BIND);
- wake_scheduler(rq, 0);
+ wake_scheduler(rq);
}
#else
erts_sched_check_cpu_bind(erts_get_scheduler_data());
@@ -2938,6 +3194,11 @@ check_immigration_need(ErtsRunQueue *c_rq, ErtsMigrationPath *mp, int prio)
if (!f_rq)
return NULL;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ if (mp->sched_util)
+ return NULL;
+#endif
+
f_rq_flags = ERTS_RUNQ_FLGS_GET(f_rq);
if (f_rq_flags & ERTS_RUNQ_FLG_PROTECTED)
return NULL;
@@ -3077,7 +3338,7 @@ suspend_run_queue(ErtsRunQueue *rq)
ERTS_SSI_FLG_SUSPENDED);
(void) ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_SUSPENDED);
- wake_scheduler(rq, 0);
+ wake_scheduler(rq);
}
static void scheduler_ix_resume_wake(Uint ix);
@@ -3169,6 +3430,9 @@ evacuate_run_queue(ErtsRunQueue *rq,
to_rq->misc.start = start;
to_rq->misc.end = end;
+
+ non_empty_runq(to_rq);
+
erts_smp_runq_unlock(to_rq);
smp_notify_inc_runq(to_rq);
erts_smp_runq_lock(to_rq);
@@ -3381,7 +3645,7 @@ try_steal_task(ErtsRunQueue *rq)
Uint32 flags;
/* Protect jobs we steal from getting stolen from us... */
- flags = ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_PROTECTED);
+ flags = empty_protected_runq(rq);
if (flags & ERTS_RUNQ_FLG_SUSPENDED)
return 0; /* go suspend instead... */
@@ -3460,6 +3724,9 @@ typedef struct {
int full_reds_history_change;
int oowc;
int max_len;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ int sched_util;
+#endif
} ErtsRunQueueBalance;
static ErtsRunQueueBalance *run_queue_info;
@@ -3623,6 +3890,9 @@ check_balance(ErtsRunQueue *c_rq)
Sint64 scheds_reds, full_scheds_reds;
int forced, active, current_active, oowc, half_full_scheds, full_scheds,
mmax_len, blnc_no_rqs, qix, pix, freds_hist_ix;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ int sched_util_balancing;
+#endif
if (erts_smp_atomic32_xchg_nob(&balance_info.checking_balance, 1)) {
c_rq->check_balance_reds = INT_MAX;
@@ -3678,6 +3948,10 @@ check_balance(ErtsRunQueue *c_rq)
return;
}
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ sched_util_balancing = 0;
+#endif
+
freds_hist_ix = balance_info.full_reds_history_index;
balance_info.full_reds_history_index++;
if (balance_info.full_reds_history_index >= ERTS_FULL_REDS_HISTORY_SIZE)
@@ -3708,7 +3982,12 @@ check_balance(ErtsRunQueue *c_rq)
run_queue_info[qix].oowc = rq->out_of_work_count;
run_queue_info[qix].max_len = rq->max_len;
rq->check_balance_reds = INT_MAX;
-
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ if (erts_sched_balance_util)
+ run_queue_info[qix].sched_util = erts_get_sched_util(rq, 1, 0);
+#endif
+
erts_smp_runq_unlock(rq);
}
@@ -3778,8 +4057,38 @@ check_balance(ErtsRunQueue *c_rq)
mmax_len = run_queue_info[qix].max_len;
}
- if (!erts_sched_compact_load)
+ if (!erts_sched_compact_load) {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ if (erts_sched_balance_util && full_scheds < blnc_no_rqs) {
+ int avg_util = 0;
+
+ for (qix = 0; qix < blnc_no_rqs; qix++)
+ avg_util += run_queue_info[qix].sched_util;
+
+ avg_util /= blnc_no_rqs; /* in ppm */
+
+ sched_util_balancing = 1;
+ /*
+ * In order to avoid renaming a large amount of fields
+ * we write utilization values instead of lenght values
+ * in the 'max_len' and 'migration_limit' fields...
+ */
+ for (qix = 0; qix < blnc_no_rqs; qix++) {
+ run_queue_info[qix].flags = 0; /* Reset for later use... */
+ for (pix = 0; pix < ERTS_NO_PRIO_LEVELS; pix++) {
+ run_queue_info[qix].prio[pix].emigrate_to = -1;
+ run_queue_info[qix].prio[pix].immigrate_from = -1;
+ run_queue_info[qix].prio[pix].avail = 100;
+ run_queue_info[qix].prio[pix].max_len = run_queue_info[qix].sched_util;
+ run_queue_info[qix].prio[pix].migration_limit = avg_util;
+ }
+ }
+ active = blnc_no_rqs;
+ goto setup_migration_paths;
+ }
+#endif
goto all_active;
+ }
if (!forced && half_full_scheds != blnc_no_rqs) {
int min = 1;
@@ -3896,15 +4205,30 @@ check_balance(ErtsRunQueue *c_rq)
}
}
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ setup_migration_paths:
+#endif
+
/* Setup migration paths for all priorities */
for (pix = 0; pix < ERTS_NO_PRIO_LEVELS; pix++) {
int low = 0, high = 0;
for (qix = 0; qix < blnc_no_rqs; qix++) {
int len_diff = run_queue_info[qix].prio[pix].max_len;
len_diff -= run_queue_info[qix].prio[pix].migration_limit;
+
#ifdef DBG_PRINT
if (pix == 2) erts_fprintf(stderr, "%d ", len_diff);
#endif
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ if (sched_util_balancing
+ && -ERTS_SCHED_UTIL_IGNORE_IMBALANCE_DIFF <= len_diff
+ && len_diff <= ERTS_SCHED_UTIL_IGNORE_IMBALANCE_DIFF) {
+ /* ignore minor imbalance */
+ len_diff = 0;
+ }
+#endif
+
run_queue_compare[qix].qix = qix;
run_queue_compare[qix].len = len_diff;
if (len_diff != 0) {
@@ -4031,6 +4355,9 @@ erts_fprintf(stderr, "--------------------------------\n");
Uint32 flags = run_queue_info[qix].flags;
ErtsMigrationPath *mp = &new_mpaths->mpath[qix];
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ mp->sched_util = sched_util_balancing;
+#endif
mp->flags = flags;
mp->misc_evac_runq = NULL;
@@ -4628,6 +4955,11 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online)
set_wakeup_other_data();
#endif
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ if (erts_sched_balance_util)
+ erts_sched_compact_load = 0;
+#endif
+
ASSERT(no_schedulers_online <= no_schedulers);
ASSERT(no_schedulers_online >= 1);
ASSERT(no_schedulers >= 1);
@@ -4696,6 +5028,11 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online)
rq->ports.info.reds = 0;
rq->ports.start = NULL;
rq->ports.end = NULL;
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ init_runq_sched_util(&rq->sched_util, erts_sched_balance_util);
+#endif
+
}
#ifdef ERTS_SMP
@@ -4794,6 +5131,7 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online)
esdp->reductions = 0;
init_sched_wall_time(&esdp->sched_wall_time);
+
erts_port_task_handle_init(&esdp->nosuspend_port_task_handle);
}
@@ -5761,7 +6099,7 @@ erts_set_schedulers_online(Process *p,
for (ix = no; ix < online; ix++) {
ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
- wake_scheduler(rq, 0);
+ wake_scheduler(rq);
}
}
}
@@ -5860,7 +6198,7 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all)
for (ix = 1; ix < online; ix++) {
ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
- wake_scheduler(rq, 0);
+ wake_scheduler(rq);
}
if (erts_smp_atomic32_read_nob(&schdlr_sspnd.active)
@@ -7294,7 +7632,7 @@ Process *schedule(Process *p, int calls)
continue_check_activities_to_run:
flags = ERTS_RUNQ_FLGS_GET_NOB(rq);
continue_check_activities_to_run_known_flags:
-
+ ASSERT(flags & ERTS_RUNQ_FLG_NONEMPTY);
if (flags & (ERTS_RUNQ_FLG_CHK_CPU_BIND|ERTS_RUNQ_FLG_SUSPENDED)) {
@@ -7346,20 +7684,16 @@ Process *schedule(Process *p, int calls)
rq->wakeup_other = 0;
rq->wakeup_other_reds = 0;
- empty_runq(rq);
-
flags = ERTS_RUNQ_FLGS_GET_NOB(rq);
- if (flags & ERTS_RUNQ_FLG_SUSPENDED) {
- non_empty_runq(rq);
+ if (flags & ERTS_RUNQ_FLG_SUSPENDED)
goto continue_check_activities_to_run_known_flags;
- }
- else if (!(flags & ERTS_RUNQ_FLG_INACTIVE)) {
- if (try_steal_task(rq)) {
- non_empty_runq(rq);
+ if (flags & ERTS_RUNQ_FLG_INACTIVE)
+ empty_runq(rq);
+ else {
+ if (try_steal_task(rq))
goto continue_check_activities_to_run;
- }
- (void) ERTS_RUNQ_FLGS_UNSET(rq, ERTS_RUNQ_FLG_PROTECTED);
+ empty_runq(rq);
/*
* Check for ERTS_RUNQ_FLG_SUSPENDED has to be done
@@ -7371,7 +7705,6 @@ Process *schedule(Process *p, int calls)
goto continue_check_activities_to_run_known_flags;
}
}
-
#endif
scheduler_wait(&fcalls, esdp, rq);
@@ -8486,6 +8819,10 @@ erts_schedule_misc_op(void (*func)(void *), void *arg)
rq->misc.start = molp;
rq->misc.end = molp;
+#ifdef ERTS_SMP
+ non_empty_runq(rq);
+#endif
+
erts_smp_runq_unlock(rq);
smp_notify_inc_runq(rq);
@@ -9372,8 +9709,11 @@ save_pending_exiter(Process *p)
erts_proclist_store_last(&rq->procs.pending_exiters, plp);
+ non_empty_runq(rq);
+
erts_smp_runq_unlock(rq);
- wake_scheduler(rq, 1);
+
+ wake_scheduler(rq);
}
#endif
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index 043621125c..c2ef55f2d5 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -70,6 +70,9 @@ typedef struct process Process;
struct ErtsNodesMonitor_;
+#define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT 0
+#define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT 0
+
#define ERTS_MAX_NO_OF_SCHEDULERS 1024
#define ERTS_DEFAULT_MAX_PROCESSES (1 << 18)
@@ -98,6 +101,7 @@ struct saved_calls {
extern Export exp_send, exp_receive, exp_timeout;
extern int erts_sched_compact_load;
+extern int erts_sched_balance_util;
extern Uint erts_no_schedulers;
extern Uint erts_no_run_queues;
extern int erts_sched_thread_suggested_stack_size;
@@ -198,6 +202,10 @@ extern int erts_sched_thread_suggested_stack_size;
#define ERTS_RUNQ_FLGS_SET(RQ, FLGS) \
((Uint32) erts_smp_atomic32_read_bor_relb(&(RQ)->flags, \
(erts_aint32_t) (FLGS)))
+#define ERTS_RUNQ_FLGS_BSET(RQ, MSK, FLGS) \
+ ((Uint32) erts_smp_atomic32_read_bset_relb(&(RQ)->flags, \
+ (erts_aint32_t) (MSK), \
+ (erts_aint32_t) (FLGS)))
#define ERTS_RUNQ_FLGS_UNSET(RQ, FLGS) \
((Uint32) erts_smp_atomic32_read_band_relb(&(RQ)->flags, \
(erts_aint32_t) ~(FLGS)))
@@ -316,9 +324,40 @@ typedef struct {
int reds;
} ErtsRunQueueInfo;
+
+#ifdef HAVE_GETHRTIME
+# undef ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+# define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT 1
+#endif
+
#ifdef ERTS_SMP
+#undef ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+#define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+
+#ifdef ARCH_64
+typedef erts_atomic_t ErtsAtomicSchedTime;
+#elif defined(ARCH_32)
+typedef erts_dw_atomic_t ErtsAtomicSchedTime;
+#else
+# error :-/
+#endif
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+typedef struct {
+ ErtsAtomicSchedTime last;
+ struct {
+ Uint64 short_interval;
+ Uint64 long_interval;
+ } worktime;
+ int is_working;
+} ErtsRunQueueSchedUtil;
+#endif
+
typedef struct {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ int sched_util;
+#endif
Uint32 flags;
ErtsRunQueue *misc_evac_runq;
struct {
@@ -385,6 +424,9 @@ struct ErtsRunQueue_ {
Port *start;
Port *end;
} ports;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ ErtsRunQueueSchedUtil sched_util;
+#endif
};
#ifdef ERTS_SMP
@@ -414,6 +456,7 @@ do { \
} while (0)
typedef struct {
+ int need; /* "+sbu true" or scheduler_wall_time enabled */
int enabled;
Uint64 start;
struct {
@@ -542,6 +585,12 @@ int erts_smp_lc_runq_is_locked(ErtsRunQueue *);
#ifdef ERTS_INCLUDE_SCHEDULER_INTERNALS
+#ifdef ERTS_SMP
+void erts_empty_runq(ErtsRunQueue *rq);
+void erts_non_empty_runq(ErtsRunQueue *rq);
+#endif
+
+
/*
* Run queue locked during modifications. We use atomic ops since
* other threads peek at values without run queue lock.
@@ -574,6 +623,10 @@ erts_smp_inc_runq_len(ErtsRunQueue *rq, ErtsRunQueueInfo *rqi, int prio)
erts_smp_atomic32_set_relb(&rqi->len, len);
+#ifdef ERTS_SMP
+ if (rq->len == 0)
+ erts_non_empty_runq(rq);
+#endif
rq->len++;
if (rq->max_len < rq->len)
rq->max_len = len;
@@ -1695,6 +1748,13 @@ erts_proc_set_error_handler(Process *p, ErtsProcLocks plocks, Eterm handler)
extern erts_atomic_t erts_migration_paths;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+int erts_get_sched_util(ErtsRunQueue *rq,
+ int initially_locked,
+ int short_interval);
+#endif
+
+
ERTS_GLB_INLINE ErtsMigrationPaths *erts_get_migration_paths_managed(void);
ERTS_GLB_INLINE ErtsMigrationPaths *erts_get_migration_paths(void);
ERTS_GLB_INLINE ErtsRunQueue *erts_check_emigration_need(ErtsRunQueue *c_rq,
@@ -1746,22 +1806,36 @@ erts_check_emigration_need(ErtsRunQueue *c_rq, int prio)
return mp->prio[prio].runq;
}
-
- if (prio == ERTS_PORT_PRIO_LEVEL)
- len = RUNQ_READ_LEN(&c_rq->ports.info.len);
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+ if (mp->sched_util) {
+ ErtsRunQueue *rq = mp->prio[prio].runq;
+ /* No migration if other is non-empty */
+ if (!(ERTS_RUNQ_FLGS_GET(rq) & ERTS_RUNQ_FLG_NONEMPTY)
+ && erts_get_sched_util(rq, 0, 1) < mp->prio[prio].limit.other
+ && erts_get_sched_util(c_rq, 0, 1) > mp->prio[prio].limit.this) {
+ return rq;
+ }
+ }
else
- len = RUNQ_READ_LEN(&c_rq->procs.prio_info[prio].len);
-
- if (len > mp->prio[prio].limit.this) {
- ErtsRunQueue *n_rq = mp->prio[prio].runq;
- if (n_rq) {
- if (prio == ERTS_PORT_PRIO_LEVEL)
- len = RUNQ_READ_LEN(&n_rq->ports.info.len);
- else
- len = RUNQ_READ_LEN(&n_rq->procs.prio_info[prio].len);
-
- if (len < mp->prio[prio].limit.other)
- return n_rq;
+#endif
+ {
+
+ if (prio == ERTS_PORT_PRIO_LEVEL)
+ len = RUNQ_READ_LEN(&c_rq->ports.info.len);
+ else
+ len = RUNQ_READ_LEN(&c_rq->procs.prio_info[prio].len);
+
+ if (len > mp->prio[prio].limit.this) {
+ ErtsRunQueue *n_rq = mp->prio[prio].runq;
+ if (n_rq) {
+ if (prio == ERTS_PORT_PRIO_LEVEL)
+ len = RUNQ_READ_LEN(&n_rq->ports.info.len);
+ else
+ len = RUNQ_READ_LEN(&n_rq->procs.prio_info[prio].len);
+
+ if (len < mp->prio[prio].limit.other)
+ return n_rq;
+ }
}
}
}
diff --git a/erts/etc/common/erlexec.c b/erts/etc/common/erlexec.c
index c30203c632..78a50744ef 100644
--- a/erts/etc/common/erlexec.c
+++ b/erts/etc/common/erlexec.c
@@ -135,6 +135,7 @@ static char *pluss_val_switches[] = {
"ws",
"ss",
"pp",
+ "ub",
NULL
};
/* +h arguments with values */