From f9459092940943876dff040ee997515b96fd5d50 Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Wed, 4 Jan 2017 18:10:26 +0100 Subject: Scheduler wall time support for dirty schedulers --- erts/doc/src/erlang.xml | 97 +++++++-- erts/emulator/beam/atom.names | 3 +- erts/emulator/beam/beam_debug.c | 13 +- erts/emulator/beam/bif.c | 2 +- erts/emulator/beam/erl_bif_info.c | 7 +- erts/emulator/beam/erl_process.c | 366 ++++++++++++++++++++++++++------ erts/emulator/beam/erl_process.h | 9 +- erts/emulator/test/statistics_SUITE.erl | 131 ++++++++++-- erts/preloaded/ebin/erlang.beam | Bin 105976 -> 106168 bytes erts/preloaded/src/erlang.erl | 5 + 10 files changed, 535 insertions(+), 98 deletions(-) (limited to 'erts') diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml index b3fab3874b..1f64d7be86 100644 --- a/erts/doc/src/erlang.xml +++ b/erts/doc/src/erlang.xml @@ -6415,12 +6415,17 @@ lists:map( TotalTime is the total time duration since scheduler_wall_time - activation. The time unit is undefined and can be subject - to change between releases, OSs, and system restarts. - scheduler_wall_time is only to be used to - calculate relative values for scheduler-utilization. - ActiveTime can never exceed - TotalTime.

+ activation for the specific scheduler. Note that + activation time can differ significantly between + schedulers. Currently dirty schedulers are activated + at system start while normal schedulers are activated + some time after the scheduler_wall_time + functionality is enabled. The time unit is undefined + and can be subject to change between releases, OSs, + and system restarts. scheduler_wall_time is only + to be used to calculate relative values for scheduler + utilization. ActiveTime can never + exceed TotalTime.

The definition of a busy scheduler is when it is not idle and is not scheduling (selecting) a process or port, that is:

@@ -6438,15 +6443,37 @@ lists:map( scheduler_wall_time is turned off.

The list of scheduler information is unsorted and can appear in different order between calls.

+

As of ERTS version 9.0, also dirty CPU schedulers will + be included in the result. That is, all scheduler threads + that are expected to handle CPU bound work. If you also + want information about dirty I/O schedulers, use + statistics(scheduler_wall_time_all) + instead.

+ +

Normal schedulers will have scheduler identifiers in + the range 1 =< SchedulerId =< + erlang:system_info(schedulers). + Dirty CPU schedulers will have scheduler identifiers in + the range erlang:system_info(schedulers) < + SchedulerId =< erlang:system_info(schedulers) + + + erlang:system_info(dirty_cpu_schedulers). +

+

The different types of schedulers handle + specific types of jobs. Every job is assigned to a specific + scheduler type. Jobs can migrate between different schedulers + of the same type, but never between schedulers of different + types. This fact has to be taken under consideration when + evaluating the result returned.

Using scheduler_wall_time to calculate - scheduler-utilization:

+ scheduler utilization:

 > erlang:system_flag(scheduler_wall_time, true).
 false
 > Ts0 = lists:sort(erlang:statistics(scheduler_wall_time)), ok.
 ok

Some time later the user takes another snapshot and calculates - scheduler-utilization per scheduler, for example:

+ scheduler utilization per scheduler, for example:

 > Ts1 = lists:sort(erlang:statistics(scheduler_wall_time)), ok.
 ok
@@ -6461,11 +6488,32 @@ ok
  {7,0.973237033077876},
  {8,0.9741297293248656}]

Using the same snapshots to calculate a total - scheduler-utilization:

+ scheduler utilization:

 > {A, T} = lists:foldl(fun({{_, A0, T0}, {_, A1, T1}}, {Ai,Ti}) ->
-	{Ai + (A1 - A0), Ti + (T1 - T0)} end, {0, 0}, lists:zip(Ts0,Ts1)), A/T.
+	{Ai + (A1 - A0), Ti + (T1 - T0)} end, {0, 0}, lists:zip(Ts0,Ts1)),
+	TotalSchedulerUtilization = A/T.
+0.9769136803764825
+

Total scheduler utilization will equal 1.0 when + all schedulers have been active all the time between the + two measurements.

+

Another (probably more) useful value is to calculate + total scheduler utilization weighted against maximum amount + of available CPU time:

+
+> WeightedSchedulerUtilization = (TotalSchedulerUtilization
+                                  * (erlang:system_info(schedulers)
+                                     + erlang:system_info(dirty_cpu_schedulers)))
+                                 / erlang:system_info(logical_processors_available).
 0.9769136803764825
+

This weighted scheduler utilization will reach 1.0 + when schedulers are active the same amount of time as + maximum available CPU time. If more schedulers exist + than available logical processors, this value may + be greater than 1.0.

+

As of ERTS version 9.0, the Erlang runtime system + with SMP support will as default have more schedulers + than logical processors. This due to the dirty schedulers.

scheduler_wall_time is by default disabled. To enable it, use @@ -6476,6 +6524,31 @@ ok + Information about each schedulers work time. + + +

The same as + statistics(scheduler_wall_time), + except that it also include information about all dirty I/O + schedulers.

+

Dirty IO schedulers will have scheduler identifiers in + the range + erlang:system_info(schedulers) + + + erlang:system_info(dirty_cpu_schedulers) < + SchedulerId =< erlang:system_info(schedulers) + + erlang:system_info(dirty_cpu_schedulers) + + + erlang:system_info(dirty_io_schedulers).

+

Note that work executing on dirty I/O schedulers + are expected to mainly wait for I/O. That is, when you + get high scheduler utilization on dirty I/O schedulers, + CPU utilization is not expected to be high due to + this work.

+ + + + Information about active processes and ports.

Returns the total amount of active processes and ports in @@ -6495,7 +6568,7 @@ ok - + Information about the run-queue lengths.

Returns the total length of the run queues. That is, the number @@ -6515,7 +6588,7 @@ ok - + Information about wall clock.

Returns information about wall clock. wall_clock can diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index 7df350116a..dd0a42b5ba 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -574,6 +574,8 @@ atom safe atom save_calls atom scheduler atom scheduler_id +atom scheduler_wall_time +atom scheduler_wall_time_all atom schedulers_online atom scheme atom scientific @@ -678,7 +680,6 @@ atom waiting atom wall_clock atom warning atom warning_msg -atom scheduler_wall_time atom wordsize atom write_concurrency atom xor diff --git a/erts/emulator/beam/beam_debug.c b/erts/emulator/beam/beam_debug.c index 3ffbb0364c..8326d348af 100644 --- a/erts/emulator/beam/beam_debug.c +++ b/erts/emulator/beam/beam_debug.c @@ -994,13 +994,20 @@ dirty_test(Process *c_p, Eterm type, Eterm arg1, Eterm arg2, UWord *I) Eterm *hp, *hp2; Uint sz; int i; + ErtsSchedulerData *esdp = erts_proc_sched_data(c_p); + int dirty_io = esdp->type == ERTS_SCHED_DIRTY_IO; + if (ERTS_PROC_IS_EXITING(real_c_p)) goto badarg; dirty_send_message(c_p, arg2, am_alive); - /* Wait until dead */ - while (!ERTS_PROC_IS_EXITING(real_c_p)) - erts_thr_yield(); + /* Wait until dead */ + while (!ERTS_PROC_IS_EXITING(real_c_p)) { + if (dirty_io) + ms_wait(c_p, make_small(100), 0); + else + erts_thr_yield(); + } ms_wait(c_p, make_small(1000), 0); diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index a5b66db35c..65c370c55b 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -4601,7 +4601,7 @@ BIF_RETTYPE system_flag_2(BIF_ALIST_2) erts_aint32_t new = BIF_ARG_2 == am_true ? 1 : 0; erts_aint32_t old = erts_smp_atomic32_xchg_nob(&sched_wall_time, new); - Eterm ref = erts_sched_wall_time_request(BIF_P, 1, new); + Eterm ref = erts_sched_wall_time_request(BIF_P, 1, new, 0, 0); ASSERT(is_value(ref)); BIF_TRAP2(await_sched_wall_time_mod_trap, BIF_P, diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index 5f2122b969..5ee19aead8 100644 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -3385,7 +3385,12 @@ BIF_RETTYPE statistics_1(BIF_ALIST_1) Eterm* hp; if (BIF_ARG_1 == am_scheduler_wall_time) { - res = erts_sched_wall_time_request(BIF_P, 0, 0); + res = erts_sched_wall_time_request(BIF_P, 0, 0, 1, 0); + if (is_non_value(res)) + BIF_RET(am_undefined); + BIF_TRAP1(gather_sched_wall_time_res_trap, BIF_P, res); + } else if (BIF_ARG_1 == am_scheduler_wall_time_all) { + res = erts_sched_wall_time_request(BIF_P, 0, 0, 1, 1); if (is_non_value(res)) BIF_RET(am_undefined); BIF_TRAP1(gather_sched_wall_time_res_trap, BIF_P, res); diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index 084ecb14ec..8c9aa45a81 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -804,15 +804,28 @@ erts_late_init_process(void) } +#define ERTS_SCHED_WTIME_IDLE ~((Uint64) 0) + static void -init_sched_wall_time(ErtsSchedWallTime *swtp) +init_sched_wall_time(ErtsSchedulerData *esdp, Uint64 time_stamp) { - swtp->need = erts_sched_balance_util; - swtp->enabled = 0; - swtp->start = 0; - swtp->working.total = 0; - swtp->working.start = 0; - swtp->working.currently = 0; +#ifdef ERTS_DIRTY_SCHEDULERS + if (esdp->type != ERTS_SCHED_NORMAL) { + erts_atomic32_init_nob(&esdp->sched_wall_time.u.mod, 0); + esdp->sched_wall_time.enabled = 1; + esdp->sched_wall_time.start = time_stamp; + esdp->sched_wall_time.working.total = 0; + esdp->sched_wall_time.working.start = ERTS_SCHED_WTIME_IDLE; + } + else +#endif + { + esdp->sched_wall_time.u.need = erts_sched_balance_util; + esdp->sched_wall_time.enabled = 0; + esdp->sched_wall_time.start = 0; + esdp->sched_wall_time.working.total = 0; + esdp->sched_wall_time.working.start = 0; + } } static ERTS_INLINE Uint64 @@ -1003,31 +1016,145 @@ init_runq_sched_util(ErtsRunQueueSchedUtil *rqsu, int enabled) #endif /* ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT */ -static ERTS_INLINE void +#ifdef ERTS_DIRTY_SCHEDULERS + +typedef struct { + Uint64 working; + Uint64 total; +} ErtsDirtySchedWallTime; + +static void +read_dirty_sched_wall_time(ErtsSchedulerData *esdp, ErtsDirtySchedWallTime *info) +{ + erts_aint32_t mod1; + Uint64 working, start, ts; + + mod1 = erts_atomic32_read_nob(&esdp->sched_wall_time.u.mod); + + while (1) { + erts_aint32_t mod2; + + /* Spin until values are not written... */ + while (1) { + if ((mod1 & 1) == 0) + break; + ERTS_SPIN_BODY; + mod1 = erts_atomic32_read_nob(&esdp->sched_wall_time.u.mod); + } + + ERTS_THR_READ_MEMORY_BARRIER; + + working = esdp->sched_wall_time.working.total; + start = esdp->sched_wall_time.working.start; + + ERTS_THR_READ_MEMORY_BARRIER; + + mod2 = erts_atomic32_read_nob(&esdp->sched_wall_time.u.mod); + if (mod1 == mod2) + break; + mod1 = mod2; + } + + ts = sched_wall_time_ts(); + ts -= esdp->sched_wall_time.start; + + info->total = ts; + + if (start == ERTS_SCHED_WTIME_IDLE || ts < start) + info->working = working; + else + info->working = working + (ts - start); + + if (info->working > info->total) + info->working = info->total; +} + +#endif + +#ifdef ERTS_SMP + +static void +dirty_sched_wall_time_change(ErtsSchedulerData *esdp, int working) +{ + erts_aint32_t mod; + Uint64 ts = sched_wall_time_ts(); + + ts -= esdp->sched_wall_time.start; + + /* + * This thread is the only thread writing in + * this sched_wall_time struct. We set 'mod' to + * an odd value while writing... + */ + mod = erts_atomic32_read_dirty(&esdp->sched_wall_time.u.mod); + ASSERT((mod & 1) == 0); + mod++; + + erts_atomic32_set_nob(&esdp->sched_wall_time.u.mod, mod); + ERTS_THR_WRITE_MEMORY_BARRIER; + + if (working) { + ASSERT(esdp->sched_wall_time.working.start + == ERTS_SCHED_WTIME_IDLE); + + esdp->sched_wall_time.working.start = ts; + + } + else { + Uint64 total; + + ASSERT(esdp->sched_wall_time.working.start + != ERTS_SCHED_WTIME_IDLE); + + total = esdp->sched_wall_time.working.total; + total += ts - esdp->sched_wall_time.working.start; + + esdp->sched_wall_time.working.total = total; + esdp->sched_wall_time.working.start = ERTS_SCHED_WTIME_IDLE; + + + } + + ERTS_THR_WRITE_MEMORY_BARRIER; + mod++; + erts_atomic32_set_nob(&esdp->sched_wall_time.u.mod, mod); + +#if 0 + if (!working) { + ERTS_MSACC_SET_STATE_M_X(ERTS_MSACC_STATE_BUSY_WAIT); + } else { + ERTS_MSACC_SET_STATE_M_X(ERTS_MSACC_STATE_OTHER); + } +#endif +} + +#endif /* ERTS_SMP */ + +static void sched_wall_time_change(ErtsSchedulerData *esdp, int working) { - if (esdp->sched_wall_time.need) { + if (esdp->sched_wall_time.u.need) { Uint64 ts = sched_wall_time_ts(); #if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT - update_avg_sched_util(esdp, ts, working); + update_avg_sched_util(esdp, ts, working); #endif if (esdp->sched_wall_time.enabled) { if (working) { -#ifdef DEBUG - ASSERT(!esdp->sched_wall_time.working.currently); - esdp->sched_wall_time.working.currently = 1; -#endif + ASSERT(esdp->sched_wall_time.working.start + == ERTS_SCHED_WTIME_IDLE); ts -= esdp->sched_wall_time.start; esdp->sched_wall_time.working.start = ts; } else { -#ifdef DEBUG - ASSERT(esdp->sched_wall_time.working.currently); - esdp->sched_wall_time.working.currently = 0; -#endif + ASSERT(esdp->sched_wall_time.working.start + != ERTS_SCHED_WTIME_IDLE); ts -= esdp->sched_wall_time.start; ts -= esdp->sched_wall_time.working.start; esdp->sched_wall_time.working.total += ts; +#ifdef DEBUG + esdp->sched_wall_time.working.start + = ERTS_SCHED_WTIME_IDLE; +#endif } } } @@ -1047,6 +1174,10 @@ typedef struct { Eterm ref_heap[REF_THING_SIZE]; Uint req_sched; erts_smp_atomic32_t refc; +#ifdef ERTS_DIRTY_SCHEDULERS + int want_dirty_cpu; + int want_dirty_io; +#endif } ErtsSchedWallTimeReq; typedef struct { @@ -1068,6 +1199,7 @@ ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(screq, 5, ERTS_ALC_T_SYS_CHECK_REQ) + static void reply_sched_wall_time(void *vswtrp) { @@ -1091,23 +1223,23 @@ reply_sched_wall_time(void *vswtrp) #endif if (swtrp->set) { if (!swtrp->enable && esdp->sched_wall_time.enabled) { - esdp->sched_wall_time.need = erts_sched_balance_util; + esdp->sched_wall_time.u.need = erts_sched_balance_util; esdp->sched_wall_time.enabled = 0; } else if (swtrp->enable && !esdp->sched_wall_time.enabled) { Uint64 ts = sched_wall_time_ts(); - esdp->sched_wall_time.need = 1; + esdp->sched_wall_time.u.need = 1; esdp->sched_wall_time.enabled = 1; esdp->sched_wall_time.start = ts; esdp->sched_wall_time.working.total = 0; esdp->sched_wall_time.working.start = 0; - esdp->sched_wall_time.working.currently = 1; } } if (esdp->sched_wall_time.enabled) { Uint64 ts = sched_wall_time_ts(); - ASSERT(esdp->sched_wall_time.working.currently); + ASSERT(esdp->sched_wall_time.working.start + != ERTS_SCHED_WTIME_IDLE); ts -= esdp->sched_wall_time.start; total = ts; ts -= esdp->sched_wall_time.working.start; @@ -1118,30 +1250,117 @@ reply_sched_wall_time(void *vswtrp) hpp = NULL; szp = &sz; - while (1) { - if (hpp) - ref_copy = STORE_NC(hpp, ohp, swtrp->ref); - else - *szp += REF_THING_SIZE; +#ifdef ERTS_DIRTY_SCHEDULERS + if (esdp->sched_wall_time.enabled + && swtrp->req_sched == esdp->no + && (swtrp->want_dirty_cpu || swtrp->want_dirty_io)) { + /* Reply with info about this scheduler and all dirty schedulers... */ + ErtsDirtySchedWallTime *dswt; + int ix, no_dirty_scheds, want_dcpu, want_dio, soffset; + + want_dcpu = swtrp->want_dirty_cpu; + want_dio = swtrp->want_dirty_io; + + no_dirty_scheds = 0; + if (want_dcpu) + no_dirty_scheds += erts_no_dirty_cpu_schedulers; + if (want_dio) + no_dirty_scheds += erts_no_dirty_io_schedulers; + + ASSERT(no_dirty_scheds); + + dswt = erts_alloc(ERTS_ALC_T_TMP, + sizeof(ErtsDirtySchedWallTime) + * no_dirty_scheds); + + for (ix = 0; ix < no_dirty_scheds; ix++) { + ErtsSchedulerData *esdp; + if (want_dcpu && ix < erts_no_dirty_cpu_schedulers) + esdp = &erts_aligned_dirty_cpu_scheduler_data[ix].esd; + else { + int dio_ix = ix - erts_no_dirty_cpu_schedulers; + esdp = &erts_aligned_dirty_io_scheduler_data[dio_ix].esd; + } + read_dirty_sched_wall_time(esdp, &dswt[ix]); + } - if (swtrp->set) - msg = ref_copy; - else { - msg = (!esdp->sched_wall_time.enabled - ? am_notsup - : erts_bld_tuple(hpp, szp, 3, - make_small(esdp->no), - erts_bld_uint64(hpp, szp, working), - erts_bld_uint64(hpp, szp, total))); + soffset = erts_no_schedulers + 1; - msg = erts_bld_tuple(hpp, szp, 2, ref_copy, msg); - } - if (hpp) - break; + if (!want_dcpu) { + ASSERT(want_dio); + soffset += erts_no_dirty_cpu_schedulers; + } + + while (1) { + if (hpp) + ref_copy = STORE_NC(hpp, ohp, swtrp->ref); + else + *szp += REF_THING_SIZE; + + ASSERT(!swtrp->set); + + /* info about dirty schedulers... */ + msg = NIL; + for (ix = no_dirty_scheds-1; ix >= 0; ix--) { + msg = erts_bld_cons(hpp, szp, + erts_bld_tuple(hpp, szp, 3, + make_small(ix+soffset), + erts_bld_uint64(hpp, szp, + dswt[ix].working), + erts_bld_uint64(hpp, szp, + dswt[ix].total)), + msg); + } + /* info about this scheduler... */ + msg = erts_bld_cons(hpp, szp, + erts_bld_tuple(hpp, szp, 3, + make_small(esdp->no), + erts_bld_uint64(hpp, szp, working), + erts_bld_uint64(hpp, szp, total)), + msg); + + msg = erts_bld_tuple(hpp, szp, 2, ref_copy, msg); + + if (hpp) + break; + + mp = erts_alloc_message_heap(rp, &rp_locks, sz, &hp, &ohp); + szp = NULL; + hpp = &hp; + } - mp = erts_alloc_message_heap(rp, &rp_locks, sz, &hp, &ohp); - szp = NULL; - hpp = &hp; + erts_free(ERTS_ALC_T_TMP, dswt); + } + else +#endif + { + /* Reply with info about this scheduler only... */ + + while (1) { + if (hpp) + ref_copy = STORE_NC(hpp, ohp, swtrp->ref); + else + *szp += REF_THING_SIZE; + + if (swtrp->set) + msg = ref_copy; + else { + msg = (!esdp->sched_wall_time.enabled + ? am_undefined + : erts_bld_tuple(hpp, szp, 3, + make_small(esdp->no), + erts_bld_uint64(hpp, szp, working), + erts_bld_uint64(hpp, szp, total))); + + msg = erts_bld_tuple(hpp, szp, 2, ref_copy, msg); + } + if (hpp) + break; + + mp = erts_alloc_message_heap(rp, &rp_locks, sz, &hp, &ohp); + szp = NULL; + hpp = &hp; + } } erts_queue_message(rp, rp_locks, mp, msg, am_system); @@ -1159,7 +1378,8 @@ reply_sched_wall_time(void *vswtrp) } Eterm -erts_sched_wall_time_request(Process *c_p, int set, int enable) +erts_sched_wall_time_request(Process *c_p, int set, int enable, + int want_dirty_cpu, int want_dirty_io) { ErtsSchedulerData *esdp = erts_proc_sched_data(c_p); Eterm ref; @@ -1181,6 +1401,10 @@ erts_sched_wall_time_request(Process *c_p, int set, int enable) swtrp->proc = c_p; swtrp->ref = STORE_NC(&hp, NULL, ref); swtrp->req_sched = esdp->no; +#ifdef ERTS_DIRTY_SCHEDULERS + swtrp->want_dirty_cpu = want_dirty_cpu; + swtrp->want_dirty_io = want_dirty_io; +#endif erts_smp_atomic32_init_nob(&swtrp->refc, (erts_aint32_t) erts_no_schedulers); @@ -3106,8 +3330,10 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) tse_wait: - if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && thr_prgr_active != working) - sched_wall_time_change(esdp, thr_prgr_active); + if (ERTS_SCHEDULER_IS_DIRTY(esdp)) + dirty_sched_wall_time_change(esdp, working = 0); + else if (thr_prgr_active != working) + sched_wall_time_change(esdp, working = thr_prgr_active); while (1) { ErtsMonotonicTime current_time = 0; @@ -3218,10 +3444,12 @@ scheduler_wait(int *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) (ERTS_SSI_FLG_SUSPENDED | ERTS_SSI_FLG_MSB_EXEC)); - if (!ERTS_SCHEDULER_IS_DIRTY(esdp) && !thr_prgr_active) { + if (ERTS_SCHEDULER_IS_DIRTY(esdp)) + dirty_sched_wall_time_change(esdp, working = 1); + else if (!thr_prgr_active) { erts_thr_progress_active(esdp, thr_prgr_active = 1); sched_wall_time_change(esdp, 1); - } + } erts_smp_runq_lock(rq); sched_active(esdp->no, rq); @@ -5712,7 +5940,8 @@ init_scheduler_data(ErtsSchedulerData* esdp, int num, ErtsSchedulerSleepInfo* ssi, ErtsRunQueue* runq, char** daww_ptr, size_t daww_sz, - Process *shadow_proc) + Process *shadow_proc, + Uint64 time_stamp) { esdp->timer_wheel = NULL; #ifdef ERTS_SMP @@ -5794,7 +6023,7 @@ init_scheduler_data(ErtsSchedulerData* esdp, int num, esdp->reductions = 0; - init_sched_wall_time(&esdp->sched_wall_time); + init_sched_wall_time(esdp, time_stamp); erts_port_task_handle_init(&esdp->nosuspend_port_task_handle); } @@ -6008,17 +6237,18 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online erts_aligned_scheduler_data = erts_alloc_permanent_cache_aligned(ERTS_ALC_T_SCHDLR_DATA, - n*sizeof(ErtsAlignedSchedulerData)); + n*sizeof(ErtsAlignedSchedulerData)); for (ix = 0; ix < n; ix++) { ErtsSchedulerData *esdp = ERTS_SCHEDULER_IX(ix); init_scheduler_data(esdp, ix+1, ERTS_SCHED_SLEEP_INFO_IX(ix), ERTS_RUNQ_IX(ix), &daww_ptr, daww_sz, - NULL); + NULL, 0); } #ifdef ERTS_DIRTY_SCHEDULERS { + Uint64 ts = sched_wall_time_ts(); int dirty_scheds = no_dirty_cpu_schedulers + no_dirty_io_schedulers; int adspix = 0; ErtsAlignedDirtyShadowProcess *adsp = @@ -6038,13 +6268,13 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online ErtsSchedulerData *esdp = ERTS_DIRTY_CPU_SCHEDULER_IX(ix); init_scheduler_data(esdp, ix+1, ERTS_DIRTY_CPU_SCHED_SLEEP_INFO_IX(ix), ERTS_DIRTY_CPU_RUNQ, NULL, 0, - &adsp[adspix++].dsp); + &adsp[adspix++].dsp, ts); } for (ix = 0; ix < no_dirty_io_schedulers; ix++) { ErtsSchedulerData *esdp = ERTS_DIRTY_IO_SCHEDULER_IX(ix); init_scheduler_data(esdp, ix+1, ERTS_DIRTY_IO_SCHED_SLEEP_INFO_IX(ix), ERTS_DIRTY_IO_RUNQ, NULL, 0, - &adsp[adspix++].dsp); + &adsp[adspix++].dsp, ts); } } #endif @@ -7506,8 +7736,10 @@ suspend_scheduler(ErtsSchedulerData *esdp) #endif - if (sched_type != ERTS_SCHED_NORMAL) + if (sched_type != ERTS_SCHED_NORMAL) { erts_smp_runq_unlock(esdp->run_queue); + dirty_sched_wall_time_change(esdp, 0); + } else { if (no != 1) evacuate_run_queue(esdp->run_queue, &sbp); @@ -7518,8 +7750,8 @@ suspend_scheduler(ErtsSchedulerData *esdp) if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_inactive); - } + erts_smp_mtx_lock(&schdlr_sspnd.mtx); flgs = sched_prep_spin_suspended(ssi, ERTS_SSI_FLG_SUSPENDED); @@ -7797,15 +8029,17 @@ suspend_scheduler(ErtsSchedulerData *esdp) ASSERT(curr_online); - if (sched_type == ERTS_SCHED_NORMAL) { - (void) erts_get_monotonic_time(esdp); - if (erts_system_profile_flags.scheduler) - profile_scheduler(make_small(esdp->no), am_active); + if (sched_type != ERTS_SCHED_NORMAL) + dirty_sched_wall_time_change(esdp, 1); + else { + (void) erts_get_monotonic_time(esdp); + if (erts_system_profile_flags.scheduler) + profile_scheduler(make_small(esdp->no), am_active); - if (!thr_prgr_active) { - erts_thr_progress_active(esdp, thr_prgr_active = 1); - sched_wall_time_change(esdp, 1); - } + if (!thr_prgr_active) { + erts_thr_progress_active(esdp, thr_prgr_active = 1); + sched_wall_time_change(esdp, 1); + } } erts_smp_runq_lock(esdp->run_queue); @@ -8518,6 +8752,8 @@ sched_dirty_cpu_thread_func(void *vesdp) callbacks.wait = NULL; callbacks.finalize_wait = NULL; + dirty_sched_wall_time_change(esdp, 1); + esdp->thr_id += erts_no_schedulers; erts_msacc_init_thread("dirty_cpu_scheduler", no, 0); @@ -8565,6 +8801,8 @@ sched_dirty_io_thread_func(void *vesdp) callbacks.wait = NULL; callbacks.finalize_wait = NULL; + dirty_sched_wall_time_change(esdp, 1); + esdp->thr_id += erts_no_schedulers + erts_no_dirty_cpu_schedulers; erts_msacc_init_thread("dirty_io_scheduler", no, 0); diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h index fc2055332b..ce0989883c 100644 --- a/erts/emulator/beam/erl_process.h +++ b/erts/emulator/beam/erl_process.h @@ -550,13 +550,15 @@ do { \ } while (0) typedef struct { - int need; /* "+sbu true" or scheduler_wall_time enabled */ + union { + erts_atomic32_t mod; /* on dirty schedulers */ + int need; /* "+sbu true" or scheduler_wall_time enabled */ + } u; int enabled; Uint64 start; struct { Uint64 total; Uint64 start; - int currently; } working; } ErtsSchedWallTime; @@ -1598,7 +1600,8 @@ void erts_init_scheduling(int, int void erts_execute_dirty_system_task(Process *c_p); #endif int erts_set_gc_state(Process *c_p, int enable); -Eterm erts_sched_wall_time_request(Process *c_p, int set, int enable); +Eterm erts_sched_wall_time_request(Process *c_p, int set, int enable, + int dirty_cpu, int want_dirty_io); Eterm erts_system_check_request(Process *c_p); Eterm erts_gc_info_request(Process *c_p); Uint64 erts_get_proc_interval(void); diff --git a/erts/emulator/test/statistics_SUITE.erl b/erts/emulator/test/statistics_SUITE.erl index a1f12ba93c..f51244485b 100644 --- a/erts/emulator/test/statistics_SUITE.erl +++ b/erts/emulator/test/statistics_SUITE.erl @@ -28,6 +28,8 @@ runtime_update/1, runtime_diff/1, run_queue_one/1, scheduler_wall_time/1, + scheduler_wall_time_all/1, + msb_scheduler_wall_time/1, reductions/1, reductions_big/1, garbage_collection/1, io/1, badarg/1, run_queues_lengths_active_tasks/1, msacc/1]). @@ -43,7 +45,9 @@ suite() -> all() -> [{group, wall_clock}, {group, runtime}, reductions, - reductions_big, {group, run_queue}, scheduler_wall_time, + reductions_big, {group, run_queue}, + scheduler_wall_time, scheduler_wall_time_all, + msb_scheduler_wall_time, garbage_collection, io, badarg, run_queues_lengths_active_tasks, msacc]. @@ -271,35 +275,64 @@ hog_iter(0, Mon) -> %% Tests that statistics(scheduler_wall_time) works as intended scheduler_wall_time(Config) when is_list(Config) -> + scheduler_wall_time_test(scheduler_wall_time). + +%% Tests that statistics(scheduler_wall_time_all) works as intended +scheduler_wall_time_all(Config) when is_list(Config) -> + scheduler_wall_time_test(scheduler_wall_time_all). + +scheduler_wall_time_test(Type) -> %% Should return undefined if system_flag is not turned on yet - undefined = statistics(scheduler_wall_time), + undefined = statistics(Type), %% Turn on statistics false = erlang:system_flag(scheduler_wall_time, true), try Schedulers = erlang:system_info(schedulers_online), + DirtyCPUSchedulers = erlang:system_info(dirty_cpu_schedulers_online), + DirtyIOSchedulers = erlang:system_info(dirty_io_schedulers), + TotLoadSchedulers = case Type of + scheduler_wall_time_all -> + Schedulers + DirtyCPUSchedulers + DirtyIOSchedulers; + scheduler_wall_time -> + Schedulers + DirtyCPUSchedulers + end, + %% Let testserver and everyone else finish their work timer:sleep(1500), %% Empty load - EmptyLoad = get_load(), + EmptyLoad = get_load(Type), {false, _} = {lists:any(fun(Load) -> Load > 50 end, EmptyLoad),EmptyLoad}, MeMySelfAndI = self(), StartHog = fun() -> - Pid = spawn(?MODULE, hog, [self()]), + Pid = spawn_link(?MODULE, hog, [self()]), receive hog_started -> MeMySelfAndI ! go end, Pid end, + StartDirtyHog = fun(Func) -> + F = fun () -> + erts_debug:Func(alive_waitexiting, + MeMySelfAndI) + end, + Pid = spawn_link(F), + receive {alive, Pid} -> ok end, + Pid + end, P1 = StartHog(), %% Max on one, the other schedulers empty (hopefully) %% Be generous the process can jump between schedulers %% which is ok and we don't want the test to fail for wrong reasons - _L1 = [S1Load|EmptyScheds1] = get_load(), + _L1 = [S1Load|EmptyScheds1] = get_load(Type), {true,_} = {S1Load > 50,S1Load}, {false,_} = {lists:any(fun(Load) -> Load > 50 end, EmptyScheds1),EmptyScheds1}, {true,_} = {lists:sum(EmptyScheds1) < 60,EmptyScheds1}, %% 50% load HalfHogs = [StartHog() || _ <- lists:seq(1, (Schedulers-1) div 2)], - HalfLoad = lists:sum(get_load()) div Schedulers, + HalfDirtyCPUHogs = [StartDirtyHog(dirty_cpu) + || _ <- lists:seq(1, DirtyCPUSchedulers div 2)], + HalfDirtyIOHogs = [StartDirtyHog(dirty_io) + || _ <- lists:seq(1, DirtyIOSchedulers div 2)], + HalfLoad = lists:sum(get_load(Type)) div TotLoadSchedulers, if Schedulers < 2, HalfLoad > 80 -> ok; %% Ok only one scheduler online and one hog %% We want roughly 50% load HalfLoad > 40, HalfLoad < 60 -> ok; @@ -308,23 +341,30 @@ scheduler_wall_time(Config) when is_list(Config) -> %% 100% load LastHogs = [StartHog() || _ <- lists:seq(1, Schedulers div 2)], - FullScheds = get_load(), + LastDirtyCPUHogs = [StartDirtyHog(dirty_cpu) + || _ <- lists:seq(1, DirtyCPUSchedulers div 2)], + LastDirtyIOHogs = [StartDirtyHog(dirty_io) + || _ <- lists:seq(1, DirtyIOSchedulers div 2)], + FullScheds = get_load(Type), {false,_} = {lists:any(fun(Load) -> Load < 80 end, FullScheds),FullScheds}, - FullLoad = lists:sum(FullScheds) div Schedulers, + FullLoad = lists:sum(FullScheds) div TotLoadSchedulers, if FullLoad > 90 -> ok; true -> exit({fullload, FullLoad}) end, KillHog = fun (HP) -> HPM = erlang:monitor(process, HP), + unlink(HP), exit(HP, kill), receive {'DOWN', HPM, process, HP, killed} -> ok end end, - [KillHog(Pid) || Pid <- [P1|HalfHogs++LastHogs]], - AfterLoad = get_load(), + [KillHog(Pid) || Pid <- [P1|HalfHogs++HalfDirtyCPUHogs++HalfDirtyIOHogs + ++LastHogs++LastDirtyCPUHogs++LastDirtyIOHogs]], + receive after 2000 -> ok end, %% Give dirty schedulers time to complete... + AfterLoad = get_load(Type), io:format("AfterLoad=~p~n", [AfterLoad]), {false,_} = {lists:any(fun(Load) -> Load > 25 end, AfterLoad),AfterLoad}, true = erlang:system_flag(scheduler_wall_time, false) @@ -332,16 +372,81 @@ scheduler_wall_time(Config) when is_list(Config) -> erlang:system_flag(scheduler_wall_time, false) end. -get_load() -> - Start = erlang:statistics(scheduler_wall_time), +get_load(Type) -> + Start = erlang:statistics(Type), timer:sleep(1500), - End = erlang:statistics(scheduler_wall_time), + End = erlang:statistics(Type), lists:reverse(lists:sort(load_percentage(lists:sort(Start),lists:sort(End)))). load_percentage([{Id, WN, TN}|Ss], [{Id, WP, TP}|Ps]) -> [100*(WN-WP) div (TN-TP)|load_percentage(Ss, Ps)]; load_percentage([], []) -> []. +count(0) -> + ok; +count(N) -> + count(N-1). + +msb_swt_hog(true) -> + count(1000000), + erts_debug:dirty_cpu(wait, 10), + erts_debug:dirty_io(wait, 10), + msb_swt_hog(true); +msb_swt_hog(false) -> + count(1000000), + msb_swt_hog(false). + +msb_scheduler_wall_time(Config) -> + erlang:system_flag(scheduler_wall_time, true), + Dirty = erlang:system_info(dirty_cpu_schedulers) /= 0, + Hogs = lists:map(fun (_) -> + spawn_opt(fun () -> + msb_swt_hog(Dirty) + end, [{priority,low}, link, monitor]) + end, lists:seq(1,10)), + erlang:system_flag(multi_scheduling, block), + try + SWT1 = lists:sort(statistics(scheduler_wall_time_all)), + %% io:format("SWT1 = ~p~n", [SWT1]), + receive after 4000 -> ok end, + SWT2 = lists:sort(statistics(scheduler_wall_time_all)), + %% io:format("SWT2 = ~p~n", [SWT2]), + SWT = lists:zip(SWT1, SWT2), + io:format("SU = ~p~n", [lists:map(fun({{I, A0, T0}, {I, A1, T1}}) -> + {I, (A1 - A0)/(T1 - T0)} end, + SWT)]), + {A, T} = lists:foldl(fun({{_, A0, T0}, {_, A1, T1}}, {Ai,Ti}) -> + {Ai + (A1 - A0), Ti + (T1 - T0)} + end, + {0, 0}, + SWT), + TSU = A/T, + WSU = ((TSU * (erlang:system_info(schedulers) + + erlang:system_info(dirty_cpu_schedulers) + + erlang:system_info(dirty_io_schedulers))) + / 1), + %% Weighted scheduler utilization should be + %% very close to 1.0, i.e., we execute the + %% same time as one thread executing all + %% the time... + io:format("WSU = ~p~n", [WSU]), + true = 0.9 < WSU andalso WSU < 1.1, + ok + after + erlang:system_flag(multi_scheduling, unblock), + erlang:system_flag(scheduler_wall_time, false), + lists:foreach(fun ({HP, _HM}) -> + unlink(HP), + exit(HP, kill) + end, Hogs), + lists:foreach(fun ({HP, HM}) -> + receive + {'DOWN', HM, process, HP, _} -> + ok + end + end, Hogs), + ok + end. %% Tests that statistics(garbage_collection) is callable. %% It is not clear how to test anything more. diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam index 359d28c0aa..0799546c60 100644 Binary files a/erts/preloaded/ebin/erlang.beam and b/erts/preloaded/ebin/erlang.beam differ diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl index 99ef5c6314..86ebb4dd4b 100644 --- a/erts/preloaded/src/erlang.erl +++ b/erts/preloaded/src/erlang.erl @@ -2332,6 +2332,10 @@ spawn_opt(_Tuple) -> (scheduler_wall_time) -> [{SchedulerId, ActiveTime, TotalTime}] | undefined when SchedulerId :: pos_integer(), ActiveTime :: non_neg_integer(), + TotalTime :: non_neg_integer(); + (scheduler_wall_time_all) -> [{SchedulerId, ActiveTime, TotalTime}] | undefined when + SchedulerId :: pos_integer(), + ActiveTime :: non_neg_integer(), TotalTime :: non_neg_integer(); (total_active_tasks) -> ActiveTasks when ActiveTasks :: non_neg_integer(); @@ -4014,6 +4018,7 @@ sched_wall_time(Ref, N, undefined) -> sched_wall_time(Ref, N, Acc) -> receive {Ref, undefined} -> sched_wall_time(Ref, N-1, undefined); + {Ref, SWTL} when erlang:is_list(SWTL) -> sched_wall_time(Ref, N-1, Acc ++ SWTL); {Ref, SWT} -> sched_wall_time(Ref, N-1, [SWT|Acc]) end. -- cgit v1.2.3