From d8ce99ba8dd966e87a61e50b70d01e7f1f381bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Mon, 15 Feb 2016 15:28:38 +0100 Subject: kernel: Refactor heart code --- lib/kernel/src/heart.erl | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 464b6919f1..97f384b1d8 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -49,6 +49,9 @@ -define(CYCLE_TIMEOUT, 10000). -define(HEART_PORT_NAME, heart_port). +-record(state,{port :: port(), + cmd :: [] | binary()}). + %%--------------------------------------------------------------------- -spec start() -> 'ignore' | {'error', term()} | {'ok', pid()}. @@ -85,7 +88,7 @@ init(Starter, Parent) -> case catch start_portprogram() of {ok, Port} -> Starter ! {ok, self()}, - loop(Parent, Port, ""); + loop(Parent, #state{port = Port, cmd = []}); no_heart -> Starter ! {no_heart, self()}; error -> @@ -182,7 +185,7 @@ wait_ack(Port) -> {error, Reason} end. -loop(Parent, Port, Cmd) -> +loop(Parent, #state{port=Port}=S) -> _ = send_heart_beat(Port), receive {From, set_cmd, NewCmd0} -> @@ -192,36 +195,36 @@ loop(Parent, Port, Cmd) -> _ = send_heart_cmd(Port, NewCmd), _ = wait_ack(Port), From ! {heart, ok}, - loop(Parent, Port, NewCmd); + loop(Parent, S#state{cmd=NewCmd}); _ -> From ! {heart, {error, {bad_cmd, NewCmd0}}}, - loop(Parent, Port, Cmd) + loop(Parent, S) end; {From, clear_cmd} -> From ! {heart, ok}, - _ = send_heart_cmd(Port, ""), + _ = send_heart_cmd(Port, []), _ = wait_ack(Port), - loop(Parent, Port, ""); + loop(Parent, S#state{cmd = []}); {From, get_cmd} -> From ! {heart, get_heart_cmd(Port)}, - loop(Parent, Port, Cmd); + loop(Parent, S); {From, cycle} -> %% Calls back to loop - do_cycle_port_program(From, Parent, Port, Cmd); + do_cycle_port_program(From, Parent, S); {'EXIT', Parent, shutdown} -> no_reboot_shutdown(Port); {'EXIT', Parent, Reason} -> exit(Port, Reason), exit(Reason); {'EXIT', Port, badsig} -> % we can ignore badsig-messages! - loop(Parent, Port, Cmd); + loop(Parent, S); {'EXIT', Port, _Reason} -> - exit({port_terminated, {heart, loop, [Parent, Port, Cmd]}}); + exit({port_terminated, {heart, loop, [Parent, S]}}); _ -> - loop(Parent, Port, Cmd) + loop(Parent, S) after ?TIMEOUT -> - loop(Parent, Port, Cmd) + loop(Parent, S) end. -spec no_reboot_shutdown(port()) -> no_return(). @@ -233,31 +236,31 @@ no_reboot_shutdown(Port) -> exit(normal) end. -do_cycle_port_program(Caller, Parent, Port, Cmd) -> +do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> unregister(?HEART_PORT_NAME), case catch start_portprogram() of {ok, NewPort} -> _ = send_shutdown(Port), receive {'EXIT', Port, _Reason} -> - _ = send_heart_cmd(NewPort, Cmd), + _ = send_heart_cmd(NewPort, S#state.cmd), Caller ! {heart, ok}, - loop(Parent, NewPort, Cmd) + loop(Parent, S#state{port=NewPort}) after ?CYCLE_TIMEOUT -> %% Huh! Two heart port programs running... %% well, the old one has to be sick not to respond %% so we'll settle for the new one... - _ = send_heart_cmd(NewPort, Cmd), + _ = send_heart_cmd(NewPort, S#state.cmd), Caller ! {heart, {error, stop_error}}, - loop(Parent, NewPort, Cmd) + loop(Parent, S#state{port=NewPort}) end; no_heart -> Caller ! {heart, {error, no_heart}}, - loop(Parent, Port, Cmd); + loop(Parent, S); error -> Caller ! {heart, {error, start_error}}, - loop(Parent, Port, Cmd) + loop(Parent, S) end. -- cgit v1.2.3 From f5a6e1c353e2a7fb536dcac4523e4f6e3d99039b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Mon, 15 Feb 2016 16:18:20 +0100 Subject: kernel: Add validation callback for heart * heart:set_callback/2 * heart:get_callback/0 * heart:clear_callback/0 The callback is called before every heartbeat to the heart port. The callback needs to return 'ok' if the validation is correct. --- lib/kernel/src/heart.erl | 97 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 19 deletions(-) diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 97f384b1d8..ad3bbf2f7a 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -34,7 +34,10 @@ %%% %%% It recognizes the flag '-heart' %%%-------------------------------------------------------------------- --export([start/0, init/2, set_cmd/1, clear_cmd/0, get_cmd/0, cycle/0]). +-export([start/0, init/2, + set_cmd/1, clear_cmd/0, get_cmd/0, + set_callback/2, clear_callback/0, get_callback/0, + cycle/0]). -define(START_ACK, 1). -define(HEART_BEAT, 2). @@ -50,7 +53,8 @@ -define(HEART_PORT_NAME, heart_port). -record(state,{port :: port(), - cmd :: [] | binary()}). + cmd :: [] | binary(), + callback :: 'undefined' | {module(), function()}}). %%--------------------------------------------------------------------- @@ -84,7 +88,7 @@ wait_for_init_ack(From) -> init(Starter, Parent) -> process_flag(trap_exit, true), process_flag(priority, max), - register(heart, self()), + register(?MODULE, self()), case catch start_portprogram() of {ok, Port} -> Starter ! {ok, self()}, @@ -99,20 +103,41 @@ init(Starter, Parent) -> Cmd :: string(). set_cmd(Cmd) -> - heart ! {self(), set_cmd, Cmd}, + ?MODULE ! {self(), set_cmd, Cmd}, wait(). -spec get_cmd() -> {ok, Cmd} when Cmd :: string(). get_cmd() -> - heart ! {self(), get_cmd}, + ?MODULE ! {self(), get_cmd}, wait(). -spec clear_cmd() -> ok. clear_cmd() -> - heart ! {self(), clear_cmd}, + ?MODULE ! {self(), clear_cmd}, + wait(). + +-spec set_callback(Module,Function) -> 'ok' | {'error', {'bad_callback', {Module, Function}}} when + Module :: module(), + Function :: function(). + +set_callback(Module, Function) -> + ?MODULE ! {self(), set_callback, {Module,Function}}, + wait(). + +-spec get_callback() -> {'ok', Callback} | 'none' when + Callback :: {module(), function()}. + +get_callback() -> + ?MODULE ! {self(), get_callback}, + wait(). + +-spec clear_callback() -> ok. + +clear_callback() -> + ?MODULE ! {self(), clear_callback}, wait(). @@ -120,12 +145,12 @@ clear_cmd() -> -spec cycle() -> 'ok' | {'error', term()}. cycle() -> - heart ! {self(), cycle}, + ?MODULE ! {self(), cycle}, wait(). wait() -> receive - {heart, Res} -> + {?MODULE, Res} -> Res end. @@ -186,7 +211,7 @@ wait_ack(Port) -> end. loop(Parent, #state{port=Port}=S) -> - _ = send_heart_beat(Port), + _ = send_heart_beat(S), receive {From, set_cmd, NewCmd0} -> Enc = file:native_name_encoding(), @@ -194,20 +219,39 @@ loop(Parent, #state{port=Port}=S) -> NewCmd when is_binary(NewCmd), byte_size(NewCmd) < 2047 -> _ = send_heart_cmd(Port, NewCmd), _ = wait_ack(Port), - From ! {heart, ok}, + From ! {?MODULE, ok}, loop(Parent, S#state{cmd=NewCmd}); _ -> - From ! {heart, {error, {bad_cmd, NewCmd0}}}, + From ! {?MODULE, {error, {bad_cmd, NewCmd0}}}, loop(Parent, S) end; {From, clear_cmd} -> - From ! {heart, ok}, + From ! {?MODULE, ok}, _ = send_heart_cmd(Port, []), _ = wait_ack(Port), loop(Parent, S#state{cmd = []}); {From, get_cmd} -> - From ! {heart, get_heart_cmd(Port)}, + From ! {?MODULE, get_heart_cmd(Port)}, loop(Parent, S); + {From, set_callback, Callback} -> + case Callback of + {M,F} when is_atom(M), is_atom(F) -> + From ! {?MODULE, ok}, + loop(Parent, S#state{callback=Callback}); + _ -> + From ! {?MODULE, {error, {bad_callback, Callback}}}, + loop(Parent, S) + end; + {From, get_callback} -> + Res = case S#state.callback of + undefined -> none; + Cb -> {ok, Cb} + end, + From ! {?MODULE, Res}, + loop(Parent, S); + {From, clear_callback} -> + From ! {?MODULE, ok}, + loop(Parent, S#state{callback=undefined}); {From, cycle} -> %% Calls back to loop do_cycle_port_program(From, Parent, S); @@ -219,7 +263,7 @@ loop(Parent, #state{port=Port}=S) -> {'EXIT', Port, badsig} -> % we can ignore badsig-messages! loop(Parent, S); {'EXIT', Port, _Reason} -> - exit({port_terminated, {heart, loop, [Parent, S]}}); + exit({port_terminated, {?MODULE, loop, [Parent, S]}}); _ -> loop(Parent, S) after @@ -244,7 +288,7 @@ do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> receive {'EXIT', Port, _Reason} -> _ = send_heart_cmd(NewPort, S#state.cmd), - Caller ! {heart, ok}, + Caller ! {?MODULE, ok}, loop(Parent, S#state{port=NewPort}) after ?CYCLE_TIMEOUT -> @@ -252,20 +296,22 @@ do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> %% well, the old one has to be sick not to respond %% so we'll settle for the new one... _ = send_heart_cmd(NewPort, S#state.cmd), - Caller ! {heart, {error, stop_error}}, + Caller ! {?MODULE, {error, stop_error}}, loop(Parent, S#state{port=NewPort}) end; no_heart -> - Caller ! {heart, {error, no_heart}}, + Caller ! {?MODULE, {error, no_heart}}, loop(Parent, S); error -> - Caller ! {heart, {error, start_error}}, + Caller ! {?MODULE, {error, start_error}}, loop(Parent, S) end. %% "Beates" the heart once. -send_heart_beat(Port) -> Port ! {self(), {command, [?HEART_BEAT]}}. +send_heart_beat(#state{port=Port, callback=Cb}) -> + ok = check_callback(Cb), + Port ! {self(), {command, [?HEART_BEAT]}}. %% Set a new HEART_COMMAND. send_heart_cmd(Port, []) -> @@ -280,6 +326,19 @@ get_heart_cmd(Port) -> {ok, Cmd} end. +%% validate system by performing a check before the heartbeat +%% return 'ok' if everything is alright. +%% Terminate if with reason if something is a miss. +%% It is fine to timeout in the callback, in fact that is the intention +%% if something goes wront -> no heartbeat. + +check_callback(Callback) -> + case Callback of + undefined -> ok; + {M,F} -> + erlang:apply(M,F,[]) + end. + %% Sends shutdown command to the port. send_shutdown(Port) -> Port ! {self(), {command, [?SHUT_DOWN]}}. -- cgit v1.2.3 From 4ea67ffdac2629255b1b0ed4e9423823f62c0947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Mon, 15 Feb 2016 18:12:48 +0100 Subject: erts: Add BIF erts_internal:system_check/1 This commit implements erts_internal:system_check(schedulers) with the intent of a basic responsiveness test check of the schedulers. --- erts/emulator/beam/atom.names | 1 + erts/emulator/beam/bif.tab | 2 + erts/emulator/beam/erl_alloc.types | 2 + erts/emulator/beam/erl_bif_info.c | 15 ++++++ erts/emulator/beam/erl_process.c | 97 ++++++++++++++++++++++++++++++++++++ erts/emulator/beam/erl_process.h | 1 + erts/preloaded/src/erts_internal.erl | 20 ++++++++ 7 files changed, 138 insertions(+) diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index 07f6492948..cb6d294a41 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -269,6 +269,7 @@ atom getenv atom gather_gc_info_result atom gather_io_bytes atom gather_sched_wall_time_result +atom gather_system_check_result atom getting_linked atom getting_unlinked atom global diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab index 4f0656d174..3177d5dae7 100644 --- a/erts/emulator/beam/bif.tab +++ b/erts/emulator/beam/bif.tab @@ -174,6 +174,8 @@ bif erts_internal:time_unit/0 bif erts_internal:is_system_process/1 +bif erts_internal:system_check/1 + # inet_db support bif erlang:port_set_data/2 bif erlang:port_get_data/1 diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types index 1ecebdeb07..7738531142 100644 --- a/erts/emulator/beam/erl_alloc.types +++ b/erts/emulator/beam/erl_alloc.types @@ -369,6 +369,7 @@ type AINFO_REQ STANDARD_LOW SYSTEM alloc_info_request type SCHED_WTIME_REQ STANDARD_LOW SYSTEM sched_wall_time_request type GC_INFO_REQ STANDARD_LOW SYSTEM gc_info_request type PORT_DATA_HEAP STANDARD_LOW SYSTEM port_data_heap +type SYS_CHECK_REQ STANDARD_LOW SYSTEM system_check_request +else # "fullword" @@ -389,6 +390,7 @@ type AINFO_REQ SHORT_LIVED SYSTEM alloc_info_request type SCHED_WTIME_REQ SHORT_LIVED SYSTEM sched_wall_time_request type GC_INFO_REQ SHORT_LIVED SYSTEM gc_info_request type PORT_DATA_HEAP STANDARD SYSTEM port_data_heap +type SYS_CHECK_REQ SHORT_LIVED SYSTEM system_check_request +endif diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index 414ff6711a..8bf6877bea 100644 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -65,6 +65,7 @@ static Export* gather_io_bytes_trap = NULL; static Export *gather_sched_wall_time_res_trap; static Export *gather_gc_info_res_trap; +static Export *gather_system_check_res_trap; #define DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1) @@ -3784,6 +3785,18 @@ BIF_RETTYPE erts_internal_is_system_process_1(BIF_ALIST_1) BIF_ERROR(BIF_P, BADARG); } +BIF_RETTYPE erts_internal_system_check_1(BIF_ALIST_1) +{ + Eterm res; + if (ERTS_IS_ATOM_STR("schedulers", BIF_ARG_1)) { + res = erts_system_check_request(BIF_P); + if (is_non_value(res)) + BIF_RET(am_undefined); + BIF_TRAP1(gather_system_check_res_trap, BIF_P, res); + } + + BIF_ERROR(BIF_P, BADARG); +} static erts_smp_atomic_t hipe_test_reschedule_flag; @@ -4391,6 +4404,8 @@ erts_bif_info_init(void) = erts_export_put(am_erlang, am_gather_gc_info_result, 1); gather_io_bytes_trap = erts_export_put(am_erts_internal, am_gather_io_bytes, 2); + gather_system_check_res_trap + = erts_export_put(am_erts_internal, am_gather_system_check_result, 1); process_info_init(); os_info_init(); } diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index b24f26138c..8e4a42fbe4 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -967,11 +967,25 @@ typedef struct { erts_smp_atomic32_t refc; } ErtsSchedWallTimeReq; +typedef struct { + Process *proc; + Eterm ref; + Eterm ref_heap[REF_THING_SIZE]; + Uint req_sched; + erts_smp_atomic32_t refc; +} ErtsSystemCheckReq; + + #if !HALFWORD_HEAP ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(swtreq, ErtsSchedWallTimeReq, 5, ERTS_ALC_T_SCHED_WTIME_REQ) + +ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(screq, + ErtsSystemCheckReq, + 5, + ERTS_ALC_T_SYS_CHECK_REQ) #else static ERTS_INLINE ErtsSchedWallTimeReq * swtreq_alloc(void) @@ -985,6 +999,19 @@ swtreq_free(ErtsSchedWallTimeReq *ptr) { erts_free(ERTS_ALC_T_SCHED_WTIME_REQ, ptr); } + +static ERTS_INLINE ErtsSystemCheckReq * +screq_alloc(void) +{ + return erts_alloc(ERTS_ALC_T_SYS_CHECK_REQ, + sizeof(ErtsSystemCheckReq)); +} + +static ERTS_INLINE void +screq_free(ErtsSystemCheckReq *ptr) +{ + erts_free(ERTS_ALC_T_SYS_CHECK_REQ, ptr); +} #endif static void @@ -1118,6 +1145,75 @@ erts_sched_wall_time_request(Process *c_p, int set, int enable) return ref; } +static void +reply_system_check(void *vscrp) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + ErtsSystemCheckReq *scrp = (ErtsSystemCheckReq *) vscrp; + ErtsProcLocks rp_locks = (scrp->req_sched == esdp->no ? ERTS_PROC_LOCK_MAIN : 0); + Process *rp = scrp->proc; + Eterm msg; + Eterm *hp = NULL; + Eterm **hpp; + Uint sz; + ErlOffHeap *ohp = NULL; + ErlHeapFragment *bp = NULL; + + ASSERT(esdp); +#ifdef ERTS_DIRTY_SCHEDULERS + ASSERT(!ERTS_SCHEDULER_IS_DIRTY(esdp)); +#endif + + sz = REF_THING_SIZE; + hp = erts_alloc_message_heap(sz, &bp, &ohp, rp, &rp_locks); + hpp = &hp; + msg = STORE_NC(hpp, ohp, scrp->ref); + + erts_queue_message(rp, &rp_locks, bp, msg, NIL); + + if (scrp->req_sched == esdp->no) + rp_locks &= ~ERTS_PROC_LOCK_MAIN; + + if (rp_locks) + erts_smp_proc_unlock(rp, rp_locks); + + erts_proc_dec_refc(rp); + + if (erts_smp_atomic32_dec_read_nob(&scrp->refc) == 0) + screq_free(vscrp); +} + + +Eterm erts_system_check_request(Process *c_p) { + ErtsSchedulerData *esdp = ERTS_PROC_GET_SCHDATA(c_p); + Eterm ref; + ErtsSystemCheckReq *scrp; + Eterm *hp; + + scrp = screq_alloc(); + ref = erts_make_ref(c_p); + hp = &scrp->ref_heap[0]; + + scrp->proc = c_p; + scrp->ref = STORE_NC(&hp, NULL, ref); + scrp->req_sched = esdp->no; + erts_smp_atomic32_init_nob(&scrp->refc, (erts_aint32_t) erts_no_schedulers); + + erts_proc_add_refc(c_p, (Sint) erts_no_schedulers); + +#ifdef ERTS_SMP + if (erts_no_schedulers > 1) + erts_schedule_multi_misc_aux_work(1, + erts_no_schedulers, + reply_system_check, + (void *) scrp); +#endif + + reply_system_check((void *) scrp); + + return ref; +} + static ERTS_INLINE ErtsProcList * proclist_create(Process *p) { @@ -5792,6 +5888,7 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online init_misc_aux_work(); #if !HALFWORD_HEAP init_swtreq_alloc(); + init_screq_alloc(); #endif erts_atomic32_init_nob(&debug_wait_completed_count, 0); /* debug only */ diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h index 799e49005c..f1570a835c 100644 --- a/erts/emulator/beam/erl_process.h +++ b/erts/emulator/beam/erl_process.h @@ -1483,6 +1483,7 @@ void erts_init_scheduling(int, int int erts_set_gc_state(Process *c_p, int enable); Eterm erts_sched_wall_time_request(Process *c_p, int set, int enable); +Eterm erts_system_check_request(Process *c_p); Eterm erts_gc_info_request(Process *c_p); Uint64 erts_get_proc_interval(void); Uint64 erts_ensure_later_proc_interval(Uint64); diff --git a/erts/preloaded/src/erts_internal.erl b/erts/preloaded/src/erts_internal.erl index 7ed4efea4b..6db77a8482 100644 --- a/erts/preloaded/src/erts_internal.erl +++ b/erts/preloaded/src/erts_internal.erl @@ -35,6 +35,9 @@ -export([port_command/3, port_connect/2, port_close/1, port_control/3, port_call/3, port_info/1, port_info/2]). +-export([system_check/1, + gather_system_check_result/1]). + -export([request_system_task/3]). -export([check_process_code/2]). @@ -197,6 +200,23 @@ request_system_task(_Pid, _Prio, _Request) -> check_process_code(_Module, _OptionList) -> erlang:nif_error(undefined). +-spec system_check(Type) -> 'ok' when + Type :: 'schedulers'. + +system_check(_Type) -> + erlang:nif_error(undefined). + +gather_system_check_result(Ref) when is_reference(Ref) -> + gather_system_check_result(Ref, erlang:system_info(schedulers)). + +gather_system_check_result(_Ref, 0) -> + ok; +gather_system_check_result(Ref, N) -> + receive + Ref -> + gather_system_check_result(Ref, N - 1) + end. + %% term compare where integer() < float() = true -spec cmp_term(A,B) -> Result when -- cgit v1.2.3 From f58e74b08380758d97c4d8a1d9ef53217f73fe6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Mon, 15 Feb 2016 18:16:18 +0100 Subject: kernel: Add basic system check of schedulers on heartbeat Before a heartbeat to the port program a responsiveness check of the schedulers is performed. If the responsiveness check fails, stalls, the heartbeat will not be performed (as intended). --- lib/kernel/src/heart.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index ad3bbf2f7a..617ae2f91b 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -333,6 +333,7 @@ get_heart_cmd(Port) -> %% if something goes wront -> no heartbeat. check_callback(Callback) -> + ok = erts_internal:system_check(schedulers), case Callback of undefined -> ok; {M,F} -> -- cgit v1.2.3 From 69dc6221cdca3c1ea55fdd55fad2a0de0b882d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 16 Feb 2016 14:59:44 +0100 Subject: kernel: Add heart callback test --- lib/kernel/test/heart_SUITE.erl | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/kernel/test/heart_SUITE.erl b/lib/kernel/test/heart_SUITE.erl index 83efbb4c35..b942d24bf9 100644 --- a/lib/kernel/test/heart_SUITE.erl +++ b/lib/kernel/test/heart_SUITE.erl @@ -27,6 +27,7 @@ node_start_immediately_after_crash/1, node_start_soon_after_crash/1, set_cmd/1, clear_cmd/1, get_cmd/1, + callback_api/1, dont_drop/1, kill_pid/1]). -export([init_per_testcase/2, end_per_testcase/2]). @@ -66,6 +67,7 @@ all() -> [ node_start_immediately_after_crash, node_start_soon_after_crash, set_cmd, clear_cmd, get_cmd, + callback_api, kill_pid ]. @@ -358,6 +360,41 @@ get_cmd(Config) when is_list(Config) -> stop_node(Node), ok. +callback_api(Config) when is_list(Config) -> + {ok, Node} = start_check(slave, heart_test), + none = rpc:call(Node, heart, get_callback, []), + M0 = self(), + F0 = ok, + {error, {bad_callback, {M0,F0}}} = rpc:call(Node, heart, set_callback, [M0,F0]), + none = rpc:call(Node, heart, get_callback, []), + M1 = lists:duplicate(28, $a), + F1 = lists:duplicate(28, $b), + {error, {bad_callback, {M1,F1}}} = rpc:call(Node, heart, set_callback, [M1,F1]), + none = rpc:call(Node, heart, get_callback, []), + + M2 = heart_check_module, + F2 = cb_ok, + F3 = cb_error, + Code0 = generate(M2, [], [ + atom_to_list(F2) ++ "() -> ok.", + atom_to_list(F3) ++ "() -> exit(\"callback_error (as intended)\")." + ]), + {module, M2} = rpc:call(Node, erlang, load_module, [M2, Code0]), + ok = rpc:call(Node, M2, F2, []), + ok = rpc:call(Node, heart, set_callback, [M2,F2]), + {ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []), + ok = rpc:call(Node, heart, clear_callback, []), + none = rpc:call(Node, heart, get_callback, []), + ok = rpc:call(Node, heart, set_callback, [M2,F2]), + {ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []), + ok = rpc:call(Node, heart, set_callback, [M2,F3]), + receive {nodedown, Node} -> ok + after 5000 -> test_server:fail(node_not_killed) + end, + stop_node(Node), + ok. + + dont_drop(suite) -> %%% Removed as it may crash epmd/distribution in colourful %%% ways. While we ARE finding out WHY, it would -- cgit v1.2.3 From a50a75e4ff842b600ba526a37e64ac8006fa8249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 16 Feb 2016 10:43:51 +0100 Subject: heart: Remove dead code --- erts/etc/common/heart.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/erts/etc/common/heart.c b/erts/etc/common/heart.c index 9571b83ffd..1a826221fb 100644 --- a/erts/etc/common/heart.c +++ b/erts/etc/common/heart.c @@ -472,10 +472,6 @@ message_loop(erlin_fd, erlout_fd) switch (mp->op) { case HEART_BEAT: timestamp(&last_received); -#ifdef USE_WATCHDOG - /* reset the hardware watchdog timer */ - wd_reset(); -#endif break; case SHUT_DOWN: return R_SHUT_DOWN; -- cgit v1.2.3 From b727eb1d46cc4238e4fe7e6070f41dbfe2d21691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 16 Feb 2016 10:09:04 +0100 Subject: Update preloaded erts_internal.beam --- erts/preloaded/ebin/erts_internal.beam | Bin 5960 -> 6516 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/erts/preloaded/ebin/erts_internal.beam b/erts/preloaded/ebin/erts_internal.beam index 32d5d70122..21dde7f257 100644 Binary files a/erts/preloaded/ebin/erts_internal.beam and b/erts/preloaded/ebin/erts_internal.beam differ -- cgit v1.2.3 From a3ed00b212fedb7dbe9dc5daf4c3beaf2c4dbe9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 16 Feb 2016 16:02:34 +0100 Subject: kernel: Fix heart dialyzer types --- lib/kernel/src/heart.erl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 617ae2f91b..686e2e03ff 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -54,7 +54,7 @@ -record(state,{port :: port(), cmd :: [] | binary(), - callback :: 'undefined' | {module(), function()}}). + callback :: 'undefined' | {atom(), atom()}}). %%--------------------------------------------------------------------- @@ -120,15 +120,16 @@ clear_cmd() -> wait(). -spec set_callback(Module,Function) -> 'ok' | {'error', {'bad_callback', {Module, Function}}} when - Module :: module(), - Function :: function(). + Module :: atom(), + Function :: atom(). set_callback(Module, Function) -> ?MODULE ! {self(), set_callback, {Module,Function}}, wait(). --spec get_callback() -> {'ok', Callback} | 'none' when - Callback :: {module(), function()}. +-spec get_callback() -> {'ok', {Module, Function}} | 'none' when + Module :: atom(), + Function :: atom(). get_callback() -> ?MODULE ! {self(), get_callback}, -- cgit v1.2.3 From bfdc47eca9d322680ccda988e4a1fae57f433e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 16 Feb 2016 16:09:52 +0100 Subject: kernel: Add heart validation callback documentation --- lib/kernel/doc/src/heart.xml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/kernel/doc/src/heart.xml b/lib/kernel/doc/src/heart.xml index b9fad17ce1..f32b74dc24 100644 --- a/lib/kernel/doc/src/heart.xml +++ b/lib/kernel/doc/src/heart.xml @@ -154,6 +154,34 @@ the empty string will be returned.

+ + + + Set a validation callback + +

This validation callback will be executed before any heartbeat sent + to the port program. For the validation to succeed it needs to return + with the value ok. +

+

An exception within the callback will be treated as a validation failure.

+

The callback will be removed if the system reboots.

+
+
+ + + Clear the validation callback + +

Removes the validation callback call before heartbeats.

+
+
+ + + Get the validation callback + +

Get the validation callback. If the callback is cleared, none will be returned.

+
+
+ -- cgit v1.2.3 From f623b7430ec8f7bc48770953b32a486816ad6eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Mon, 22 Feb 2016 16:59:49 +0100 Subject: kernel: Add builtin scheduler check for heart In addition, the heart API is extended with the following functions: * heart:set_options/1 * heart:get_options/0 If heart:set_options([scheduler]) is set, heart will check scheduler responsiveness before every heartbeat to the heart port. --- lib/kernel/src/heart.erl | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 686e2e03ff..273bdcb352 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -37,6 +37,7 @@ -export([start/0, init/2, set_cmd/1, clear_cmd/0, get_cmd/0, set_callback/2, clear_callback/0, get_callback/0, + set_options/1, get_options/0, cycle/0]). -define(START_ACK, 1). @@ -54,6 +55,7 @@ -record(state,{port :: port(), cmd :: [] | binary(), + options :: [atom()], callback :: 'undefined' | {atom(), atom()}}). %%--------------------------------------------------------------------- @@ -92,7 +94,7 @@ init(Starter, Parent) -> case catch start_portprogram() of {ok, Port} -> Starter ! {ok, self()}, - loop(Parent, #state{port = Port, cmd = []}); + loop(Parent, #state{port=Port, cmd=[], options=[]}); no_heart -> Starter ! {no_heart, self()}; error -> @@ -141,6 +143,19 @@ clear_callback() -> ?MODULE ! {self(), clear_callback}, wait(). +-spec set_options(Options) -> 'ok' | {'error', {'bad_options', Options}} when + Options :: [atom()]. + +set_options(Options) -> + ?MODULE ! {self(), set_options, Options}, + wait(). + +-spec get_options() -> {'ok', Options} | 'none' when + Options :: [atom()]. + +get_options() -> + ?MODULE ! {self(), get_options}, + wait(). %%% Should be used solely by the release handler!!!!!!! -spec cycle() -> 'ok' | {'error', term()}. @@ -253,6 +268,22 @@ loop(Parent, #state{port=Port}=S) -> {From, clear_callback} -> From ! {?MODULE, ok}, loop(Parent, S#state{callback=undefined}); + {From, set_options, Options} -> + case validate_options(Options) of + Validated when is_list(Validated) -> + From ! {?MODULE, ok}, + loop(Parent, S#state{options=Validated}); + _ -> + From ! {?MODULE, {error, {bad_options, Options}}}, + loop(Parent, S) + end; + {From, get_options} -> + Res = case S#state.options of + [] -> none; + Cb -> {ok, Cb} + end, + From ! {?MODULE, Res}, + loop(Parent, S); {From, cycle} -> %% Calls back to loop do_cycle_port_program(From, Parent, S); @@ -281,6 +312,11 @@ no_reboot_shutdown(Port) -> exit(normal) end. +validate_options(Opts) -> validate_options(Opts,[]). +validate_options([],Res) -> Res; +validate_options([scheduler=Opt|Opts],Res) -> validate_options(Opts,[Opt|Res]); +validate_options(_,_) -> error. + do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> unregister(?HEART_PORT_NAME), case catch start_portprogram() of @@ -310,7 +346,8 @@ do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> %% "Beates" the heart once. -send_heart_beat(#state{port=Port, callback=Cb}) -> +send_heart_beat(#state{port=Port, callback=Cb, options=Opts}) -> + ok = check_system(Opts), ok = check_callback(Cb), Port ! {self(), {command, [?HEART_BEAT]}}. @@ -327,6 +364,11 @@ get_heart_cmd(Port) -> {ok, Cmd} end. +check_system([]) -> ok; +check_system([scheduler|Opts]) -> + ok = erts_internal:system_check(schedulers), + check_system(Opts). + %% validate system by performing a check before the heartbeat %% return 'ok' if everything is alright. %% Terminate if with reason if something is a miss. @@ -334,7 +376,6 @@ get_heart_cmd(Port) -> %% if something goes wront -> no heartbeat. check_callback(Callback) -> - ok = erts_internal:system_check(schedulers), case Callback of undefined -> ok; {M,F} -> -- cgit v1.2.3 From 4603e0d2b49e7ab488698a22b7831d3f3f113166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Mon, 22 Feb 2016 17:45:21 +0100 Subject: kernel: Add heart options documentation --- lib/kernel/doc/src/heart.xml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/lib/kernel/doc/src/heart.xml b/lib/kernel/doc/src/heart.xml index f32b74dc24..a4569b5fb8 100644 --- a/lib/kernel/doc/src/heart.xml +++ b/lib/kernel/doc/src/heart.xml @@ -182,6 +182,34 @@ + + + Set a list of options + +

Valid options set_options are:

+ + scheduler + +

If enabled, a message will be sent to each scheduler to check its + responsiveness. The system check occurs before any heartbeat sent + to the port program. If any scheduler is not responsive enough the + heart program will not receive its heartpeat and terminate the node. +

+
+
+

Returns with the value ok if the options are valid.

+
+
+ + + Get the temporary reboot command + +

Returns {ok, Options} where Options is a list of current options enabled for heart. + If the callback is cleared, none will be returned.

+
+
+ + -- cgit v1.2.3 From bedde8c59e568b15bee4b36dc4e4eaa93d00f63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Tue, 23 Feb 2016 16:15:10 +0100 Subject: kernel: Add heart options test --- lib/kernel/test/heart_SUITE.erl | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/lib/kernel/test/heart_SUITE.erl b/lib/kernel/test/heart_SUITE.erl index b942d24bf9..6cf451b10b 100644 --- a/lib/kernel/test/heart_SUITE.erl +++ b/lib/kernel/test/heart_SUITE.erl @@ -28,6 +28,7 @@ node_start_soon_after_crash/1, set_cmd/1, clear_cmd/1, get_cmd/1, callback_api/1, + options_api/1, dont_drop/1, kill_pid/1]). -export([init_per_testcase/2, end_per_testcase/2]). @@ -68,6 +69,7 @@ all() -> [ node_start_soon_after_crash, set_cmd, clear_cmd, get_cmd, callback_api, + options_api, kill_pid ]. @@ -394,6 +396,34 @@ callback_api(Config) when is_list(Config) -> stop_node(Node), ok. +options_api(Config) when is_list(Config) -> + {ok, Node} = start_check(slave, heart_test), + none = rpc:call(Node, heart, get_options, []), + M0 = self(), + F0 = ok, + {error, {bad_options, {M0,F0}}} = rpc:call(Node, heart, set_options, [{M0,F0}]), + none = rpc:call(Node, heart, get_options, []), + Ls = lists:duplicate(28, $b), + {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]), + none = rpc:call(Node, heart, get_options, []), + + ok = rpc:call(Node, heart, set_options, [[scheduler]]), + {ok, [scheduler]} = rpc:call(Node, heart, get_options, []), + ok = rpc:call(Node, heart, set_options, [[]]), + none = rpc:call(Node, heart, get_options, []), + + ok = rpc:call(Node, heart, set_options, [[scheduler]]), + {ok, [scheduler]} = rpc:call(Node, heart, get_options, []), + {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]), + {ok, [scheduler]} = rpc:call(Node, heart, get_options, []), + + receive after 3000 -> ok end, %% wait 3 secs + + ok = rpc:call(Node, heart, set_options, [[]]), + none = rpc:call(Node, heart, get_options, []), + stop_node(Node), + ok. + dont_drop(suite) -> %%% Removed as it may crash epmd/distribution in colourful -- cgit v1.2.3 From 0ca09ee90d6384e74d1b18ab0e05f2c05fc03905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn-Egil=20Dahlberg?= Date: Thu, 25 Feb 2016 15:13:32 +0100 Subject: kernel: Clarify heart option Change scheduler responsiveness to 'check_schedulers'. --- lib/kernel/doc/src/heart.xml | 13 ++++++++++--- lib/kernel/src/heart.erl | 13 +++++++++---- lib/kernel/test/heart_SUITE.erl | 10 +++++----- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/lib/kernel/doc/src/heart.xml b/lib/kernel/doc/src/heart.xml index a4569b5fb8..9da4773f2d 100644 --- a/lib/kernel/doc/src/heart.xml +++ b/lib/kernel/doc/src/heart.xml @@ -118,6 +118,13 @@

In the following descriptions, all function fails with reason badarg if heart is not started.

+ + + + + + + @@ -188,12 +195,12 @@

Valid options set_options are:

- scheduler + check_schedulers -

If enabled, a message will be sent to each scheduler to check its +

If enabled, a signal will be sent to each scheduler to check its responsiveness. The system check occurs before any heartbeat sent to the port program. If any scheduler is not responsive enough the - heart program will not receive its heartpeat and terminate the node. + heart program will not receive its heartbeat and thus eventually terminate the node.

diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 273bdcb352..d14598cd05 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -53,9 +53,14 @@ -define(CYCLE_TIMEOUT, 10000). -define(HEART_PORT_NAME, heart_port). +%% valid heart options +-define(SCHEDULER_CHECK_OPT, check_schedulers). + +-type heart_option() :: ?SCHEDULER_CHECK_OPT. + -record(state,{port :: port(), cmd :: [] | binary(), - options :: [atom()], + options :: [heart_option()], callback :: 'undefined' | {atom(), atom()}}). %%--------------------------------------------------------------------- @@ -144,7 +149,7 @@ clear_callback() -> wait(). -spec set_options(Options) -> 'ok' | {'error', {'bad_options', Options}} when - Options :: [atom()]. + Options :: [heart_option()]. set_options(Options) -> ?MODULE ! {self(), set_options, Options}, @@ -314,7 +319,7 @@ no_reboot_shutdown(Port) -> validate_options(Opts) -> validate_options(Opts,[]). validate_options([],Res) -> Res; -validate_options([scheduler=Opt|Opts],Res) -> validate_options(Opts,[Opt|Res]); +validate_options([?SCHEDULER_CHECK_OPT=Opt|Opts],Res) -> validate_options(Opts,[Opt|Res]); validate_options(_,_) -> error. do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> @@ -365,7 +370,7 @@ get_heart_cmd(Port) -> end. check_system([]) -> ok; -check_system([scheduler|Opts]) -> +check_system([?SCHEDULER_CHECK_OPT|Opts]) -> ok = erts_internal:system_check(schedulers), check_system(Opts). diff --git a/lib/kernel/test/heart_SUITE.erl b/lib/kernel/test/heart_SUITE.erl index 6cf451b10b..39cd29cea0 100644 --- a/lib/kernel/test/heart_SUITE.erl +++ b/lib/kernel/test/heart_SUITE.erl @@ -407,15 +407,15 @@ options_api(Config) when is_list(Config) -> {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]), none = rpc:call(Node, heart, get_options, []), - ok = rpc:call(Node, heart, set_options, [[scheduler]]), - {ok, [scheduler]} = rpc:call(Node, heart, get_options, []), + ok = rpc:call(Node, heart, set_options, [[check_schedulers]]), + {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []), ok = rpc:call(Node, heart, set_options, [[]]), none = rpc:call(Node, heart, get_options, []), - ok = rpc:call(Node, heart, set_options, [[scheduler]]), - {ok, [scheduler]} = rpc:call(Node, heart, get_options, []), + ok = rpc:call(Node, heart, set_options, [[check_schedulers]]), + {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []), {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]), - {ok, [scheduler]} = rpc:call(Node, heart, get_options, []), + {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []), receive after 3000 -> ok end, %% wait 3 secs -- cgit v1.2.3