diff options
Diffstat (limited to 'lib/kernel')
-rw-r--r-- | lib/kernel/doc/src/heart.xml | 63 | ||||
-rw-r--r-- | lib/kernel/src/heart.erl | 182 | ||||
-rw-r--r-- | lib/kernel/test/heart_SUITE.erl | 67 |
3 files changed, 276 insertions, 36 deletions
diff --git a/lib/kernel/doc/src/heart.xml b/lib/kernel/doc/src/heart.xml index b9fad17ce1..9da4773f2d 100644 --- a/lib/kernel/doc/src/heart.xml +++ b/lib/kernel/doc/src/heart.xml @@ -118,6 +118,13 @@ <p>In the following descriptions, all function fails with reason <c>badarg</c> if <c>heart</c> is not started.</p> </description> + + <datatypes> + <datatype> + <name name="heart_option"/> + </datatype> + </datatypes> + <funcs> <func> <name name="set_cmd" arity="1"/> @@ -154,6 +161,62 @@ the empty string will be returned.</p> </desc> </func> + + <func> + <name name="set_callback" arity="2"/> + <fsummary>Set a validation callback</fsummary> + <desc> + <p> This validation callback will be executed before any heartbeat sent + to the port program. For the validation to succeed it needs to return + with the value <c>ok</c>. + </p> + <p> An exception within the callback will be treated as a validation failure. </p> + <p> The callback will be removed if the system reboots. </p> + </desc> + </func> + <func> + <name name="clear_callback" arity="0"/> + <fsummary>Clear the validation callback</fsummary> + <desc> + <p>Removes the validation callback call before heartbeats.</p> + </desc> + </func> + <func> + <name name="get_callback" arity="0"/> + <fsummary>Get the validation callback</fsummary> + <desc> + <p>Get the validation callback. If the callback is cleared, <c>none</c> will be returned.</p> + </desc> + </func> + + <func> + <name name="set_options" arity="1"/> + <fsummary>Set a list of options</fsummary> + <desc> + <p> Valid options <c>set_options</c> are: </p> + <taglist> + <tag><c>check_schedulers</c></tag> + <item> + <p>If enabled, a signal will be sent to each scheduler to check its + responsiveness. The system check occurs before any heartbeat sent + to the port program. If any scheduler is not responsive enough the + heart program will not receive its heartbeat and thus eventually terminate the node. + </p> + </item> + </taglist> + <p> Returns with the value <c>ok</c> if the options are valid.</p> + </desc> + </func> + <func> + <name name="get_options" arity="0"/> + <fsummary>Get the temporary reboot command</fsummary> + <desc> + <p>Returns <c>{ok, Options}</c> where <c>Options</c> is a list of current options enabled for heart. + If the callback is cleared, <c>none</c> will be returned.</p> + </desc> + </func> + + </funcs> </erlref> diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl index 137fad706f..eea78aabdf 100644 --- a/lib/kernel/src/heart.erl +++ b/lib/kernel/src/heart.erl @@ -34,7 +34,11 @@ %%% %%% It recognizes the flag '-heart' %%%-------------------------------------------------------------------- --export([start/0, init/2, set_cmd/1, clear_cmd/0, get_cmd/0, cycle/0]). +-export([start/0, init/2, + set_cmd/1, clear_cmd/0, get_cmd/0, + set_callback/2, clear_callback/0, get_callback/0, + set_options/1, get_options/0, + cycle/0]). -define(START_ACK, 1). -define(HEART_BEAT, 2). @@ -49,6 +53,16 @@ -define(CYCLE_TIMEOUT, 10000). -define(HEART_PORT_NAME, heart_port). +%% valid heart options +-define(SCHEDULER_CHECK_OPT, check_schedulers). + +-type heart_option() :: ?SCHEDULER_CHECK_OPT. + +-record(state,{port :: port(), + cmd :: [] | binary(), + options :: [heart_option()], + callback :: 'undefined' | {atom(), atom()}}). + %%--------------------------------------------------------------------- -spec start() -> 'ignore' | {'error', term()} | {'ok', pid()}. @@ -81,11 +95,11 @@ wait_for_init_ack(From) -> init(Starter, Parent) -> process_flag(trap_exit, true), process_flag(priority, max), - register(heart, self()), + register(?MODULE, self()), case catch start_portprogram() of {ok, Port} -> Starter ! {ok, self()}, - loop(Parent, Port, ""); + loop(Parent, #state{port=Port, cmd=[], options=[]}); no_heart -> Starter ! {no_heart, self()}; error -> @@ -96,33 +110,68 @@ init(Starter, Parent) -> Cmd :: string(). set_cmd(Cmd) -> - heart ! {self(), set_cmd, Cmd}, + ?MODULE ! {self(), set_cmd, Cmd}, wait(). -spec get_cmd() -> {ok, Cmd} when Cmd :: string(). get_cmd() -> - heart ! {self(), get_cmd}, + ?MODULE ! {self(), get_cmd}, wait(). -spec clear_cmd() -> ok. clear_cmd() -> - heart ! {self(), clear_cmd}, + ?MODULE ! {self(), clear_cmd}, + wait(). + +-spec set_callback(Module,Function) -> 'ok' | {'error', {'bad_callback', {Module, Function}}} when + Module :: atom(), + Function :: atom(). + +set_callback(Module, Function) -> + ?MODULE ! {self(), set_callback, {Module,Function}}, + wait(). + +-spec get_callback() -> {'ok', {Module, Function}} | 'none' when + Module :: atom(), + Function :: atom(). + +get_callback() -> + ?MODULE ! {self(), get_callback}, + wait(). + +-spec clear_callback() -> ok. + +clear_callback() -> + ?MODULE ! {self(), clear_callback}, + wait(). + +-spec set_options(Options) -> 'ok' | {'error', {'bad_options', Options}} when + Options :: [heart_option()]. + +set_options(Options) -> + ?MODULE ! {self(), set_options, Options}, wait(). +-spec get_options() -> {'ok', Options} | 'none' when + Options :: [atom()]. + +get_options() -> + ?MODULE ! {self(), get_options}, + wait(). %%% Should be used solely by the release handler!!!!!!! -spec cycle() -> 'ok' | {'error', term()}. cycle() -> - heart ! {self(), cycle}, + ?MODULE ! {self(), cycle}, wait(). wait() -> receive - {heart, Res} -> + {?MODULE, Res} -> Res end. @@ -182,8 +231,8 @@ wait_ack(Port) -> {error, Reason} end. -loop(Parent, Port, Cmd) -> - _ = send_heart_beat(Port), +loop(Parent, #state{port=Port}=S) -> + _ = send_heart_beat(S), receive {From, set_cmd, NewCmd0} -> Enc = file:native_name_encoding(), @@ -191,37 +240,72 @@ loop(Parent, Port, Cmd) -> NewCmd when is_binary(NewCmd), byte_size(NewCmd) < 2047 -> _ = send_heart_cmd(Port, NewCmd), _ = wait_ack(Port), - From ! {heart, ok}, - loop(Parent, Port, NewCmd); + From ! {?MODULE, ok}, + loop(Parent, S#state{cmd=NewCmd}); _ -> - From ! {heart, {error, {bad_cmd, NewCmd0}}}, - loop(Parent, Port, Cmd) + From ! {?MODULE, {error, {bad_cmd, NewCmd0}}}, + loop(Parent, S) end; {From, clear_cmd} -> - From ! {heart, ok}, - _ = send_heart_cmd(Port, ""), + From ! {?MODULE, ok}, + _ = send_heart_cmd(Port, []), _ = wait_ack(Port), - loop(Parent, Port, ""); + loop(Parent, S#state{cmd = []}); {From, get_cmd} -> - From ! {heart, get_heart_cmd(Port)}, - loop(Parent, Port, Cmd); + From ! {?MODULE, get_heart_cmd(Port)}, + loop(Parent, S); + {From, set_callback, Callback} -> + case Callback of + {M,F} when is_atom(M), is_atom(F) -> + From ! {?MODULE, ok}, + loop(Parent, S#state{callback=Callback}); + _ -> + From ! {?MODULE, {error, {bad_callback, Callback}}}, + loop(Parent, S) + end; + {From, get_callback} -> + Res = case S#state.callback of + undefined -> none; + Cb -> {ok, Cb} + end, + From ! {?MODULE, Res}, + loop(Parent, S); + {From, clear_callback} -> + From ! {?MODULE, ok}, + loop(Parent, S#state{callback=undefined}); + {From, set_options, Options} -> + case validate_options(Options) of + Validated when is_list(Validated) -> + From ! {?MODULE, ok}, + loop(Parent, S#state{options=Validated}); + _ -> + From ! {?MODULE, {error, {bad_options, Options}}}, + loop(Parent, S) + end; + {From, get_options} -> + Res = case S#state.options of + [] -> none; + Cb -> {ok, Cb} + end, + From ! {?MODULE, Res}, + loop(Parent, S); {From, cycle} -> %% Calls back to loop - do_cycle_port_program(From, Parent, Port, Cmd); + do_cycle_port_program(From, Parent, S); {'EXIT', Parent, shutdown} -> no_reboot_shutdown(Port); {'EXIT', Parent, Reason} -> exit(Port, Reason), exit(Reason); {'EXIT', Port, badsig} -> % we can ignore badsig-messages! - loop(Parent, Port, Cmd); + loop(Parent, S); {'EXIT', Port, _Reason} -> - exit({port_terminated, {heart, loop, [Parent, Port, Cmd]}}); + exit({port_terminated, {?MODULE, loop, [Parent, S]}}); _ -> - loop(Parent, Port, Cmd) + loop(Parent, S) after ?TIMEOUT -> - loop(Parent, Port, Cmd) + loop(Parent, S) end. -spec no_reboot_shutdown(port()) -> no_return(). @@ -233,36 +317,44 @@ no_reboot_shutdown(Port) -> exit(normal) end. -do_cycle_port_program(Caller, Parent, Port, Cmd) -> +validate_options(Opts) -> validate_options(Opts,[]). +validate_options([],Res) -> Res; +validate_options([?SCHEDULER_CHECK_OPT=Opt|Opts],Res) -> validate_options(Opts,[Opt|Res]); +validate_options(_,_) -> error. + +do_cycle_port_program(Caller, Parent, #state{port=Port} = S) -> unregister(?HEART_PORT_NAME), case catch start_portprogram() of {ok, NewPort} -> _ = send_shutdown(Port), receive {'EXIT', Port, _Reason} -> - _ = send_heart_cmd(NewPort, Cmd), - Caller ! {heart, ok}, - loop(Parent, NewPort, Cmd) + _ = send_heart_cmd(NewPort, S#state.cmd), + Caller ! {?MODULE, ok}, + loop(Parent, S#state{port=NewPort}) after ?CYCLE_TIMEOUT -> %% Huh! Two heart port programs running... %% well, the old one has to be sick not to respond %% so we'll settle for the new one... - _ = send_heart_cmd(NewPort, Cmd), - Caller ! {heart, {error, stop_error}}, - loop(Parent, NewPort, Cmd) + _ = send_heart_cmd(NewPort, S#state.cmd), + Caller ! {?MODULE, {error, stop_error}}, + loop(Parent, S#state{port=NewPort}) end; no_heart -> - Caller ! {heart, {error, no_heart}}, - loop(Parent, Port, Cmd); + Caller ! {?MODULE, {error, no_heart}}, + loop(Parent, S); error -> - Caller ! {heart, {error, start_error}}, - loop(Parent, Port, Cmd) + Caller ! {?MODULE, {error, start_error}}, + loop(Parent, S) end. %% "Beates" the heart once. -send_heart_beat(Port) -> Port ! {self(), {command, [?HEART_BEAT]}}. +send_heart_beat(#state{port=Port, callback=Cb, options=Opts}) -> + ok = check_system(Opts), + ok = check_callback(Cb), + Port ! {self(), {command, [?HEART_BEAT]}}. %% Set a new HEART_COMMAND. -dialyzer({no_improper_lists, send_heart_cmd/2}). @@ -278,6 +370,24 @@ get_heart_cmd(Port) -> {ok, Cmd} end. +check_system([]) -> ok; +check_system([?SCHEDULER_CHECK_OPT|Opts]) -> + ok = erts_internal:system_check(schedulers), + check_system(Opts). + +%% validate system by performing a check before the heartbeat +%% return 'ok' if everything is alright. +%% Terminate if with reason if something is a miss. +%% It is fine to timeout in the callback, in fact that is the intention +%% if something goes wront -> no heartbeat. + +check_callback(Callback) -> + case Callback of + undefined -> ok; + {M,F} -> + erlang:apply(M,F,[]) + end. + %% Sends shutdown command to the port. send_shutdown(Port) -> Port ! {self(), {command, [?SHUT_DOWN]}}. diff --git a/lib/kernel/test/heart_SUITE.erl b/lib/kernel/test/heart_SUITE.erl index 83efbb4c35..39cd29cea0 100644 --- a/lib/kernel/test/heart_SUITE.erl +++ b/lib/kernel/test/heart_SUITE.erl @@ -27,6 +27,8 @@ node_start_immediately_after_crash/1, node_start_soon_after_crash/1, set_cmd/1, clear_cmd/1, get_cmd/1, + callback_api/1, + options_api/1, dont_drop/1, kill_pid/1]). -export([init_per_testcase/2, end_per_testcase/2]). @@ -66,6 +68,8 @@ all() -> [ node_start_immediately_after_crash, node_start_soon_after_crash, set_cmd, clear_cmd, get_cmd, + callback_api, + options_api, kill_pid ]. @@ -358,6 +362,69 @@ get_cmd(Config) when is_list(Config) -> stop_node(Node), ok. +callback_api(Config) when is_list(Config) -> + {ok, Node} = start_check(slave, heart_test), + none = rpc:call(Node, heart, get_callback, []), + M0 = self(), + F0 = ok, + {error, {bad_callback, {M0,F0}}} = rpc:call(Node, heart, set_callback, [M0,F0]), + none = rpc:call(Node, heart, get_callback, []), + M1 = lists:duplicate(28, $a), + F1 = lists:duplicate(28, $b), + {error, {bad_callback, {M1,F1}}} = rpc:call(Node, heart, set_callback, [M1,F1]), + none = rpc:call(Node, heart, get_callback, []), + + M2 = heart_check_module, + F2 = cb_ok, + F3 = cb_error, + Code0 = generate(M2, [], [ + atom_to_list(F2) ++ "() -> ok.", + atom_to_list(F3) ++ "() -> exit(\"callback_error (as intended)\")." + ]), + {module, M2} = rpc:call(Node, erlang, load_module, [M2, Code0]), + ok = rpc:call(Node, M2, F2, []), + ok = rpc:call(Node, heart, set_callback, [M2,F2]), + {ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []), + ok = rpc:call(Node, heart, clear_callback, []), + none = rpc:call(Node, heart, get_callback, []), + ok = rpc:call(Node, heart, set_callback, [M2,F2]), + {ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []), + ok = rpc:call(Node, heart, set_callback, [M2,F3]), + receive {nodedown, Node} -> ok + after 5000 -> test_server:fail(node_not_killed) + end, + stop_node(Node), + ok. + +options_api(Config) when is_list(Config) -> + {ok, Node} = start_check(slave, heart_test), + none = rpc:call(Node, heart, get_options, []), + M0 = self(), + F0 = ok, + {error, {bad_options, {M0,F0}}} = rpc:call(Node, heart, set_options, [{M0,F0}]), + none = rpc:call(Node, heart, get_options, []), + Ls = lists:duplicate(28, $b), + {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]), + none = rpc:call(Node, heart, get_options, []), + + ok = rpc:call(Node, heart, set_options, [[check_schedulers]]), + {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []), + ok = rpc:call(Node, heart, set_options, [[]]), + none = rpc:call(Node, heart, get_options, []), + + ok = rpc:call(Node, heart, set_options, [[check_schedulers]]), + {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []), + {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]), + {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []), + + receive after 3000 -> ok end, %% wait 3 secs + + ok = rpc:call(Node, heart, set_options, [[]]), + none = rpc:call(Node, heart, get_options, []), + stop_node(Node), + ok. + + dont_drop(suite) -> %%% Removed as it may crash epmd/distribution in colourful %%% ways. While we ARE finding out WHY, it would |