aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/kernel/doc/src/heart.xml63
-rw-r--r--lib/kernel/src/heart.erl182
-rw-r--r--lib/kernel/test/heart_SUITE.erl67
3 files changed, 276 insertions, 36 deletions
diff --git a/lib/kernel/doc/src/heart.xml b/lib/kernel/doc/src/heart.xml
index b9fad17ce1..9da4773f2d 100644
--- a/lib/kernel/doc/src/heart.xml
+++ b/lib/kernel/doc/src/heart.xml
@@ -118,6 +118,13 @@
<p>In the following descriptions, all function fails with reason
<c>badarg</c> if <c>heart</c> is not started.</p>
</description>
+
+ <datatypes>
+ <datatype>
+ <name name="heart_option"/>
+ </datatype>
+ </datatypes>
+
<funcs>
<func>
<name name="set_cmd" arity="1"/>
@@ -154,6 +161,62 @@
the empty string will be returned.</p>
</desc>
</func>
+
+ <func>
+ <name name="set_callback" arity="2"/>
+ <fsummary>Set a validation callback</fsummary>
+ <desc>
+ <p> This validation callback will be executed before any heartbeat sent
+ to the port program. For the validation to succeed it needs to return
+ with the value <c>ok</c>.
+ </p>
+ <p> An exception within the callback will be treated as a validation failure. </p>
+ <p> The callback will be removed if the system reboots. </p>
+ </desc>
+ </func>
+ <func>
+ <name name="clear_callback" arity="0"/>
+ <fsummary>Clear the validation callback</fsummary>
+ <desc>
+ <p>Removes the validation callback call before heartbeats.</p>
+ </desc>
+ </func>
+ <func>
+ <name name="get_callback" arity="0"/>
+ <fsummary>Get the validation callback</fsummary>
+ <desc>
+ <p>Get the validation callback. If the callback is cleared, <c>none</c> will be returned.</p>
+ </desc>
+ </func>
+
+ <func>
+ <name name="set_options" arity="1"/>
+ <fsummary>Set a list of options</fsummary>
+ <desc>
+ <p> Valid options <c>set_options</c> are: </p>
+ <taglist>
+ <tag><c>check_schedulers</c></tag>
+ <item>
+ <p>If enabled, a signal will be sent to each scheduler to check its
+ responsiveness. The system check occurs before any heartbeat sent
+ to the port program. If any scheduler is not responsive enough the
+ heart program will not receive its heartbeat and thus eventually terminate the node.
+ </p>
+ </item>
+ </taglist>
+ <p> Returns with the value <c>ok</c> if the options are valid.</p>
+ </desc>
+ </func>
+ <func>
+ <name name="get_options" arity="0"/>
+ <fsummary>Get the temporary reboot command</fsummary>
+ <desc>
+ <p>Returns <c>{ok, Options}</c> where <c>Options</c> is a list of current options enabled for heart.
+ If the callback is cleared, <c>none</c> will be returned.</p>
+ </desc>
+ </func>
+
+
</funcs>
</erlref>
diff --git a/lib/kernel/src/heart.erl b/lib/kernel/src/heart.erl
index 137fad706f..eea78aabdf 100644
--- a/lib/kernel/src/heart.erl
+++ b/lib/kernel/src/heart.erl
@@ -34,7 +34,11 @@
%%%
%%% It recognizes the flag '-heart'
%%%--------------------------------------------------------------------
--export([start/0, init/2, set_cmd/1, clear_cmd/0, get_cmd/0, cycle/0]).
+-export([start/0, init/2,
+ set_cmd/1, clear_cmd/0, get_cmd/0,
+ set_callback/2, clear_callback/0, get_callback/0,
+ set_options/1, get_options/0,
+ cycle/0]).
-define(START_ACK, 1).
-define(HEART_BEAT, 2).
@@ -49,6 +53,16 @@
-define(CYCLE_TIMEOUT, 10000).
-define(HEART_PORT_NAME, heart_port).
+%% valid heart options
+-define(SCHEDULER_CHECK_OPT, check_schedulers).
+
+-type heart_option() :: ?SCHEDULER_CHECK_OPT.
+
+-record(state,{port :: port(),
+ cmd :: [] | binary(),
+ options :: [heart_option()],
+ callback :: 'undefined' | {atom(), atom()}}).
+
%%---------------------------------------------------------------------
-spec start() -> 'ignore' | {'error', term()} | {'ok', pid()}.
@@ -81,11 +95,11 @@ wait_for_init_ack(From) ->
init(Starter, Parent) ->
process_flag(trap_exit, true),
process_flag(priority, max),
- register(heart, self()),
+ register(?MODULE, self()),
case catch start_portprogram() of
{ok, Port} ->
Starter ! {ok, self()},
- loop(Parent, Port, "");
+ loop(Parent, #state{port=Port, cmd=[], options=[]});
no_heart ->
Starter ! {no_heart, self()};
error ->
@@ -96,33 +110,68 @@ init(Starter, Parent) ->
Cmd :: string().
set_cmd(Cmd) ->
- heart ! {self(), set_cmd, Cmd},
+ ?MODULE ! {self(), set_cmd, Cmd},
wait().
-spec get_cmd() -> {ok, Cmd} when
Cmd :: string().
get_cmd() ->
- heart ! {self(), get_cmd},
+ ?MODULE ! {self(), get_cmd},
wait().
-spec clear_cmd() -> ok.
clear_cmd() ->
- heart ! {self(), clear_cmd},
+ ?MODULE ! {self(), clear_cmd},
+ wait().
+
+-spec set_callback(Module,Function) -> 'ok' | {'error', {'bad_callback', {Module, Function}}} when
+ Module :: atom(),
+ Function :: atom().
+
+set_callback(Module, Function) ->
+ ?MODULE ! {self(), set_callback, {Module,Function}},
+ wait().
+
+-spec get_callback() -> {'ok', {Module, Function}} | 'none' when
+ Module :: atom(),
+ Function :: atom().
+
+get_callback() ->
+ ?MODULE ! {self(), get_callback},
+ wait().
+
+-spec clear_callback() -> ok.
+
+clear_callback() ->
+ ?MODULE ! {self(), clear_callback},
+ wait().
+
+-spec set_options(Options) -> 'ok' | {'error', {'bad_options', Options}} when
+ Options :: [heart_option()].
+
+set_options(Options) ->
+ ?MODULE ! {self(), set_options, Options},
wait().
+-spec get_options() -> {'ok', Options} | 'none' when
+ Options :: [atom()].
+
+get_options() ->
+ ?MODULE ! {self(), get_options},
+ wait().
%%% Should be used solely by the release handler!!!!!!!
-spec cycle() -> 'ok' | {'error', term()}.
cycle() ->
- heart ! {self(), cycle},
+ ?MODULE ! {self(), cycle},
wait().
wait() ->
receive
- {heart, Res} ->
+ {?MODULE, Res} ->
Res
end.
@@ -182,8 +231,8 @@ wait_ack(Port) ->
{error, Reason}
end.
-loop(Parent, Port, Cmd) ->
- _ = send_heart_beat(Port),
+loop(Parent, #state{port=Port}=S) ->
+ _ = send_heart_beat(S),
receive
{From, set_cmd, NewCmd0} ->
Enc = file:native_name_encoding(),
@@ -191,37 +240,72 @@ loop(Parent, Port, Cmd) ->
NewCmd when is_binary(NewCmd), byte_size(NewCmd) < 2047 ->
_ = send_heart_cmd(Port, NewCmd),
_ = wait_ack(Port),
- From ! {heart, ok},
- loop(Parent, Port, NewCmd);
+ From ! {?MODULE, ok},
+ loop(Parent, S#state{cmd=NewCmd});
_ ->
- From ! {heart, {error, {bad_cmd, NewCmd0}}},
- loop(Parent, Port, Cmd)
+ From ! {?MODULE, {error, {bad_cmd, NewCmd0}}},
+ loop(Parent, S)
end;
{From, clear_cmd} ->
- From ! {heart, ok},
- _ = send_heart_cmd(Port, ""),
+ From ! {?MODULE, ok},
+ _ = send_heart_cmd(Port, []),
_ = wait_ack(Port),
- loop(Parent, Port, "");
+ loop(Parent, S#state{cmd = []});
{From, get_cmd} ->
- From ! {heart, get_heart_cmd(Port)},
- loop(Parent, Port, Cmd);
+ From ! {?MODULE, get_heart_cmd(Port)},
+ loop(Parent, S);
+ {From, set_callback, Callback} ->
+ case Callback of
+ {M,F} when is_atom(M), is_atom(F) ->
+ From ! {?MODULE, ok},
+ loop(Parent, S#state{callback=Callback});
+ _ ->
+ From ! {?MODULE, {error, {bad_callback, Callback}}},
+ loop(Parent, S)
+ end;
+ {From, get_callback} ->
+ Res = case S#state.callback of
+ undefined -> none;
+ Cb -> {ok, Cb}
+ end,
+ From ! {?MODULE, Res},
+ loop(Parent, S);
+ {From, clear_callback} ->
+ From ! {?MODULE, ok},
+ loop(Parent, S#state{callback=undefined});
+ {From, set_options, Options} ->
+ case validate_options(Options) of
+ Validated when is_list(Validated) ->
+ From ! {?MODULE, ok},
+ loop(Parent, S#state{options=Validated});
+ _ ->
+ From ! {?MODULE, {error, {bad_options, Options}}},
+ loop(Parent, S)
+ end;
+ {From, get_options} ->
+ Res = case S#state.options of
+ [] -> none;
+ Cb -> {ok, Cb}
+ end,
+ From ! {?MODULE, Res},
+ loop(Parent, S);
{From, cycle} ->
%% Calls back to loop
- do_cycle_port_program(From, Parent, Port, Cmd);
+ do_cycle_port_program(From, Parent, S);
{'EXIT', Parent, shutdown} ->
no_reboot_shutdown(Port);
{'EXIT', Parent, Reason} ->
exit(Port, Reason),
exit(Reason);
{'EXIT', Port, badsig} -> % we can ignore badsig-messages!
- loop(Parent, Port, Cmd);
+ loop(Parent, S);
{'EXIT', Port, _Reason} ->
- exit({port_terminated, {heart, loop, [Parent, Port, Cmd]}});
+ exit({port_terminated, {?MODULE, loop, [Parent, S]}});
_ ->
- loop(Parent, Port, Cmd)
+ loop(Parent, S)
after
?TIMEOUT ->
- loop(Parent, Port, Cmd)
+ loop(Parent, S)
end.
-spec no_reboot_shutdown(port()) -> no_return().
@@ -233,36 +317,44 @@ no_reboot_shutdown(Port) ->
exit(normal)
end.
-do_cycle_port_program(Caller, Parent, Port, Cmd) ->
+validate_options(Opts) -> validate_options(Opts,[]).
+validate_options([],Res) -> Res;
+validate_options([?SCHEDULER_CHECK_OPT=Opt|Opts],Res) -> validate_options(Opts,[Opt|Res]);
+validate_options(_,_) -> error.
+
+do_cycle_port_program(Caller, Parent, #state{port=Port} = S) ->
unregister(?HEART_PORT_NAME),
case catch start_portprogram() of
{ok, NewPort} ->
_ = send_shutdown(Port),
receive
{'EXIT', Port, _Reason} ->
- _ = send_heart_cmd(NewPort, Cmd),
- Caller ! {heart, ok},
- loop(Parent, NewPort, Cmd)
+ _ = send_heart_cmd(NewPort, S#state.cmd),
+ Caller ! {?MODULE, ok},
+ loop(Parent, S#state{port=NewPort})
after
?CYCLE_TIMEOUT ->
%% Huh! Two heart port programs running...
%% well, the old one has to be sick not to respond
%% so we'll settle for the new one...
- _ = send_heart_cmd(NewPort, Cmd),
- Caller ! {heart, {error, stop_error}},
- loop(Parent, NewPort, Cmd)
+ _ = send_heart_cmd(NewPort, S#state.cmd),
+ Caller ! {?MODULE, {error, stop_error}},
+ loop(Parent, S#state{port=NewPort})
end;
no_heart ->
- Caller ! {heart, {error, no_heart}},
- loop(Parent, Port, Cmd);
+ Caller ! {?MODULE, {error, no_heart}},
+ loop(Parent, S);
error ->
- Caller ! {heart, {error, start_error}},
- loop(Parent, Port, Cmd)
+ Caller ! {?MODULE, {error, start_error}},
+ loop(Parent, S)
end.
%% "Beates" the heart once.
-send_heart_beat(Port) -> Port ! {self(), {command, [?HEART_BEAT]}}.
+send_heart_beat(#state{port=Port, callback=Cb, options=Opts}) ->
+ ok = check_system(Opts),
+ ok = check_callback(Cb),
+ Port ! {self(), {command, [?HEART_BEAT]}}.
%% Set a new HEART_COMMAND.
-dialyzer({no_improper_lists, send_heart_cmd/2}).
@@ -278,6 +370,24 @@ get_heart_cmd(Port) ->
{ok, Cmd}
end.
+check_system([]) -> ok;
+check_system([?SCHEDULER_CHECK_OPT|Opts]) ->
+ ok = erts_internal:system_check(schedulers),
+ check_system(Opts).
+
+%% validate system by performing a check before the heartbeat
+%% return 'ok' if everything is alright.
+%% Terminate if with reason if something is a miss.
+%% It is fine to timeout in the callback, in fact that is the intention
+%% if something goes wront -> no heartbeat.
+
+check_callback(Callback) ->
+ case Callback of
+ undefined -> ok;
+ {M,F} ->
+ erlang:apply(M,F,[])
+ end.
+
%% Sends shutdown command to the port.
send_shutdown(Port) -> Port ! {self(), {command, [?SHUT_DOWN]}}.
diff --git a/lib/kernel/test/heart_SUITE.erl b/lib/kernel/test/heart_SUITE.erl
index 4447475833..eb6cb06622 100644
--- a/lib/kernel/test/heart_SUITE.erl
+++ b/lib/kernel/test/heart_SUITE.erl
@@ -27,6 +27,8 @@
node_start_immediately_after_crash/1,
node_start_soon_after_crash/1,
set_cmd/1, clear_cmd/1, get_cmd/1,
+ callback_api/1,
+ options_api/1,
dont_drop/1, kill_pid/1]).
-export([init_per_testcase/2, end_per_testcase/2]).
@@ -66,6 +68,8 @@ all() -> [
node_start_immediately_after_crash,
node_start_soon_after_crash,
set_cmd, clear_cmd, get_cmd,
+ callback_api,
+ options_api,
kill_pid
].
@@ -358,6 +362,69 @@ get_cmd(Config) when is_list(Config) ->
stop_node(Node),
ok.
+callback_api(Config) when is_list(Config) ->
+ {ok, Node} = start_check(slave, heart_test),
+ none = rpc:call(Node, heart, get_callback, []),
+ M0 = self(),
+ F0 = ok,
+ {error, {bad_callback, {M0,F0}}} = rpc:call(Node, heart, set_callback, [M0,F0]),
+ none = rpc:call(Node, heart, get_callback, []),
+ M1 = lists:duplicate(28, $a),
+ F1 = lists:duplicate(28, $b),
+ {error, {bad_callback, {M1,F1}}} = rpc:call(Node, heart, set_callback, [M1,F1]),
+ none = rpc:call(Node, heart, get_callback, []),
+
+ M2 = heart_check_module,
+ F2 = cb_ok,
+ F3 = cb_error,
+ Code0 = generate(M2, [], [
+ atom_to_list(F2) ++ "() -> ok.",
+ atom_to_list(F3) ++ "() -> exit(\"callback_error (as intended)\")."
+ ]),
+ {module, M2} = rpc:call(Node, erlang, load_module, [M2, Code0]),
+ ok = rpc:call(Node, M2, F2, []),
+ ok = rpc:call(Node, heart, set_callback, [M2,F2]),
+ {ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []),
+ ok = rpc:call(Node, heart, clear_callback, []),
+ none = rpc:call(Node, heart, get_callback, []),
+ ok = rpc:call(Node, heart, set_callback, [M2,F2]),
+ {ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []),
+ ok = rpc:call(Node, heart, set_callback, [M2,F3]),
+ receive {nodedown, Node} -> ok
+ after 5000 -> test_server:fail(node_not_killed)
+ end,
+ stop_node(Node),
+ ok.
+
+options_api(Config) when is_list(Config) ->
+ {ok, Node} = start_check(slave, heart_test),
+ none = rpc:call(Node, heart, get_options, []),
+ M0 = self(),
+ F0 = ok,
+ {error, {bad_options, {M0,F0}}} = rpc:call(Node, heart, set_options, [{M0,F0}]),
+ none = rpc:call(Node, heart, get_options, []),
+ Ls = lists:duplicate(28, $b),
+ {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]),
+ none = rpc:call(Node, heart, get_options, []),
+
+ ok = rpc:call(Node, heart, set_options, [[check_schedulers]]),
+ {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []),
+ ok = rpc:call(Node, heart, set_options, [[]]),
+ none = rpc:call(Node, heart, get_options, []),
+
+ ok = rpc:call(Node, heart, set_options, [[check_schedulers]]),
+ {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []),
+ {error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]),
+ {ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []),
+
+ receive after 3000 -> ok end, %% wait 3 secs
+
+ ok = rpc:call(Node, heart, set_options, [[]]),
+ none = rpc:call(Node, heart, get_options, []),
+ stop_node(Node),
+ ok.
+
+
dont_drop(suite) ->
%%% Removed as it may crash epmd/distribution in colourful
%%% ways. While we ARE finding out WHY, it would