%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 1996-2014. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
-module(heart_SUITE).
-include_lib("test_server/include/test_server.hrl").
-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
init_per_group/2,end_per_group/2, start/1, restart/1,
reboot/1,
node_start_immediately_after_crash/1,
node_start_soon_after_crash/1,
set_cmd/1, clear_cmd/1, get_cmd/1,
callback_api/1,
options_api/1,
dont_drop/1, kill_pid/1, heart_no_kill/1]).
-export([init_per_testcase/2, end_per_testcase/2]).
-export([start_heart_stress/1, mangle/1, suicide_by_heart/0, non_suicide_by_heart/0]).
-define(DEFAULT_TIMEOUT_SECS, 120).
init_per_testcase(_Func, Config) ->
Dog=test_server:timetrap(test_server:seconds(?DEFAULT_TIMEOUT_SECS)),
[{watchdog, Dog}|Config].
end_per_testcase(_Func, Config) ->
Nodes = nodes(),
lists:foreach(fun(X) ->
NNam = list_to_atom(hd(string:tokens(atom_to_list(X),"@"))),
case NNam of
heart_test ->
?t:format(1, "WARNING: Killed ~p~n", [X]),
rpc:cast(X, erlang, halt, []);
_ ->
ok
end
end, Nodes),
Dog=?config(watchdog, Config),
test_server:timetrap_cancel(Dog).
%%-----------------------------------------------------------------
%% Test suite for heart.
%% Should be started in a CC view with:
%% erl -sname master -rsh ctrsh
%%-----------------------------------------------------------------
suite() -> [{ct_hooks,[ts_install_cth]}].
all() -> [
start, restart, reboot,
node_start_immediately_after_crash,
node_start_soon_after_crash,
set_cmd, clear_cmd, get_cmd,
callback_api,
options_api,
kill_pid,
heart_no_kill
].
groups() ->
[].
init_per_group(_GroupName, Config) ->
Config.
end_per_group(_GroupName, Config) ->
Config.
init_per_suite(Config) when is_list(Config) ->
case os:type() of
{win32, windows} ->
{skipped, "No use to run on Windows 95/98"};
_ ->
ignore_cores:init(Config)
end.
end_per_suite(Config) when is_list(Config) ->
ignore_cores:fini(Config).
start_check(Type, Name) ->
start_check(Type, Name, []).
start_check(Type, Name, Envs) ->
Args = case ?t:os_type() of
{win32,_} ->
"+t50000 -heart " ++ env_encode([{"HEART_COMMAND", no_reboot}|Envs]);
_ ->
"+t50000 -heart " ++ env_encode(Envs)
end,
{ok, Node} = case Type of
loose ->
loose_node:start(Name, Args, ?DEFAULT_TIMEOUT_SECS);
_ ->
?t:start_node(Name, Type, [{args, Args}])
end,
erlang:monitor_node(Node, true),
case rpc:call(Node, erlang, whereis, [heart]) of
Pid when is_pid(Pid) ->
ok;
_ ->
test_server:fail(heart_not_started)
end,
{ok, Node}.
start(doc) -> [];
start(suite) -> {req, [{time, 10}]};
start(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
rpc:call(Node, init, reboot, []),
receive
{nodedown, Node} -> ok
after 2000 -> test_server:fail(node_not_closed)
end,
test_server:sleep(5000),
case net_adm:ping(Node) of
pang ->
ok;
_ ->
test_server:fail(node_rebooted)
end,
test_server:stop_node(Node).
%% Also test fixed bug in R1B (it was not possible to
%% do init:stop/0 on a restarted system before)
%% Slave executes erlang:halt() on master nodedown.
%% Therefore the slave process has to be killed
%% before restart.
%% restart
%% Purpose:
%% Check that a node is up and running after a init:restart/0
restart(doc) -> [];
restart(suite) ->
case ?t:os_type() of
{Fam, _} when Fam == unix; Fam == win32 ->
{req, [{time,10}]};
_ ->
{skip, "Only run on unix and win32"}
end;
restart(Config) when is_list(Config) ->
{ok, Node} = start_check(loose, heart_test),
rpc:call(Node, init, restart, []),
receive
{nodedown, Node} ->
ok
after 2000 ->
test_server:fail(node_not_closed)
end,
test_server:sleep(5000),
node_check_up_down(Node, 2000),
loose_node:stop(Node).
%% reboot
%% Purpose:
%% Check that a node is up and running after a init:reboot/0
reboot(doc) -> [];
reboot(suite) -> {req, [{time, 10}]};
reboot(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
ok = rpc:call(Node, heart, set_cmd,
[atom_to_list(lib:progname()) ++
" -noshell -heart " ++ name(Node) ++ "&"]),
rpc:call(Node, init, reboot, []),
receive
{nodedown, Node} ->
ok
after 2000 ->
test_server:fail(node_not_closed)
end,
test_server:sleep(5000),
node_check_up_down(Node, 2000),
ok.
%% node_start_immediately_after_crash
%% Purpose:
%% Check that a node is up and running after a crash.
%% This test exhausts the atom table on the remote node.
%% ERL_CRASH_DUMP_SECONDS=0 will force beam not to dump an erl_crash.dump.
%% May currently dump core in beam debug build due to lock-order violation
%% This should be removed when a non-lockad information retriever is implemented
%% for crash dumps
node_start_immediately_after_crash(suite) -> {req, [{time, 10}]};
node_start_immediately_after_crash(Config) when is_list(Config) ->
Config2 = ignore_cores:setup(?MODULE, node_start_immediately_after_crash, Config, true),
try
node_start_immediately_after_crash_test(Config2)
after
ignore_cores:restore(Config2)
end.
node_start_immediately_after_crash_test(Config) when is_list(Config) ->
{ok, Node} = start_check(loose, heart_test_imm, [{"ERL_CRASH_DUMP_SECONDS", "0"}]),
ok = rpc:call(Node, heart, set_cmd,
[atom_to_list(lib:progname()) ++
" -noshell -heart " ++ name(Node) ++ "&"]),
Mod = exhaust_atoms,
Code = generate(Mod, [], [
"do() -> "
" Set = lists:seq($a,$z), "
" [ list_to_atom([A,B,C,D,E]) || "
" A <- Set, B <- Set, C <- Set, E <- Set, D <- Set ]."
]),
%% crash it with atom exhaustion
rpc:call(Node, erlang, load_module, [Mod, Code]),
rpc:cast(Node, Mod, do, []),
T0 = now(),
receive {nodedown, Node} ->
test_server:format("Took ~.2f s. for node to go down~n", [timer:now_diff(now(), T0)/1000000]),
ok
%% timeout is very liberal here. nodedown is received in about 1 s. on linux (palantir)
%% and in about 10 s. on solaris (carcharoth)
after (15000*test_server:timetrap_scale_factor()) -> test_server:fail(node_not_closed)
end,
test_server:sleep(3000),
node_check_up_down(Node, 2000),
loose_node:stop(Node).
%% node_start_soon_after_crash
%% Purpose:
%% Check that a node is up and running after a crash.
%% This test exhausts the atom table on the remote node.
%% ERL_CRASH_DUMP_SECONDS=10 will force beam
%% to only dump an erl_crash.dump for 10 seconds.
%% May currently dump core in beam debug build due to lock-order violation
%% This should be removed when a non-lockad information retriever is implemented
%% for crash dumps
node_start_soon_after_crash(suite) -> {req, [{time, 10}]};
node_start_soon_after_crash(Config) when is_list(Config) ->
Config2 = ignore_cores:setup(?MODULE, node_start_soon_after_crash, Config, true),
try
node_start_soon_after_crash_test(Config2)
after
ignore_cores:restore(Config2)
end.
node_start_soon_after_crash_test(Config) when is_list(Config) ->
{ok, Node} = start_check(loose, heart_test_soon, [{"ERL_CRASH_DUMP_SECONDS", "10"}]),
ok = rpc:call(Node, heart, set_cmd,
[atom_to_list(lib:progname()) ++
" -noshell -heart " ++ name(Node) ++ "&"]),
Mod = exhaust_atoms,
Code = generate(Mod, [], [
"do() -> "
" Set = lists:seq($a,$z), "
" [ list_to_atom([A,B,C,D,E]) || "
" A <- Set, B <- Set, C <- Set, E <- Set, D <- Set ]."
]),
%% crash it with atom exhaustion
rpc:call(Node, erlang, load_module, [Mod, Code]),
rpc:cast(Node, Mod, do, []),
receive {nodedown, Node} -> ok
after (15000*test_server:timetrap_scale_factor()) -> test_server:fail(node_not_closed)
end,
test_server:sleep(20000),
node_check_up_down(Node, 15000),
loose_node:stop(Node).
node_check_up_down(Node, Tmo) ->
case net_adm:ping(Node) of
pong ->
erlang:monitor_node(Node, true),
rpc:call(Node, init, reboot, []),
receive
{nodedown, Node} -> ok
after Tmo ->
test_server:fail(node_not_closed2)
end;
_ ->
test_server:fail(node_not_rebooted)
end.
%% Only tests bad command, correct behaviour is tested in reboot/1.
set_cmd(suite) -> [];
set_cmd(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
Cmd = wrong_atom,
{error, {bad_cmd, Cmd}} = rpc:call(Node, heart, set_cmd, [Cmd]),
Cmd1 = lists:duplicate(2047, $a),
{error, {bad_cmd, Cmd1}} = rpc:call(Node, heart, set_cmd, [Cmd1]),
Cmd2 = lists:duplicate(28, $a),
ok = rpc:call(Node, heart, set_cmd, [Cmd2]),
Cmd3 = lists:duplicate(2000, $a),
ok = rpc:call(Node, heart, set_cmd, [Cmd3]),
stop_node(Node),
ok.
clear_cmd(suite) -> {req,[{time,15}]};
clear_cmd(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
ok = rpc:call(Node, heart, set_cmd,
[atom_to_list(lib:progname()) ++
" -noshell -heart " ++ name(Node) ++ "&"]),
rpc:call(Node, init, reboot, []),
receive
{nodedown, Node} ->
ok
after 2000 ->
test_server:fail(node_not_closed)
end,
test_server:sleep(5000),
case net_adm:ping(Node) of
pong ->
erlang:monitor_node(Node, true);
_ ->
test_server:fail(node_not_rebooted)
end,
ok = rpc:call(Node, heart, set_cmd,
["erl -noshell -heart " ++ name(Node) ++ "&"]),
ok = rpc:call(Node, heart, clear_cmd, []),
rpc:call(Node, init, reboot, []),
receive
{nodedown, Node} ->
ok
after 2000 ->
test_server:fail(node_not_closed)
end,
test_server:sleep(5000),
case net_adm:ping(Node) of
pang ->
ok;
_ ->
test_server:fail(node_rebooted)
end,
ok.
get_cmd(suite) -> [];
get_cmd(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
Cmd = "test",
ok = rpc:call(Node, heart, set_cmd, [Cmd]),
{ok, Cmd} = rpc:call(Node, heart, get_cmd, []),
stop_node(Node),
ok.
callback_api(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
none = rpc:call(Node, heart, get_callback, []),
M0 = self(),
F0 = ok,
{error, {bad_callback, {M0,F0}}} = rpc:call(Node, heart, set_callback, [M0,F0]),
none = rpc:call(Node, heart, get_callback, []),
M1 = lists:duplicate(28, $a),
F1 = lists:duplicate(28, $b),
{error, {bad_callback, {M1,F1}}} = rpc:call(Node, heart, set_callback, [M1,F1]),
none = rpc:call(Node, heart, get_callback, []),
M2 = heart_check_module,
F2 = cb_ok,
F3 = cb_error,
Code0 = generate(M2, [], [
atom_to_list(F2) ++ "() -> ok.",
atom_to_list(F3) ++ "() -> exit(\"callback_error (as intended)\")."
]),
{module, M2} = rpc:call(Node, erlang, load_module, [M2, Code0]),
ok = rpc:call(Node, M2, F2, []),
ok = rpc:call(Node, heart, set_callback, [M2,F2]),
{ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []),
ok = rpc:call(Node, heart, clear_callback, []),
none = rpc:call(Node, heart, get_callback, []),
ok = rpc:call(Node, heart, set_callback, [M2,F2]),
{ok, {M2,F2}} = rpc:call(Node, heart, get_callback, []),
ok = rpc:call(Node, heart, set_callback, [M2,F3]),
receive {nodedown, Node} -> ok
after 5000 -> test_server:fail(node_not_killed)
end,
stop_node(Node),
ok.
options_api(Config) when is_list(Config) ->
{ok, Node} = start_check(slave, heart_test),
none = rpc:call(Node, heart, get_options, []),
M0 = self(),
F0 = ok,
{error, {bad_options, {M0,F0}}} = rpc:call(Node, heart, set_options, [{M0,F0}]),
none = rpc:call(Node, heart, get_options, []),
Ls = lists:duplicate(28, $b),
{error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]),
none = rpc:call(Node, heart, get_options, []),
ok = rpc:call(Node, heart, set_options, [[check_schedulers]]),
{ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []),
ok = rpc:call(Node, heart, set_options, [[]]),
none = rpc:call(Node, heart, get_options, []),
ok = rpc:call(Node, heart, set_options, [[check_schedulers]]),
{ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []),
{error, {bad_options, Ls}} = rpc:call(Node, heart, set_options, [Ls]),
{ok, [check_schedulers]} = rpc:call(Node, heart, get_options, []),
receive after 3000 -> ok end, %% wait 3 secs
ok = rpc:call(Node, heart, set_options, [[]]),
none = rpc:call(Node, heart, get_options, []),
stop_node(Node),
ok.
dont_drop(suite) ->
%%% Removed as it may crash epmd/distribution in colourful
%%% ways. While we ARE finding out WHY, it would
%%% be nice for others to be able to run the kernel test suite
%%% without "exploding machines", so thats why I removed it for now.
[];
dont_drop(doc) ->
["Tests that the heart command does not get dropped when ",
"set just before halt on very high I/O load."];
dont_drop(Config) when is_list(Config) ->
%%% Have to do it some times to make it happen...
[ok,ok,ok,ok,ok,ok,ok,ok,ok,ok] = do_dont_drop(Config,10),
ok.
do_dont_drop(_,0) -> [];
do_dont_drop(Config,N) ->
%% Name of first slave node
NN1 = atom_to_list(?MODULE) ++ "slave_1",
%% Name of node started by heart on failure
NN2 = atom_to_list(?MODULE) ++ "slave_2",
%% Name of node started by heart on success
NN3 = atom_to_list(?MODULE) ++ "slave_3",
Host = hd(tl(string:tokens(atom_to_list(node()),"@"))),
%% The initial heart command
FirstCmd = erl() ++ name(NN2 ++ "@" ++ Host),
%% Separated the parameters to start_node_run for clarity...
Name = list_to_atom(NN1),
Env = [{"HEART_COMMAND", FirstCmd}],
Func = "start_heart_stress",
Arg = NN3 ++ "@" ++ Host ++ " " ++
filename:join(?config(data_dir, Config), "simple_echo"),
start_node_run(Name,Env,Func,Arg),
case wait_for_any_of(list_to_atom(NN2 ++ "@" ++ Host),
list_to_atom(NN3 ++ "@" ++ Host)) of
2 ->
[ok | do_dont_drop(Config,N-1)];
_ ->
false
end.
wait_for_any_of(N1,N2) ->
wait_for_any_of(N1,N2,45).
wait_for_any_of(_N1,_N2,0) ->
false;
wait_for_any_of(N1,N2,Times) ->
receive after 1000 -> ok end,
case net_adm:ping(N1) of
pang ->
case net_adm:ping(N2) of
pang ->
wait_for_any_of(N1,N2,Times - 1);
pong ->
rpc:call(N2,init,stop,[]),
2
end;
pong ->
rpc:call(N1,init,stop,[]),
1
end.
kill_pid(suite) ->
[];
kill_pid(doc) ->
["Tests that heart kills the old erlang node before executing ",
"heart command."];
kill_pid(Config) when is_list(Config) ->
ok = do_kill_pid(Config).
do_kill_pid(_Config) ->
Name = heart_test,
Env = [{"HEART_COMMAND", "nickeNyfikenFarEttJobb"}],
{ok,Node} = start_node_run(Name,Env,suicide_by_heart,[]),
ok = wait_for_node(Node,15),
erlang:monitor_node(Node, true),
receive {nodedown,Node} -> ok
after 30000 ->
false
end.
heart_no_kill(suite) ->
[];
heart_no_kill(doc) ->
["Tests that heart doesn't kill the old erlang node when ",
"HEART_NO_KILL is set."];
heart_no_kill(Config) when is_list(Config) ->
ok = do_no_kill(Config).
do_no_kill(_Config) ->
Name = heart_test,
{ok,Node} = start_node_run(Name,[],non_suicide_by_heart,[]),
io:format("Node is ~p~n", [Node]),
ok = wait_for_node(Node,15),
io:format("wait_for_node is ~p~n", [ok]),
erlang:monitor_node(Node, true),
receive {nodedown,Node} -> false
after 30000 ->
io:format("Node didn't die..\n"),
rpc:call(Node,init,stop,[]),
io:format("done init:stop..\n"),
ok
end.
wait_for_node(_,0) ->
false;
wait_for_node(Node,N) ->
receive after 1000 -> ok end,
case net_adm:ping(Node) of
pong -> ok;
pang -> wait_for_node(Node,N-1)
end.
erl() ->
case os:type() of
{win32,_} -> "werl ";
_ -> "erl "
end.
name(Node) when is_list(Node) -> name(Node,[]);
name(Node) when is_atom(Node) -> name(atom_to_list(Node),[]).
name([$@|Node], Name) ->
case lists:member($., Node) of
true ->
"-name " ++ lists:reverse(Name);
_ ->
"-sname " ++ lists:reverse(Name)
end;
name([H|T], Name) ->
name(T, [H|Name]).
enc(A) when is_atom(A) -> atom_to_list(A);
enc(A) when is_binary(A) -> binary_to_list(A);
enc(A) when is_list(A) -> A.
env_encode([]) -> [];
env_encode([{X,Y}|T]) ->
"-env " ++ enc(X) ++ " \"" ++ enc(Y) ++ "\" " ++ env_encode(T).
%%%
%%% Starts a node and runs a function in this
%%% module.
%%% Name is the node name as either atom or string,
%%% Env is a list of Tuples containing name-value pairs.
%%% Function is the function to run in this module
%%% Argument is the argument(s) to send through erl -s
%%%
start_node_run(Name, Env, Function, Argument) ->
PA = filename:dirname(code:which(?MODULE)),
Params = "-heart " ++ env_encode(Env) ++ " -pa " ++ PA ++
" -s " ++
enc(?MODULE) ++ " " ++ enc(Function) ++ " " ++
enc(Argument),
start_node(Name, Params).
start_node(Name, Param) ->
test_server:start_node(Name, slave, [{args, Param}]).
stop_node(Node) ->
test_server:stop_node(Node).
%%% This code is run in a slave node to ensure that
%%% A heart command really gets set syncronously
%%% and cannot get "dropped".
send_to(_,_,0) ->
ok;
send_to(Port,D,N) ->
Port ! {self(),{command,D}},
send_to(Port,D,N-1).
receive_from(_,_,0) ->
ok;
receive_from(Port,D,N) ->
receive
{Port, {data,{eol,_Data}}} ->
receive_from(Port,D,N-1);
X ->
io:format("Got garbage ~p~n",[X])
end.
mangle(PP) when is_list(PP) ->
Port = open_port({spawn,PP},[{line,100}]),
mangle(Port);
mangle(Port) ->
send_to(Port, "ABCDEFGHIJ" ++ io_lib:nl(),1),
receive_from(Port,"ABCDEFGHIJ",1),
mangle(Port).
explode(0,_) ->
ok;
explode(N,PP) ->
spawn(?MODULE,mangle,[PP]),
explode(N-1,PP).
start_heart_stress([NewName,PortProgram]) ->
explode(10,atom_to_list(PortProgram)),
NewCmd = erl() ++ name(NewName),
%%io:format("~p~n",[NewCmd]),
receive
after 10000 ->
heart:set_cmd(NewCmd),
halt()
end.
suicide_by_heart() ->
%%io:format("Suicide starting...~n"),
open_port({spawn,"heart -ht 11 -pid "++os:getpid()},[{packet,2}]),
receive X -> X end,
%% Just hang and wait for heart to timeout
receive
{makaronipudding} ->
sallad
end.
non_suicide_by_heart() ->
P = open_port({spawn,"heart -ht 11 -pid "++os:getpid()},
[exit_status, {env, [{"HEART_NO_KILL", "TRUE"}]},
{packet,2}]),
receive X -> X end,
%% Just hang and wait for heart to timeout
receive
{P,{exit_status,_}} ->
ok
after
20000 ->
exit(timeout)
end.
%% generate a module from binary
generate(Module, Attributes, FunStrings) ->
FunForms = function_forms(FunStrings),
Forms = [
{attribute,a(1),module,Module},
{attribute,a(2),export,[FA || {FA,_} <- FunForms]}
] ++ [{attribute, a(3), A, V}|| {A, V} <- Attributes] ++
[ Function || {_, Function} <- FunForms],
{ok, Module, Bin} = compile:forms(Forms),
Bin.
a(L) ->
erl_anno:new(L).
function_forms([]) -> [];
function_forms([S|Ss]) ->
{ok, Ts,_} = erl_scan:string(S),
{ok, Form} = erl_parse:parse_form(Ts),
Fun = element(3, Form),
Arity = element(4, Form),
[{{Fun,Arity}, Form}|function_forms(Ss)].