From d9f6409a994ae74c8b46fc1425b07134de9d0546 Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Fri, 7 Jun 2019 14:29:47 +0200 Subject: [snmp|test] Use system monitor to discover slow VM issues We have experienced intermittent failures of random test cases on slow VMs. In order to get more info about the system, we use system monitor to collect info. We have one global monitor system, and one local monitor on each node. Each local monitor report to the global, which in turn prints a message. --- lib/snmp/test/modules.mk | 2 + lib/snmp/test/snmp_agent_test.erl | 6 +- lib/snmp/test/snmp_agent_test_lib.erl | 7 +- lib/snmp/test/snmp_test_global_sys_monitor.erl | 159 +++++++++++++++++++++++++ lib/snmp/test/snmp_test_sys_monitor.erl | 86 +++++++++++++ 5 files changed, 257 insertions(+), 3 deletions(-) create mode 100644 lib/snmp/test/snmp_test_global_sys_monitor.erl create mode 100644 lib/snmp/test/snmp_test_sys_monitor.erl diff --git a/lib/snmp/test/modules.mk b/lib/snmp/test/modules.mk index 8b6547f9a9..ec3870dbd8 100644 --- a/lib/snmp/test/modules.mk +++ b/lib/snmp/test/modules.mk @@ -42,6 +42,8 @@ SUITE_MODULES = \ snmp_manager_test TEST_UTIL_MODULES = \ + snmp_test_global_sys_monitor \ + snmp_test_sys_monitor \ snmp_test_lib \ snmp_test_manager \ snmp_test_mgr \ diff --git a/lib/snmp/test/snmp_agent_test.erl b/lib/snmp/test/snmp_agent_test.erl index 860ca17cdb..4a797efbe5 100644 --- a/lib/snmp/test/snmp_agent_test.erl +++ b/lib/snmp/test/snmp_agent_test.erl @@ -557,6 +557,8 @@ init_per_suite(Config0) when is_list(Config0) -> Config3 = [{mib_dir, MibDir}, {std_mib_dir, StdMibDir} | Config2], + snmp_test_global_sys_monitor:start(), + snmp_test_sys_monitor:start(), % We need one on this node also snmp_test_mgr_counter_server:start(), p("init_per_suite -> end when" @@ -580,6 +582,8 @@ end_per_suite(Config) when is_list(Config) -> p("end_per_suite -> failed stopping counter server" "~n Reason: ~p", [Reason]) end, + snmp_test_sys_monitor:stop(), + snmp_test_global_sys_monitor:stop(), p("end_per_suite -> end when" "~n Nodes: ~p", [erlang:nodes()]), @@ -1654,7 +1658,7 @@ create_local_db_dir(Config) when is_list(Config) -> Name = list_to_atom(atom_to_list(create_local_db_dir) ++"-"++As++"-"++Bs++"-"++Cs), Pa = filename:dirname(code:which(?MODULE)), - {ok,Node} = ?t:start_node(Name, slave, [{args, "-pa "++Pa}]), + {ok,Node} = ?t:start_node(Name, slave, [{args, "-pa " ++ Pa}]), %% first start with a nonexisting DbDir Fun1 = fun() -> diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 1128fc8a8c..ddc901216e 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -1453,12 +1453,15 @@ start_node(Name) -> "" end, %% Do not use start_link!!! (the proc that calls this one is tmp) - ?DBG("start_node -> Args: ~p~n",[Args]), - A = Args ++ " -pa " ++ Pa, + ?DBG("start_node -> Args: ~p~n", [Args]), + A = Args ++ " -pa " ++ Pa ++ + " -s " ++ atom_to_list(snmp_test_sys_monitor) ++ " start" ++ + " -s global sync", case (catch ?START_NODE(Name, A)) of {ok, Node} -> %% Tell the test_server to not clean up things it never started. ?DBG("start_node -> Node: ~p",[Node]), + global:sync(), {ok, Node}; Else -> ?ERR("start_node -> failed with(other): Else: ~p",[Else]), diff --git a/lib/snmp/test/snmp_test_global_sys_monitor.erl b/lib/snmp/test/snmp_test_global_sys_monitor.erl new file mode 100644 index 0000000000..04dbc72133 --- /dev/null +++ b/lib/snmp/test/snmp_test_global_sys_monitor.erl @@ -0,0 +1,159 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2019-2019. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% + +-module(snmp_test_global_sys_monitor). + +-export([start/0, stop/0, log/1, + init/1]). + +-define(NAME, ?MODULE). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +start() -> + Parent = self(), + proc_lib:start(?MODULE, init, [Parent]). + +stop() -> + global:send(?NAME, {?MODULE, stop}). + + +log(Event) -> + global:send(?NAME, {?MODULE, node(), Event}). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +init(Parent) -> + process_flag(priority, high), + case global:register_name(?NAME, self()) of + yes -> + info_msg("Starting", []), + proc_lib:init_ack(Parent, {ok, self()}), + loop(#{parent => Parent}); + no -> + warning_msg("Already started", []), + proc_lib:init_ack(Parent, {error, already_started}), + exit(normal) + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +loop(State) -> + receive + {?MODULE, stop} -> + warning_msg("Stopping", []), + exit(normal); + + {?MODULE, Node, Event} -> + State2 = process_event(State, Node, Event), + loop(State2); + + {nodedown = Event, Node} -> + State2 = process_event(State, Node, Event), + loop(State2); + + _ -> + loop(State) + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +process_event(State, Node, {Pid, TS, Tag, Info}) -> + process_system_event(State, Node, Pid, TS, Tag, Info); + +process_event(State, Node, {TS, starting}) -> + FTS = snmp_misc:format_timestamp(TS), + info_msg("System Monitor on node ~p starting at ~s", [Node, FTS]), + if + (Node =/= node()) -> + erlang:monitor_node(Node, true); + true -> + ok + end, + State; + +process_event(State, Node, {TS, already_started}) -> + FTS = snmp_misc:format_timestamp(TS), + info_msg("System Monitor on node ~p already started", [Node, FTS]), + State; + +process_event(State, Node, nodedown) -> + info_msg("Node ~p down", [Node]), + State; + +process_event(State, Node, Event) -> + warning_msg("Received unknown event from node ~p:" + "~n ~p", [Node, Event]), + State. + + +%% System Monitor events +process_system_event(State, Node, Pid, TS, long_gc, Info) -> + print_system_event("Long GC", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, long_schedule, Info) -> + print_system_event("Long Schedule", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, large_heap, Info) -> + print_system_event("Large Heap", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, busy_port, Info) -> + print_system_event("Busy port", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, busy_dist_port, Info) -> + print_system_event("Busy dist port", Node, Pid, TS, Info), + State; + +%% And everything else +process_system_event(State, Node, Pid, TS, Tag, Info) -> + Pre = f("Unknown Event '~p'", [Tag]), + print_system_event(Pre, Node, Pid, TS, Info), + State. + + +print_system_event(Pre, Node, Pid, TS, Info) -> + FTS = snmp_misc:format_timestamp(TS), + warning_msg("~s from ~p (~p) at ~s:" + "~n ~p", [Pre, Node, Pid, FTS, Info]). + +f(F, A) -> + lists:flatten(io_lib:format(F, A)). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +info_msg(F, A) -> + error_logger:info_msg(format_msg(F, A), []). + +warning_msg(F, A) -> + error_logger:warning_msg(format_msg(F, A), []). + + +format_msg(F, A) -> + lists:flatten(io_lib:format( + "~n" ++ + "****** SNMP TEST GLOBAL SYSTEM MONITOR ******~n~n" ++ + F ++ "~n~n", + A)). + diff --git a/lib/snmp/test/snmp_test_sys_monitor.erl b/lib/snmp/test/snmp_test_sys_monitor.erl new file mode 100644 index 0000000000..2291c6ca97 --- /dev/null +++ b/lib/snmp/test/snmp_test_sys_monitor.erl @@ -0,0 +1,86 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2019-2019. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% + +-module(snmp_test_sys_monitor). + +-export([start/0, stop/0, + init/1]). + +-define(NAME, ?MODULE). +-define(GSM, snmp_test_global_sys_monitor). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +start() -> + Parent = self(), + proc_lib:start(?MODULE, init, [Parent]). + +stop() -> + case whereis(?NAME) of + Pid when is_pid(Pid) -> + Pid ! {?MODULE, stop}, + ok; + _ -> + ok + end. + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +init(Parent) -> + process_flag(priority, high), + try register(?NAME, self()) of + true -> + global:sync(), + MonSettings = [ + busy_port, + busy_dist_port, + {long_gc, 1000}, + {long_schedule, 1000}, + {large_heap, 8*1024*1024} % 8 MB + ], + erlang:system_monitor(self(), MonSettings), + ?GSM:log({erlang:timestamp(), starting}), + proc_lib:init_ack(Parent, {ok, self()}), + loop(#{parent => Parent}) + catch + _:_:_ -> + ?GSM:log({erlang:timestamp(), already_started}), + proc_lib:init_ack(Parent, {error, already_started}), + exit(normal) + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +loop(State) -> + receive + {monitor, Pid, Tag, Info} -> + ?GSM:log({Pid, erlang:timestamp(), Tag, Info}), + loop(State); + + _ -> + loop(State) + end. + + + -- cgit v1.2.3 From 447c50b21b35fa0fd35261d5add4f10b50cbda12 Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Wed, 12 Jun 2019 12:02:59 +0200 Subject: [snmp|agent|test] Attempt to skip when system events happen When a manager request timeout occurs (the manager sends a request to the agent but does not receive a reply in time), we now check if there has been any system monitor events, and if so we instead skip's the test case. --- lib/snmp/test/snmp_agent_test.erl | 5 ++ lib/snmp/test/snmp_agent_test_lib.erl | 23 ++++-- lib/snmp/test/snmp_test_global_sys_monitor.erl | 99 ++++++++++++++++++++------ 3 files changed, 101 insertions(+), 26 deletions(-) diff --git a/lib/snmp/test/snmp_agent_test.erl b/lib/snmp/test/snmp_agent_test.erl index 4a797efbe5..1ff8c2e057 100644 --- a/lib/snmp/test/snmp_agent_test.erl +++ b/lib/snmp/test/snmp_agent_test.erl @@ -772,6 +772,8 @@ init_per_testcase(Case, Config) when is_list(Config) -> Result = init_per_testcase1(Case, Config), + snmp_test_global_sys_monitor:reset_events(), + p("init_per_testcase -> done when" "~n Result: ~p" "~n Nodes: ~p", [Result, erlang:nodes()]), @@ -821,6 +823,9 @@ end_per_testcase(Case, Config) when is_list(Config) -> "~n Nodes: ~p", [Config, erlang:nodes()]), display_log(Config), + + p("system events during test: " + "~n ~p", [snmp_test_global_sys_monitor:events()]), Result = end_per_testcase1(Case, Config), diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index ddc901216e..b1c29c4235 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -332,7 +332,7 @@ await_tc_runner_started(Runner, OldFlag) -> {tc_runner_started, Runner} -> ?PRINT2("TC runner start acknowledged~n"), ok - after 10000 -> + after 10000 -> %% We should *really* not have to wait this long, but... trap_exit(OldFlag), unlink_and_flush_exit(Runner), RunnerInfo = process_info(Runner), @@ -367,6 +367,8 @@ await_tc_runner_done(Runner, OldFlag) -> case Ret of {error, Reason} -> exit(Reason); + {skip, _} = SKIP -> + exit(SKIP); OK -> OK end @@ -1332,10 +1334,23 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To) {Type2, Err2, Idx2, VBs2}, ReqId}}; - Error -> - io_format_expect("received error (16): " + + {error, timeout} = Error -> + SysEvs = snmp_test_global_sys_monitor:events(), + io_format_expect("got timeout (16) when system events:" + "~n ~p", [SysEvs]), + if + (SysEvs =/= []) -> + Error; + true -> + {skip, {system_events, SysEvs}} + end; + + + Error -> + io_format_expect("received error (17): " "~n Error: ~p", [Error]), - Error + Error end. diff --git a/lib/snmp/test/snmp_test_global_sys_monitor.erl b/lib/snmp/test/snmp_test_global_sys_monitor.erl index 04dbc72133..eafb96621a 100644 --- a/lib/snmp/test/snmp_test_global_sys_monitor.erl +++ b/lib/snmp/test/snmp_test_global_sys_monitor.erl @@ -20,8 +20,11 @@ -module(snmp_test_global_sys_monitor). --export([start/0, stop/0, log/1, - init/1]). +-export([start/0, stop/0, + reset_events/0, + events/0, + log/1]). +-export([init/1]). -define(NAME, ?MODULE). @@ -33,11 +36,18 @@ start() -> proc_lib:start(?MODULE, init, [Parent]). stop() -> - global:send(?NAME, {?MODULE, stop}). + cast(stop). +%% This does not reset the global counter but the "collector" +%% See events for more info. +reset_events() -> + cast(reset_events). + +events() -> + call(events). log(Event) -> - global:send(?NAME, {?MODULE, node(), Event}). + cast({node(), Event}). %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -48,7 +58,7 @@ init(Parent) -> yes -> info_msg("Starting", []), proc_lib:init_ack(Parent, {ok, self()}), - loop(#{parent => Parent}); + loop(#{parent => Parent, ev_cnt => 0, evs => []}); no -> warning_msg("Already started", []), proc_lib:init_ack(Parent, {error, already_started}), @@ -61,10 +71,19 @@ init(Parent) -> loop(State) -> receive {?MODULE, stop} -> - warning_msg("Stopping", []), + warning_msg("Stopping with ~w events counted", + [maps:get(ev_cnt, State)]), exit(normal); - {?MODULE, Node, Event} -> + {?MODULE, reset_events} -> + loop(State#{evs => []}); + + {?MODULE, Ref, From, events} -> + Evs = maps:get(evs, State), + From ! {?MODULE, Ref, lists:reverse(Evs)}, + loop(State); + + {?MODULE, {Node, Event}} -> State2 = process_event(State, Node, Event), loop(State2); @@ -109,21 +128,27 @@ process_event(State, Node, Event) -> %% System Monitor events -process_system_event(State, Node, Pid, TS, long_gc, Info) -> +%% We only *count* system events +process_system_event(#{ev_cnt := Cnt, evs := Evs} = State, + Node, Pid, TS, long_gc = Ev, Info) -> print_system_event("Long GC", Node, Pid, TS, Info), - State; -process_system_event(State, Node, Pid, TS, long_schedule, Info) -> + State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]}; +process_system_event(#{ev_cnt := Cnt, evs := Evs} = State, + Node, Pid, TS, long_schedule = Ev, Info) -> print_system_event("Long Schedule", Node, Pid, TS, Info), - State; -process_system_event(State, Node, Pid, TS, large_heap, Info) -> + State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]}; +process_system_event(#{ev_cnt := Cnt, evs := Evs} = State, + Node, Pid, TS, large_heap = Ev, Info) -> print_system_event("Large Heap", Node, Pid, TS, Info), - State; -process_system_event(State, Node, Pid, TS, busy_port, Info) -> + State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]}; +process_system_event(#{ev_cnt := Cnt, evs := Evs} = State, + Node, Pid, TS, busy_port = Ev, Info) -> print_system_event("Busy port", Node, Pid, TS, Info), - State; -process_system_event(State, Node, Pid, TS, busy_dist_port, Info) -> + State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]}; +process_system_event(#{ev_cnt := Cnt, evs := Evs} = State, + Node, Pid, TS, busy_dist_port = Ev, Info) -> print_system_event("Busy dist port", Node, Pid, TS, Info), - State; + State#{ev_cnt => Cnt + 1, evs => [{Node, Ev} | Evs]}; %% And everything else process_system_event(State, Node, Pid, TS, Tag, Info) -> @@ -141,6 +166,36 @@ f(F, A) -> lists:flatten(io_lib:format(F, A)). +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +cast(Msg) -> + try global:send(?NAME, {?MODULE, Msg}) of + Pid when is_pid(Pid) -> + ok + catch + C:E:_ -> + {error, {catched, C, E}} + end. + +call(Req) -> + call(Req, infinity). + +call(Req, Timeout) -> + Ref = make_ref(), + try global:send(?NAME, {?MODULE, Ref, self(), Req}) of + Pid when is_pid(Pid) -> + receive + {?MODULE, Ref, Rep} -> + Rep + after Timeout -> + {error, timeout} + end + catch + C:E:_ -> + {error, {catched, C, E}} + end. + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% info_msg(F, A) -> @@ -151,9 +206,9 @@ warning_msg(F, A) -> format_msg(F, A) -> - lists:flatten(io_lib:format( - "~n" ++ - "****** SNMP TEST GLOBAL SYSTEM MONITOR ******~n~n" ++ - F ++ "~n~n", - A)). + f("~n" ++ + "****** SNMP TEST GLOBAL SYSTEM MONITOR ******~n~n" ++ + F ++ + "~n~n", + A). -- cgit v1.2.3 From a601b7989b90649acd4deb3e42a2105d4e145ca5 Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Thu, 13 Jun 2019 11:51:42 +0200 Subject: [snmp|manager|test] Problems starting (slave) nodes When we fail to start a (slave) node with reason timeout, its possible that it actually succeeded, which will cause the following test case(s) init to fail (with 'already started', since it which also try to start this node). But since we don't know if we get this because for this reason or because the previous test case failed to clean up after itself, we cannot assume that the state of the node is ok (and therefor use it). So, we attempt to stop it, and try start it again (*one* time). This has been observed on one (slooow) VM host. --- lib/snmp/test/Makefile | 2 +- lib/snmp/test/snmp_agent_test_lib.erl | 2 +- lib/snmp/test/snmp_manager_test.erl | 76 ++++++++++++++++++++++------------- lib/snmp/test/snmp_test_lib.erl | 8 +++- lib/snmp/test/snmp_test_lib.hrl | 1 + 5 files changed, 58 insertions(+), 31 deletions(-) diff --git a/lib/snmp/test/Makefile b/lib/snmp/test/Makefile index a9142d911d..d9b01536ea 100644 --- a/lib/snmp/test/Makefile +++ b/lib/snmp/test/Makefile @@ -180,7 +180,7 @@ emakebuild: $(EMAKEFILE) targets: mib $(EMAKEFILE) erl -make -old_targets: $(TARGET_FILES) $(TEST_SERVER_TARGETS) +old_targets: mib $(TARGET_FILES) $(TEST_SERVER_TARGETS) $(EMAKEFILE): Makefile $(MAKE_EMAKE) $(ERL_COMPILE_FLAGS) -o$(EBIN) '*_SUITE_make' > $(EMAKEFILE) diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index b1c29c4235..8e54bcf3b8 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -1340,7 +1340,7 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To) io_format_expect("got timeout (16) when system events:" "~n ~p", [SysEvs]), if - (SysEvs =/= []) -> + (SysEvs =:= []) -> Error; true -> {skip, {system_events, SysEvs}} diff --git a/lib/snmp/test/snmp_manager_test.erl b/lib/snmp/test/snmp_manager_test.erl index d959d9e09b..7cd3eae0c7 100644 --- a/lib/snmp/test/snmp_manager_test.erl +++ b/lib/snmp/test/snmp_manager_test.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2003-2017. All Rights Reserved. +%% Copyright Ericsson AB 2003-2019. All Rights Reserved. %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -4841,7 +4841,7 @@ inform2(Config) when is_list(Config) -> "~n ~p", [Addr]), ok; {snmp_notification, inform2_tag1, {no_response, Addr}} -> - p(" received expected \"no response\" " + e("Received unexpected \"no response\" " "notification from: " "~n ~p", [Addr]), {error, no_response} @@ -4975,7 +4975,7 @@ inform3(Config) when is_list(Config) -> "~n ~p", [Addr]), ok; {snmp_notification, inform3_tag1, {got_response, Addr}} -> - p(" received unexpected \"got response\" " + e("Received unexpected \"got response\" " "notification from: " "~n ~p", [Addr]), @@ -5262,7 +5262,7 @@ inform_swarm_collector(N, SentAckCnt, RecvCnt, RespCnt, Timeout) -> inform_swarm_collector(N, SentAckCnt, RecvCnt+1, RespCnt, Timeout); {Err, Idx, VBs} -> - p(" unexpected error status: " + e("Unexpected error status: " "~n Err: ~p" "~n Idx: ~p" "~n VBs: ~p", [Err, Idx, VBs]), @@ -5281,7 +5281,7 @@ inform_swarm_collector(N, SentAckCnt, RecvCnt, RespCnt, Timeout) -> %% The agent did not received ack from the manager in time {snmp_notification, inform2_tag1, {no_response, Addr}} -> - p(" received expected \"no response\" notification " + e("Received expected \"no response\" notification " "from: " "~n ~p", [Addr]), Reason = {no_response, Addr, {N, SentAckCnt, RecvCnt, RespCnt}}, @@ -5458,10 +5458,10 @@ command_handler([{No, Desc, Cmd}|Cmds]) -> p("command_handler -> ~w: ok",[No]), command_handler(Cmds); {error, Reason} -> - p(" command_handler -> ~w error: ~n~p",[No, Reason]), + e("Command_handler -> ~w error: ~n~p",[No, Reason]), ?line ?FAIL({command_failed, No, Reason}); Error -> - p(" command_handler -> ~w unexpected: ~n~p",[No, Error]), + e("Command_handler -> ~w unexpected: ~n~p",[No, Error]), ?line ?FAIL({unexpected_command_result, No, Error}) end. @@ -6327,6 +6327,8 @@ start_manager_node() -> start_node(snmp_manager). start_node(Name) -> + start_node(Name, true). +start_node(Name, Retry) -> Pa = filename:dirname(code:which(?MODULE)), Args = case init:get_argument('CC_TEST') of {ok, [[]]} -> @@ -6337,30 +6339,47 @@ start_node(Name) -> "" end, A = Args ++ " -pa " ++ Pa, - case (catch ?START_NODE(Name, A)) of + try ?START_NODE(Name, A) of {ok, Node} -> Node; - Else -> - ?line ?FAIL(Else) + {error, timeout} -> + e("Failed starting node ~p: timeout", [Name]), + ?line ?FAIL({error_starting_node, Name, timeout}); + {error, {already_running, Node}} when (Retry =:= true) -> + %% Ouch + %% Either we previously failed to (properly) stop the node + %% or it was a failed start, that reported failure (for instance + %% timeout) but actually succeeded. Regardless, we don't know + %% the state of this node, so (try) stop it and then (re-) try + %% start again. + e("Failed starting node ~p: Already Running - try stop", [Node]), + case ?STOP_NODE(Node) of + true -> + p("Successfully stopped old node ~p", [Node]), + start_node(Name, false); + false -> + e("Failed stop old node ~p", [Node]), + ?line ?FAIL({error_starting_node, Node, Retry, already_running}) + end; + {error, {already_running, Node}} -> + e("Failed starting node ~p: Already Running", [Node]), + ?line ?FAIL({error_starting_node, Node, Retry, already_running}); + {error, Reason} -> + e("Failed starting node ~p: ~p", [Name, Reason]), + ?line ?FAIL({error_starting_node, Name, Reason}) + catch + exit:{suite_failed, Reason} -> + e("(suite) Failed starting node ~p: ~p", [Name, Reason]), + ?line ?FAIL({failed_starting_node, Name, Reason}) end. -stop_node(Node) -> - rpc:cast(Node, erlang, halt, []), - await_stopped(Node, 5). -await_stopped(Node, 0) -> - p("await_stopped -> ~p still exist: giving up", [Node]), - ok; -await_stopped(Node, N) -> - Nodes = erlang:nodes(), - case lists:member(Node, Nodes) of - true -> - p("await_stopped -> ~p still exist: ~w", [Node, N]), - ?SLEEP(1000), - await_stopped(Node, N-1); - false -> - p("await_stopped -> ~p gone: ~w", [Node, N]), - ok +stop_node(Node) -> + case ?STOP_NODE(Node) of + true -> + ok; + false -> + ?line ?FAIL({failed_stop_node, Node}) end. @@ -6605,12 +6624,15 @@ rcall(Node, Mod, Func, Args) -> %% ------ +e(F, A) -> + p(" " ++ F, A). + p(F) -> p(F, []). p(F, A) -> p(get(tname), F, A). - + p(TName, F, A) -> io:format("*** [~w][~s] ***" "~n " ++ F ++ "~n", [TName, formated_timestamp()|A]). diff --git a/lib/snmp/test/snmp_test_lib.erl b/lib/snmp/test/snmp_test_lib.erl index 42f710e4cd..35bc535a80 100644 --- a/lib/snmp/test/snmp_test_lib.erl +++ b/lib/snmp/test/snmp_test_lib.erl @@ -36,7 +36,7 @@ -export([hours/1, minutes/1, seconds/1, sleep/1]). -export([flush_mqueue/0, trap_exit/0, trap_exit/1]). -export([ping/1, local_nodes/0, nodes_on/1]). --export([start_node/2]). +-export([start_node/2, stop_node/1]). -export([is_app_running/1, is_crypto_running/0, is_mnesia_running/0, is_snmp_running/0]). -export([crypto_start/0, crypto_support/0]). @@ -512,10 +512,14 @@ nodes_on(Host) when is_list(Host) -> start_node(Name, Args) -> - Opts = [{cleanup,false}, {args,Args}], + Opts = [{cleanup, false}, {args, Args}], test_server:start_node(Name, slave, Opts). +stop_node(Node) -> + test_server:stop_node(Node). + + %% ---------------------------------------------------------------- %% Application and Crypto utility functions %% diff --git a/lib/snmp/test/snmp_test_lib.hrl b/lib/snmp/test/snmp_test_lib.hrl index c66602b779..99b86a928c 100644 --- a/lib/snmp/test/snmp_test_lib.hrl +++ b/lib/snmp/test/snmp_test_lib.hrl @@ -92,6 +92,7 @@ -define(LNODES(), snmp_test_lib:local_nodes()). -define(NODES(H), snmp_test_lib:nodes_on(H)). -define(START_NODE(N,A), snmp_test_lib:start_node(N,A)). +-define(STOP_NODE(N), snmp_test_lib:stop_node(N)). %% - Application and Crypto utility macros - -- cgit v1.2.3 From 7be6efd4ca4865c5be76dc35d880832ac02eda5b Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Wed, 19 Jun 2019 14:44:07 +0200 Subject: [snmp|agent|test] (maybe) Needs to throw the skip We maybe needs to throw the skip when we detect system events. --- lib/snmp/test/snmp_agent_test.erl | 1 - lib/snmp/test/snmp_agent_test_lib.erl | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/snmp/test/snmp_agent_test.erl b/lib/snmp/test/snmp_agent_test.erl index 1ff8c2e057..a45cfa9e98 100644 --- a/lib/snmp/test/snmp_agent_test.erl +++ b/lib/snmp/test/snmp_agent_test.erl @@ -6593,7 +6593,6 @@ otp_4394_test() -> gn([[1,1]]), Res = case snmp_test_mgr:expect(1, [{[sysDescr,0], "Erlang SNMP agent"}]) of - %% {error, 1, {"?",[]}, {"~w",[timeout]}} {error, 1, _, {_, [timeout]}} -> ?DBG("otp_4394_test -> expected result: timeout", []), ok; diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 8e54bcf3b8..931616edf9 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -1160,6 +1160,18 @@ do_expect(trap, Enterp, Generic, Specific, ExpVBs, To) -> {PureE, Generic, Specific, ExpVBs}, {Ent2, G2, Spec2, VBs}}}; + {error, timeout} = Error -> + SysEvs = snmp_test_global_sys_monitor:events(), + io_format_expect("[expecting trap] got timeout when system events:" + "~n ~p", [SysEvs]), + if + (SysEvs =:= []) -> + Error; + true -> + throw({skip, {system_events, SysEvs}}) + end; + + Error -> Error end. @@ -1261,7 +1273,7 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To) io_format_expect("received unexpected pdu with (11) " "~n Type: ~p" "~n ReqId: ~p" - "~n Errot status: ~p" + "~n Error status: ~p" "~n Error index: ~p", [Type2, ReqId, Err2, Idx2]), {error, @@ -1324,7 +1336,7 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To) io_format_expect("received unexpected pdu with (15) " "~n Type: ~p" "~n ReqId: ~p" - "~n Errot status: ~p" + "~n Error status: ~p" "~n Error index: ~p" "~n Varbinds: ~p", [Type2, ReqId, Err2, Idx2, VBs2]), -- cgit v1.2.3 From 0781183c6802e0993dd954a03ee4631b4270301c Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Wed, 19 Jun 2019 15:49:29 +0200 Subject: [snmp|agent|test] Try to figure out how "fast" a machine is A very simple way to calculate how long the request timeout shall be. Previously it was simply a constant (3.5 seconds). We assume that the number of schedulers matters... --- lib/snmp/test/snmp_agent_test_lib.erl | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 931616edf9..783bb3b4bf 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -984,10 +984,22 @@ expect2(Mod, Line, F) -> %% ---------------------------------------------------------------------- -get_timeout() -> - get_timeout(os:type()). +-define(BASE_REQ_TIMEOUT, 3500). -get_timeout(_) -> 3500. +get_timeout() -> + %% Try to figure out how "fast" a machine is. + %% We assume that the number of schedulers + %% (which depends on the number of core:s) + %% effect the performance of the host... + %% This is obviously not enough. The network + %% also matterns, clock freq or the CPU, ... + %% But its better than what we had before... + case erlang:system_info(schedulers) of + N when is_integer(N) -> + ?BASE_REQ_TIMEOUT + timer:seconds(10 div N); + _ -> + ?BASE_REQ_TIMEOUT + end. receive_pdu(To) -> receive -- cgit v1.2.3 From 9ef3a76da1eef43d2c466c0aa68c18a0863bdbc2 Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Mon, 24 Jun 2019 12:22:45 +0200 Subject: [snmp|agent|test] Add another "skip point" A test run where the node died after a series of "busy dist port" events and then the node died which caused the process that ran the test case to crash with reason = noconnection. And since this was a "permanent" node, the remaining test cases then crashed. Again, this has nothing to do with snmp or its test suites. --- lib/snmp/test/snmp_agent_test_lib.erl | 48 +++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 783bb3b4bf..5ff79cb3a9 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -319,7 +319,7 @@ tc_try(N, M, F, A) -> await_tc_runner_done(Runner, OldFlag); pang -> ?EPRINT2("tc_try -> ~p *not* running~n", [N]), - exit({node_not_running, N}) + skip({node_not_running, N}) end. await_tc_runner_started(Runner, OldFlag) -> @@ -346,9 +346,22 @@ await_tc_runner_started(Runner, OldFlag) -> await_tc_runner_done(Runner, OldFlag) -> receive {'EXIT', Runner, Reason} -> - ?EPRINT2("TC runner failed: " - "~n ~p~n", [Reason]), - exit({tx_runner_failed, Reason}); + %% This is not a normal (tc) failure (that is the clause below). + %% Instead the tc runner process crashed, for some reason. So + %% check if have got any system events, and if so, skip. + SysEvs = snmp_test_global_sys_monitor:events(), + if + (SysEvs =:= []) -> + ?EPRINT2("TC runner failed: " + "~n ~p~n", [Reason]), + exit({tx_runner_failed, Reason}); + true -> + ?EPRINT2("TC runner failed when we got system events: " + "~n Reason: ~p" + "~n Sys Events: ~p" + "~n", [Reason, SysEvs]), + skip([{reason, Reason}, {system_events, SysEvs}]) + end; {tc_runner_done, Runner, {'EXIT', Rn}, Loc} -> ?PRINT2("call -> done with exit: " "~n Rn: ~p" @@ -367,8 +380,8 @@ await_tc_runner_done(Runner, OldFlag) -> case Ret of {error, Reason} -> exit(Reason); - {skip, _} = SKIP -> - exit(SKIP); + {skip, Reason} -> + skip(Reason); OK -> OK end @@ -460,6 +473,21 @@ tc_run(Mod, Func, Args, Opts) -> (catch snmp_test_mgr:stop()), Res end; + %% try apply(Mod, Func, Args) of + %% Res -> + %% (catch snmp_test_mgr:stop()), + %% Res + %% catch + %% throw:Res -> + %% Res; + %% C:E:S -> + %% ?EPRINT2("Failed (tc) apply: " + %% "~n Class: ~p" + %% "~n Error: ~p" + %% "~n Stack: ~p", [C, E, S]), + %% (catch snmp_test_mgr:stop()), + %% ?FAIL({apply_failed, {Mod, Func, Args}, {C, E, S}}) + %% end; {error, Reason} -> ?EPRINT2("Failed starting (test) manager: " @@ -1180,7 +1208,7 @@ do_expect(trap, Enterp, Generic, Specific, ExpVBs, To) -> (SysEvs =:= []) -> Error; true -> - throw({skip, {system_events, SysEvs}}) + skip({system_events, SysEvs}) end; @@ -1367,7 +1395,7 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To) (SysEvs =:= []) -> Error; true -> - {skip, {system_events, SysEvs}} + skip({system_events, SysEvs}) end; @@ -1818,6 +1846,10 @@ rpc(Node, F, A) -> join(Dir, File) -> filename:join(Dir, File). + +skip(R) -> + exit({skip, R}). + %% await_pdu(To) -> %% await_response(To, pdu). %% -- cgit v1.2.3 From 56f91a0f4b2307cd4ffe8d921b038dcb7ec85359 Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Thu, 27 Jun 2019 18:29:38 +0200 Subject: [snmp|agent|test] Add another "skip point" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Try to detect if the test case function skip's, end re-exit so that "eventually" the test case runner detects it in turn re-exit...#@!!£##!... --- lib/snmp/test/snmp_agent_test_lib.erl | 31 ++++++++++++++++--------------- lib/snmp/test/snmp_test_server.erl | 4 ++-- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 5ff79cb3a9..07600bbaf7 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -362,6 +362,15 @@ await_tc_runner_done(Runner, OldFlag) -> "~n", [Reason, SysEvs]), skip([{reason, Reason}, {system_events, SysEvs}]) end; + {tc_runner_done, Runner, {'EXIT', {skip, Reason}}, Loc} -> + ?PRINT2("call -> done with skip: " + "~n Reason: ~p" + "~n Loc: ~p" + "~n", [Reason, Loc]), + trap_exit(OldFlag), + unlink_and_flush_exit(Runner), + put(test_server_loc, Loc), + skip(Reason); {tc_runner_done, Runner, {'EXIT', Rn}, Loc} -> ?PRINT2("call -> done with exit: " "~n Rn: ~p" @@ -466,28 +475,20 @@ tc_run(Mod, Func, Args, Opts) -> {mibs, mibs(StdM, M)}]) of {ok, _Pid} -> case (catch apply(Mod, Func, Args)) of + {'EXIT', {skip, Reason}} -> + ?EPRINT2("apply skip detected: " + "~n ~p", [Reason]), + (catch snmp_test_mgr:stop()), + ?SKIP(Reason); {'EXIT', Reason} -> + ?EPRINT2("apply exit catched: " + "~n ~p", [Reason]), (catch snmp_test_mgr:stop()), ?FAIL({apply_failed, {Mod, Func, Args}, Reason}); Res -> (catch snmp_test_mgr:stop()), Res end; - %% try apply(Mod, Func, Args) of - %% Res -> - %% (catch snmp_test_mgr:stop()), - %% Res - %% catch - %% throw:Res -> - %% Res; - %% C:E:S -> - %% ?EPRINT2("Failed (tc) apply: " - %% "~n Class: ~p" - %% "~n Error: ~p" - %% "~n Stack: ~p", [C, E, S]), - %% (catch snmp_test_mgr:stop()), - %% ?FAIL({apply_failed, {Mod, Func, Args}, {C, E, S}}) - %% end; {error, Reason} -> ?EPRINT2("Failed starting (test) manager: " diff --git a/lib/snmp/test/snmp_test_server.erl b/lib/snmp/test/snmp_test_server.erl index a77bdc142c..ab7dbbbaa0 100644 --- a/lib/snmp/test/snmp_test_server.erl +++ b/lib/snmp/test/snmp_test_server.erl @@ -207,7 +207,7 @@ do_subcases(Mod, Fun, [{conf, Init, Cases, Finish}|SubCases], Config, Acc) [{failed, {Mod, Fun}, Error}] end, do_subcases(Mod, Fun, SubCases, Config, [R|Acc]); -do_subcases(Mod, Fun, [SubCase|SubCases], Config, Acc) when atom(SubCase) -> +do_subcases(Mod, Fun, [SubCase|SubCases], Config, Acc) when is_atom(SubCase) -> R = do_case(Mod, SubCase, Config), do_subcases(Mod, Fun, SubCases,Config, [R|Acc]). @@ -407,7 +407,7 @@ d(_, _, _, _) -> ok. timestamp() -> - {Date, Time} = calendar:now_to_datetime( now() ), + {Date, Time} = calendar:now_to_datetime( erlang:timestamp() ), {YYYY, MM, DD} = Date, {Hour, Min, Sec} = Time, FormatDate = -- cgit v1.2.3 From 1df2cdb7d3e234bebd7ed6fb88081b324b69952d Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Tue, 2 Jul 2019 16:16:22 +0200 Subject: [snmp|agent|test] Add another "skip point" We have hosts (mostly *very* slooow VMs) that can timeout anything. Since we are basically testing communication, we therefor must check for system events at every failure. Grrr! --- lib/snmp/test/snmp_agent_test_lib.erl | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 07600bbaf7..615d6774b9 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -481,10 +481,24 @@ tc_run(Mod, Func, Args, Opts) -> (catch snmp_test_mgr:stop()), ?SKIP(Reason); {'EXIT', Reason} -> - ?EPRINT2("apply exit catched: " - "~n ~p", [Reason]), + %% We have hosts (mostly *very* slooow VMs) that + %% can timeout anything. Since we are basically + %% testing communication, we therefor must check + %% for system events at every failure. Grrr! + SysEvs = snmp_test_global_sys_monitor:events(), (catch snmp_test_mgr:stop()), - ?FAIL({apply_failed, {Mod, Func, Args}, Reason}); + if + (SysEvs =:= []) -> + ?EPRINT2("TC runner failed: " + "~n ~p~n", [Reason]), + ?FAIL({apply_failed, {Mod, Func, Args}, Reason}); + true -> + ?EPRINT2("apply exit catched when we got system events: " + "~n Reason: ~p" + "~n Sys Events: ~p" + "~n", [Reason, SysEvs]), + ?SKIP([{reason, Reason}, {system_events, SysEvs}]) + end; Res -> (catch snmp_test_mgr:stop()), Res -- cgit v1.2.3