From d9f6409a994ae74c8b46fc1425b07134de9d0546 Mon Sep 17 00:00:00 2001 From: Micael Karlberg Date: Fri, 7 Jun 2019 14:29:47 +0200 Subject: [snmp|test] Use system monitor to discover slow VM issues We have experienced intermittent failures of random test cases on slow VMs. In order to get more info about the system, we use system monitor to collect info. We have one global monitor system, and one local monitor on each node. Each local monitor report to the global, which in turn prints a message. --- lib/snmp/test/modules.mk | 2 + lib/snmp/test/snmp_agent_test.erl | 6 +- lib/snmp/test/snmp_agent_test_lib.erl | 7 +- lib/snmp/test/snmp_test_global_sys_monitor.erl | 159 +++++++++++++++++++++++++ lib/snmp/test/snmp_test_sys_monitor.erl | 86 +++++++++++++ 5 files changed, 257 insertions(+), 3 deletions(-) create mode 100644 lib/snmp/test/snmp_test_global_sys_monitor.erl create mode 100644 lib/snmp/test/snmp_test_sys_monitor.erl (limited to 'lib/snmp/test') diff --git a/lib/snmp/test/modules.mk b/lib/snmp/test/modules.mk index 8b6547f9a9..ec3870dbd8 100644 --- a/lib/snmp/test/modules.mk +++ b/lib/snmp/test/modules.mk @@ -42,6 +42,8 @@ SUITE_MODULES = \ snmp_manager_test TEST_UTIL_MODULES = \ + snmp_test_global_sys_monitor \ + snmp_test_sys_monitor \ snmp_test_lib \ snmp_test_manager \ snmp_test_mgr \ diff --git a/lib/snmp/test/snmp_agent_test.erl b/lib/snmp/test/snmp_agent_test.erl index 860ca17cdb..4a797efbe5 100644 --- a/lib/snmp/test/snmp_agent_test.erl +++ b/lib/snmp/test/snmp_agent_test.erl @@ -557,6 +557,8 @@ init_per_suite(Config0) when is_list(Config0) -> Config3 = [{mib_dir, MibDir}, {std_mib_dir, StdMibDir} | Config2], + snmp_test_global_sys_monitor:start(), + snmp_test_sys_monitor:start(), % We need one on this node also snmp_test_mgr_counter_server:start(), p("init_per_suite -> end when" @@ -580,6 +582,8 @@ end_per_suite(Config) when is_list(Config) -> p("end_per_suite -> failed stopping counter server" "~n Reason: ~p", [Reason]) end, + snmp_test_sys_monitor:stop(), + snmp_test_global_sys_monitor:stop(), p("end_per_suite -> end when" "~n Nodes: ~p", [erlang:nodes()]), @@ -1654,7 +1658,7 @@ create_local_db_dir(Config) when is_list(Config) -> Name = list_to_atom(atom_to_list(create_local_db_dir) ++"-"++As++"-"++Bs++"-"++Cs), Pa = filename:dirname(code:which(?MODULE)), - {ok,Node} = ?t:start_node(Name, slave, [{args, "-pa "++Pa}]), + {ok,Node} = ?t:start_node(Name, slave, [{args, "-pa " ++ Pa}]), %% first start with a nonexisting DbDir Fun1 = fun() -> diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl index 1128fc8a8c..ddc901216e 100644 --- a/lib/snmp/test/snmp_agent_test_lib.erl +++ b/lib/snmp/test/snmp_agent_test_lib.erl @@ -1453,12 +1453,15 @@ start_node(Name) -> "" end, %% Do not use start_link!!! (the proc that calls this one is tmp) - ?DBG("start_node -> Args: ~p~n",[Args]), - A = Args ++ " -pa " ++ Pa, + ?DBG("start_node -> Args: ~p~n", [Args]), + A = Args ++ " -pa " ++ Pa ++ + " -s " ++ atom_to_list(snmp_test_sys_monitor) ++ " start" ++ + " -s global sync", case (catch ?START_NODE(Name, A)) of {ok, Node} -> %% Tell the test_server to not clean up things it never started. ?DBG("start_node -> Node: ~p",[Node]), + global:sync(), {ok, Node}; Else -> ?ERR("start_node -> failed with(other): Else: ~p",[Else]), diff --git a/lib/snmp/test/snmp_test_global_sys_monitor.erl b/lib/snmp/test/snmp_test_global_sys_monitor.erl new file mode 100644 index 0000000000..04dbc72133 --- /dev/null +++ b/lib/snmp/test/snmp_test_global_sys_monitor.erl @@ -0,0 +1,159 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2019-2019. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% + +-module(snmp_test_global_sys_monitor). + +-export([start/0, stop/0, log/1, + init/1]). + +-define(NAME, ?MODULE). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +start() -> + Parent = self(), + proc_lib:start(?MODULE, init, [Parent]). + +stop() -> + global:send(?NAME, {?MODULE, stop}). + + +log(Event) -> + global:send(?NAME, {?MODULE, node(), Event}). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +init(Parent) -> + process_flag(priority, high), + case global:register_name(?NAME, self()) of + yes -> + info_msg("Starting", []), + proc_lib:init_ack(Parent, {ok, self()}), + loop(#{parent => Parent}); + no -> + warning_msg("Already started", []), + proc_lib:init_ack(Parent, {error, already_started}), + exit(normal) + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +loop(State) -> + receive + {?MODULE, stop} -> + warning_msg("Stopping", []), + exit(normal); + + {?MODULE, Node, Event} -> + State2 = process_event(State, Node, Event), + loop(State2); + + {nodedown = Event, Node} -> + State2 = process_event(State, Node, Event), + loop(State2); + + _ -> + loop(State) + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +process_event(State, Node, {Pid, TS, Tag, Info}) -> + process_system_event(State, Node, Pid, TS, Tag, Info); + +process_event(State, Node, {TS, starting}) -> + FTS = snmp_misc:format_timestamp(TS), + info_msg("System Monitor on node ~p starting at ~s", [Node, FTS]), + if + (Node =/= node()) -> + erlang:monitor_node(Node, true); + true -> + ok + end, + State; + +process_event(State, Node, {TS, already_started}) -> + FTS = snmp_misc:format_timestamp(TS), + info_msg("System Monitor on node ~p already started", [Node, FTS]), + State; + +process_event(State, Node, nodedown) -> + info_msg("Node ~p down", [Node]), + State; + +process_event(State, Node, Event) -> + warning_msg("Received unknown event from node ~p:" + "~n ~p", [Node, Event]), + State. + + +%% System Monitor events +process_system_event(State, Node, Pid, TS, long_gc, Info) -> + print_system_event("Long GC", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, long_schedule, Info) -> + print_system_event("Long Schedule", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, large_heap, Info) -> + print_system_event("Large Heap", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, busy_port, Info) -> + print_system_event("Busy port", Node, Pid, TS, Info), + State; +process_system_event(State, Node, Pid, TS, busy_dist_port, Info) -> + print_system_event("Busy dist port", Node, Pid, TS, Info), + State; + +%% And everything else +process_system_event(State, Node, Pid, TS, Tag, Info) -> + Pre = f("Unknown Event '~p'", [Tag]), + print_system_event(Pre, Node, Pid, TS, Info), + State. + + +print_system_event(Pre, Node, Pid, TS, Info) -> + FTS = snmp_misc:format_timestamp(TS), + warning_msg("~s from ~p (~p) at ~s:" + "~n ~p", [Pre, Node, Pid, FTS, Info]). + +f(F, A) -> + lists:flatten(io_lib:format(F, A)). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +info_msg(F, A) -> + error_logger:info_msg(format_msg(F, A), []). + +warning_msg(F, A) -> + error_logger:warning_msg(format_msg(F, A), []). + + +format_msg(F, A) -> + lists:flatten(io_lib:format( + "~n" ++ + "****** SNMP TEST GLOBAL SYSTEM MONITOR ******~n~n" ++ + F ++ "~n~n", + A)). + diff --git a/lib/snmp/test/snmp_test_sys_monitor.erl b/lib/snmp/test/snmp_test_sys_monitor.erl new file mode 100644 index 0000000000..2291c6ca97 --- /dev/null +++ b/lib/snmp/test/snmp_test_sys_monitor.erl @@ -0,0 +1,86 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2019-2019. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% + +-module(snmp_test_sys_monitor). + +-export([start/0, stop/0, + init/1]). + +-define(NAME, ?MODULE). +-define(GSM, snmp_test_global_sys_monitor). + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +start() -> + Parent = self(), + proc_lib:start(?MODULE, init, [Parent]). + +stop() -> + case whereis(?NAME) of + Pid when is_pid(Pid) -> + Pid ! {?MODULE, stop}, + ok; + _ -> + ok + end. + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +init(Parent) -> + process_flag(priority, high), + try register(?NAME, self()) of + true -> + global:sync(), + MonSettings = [ + busy_port, + busy_dist_port, + {long_gc, 1000}, + {long_schedule, 1000}, + {large_heap, 8*1024*1024} % 8 MB + ], + erlang:system_monitor(self(), MonSettings), + ?GSM:log({erlang:timestamp(), starting}), + proc_lib:init_ack(Parent, {ok, self()}), + loop(#{parent => Parent}) + catch + _:_:_ -> + ?GSM:log({erlang:timestamp(), already_started}), + proc_lib:init_ack(Parent, {error, already_started}), + exit(normal) + end. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +loop(State) -> + receive + {monitor, Pid, Tag, Info} -> + ?GSM:log({Pid, erlang:timestamp(), Tag, Info}), + loop(State); + + _ -> + loop(State) + end. + + + -- cgit v1.2.3