1 files changed, 256 insertions, 81 deletions
diff --git a/lib/snmp/test/snmp_agent_test_lib.erl b/lib/snmp/test/snmp_agent_test_lib.erl
index 6defdadb5a..c0da47dc4c 100644
--- a/lib/snmp/test/snmp_agent_test_lib.erl
+++ b/lib/snmp/test/snmp_agent_test_lib.erl
@@ -66,7 +66,7 @@
 	]).
 
 %% Internal exports
--export([wait/5, run/4]).
+-export([tc_wait/5, tc_run/4]).
 
 -include_lib("kernel/include/file.hrl").
 -include_lib("common_test/include/ct.hrl").
@@ -276,87 +276,197 @@ init_case(Config) when is_list(Config) ->
 %%% configuration.
 %%%--------------------------------------------------
 
-try_test(Mod, Func) ->
-    call(get(mgr_node), ?MODULE, run, [Mod, Func, [], []]).
-
-try_test(Mod, Func, A) ->
-    call(get(mgr_node), ?MODULE, run, [Mod, Func, A, []]).
-
-try_test(Mod, Func, A, Opts) ->
-    call(get(mgr_node), ?MODULE, run, [Mod, Func, A, Opts]).
-
-call(N,M,F,A) ->
-    ?DBG("call -> entry with~n"
-	   "    N:     ~p~n"
-	   "    M:     ~p~n"
-	   "    F:     ~p~n"
-	   "    A:     ~p~n"
-	   "  when~n"
-	   "    get(): ~p",
-	   [N,M,F,A,get()]),
-    spawn(N, ?MODULE, wait, [self(),get(),M,F,A]),
+try_test(TcRunMod, TcRunFunc) ->
+    try_test(TcRunMod, TcRunFunc, []).
+
+try_test(TcRunMod, TcRunFunc, TcRunArgs) ->
+    try_test(TcRunMod, TcRunFunc, TcRunArgs, []).
+
+try_test(TcRunMod, TcRunFunc, TcRunArgs, TcRunOpts) ->
+    Node      = get(mgr_node),
+    Mod       = ?MODULE,
+    Func      = tc_run,
+    Args      = [TcRunMod, TcRunFunc, TcRunArgs, TcRunOpts],
+    tc_try(Node, Mod, Func, Args).
+
+%% We spawn a test case runner process on the manager node.
+%% The assumption is that the manager shall do something, but
+%% not all test cases have the manager perform actions.
+%% In some cases we make a rpc call back to the agent node directly
+%% and call something in the agent... (for example the info_test
+%% test case).
+%% We should use link (instead of monitor) in order for the test case
+%% timeout cleanup (kills) should have effect on the test case runner
+%% process as well.
+
+tc_try(N, M, F, A) ->
+    ?PRINT2("tc_try -> entry with"
+            "~n      N:     ~p"
+            "~n      M:     ~p"
+            "~n      F:     ~p"
+            "~n      A:     ~p"
+            "~n   when"
+            "~n      get(): ~p"
+            "~n", [N,
+                   M, F, A,
+                   get()]),
+    case net_adm:ping(N) of
+        pong ->
+            ?PRINT2("tc_try -> ~p still running - start runner~n", [N]),
+            OldFlag = trap_exit(true), % Make sure we catch it
+            Runner  = spawn_link(N, ?MODULE, tc_wait, [self(), get(), M, F, A]),
+            await_tc_runner_started(Runner, OldFlag),
+            await_tc_runner_done(Runner, OldFlag);
+        pang ->
+            ?EPRINT2("tc_try -> ~p *not* running~n", [N]),
+            skip({node_not_running, N})
+    end.
+
+await_tc_runner_started(Runner, OldFlag) ->
+    ?PRINT2("await tc-runner (~p) start ack~n", [Runner]),
     receive
-	{done, {'EXIT', Rn}, Loc} ->
-	    ?DBG("call -> done with exit: "
-		 "~n   Rn:  ~p"
-		 "~n   Loc: ~p", [Rn, Loc]),
+        {'EXIT', Runner, Reason} ->
+            ?EPRINT2("TC runner start failed: "
+                     "~n   ~p~n", [Reason]),
+            exit({tx_runner_start_failed, Reason});
+        {tc_runner_started, Runner} ->
+            ?PRINT2("TC runner start acknowledged~n"),
+            ok
+    after 10000 -> %% We should *really* not have to wait this long, but...
+            trap_exit(OldFlag),
+            unlink_and_flush_exit(Runner),
+            RunnerInfo = process_info(Runner),
+            ?EPRINT2("TC runner start timeout: "
+                     "~n   ~p", [RunnerInfo]),
+            %% If we don't get a start ack within 10 seconds, we are f*ed
+            exit(Runner, kill),
+            exit({tc_runner_start, timeout, RunnerInfo})
+    end.
+
+await_tc_runner_done(Runner, OldFlag) ->
+    receive
+        {'EXIT', Runner, Reason} ->
+            %% This is not a normal (tc) failure (that is the clause below).
+            %% Instead the tc runner process crashed, for some reason. So
+            %% check if have got any system events, and if so, skip.
+            SysEvs = snmp_test_global_sys_monitor:events(),
+            if
+                (SysEvs =:= []) ->
+                    ?EPRINT2("TC runner failed: "
+                             "~n   ~p~n", [Reason]),
+                    exit({tx_runner_failed, Reason});
+                true ->
+                    ?EPRINT2("TC runner failed when we got system events: "
+                             "~n   Reason:     ~p"
+                             "~n   Sys Events: ~p"
+                             "~n", [Reason, SysEvs]),
+                    skip([{reason, Reason}, {system_events, SysEvs}])
+            end;
+	{tc_runner_done, Runner, {'EXIT', {skip, Reason}}, Loc} ->
+	    ?PRINT2("call -> done with skip: "
+                    "~n   Reason: ~p"
+                    "~n   Loc:    ~p"
+                    "~n", [Reason, Loc]),
+            trap_exit(OldFlag),
+            unlink_and_flush_exit(Runner),
+	    put(test_server_loc, Loc),
+	    skip(Reason);
+	{tc_runner_done, Runner, {'EXIT', Rn}, Loc} ->
+	    ?PRINT2("call -> done with exit: "
+                    "~n   Rn:  ~p"
+                    "~n   Loc: ~p"
+                    "~n", [Rn, Loc]),
+            trap_exit(OldFlag),
+            unlink_and_flush_exit(Runner),
 	    put(test_server_loc, Loc),
 	    exit(Rn);
-	{done, Ret, _Zed} -> 
+	{tc_runner_done, Runner, Ret, _Zed} -> 
 	    ?DBG("call -> done:"
 		 "~n   Ret: ~p"
 		 "~n   Zed: ~p", [Ret, _Zed]),
+            trap_exit(OldFlag),
+            unlink_and_flush_exit(Runner),
 	    case Ret of
 		{error, Reason} ->
 		    exit(Reason);
+		{skip, Reason} ->
+		    skip(Reason);
 		OK ->
 		    OK
 	    end
     end.
 
-wait(From, Env, M, F, A) ->
-    ?DBG("wait -> entry with"
-	 "~n   From: ~p"
-	 "~n   Env:  ~p"
-	 "~n   M:    ~p"
-	 "~n   F:    ~p"
-	 "~n   A:    ~p", [From, Env, M, F, A]),
+trap_exit(Flag) when is_boolean(Flag) ->    
+    erlang:process_flag(trap_exit, Flag).
+
+unlink_and_flush_exit(Pid) ->
+    unlink(Pid),
+    receive
+        {'EXIT', Pid, _} ->
+            ok
+    after 0 ->
+            ok
+    end.
+
+tc_wait(From, Env, M, F, A) ->
+    ?PRINT2("tc_wait -> entry with"
+            "~n   From: ~p"
+            "~n   Env:  ~p"
+            "~n   M:    ~p"
+            "~n   F:    ~p"
+            "~n   A:    ~p", [From, Env, M, F, A]),
+    From ! {tc_runner_started, self()},
     lists:foreach(fun({K,V}) -> put(K,V) end, Env),
-    Rn = (catch apply(M, F, A)),
-    ?DBG("wait -> Rn: ~n~p", [Rn]),
-    From ! {done, Rn, get(test_server_loc)},
-    exit(Rn).
-
-run(Mod, Func, Args, Opts) ->
-    ?DBG("run -> entry with"
-	 "~n   Mod:  ~p"
-	 "~n   Func: ~p"
-	 "~n   Args: ~p"
-	 "~n   Opts: ~p", [Mod, Func, Args, Opts]),
-    M = get(mib_dir),
-    Dir = get(mgr_dir),
-    User = snmp_misc:get_option(user, Opts, "all-rights"),
-    SecLevel = snmp_misc:get_option(sec_level, Opts, noAuthNoPriv),
-    EngineID = snmp_misc:get_option(engine_id, Opts, "agentEngine"),
+    ?PRINT2("tc_wait -> env set - now run tc~n"),
+    Res = (catch apply(M, F, A)),
+    ?PRINT2("tc_wait -> tc run done: "
+            "~n   ~p"
+            "~n", [Res]),
+    From ! {tc_runner_done, self(), Res, get(test_server_loc)},
+    %% The point of this is that in some cases we have seen that the 
+    %% exit signal having been "passed on" to the CT, which consider any
+    %% exit a fail (even if its {'EXIT', ok}).
+    %% So, just to be on the safe side, convert an 'ok' to a 'normal'.
+    case Res of
+        ok ->
+            exit(normal);
+        {ok, _} ->
+            exit(normal);
+        _ ->
+            exit(Res)
+    end.
+
+tc_run(Mod, Func, Args, Opts) ->
+    ?PRINT2("tc_run -> entry with"
+            "~n   Mod:  ~p"
+            "~n   Func: ~p"
+            "~n   Args: ~p"
+            "~n   Opts: ~p"
+            "~n", [Mod, Func, Args, Opts]),
+    (catch snmp_test_mgr:stop()), % If we had a running mgr from a failed case
+    M           = get(mib_dir),
+    Dir         = get(mgr_dir),
+    User        = snmp_misc:get_option(user, Opts, "all-rights"),
+    SecLevel    = snmp_misc:get_option(sec_level, Opts, noAuthNoPriv),
+    EngineID    = snmp_misc:get_option(engine_id, Opts, "agentEngine"),
     CtxEngineID = snmp_misc:get_option(context_engine_id, Opts, EngineID),
-    Community = snmp_misc:get_option(community, Opts, "all-rights"),
-    ?DBG("run -> start crypto app",[]),
-    _CryptoRes = ?CRYPTO_START(),
-    ?DBG("run -> Crypto: ~p", [_CryptoRes]),
-    catch snmp_test_mgr:stop(), % If we had a running mgr from a failed case
-    StdM = join(code:priv_dir(snmp), "mibs") ++ "/",
-    Vsn = get(vsn), 
-    ?DBG("run -> config:"
-	   "~n   M:           ~p"
-	   "~n   Vsn:         ~p"
-	   "~n   Dir:         ~p"
-	   "~n   User:        ~p"
-	   "~n   SecLevel:    ~p"
-	   "~n   EngineID:    ~p"
-	   "~n   CtxEngineID: ~p"
-	   "~n   Community:   ~p"
-	   "~n   StdM:        ~p",
-	   [M,Vsn,Dir,User,SecLevel,EngineID,CtxEngineID,Community,StdM]),
+    Community   = snmp_misc:get_option(community, Opts, "all-rights"),
+    ?DBG("tc_run -> start crypto app",[]),
+    _CryptoRes  = ?CRYPTO_START(),
+    ?DBG("tc_run -> Crypto: ~p", [_CryptoRes]),
+    StdM        = join(code:priv_dir(snmp), "mibs") ++ "/",
+    Vsn         = get(vsn), 
+    ?PRINT2("tc_run -> config:"
+            "~n   M:           ~p"
+            "~n   Vsn:         ~p"
+            "~n   Dir:         ~p"
+            "~n   User:        ~p"
+            "~n   SecLevel:    ~p"
+            "~n   EngineID:    ~p"
+            "~n   CtxEngineID: ~p"
+            "~n   Community:   ~p"
+            "~n   StdM:        ~p"
+            "~n", [M,Vsn,Dir,User,SecLevel,EngineID,CtxEngineID,Community,StdM]),
     case snmp_test_mgr:start([%% {agent, snmp_test_lib:hostname()},
 			      {packet_server_debug, true},
 			      {debug,               true},
@@ -376,24 +486,45 @@ run(Mod, Func, Args, Opts) ->
 			      {mibs,                mibs(StdM, M)}]) of
 	{ok, _Pid} ->
 	    case (catch apply(Mod, Func, Args)) of
+		{'EXIT', {skip, Reason}} ->
+                    ?EPRINT2("apply skip detected: "
+                             "~n   ~p", [Reason]),
+		    (catch snmp_test_mgr:stop()),
+		    ?SKIP(Reason);
 		{'EXIT', Reason} ->
-		    catch snmp_test_mgr:stop(),
-		    ?FAIL({apply_failed, {Mod, Func, Args}, Reason});
+                    %% We have hosts (mostly *very* slooow VMs) that
+                    %% can timeout anything. Since we are basically
+                    %% testing communication, we therefor must check
+                    %% for system events at every failure. Grrr!
+                    SysEvs = snmp_test_global_sys_monitor:events(),
+		    (catch snmp_test_mgr:stop()),
+                    if
+                        (SysEvs =:= []) ->
+                            ?EPRINT2("TC runner failed: "
+                                     "~n   ~p~n", [Reason]),
+                            ?FAIL({apply_failed, {Mod, Func, Args}, Reason});
+                        true ->
+                            ?EPRINT2("apply exit catched when we got system events: "
+                                     "~n   Reason:     ~p"
+                                     "~n   Sys Events: ~p"
+                                     "~n", [Reason, SysEvs]),
+                            ?SKIP([{reason, Reason}, {system_events, SysEvs}])
+                    end;
 		Res ->
-		    catch snmp_test_mgr:stop(),
+		    (catch snmp_test_mgr:stop()),
 		    Res
 	    end;
 
 	{error, Reason} ->
 	    ?EPRINT2("Failed starting (test) manager: "
                      "~n   ~p", [Reason]),
-	    catch snmp_test_mgr:stop(),
+	    (catch snmp_test_mgr:stop()),
 	    ?line ?FAIL({mgr_start_error, Reason});
 
 	Err ->
 	    ?EPRINT2("Failed starting (test) manager: "
                      "~n   ~p", [Err]),
-	    catch snmp_test_mgr:stop(),
+	    (catch snmp_test_mgr:stop()),
 	    ?line ?FAIL({mgr_start_failure, Err})
     end.
 
@@ -907,10 +1038,22 @@ expect2(Mod, Line, F) ->
 	
 %% ----------------------------------------------------------------------
 
-get_timeout() ->
-    get_timeout(os:type()).
+-define(BASE_REQ_TIMEOUT, 3500).
 
-get_timeout(_)       -> 3500.
+get_timeout() ->
+    %% Try to figure out how "fast" a machine is.
+    %% We assume that the number of schedulers
+    %% (which depends on the number of core:s)
+    %% effect the performance of the host...
+    %% This is obviously not enough. The network
+    %% also matterns, clock freq or the CPU, ...
+    %% But its better than what we had before...
+    case erlang:system_info(schedulers) of
+        N when is_integer(N) ->
+            ?BASE_REQ_TIMEOUT + timer:seconds(10 div N);
+        _ ->
+            ?BASE_REQ_TIMEOUT
+    end.
 
 receive_pdu(To) ->
     receive
@@ -1083,6 +1226,18 @@ do_expect(trap, Enterp, Generic, Specific, ExpVBs, To) ->
 		     {PureE, Generic, Specific, ExpVBs}, 
 		     {Ent2, G2, Spec2, VBs}}};
 
+	{error, timeout} = Error ->
+            SysEvs = snmp_test_global_sys_monitor:events(),
+	    io_format_expect("[expecting trap] got timeout when system events:"
+                             "~n   ~p", [SysEvs]),
+            if
+                (SysEvs =:= []) ->
+                    Error;
+                true ->
+                    skip({system_events, SysEvs})
+            end;
+
+
 	Error ->
 	    Error
     end.
@@ -1184,7 +1339,7 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To)
 	    io_format_expect("received unexpected pdu with (11) "
                              "~n   Type:         ~p"
                              "~n   ReqId:        ~p"
-                             "~n   Errot status: ~p"
+                             "~n   Error status: ~p"
                              "~n   Error index:  ~p",
                              [Type2, ReqId, Err2, Idx2]),
 	    {error, 
@@ -1247,7 +1402,7 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To)
 	    io_format_expect("received unexpected pdu with (15) "
                              "~n   Type:         ~p"
                              "~n   ReqId:        ~p"
-                             "~n   Errot status: ~p"
+                             "~n   Error status: ~p"
                              "~n   Error index:  ~p"
                              "~n   Varbinds:     ~p",
                              [Type2, ReqId, Err2, Idx2, VBs2]),
@@ -1257,10 +1412,23 @@ do_expect2(Check, Type, Err, Idx, ExpVBs, To)
 	      {Type2, Err2, Idx2, VBs2}, 
 	      ReqId}};
 	
-	Error ->
-	    io_format_expect("received error (16):  "
+
+	{error, timeout} = Error ->
+            SysEvs = snmp_test_global_sys_monitor:events(),
+	    io_format_expect("got timeout (16) when system events:"
+                             "~n   ~p", [SysEvs]),
+            if
+                (SysEvs =:= []) ->
+                    Error;
+                true ->
+                    skip({system_events, SysEvs})
+            end;
+
+
+        Error ->
+            io_format_expect("received error (17):  "
                              "~n   Error: ~p", [Error]),
-	    Error
+            Error
     end.
 
 
@@ -1378,12 +1546,15 @@ start_node(Name) ->
 		      ""
 	      end,
     %% Do not use start_link!!! (the proc that calls this one is tmp)
-    ?DBG("start_node -> Args: ~p~n",[Args]),
-    A = Args ++ " -pa " ++ Pa,
+    ?DBG("start_node -> Args: ~p~n", [Args]),
+    A = Args ++ " -pa " ++ Pa ++ 
+        " -s " ++ atom_to_list(snmp_test_sys_monitor) ++ " start" ++ 
+        " -s global sync",
     case (catch ?START_NODE(Name, A)) of
 	{ok, Node} ->
 	    %% Tell the test_server to not clean up things it never started.
 	    ?DBG("start_node -> Node: ~p",[Node]),
+            global:sync(),
 	    {ok, Node};
 	Else  -> 
 	    ?ERR("start_node -> failed with(other): Else: ~p",[Else]),
@@ -1701,6 +1872,10 @@ rpc(Node, F, A) ->
 join(Dir, File) ->
     filename:join(Dir, File).
 
+
+skip(R) ->
+    exit({skip, R}).
+
 %% await_pdu(To) ->
 %%     await_response(To, pdu).
 %%