From 8179e41007922c059b8f9441f563edf694b8b315 Mon Sep 17 00:00:00 2001 From: Siri Hansen Date: Fri, 12 Oct 2012 15:49:22 +0200 Subject: [cover] Add support for test_server OTP-10523 * Added cover:flush(Nodes), which will fetch data from remote nodes without stopping cover on those nodes. * Added cover:get_main_node(), which returns the node name of the main node. This is used by test_server to avoid {error,not_main_node} when a slave starts another slave (e.g. in test_server's own tests). --- lib/tools/src/cover.erl | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'lib/tools/src') diff --git a/lib/tools/src/cover.erl b/lib/tools/src/cover.erl index e21bd1b88c..fb65206938 100644 --- a/lib/tools/src/cover.erl +++ b/lib/tools/src/cover.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2001-2011. All Rights Reserved. +%% Copyright Ericsson AB 2001-2012. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -81,8 +81,9 @@ export/1, export/2, import/1, modules/0, imported/0, imported_modules/0, which_nodes/0, is_compiled/1, reset/1, reset/0, + flush/1, stop/0, stop/1]). --export([remote_start/1]). +-export([remote_start/1,get_main_node/0]). %-export([bump/5]). -export([transform/4]). % for test purposes @@ -497,6 +498,19 @@ stop(Node) when is_atom(Node) -> stop(Nodes) -> call({stop,remove_myself(Nodes,[])}). +%% flush(Nodes) -> ok | {error,not_main_node} +%% Nodes = [Node] | Node +%% Node = atom() +%% Error = {not_cover_compiled,Module} +flush(Node) when is_atom(Node) -> + flush([Node]); +flush(Nodes) -> + call({flush,remove_myself(Nodes,[])}). + +%% Used by test_server only. Not documented. +get_main_node() -> + call(get_main_node). + %% bump(Module, Function, Arity, Clause, Line) %% Module = Function = atom() %% Arity = Clause = Line = integer() @@ -710,6 +724,11 @@ main_process_loop(State) -> State1 = State#main_state{nodes=State#main_state.nodes--Nodes}, main_process_loop(State1); + {From, {flush,Nodes}} -> + remote_collect('_',Nodes,false), + reply(From, ok), + main_process_loop(State); + {From, stop} -> lists:foreach( fun(Node) -> @@ -796,6 +815,10 @@ main_process_loop(State) -> State1 = State#main_state{nodes=State#main_state.nodes--[node(Pid)]}, main_process_loop(State1); + {From, get_main_node} -> + reply(From, node()), + main_process_loop(State); + get_status -> io:format("~p~n",[State]), main_process_loop(State) @@ -852,6 +875,10 @@ remote_process_loop(State) -> unregister(?SERVER), remote_reply(State#remote_state.main_node, ok); + {From, get_main_node} -> + reply(From, State#remote_state.main_node), + remote_process_loop(State); + get_status -> io:format("~p~n",[State]), remote_process_loop(State); -- cgit v1.2.3 From f0010583fd53174b72341d64be3a481cccc4623c Mon Sep 17 00:00:00 2001 From: Siri Hansen Date: Fri, 26 Oct 2012 16:07:36 +0200 Subject: [cover] Don't kill remote nodes when connection to main node is lost OTP-10523 Since the will probably be cover compiled modules left in the remote node, we want to keep the cover_server and it's ets tables even if connection to remote node is lost. This way it will not crash (due to ets:insert in non existing table) if functions in cover compiled modules are executed. --- lib/tools/src/cover.erl | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'lib/tools/src') diff --git a/lib/tools/src/cover.erl b/lib/tools/src/cover.erl index fb65206938..2154a21ed6 100644 --- a/lib/tools/src/cover.erl +++ b/lib/tools/src/cover.erl @@ -555,7 +555,10 @@ remote_call(Node,Request) -> Return = receive {'DOWN', Ref, _Type, _Object, _Info} -> - {error,node_dead}; + case Request of + {remote,stop} -> ok; + _ -> {error,node_dead} + end; {?SERVER,Reply} -> Reply end, @@ -583,7 +586,6 @@ init_main(Starter) -> ets:new(?BINARY_TABLE, [set, named_table]), ets:new(?COLLECTION_TABLE, [set, public, named_table]), ets:new(?COLLECTION_CLAUSE_TABLE, [set, public, named_table]), - process_flag(trap_exit,true), Starter ! {?SERVER,started}, main_process_loop(#main_state{}). @@ -595,8 +597,8 @@ main_process_loop(State) -> lists:foldl( fun(Node,Acc) -> case rpc:call(Node,cover,remote_start,[ThisNode]) of - {ok,RPid} -> - link(RPid), + {ok,_RPid} -> + erlang:monitor(process,{?SERVER,Node}), [Node|Acc]; Error -> io:format("Could not start cover on ~w: ~p\n", @@ -721,8 +723,8 @@ main_process_loop(State) -> {From, {stop,Nodes}} -> remote_collect('_',Nodes,true), reply(From, ok), - State1 = State#main_state{nodes=State#main_state.nodes--Nodes}, - main_process_loop(State1); + Nodes1 = State#main_state.nodes--Nodes, + main_process_loop(State#main_state{nodes=Nodes1}); {From, {flush,Nodes}} -> remote_collect('_',Nodes,false), @@ -807,13 +809,10 @@ main_process_loop(State) -> end, main_process_loop(S); - {'EXIT',Pid,_Reason} -> - %% Exit is trapped on the main node only, so this will only happen - %% there. I assume that I'm only linked to cover_servers on remote - %% nodes, so this must be one of them crashing. - %% Remove node from list! - State1 = State#main_state{nodes=State#main_state.nodes--[node(Pid)]}, - main_process_loop(State1); + {'DOWN', _MRef, process, {?SERVER,Node}, _Info} -> + %% A remote cover_server is down, remove node from list. + Nodes1 = State#main_state.nodes--[Node], + main_process_loop(State#main_state{nodes=Nodes1}); {From, get_main_node} -> reply(From, node()), @@ -873,7 +872,7 @@ remote_process_loop(State) -> {remote,stop} -> reload_originals(State#remote_state.compiled), unregister(?SERVER), - remote_reply(State#remote_state.main_node, ok); + ok; % not replying since 'DOWN' message will be received anyway {From, get_main_node} -> reply(From, State#remote_state.main_node), @@ -1121,7 +1120,6 @@ remove_myself([Node|Nodes],Acc) -> remove_myself(Nodes,[Node|Acc]); remove_myself([],Acc) -> Acc. - %%%--Handling of modules state data-------------------------------------- @@ -2334,7 +2332,7 @@ pmap(Fun, [E | Rest], Pids, Limit, Cnt, Acc) when Cnt < Limit -> pmap(Fun, Rest, Pids ++ [Pid], Limit, Cnt + 1, Acc); pmap(Fun, List, [Pid | Pids], Limit, Cnt, Acc) -> receive - {'DOWN', _Ref, process, _, _} -> + {'DOWN', _Ref, process, X, _} when is_pid(X) -> pmap(Fun, List, [Pid | Pids], Limit, Cnt - 1, Acc); {res, Pid, Res} -> pmap(Fun, List, Pids, Limit, Cnt, [Res | Acc]) @@ -2343,6 +2341,6 @@ pmap(_Fun, [], [], _Limit, 0, Acc) -> lists:reverse(Acc); pmap(Fun, [], [], Limit, Cnt, Acc) -> receive - {'DOWN', _Ref, process, _, _} -> + {'DOWN', _Ref, process, X, _} when is_pid(X) -> pmap(Fun, [], [], Limit, Cnt - 1, Acc) end. -- cgit v1.2.3 From 0dbae9d6fcdaa9b699e85a00fe44560b89cc24df Mon Sep 17 00:00:00 2001 From: Siri Hansen Date: Mon, 29 Oct 2012 17:37:29 +0100 Subject: [cover] Allow reconnection if node has been disconnected or down OTP-10523 Earlier, if the connection to a remote cover node was lost, all cover data was lost and the cover_server on the remote node would die. This would cause problems if there were cover compiled modules that would still be executed since they would attempt to write to the no longer existing ets tables belonging to the cover_server. This commit changes this behavior so that the cover_server on the remote node will survive connection loss and continue collecting cover data. If the connection is re-established then the main node will sync with the remote node again and cover data will not be lost (unless the node was down). --- lib/tools/src/cover.erl | 139 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 39 deletions(-) (limited to 'lib/tools/src') diff --git a/lib/tools/src/cover.erl b/lib/tools/src/cover.erl index 2154a21ed6..10f14b0a49 100644 --- a/lib/tools/src/cover.erl +++ b/lib/tools/src/cover.erl @@ -26,12 +26,17 @@ %% ARCHITECTURE %% The coverage tool consists of one process on each node involved in %% coverage analysis. The process is registered as 'cover_server' -%% (?SERVER). All cover_servers in the distributed system are linked -%% together. The cover_server on the 'main' node is in charge, and it -%% traps exits so it can detect nodedown or process crashes on the -%% remote nodes. This process is implemented by the functions -%% init_main/1 and main_process_loop/1. The cover_server on the remote -%% nodes are implemented by the functions init_remote/2 and +%% (?SERVER). The cover_server on the 'main' node is in charge, and +%% it monitors the cover_servers on all remote nodes. When it gets a +%% 'DOWN' message for another cover_server, it marks the node as +%% 'lost'. If a nodeup is received for a lost node the main node +%% ensures that the cover compiled modules are loaded again. If the +%% remote node was alive during the disconnected periode, cover data +%% for this periode will also be included in the analysis. +%% +%% The cover_server process on the main node is implemented by the +%% functions init_main/1 and main_process_loop/1. The cover_server on +%% the remote nodes are implemented by the functions init_remote/2 and %% remote_process_loop/1. %% %% TABLES @@ -90,7 +95,8 @@ -record(main_state, {compiled=[], % [{Module,File}] imported=[], % [{Module,File,ImportFile}] stopper, % undefined | pid() - nodes=[]}). % [Node] + nodes=[], % [Node] + lost_nodes=[]}). % [Node] -record(remote_state, {compiled=[], % [{Module,File}] main_node}). % atom() @@ -586,39 +592,14 @@ init_main(Starter) -> ets:new(?BINARY_TABLE, [set, named_table]), ets:new(?COLLECTION_TABLE, [set, public, named_table]), ets:new(?COLLECTION_CLAUSE_TABLE, [set, public, named_table]), + net_kernel:monitor_nodes(true), Starter ! {?SERVER,started}, main_process_loop(#main_state{}). main_process_loop(State) -> receive {From, {start_nodes,Nodes}} -> - ThisNode = node(), - StartedNodes = - lists:foldl( - fun(Node,Acc) -> - case rpc:call(Node,cover,remote_start,[ThisNode]) of - {ok,_RPid} -> - erlang:monitor(process,{?SERVER,Node}), - [Node|Acc]; - Error -> - io:format("Could not start cover on ~w: ~p\n", - [Node,Error]), - Acc - end - end, - [], - Nodes), - - %% In case some of the compiled modules have been unloaded they - %% should not be loaded on the new node. - {_LoadedModules,Compiled} = - get_compiled_still_loaded(State#main_state.nodes, - State#main_state.compiled), - remote_load_compiled(StartedNodes,Compiled), - - State1 = - State#main_state{nodes = State#main_state.nodes ++ StartedNodes, - compiled = Compiled}, + {StartedNodes,State1} = do_start_nodes(Nodes, State), reply(From, {ok,StartedNodes}), main_process_loop(State1); @@ -810,9 +791,24 @@ main_process_loop(State) -> main_process_loop(S); {'DOWN', _MRef, process, {?SERVER,Node}, _Info} -> - %% A remote cover_server is down, remove node from list. - Nodes1 = State#main_state.nodes--[Node], - main_process_loop(State#main_state{nodes=Nodes1}); + %% A remote cover_server is down, mark as lost + Nodes = State#main_state.nodes--[Node], + Lost = [Node|State#main_state.lost_nodes], + main_process_loop(State#main_state{nodes=Nodes,lost_nodes=Lost}); + + {nodeup,Node} -> + State1 = + case lists:member(Node,State#main_state.lost_nodes) of + true -> + sync_compiled(Node,State); + false -> + State + end, + main_process_loop(State1); + + {nodedown,_} -> + %% Will be taken care of when 'DOWN' message arrives + main_process_loop(State); {From, get_main_node} -> reply(From, node()), @@ -874,8 +870,13 @@ remote_process_loop(State) -> unregister(?SERVER), ok; % not replying since 'DOWN' message will be received anyway + {remote,get_compiled} -> + remote_reply(State#remote_state.main_node, + State#remote_state.compiled), + remote_process_loop(State); + {From, get_main_node} -> - reply(From, State#remote_state.main_node), + remote_reply(From, State#remote_state.main_node), remote_process_loop(State); get_status -> @@ -987,6 +988,36 @@ unload([]) -> %%%--Handling of remote nodes-------------------------------------------- +do_start_nodes(Nodes, State) -> + ThisNode = node(), + StartedNodes = + lists:foldl( + fun(Node,Acc) -> + case rpc:call(Node,cover,remote_start,[ThisNode]) of + {ok,_RPid} -> + erlang:monitor(process,{?SERVER,Node}), + [Node|Acc]; + Error -> + io:format("Could not start cover on ~w: ~p\n", + [Node,Error]), + Acc + end + end, + [], + Nodes), + + %% In case some of the compiled modules have been unloaded they + %% should not be loaded on the new node. + {_LoadedModules,Compiled} = + get_compiled_still_loaded(State#main_state.nodes, + State#main_state.compiled), + remote_load_compiled(StartedNodes,Compiled), + + State1 = + State#main_state{nodes = State#main_state.nodes ++ StartedNodes, + compiled = Compiled}, + {StartedNodes, State1}. + %% start the cover_server on a remote node remote_start(MainNode) -> case whereis(?SERVER) of @@ -1010,6 +1041,30 @@ remote_start(MainNode) -> {error,{already_started,Pid}} end. +%% If a lost node comes back, ensure that main and remote node has the +%% same cover compiled modules. Note that no action is taken if the +%% same {Mod,File} eksists on both, i.e. code change is not handled! +sync_compiled(Node,State) -> + #main_state{compiled=Compiled0,nodes=Nodes,lost_nodes=Lost}=State, + State1 = + case remote_call(Node,{remote,get_compiled}) of + {error,node_dead} -> + {_,S} = do_start_nodes([Node],State), + S; + {error,_} -> + State; + RemoteCompiled -> + {_,Compiled} = get_compiled_still_loaded(Nodes,Compiled0), + Unload = [UM || {UM,_}=U <- RemoteCompiled, + false == lists:member(U,Compiled)], + remote_unload([Node],Unload), + Load = [L || L <- Compiled, + false == lists:member(L,RemoteCompiled)], + remote_load_compiled([Node],Load), + State#main_state{compiled=Compiled, nodes=[Node|Nodes]} + end, + State1#main_state{lost_nodes=Lost--[Node]}. + %% Load a set of cover compiled modules on remote nodes, %% We do it ?MAX_MODS modules at a time so that we don't %% run out of memory on the cover_server node. @@ -2279,7 +2334,13 @@ do_reset2([]) -> do_clear(Module) -> ets:match_delete(?COVER_CLAUSE_TABLE, {Module,'_'}), ets:match_delete(?COVER_TABLE, {#bump{module=Module},'_'}), - ets:match_delete(?COLLECTION_TABLE, {#bump{module=Module},'_'}). + case lists:member(?COLLECTION_TABLE, ets:all()) of + true -> + %% We're on the main node + ets:match_delete(?COLLECTION_TABLE, {#bump{module=Module},'_'}); + false -> + ok + end. not_loaded(Module, unloaded, State) -> do_clear(Module), -- cgit v1.2.3