aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSiri Hansen <[email protected]>2012-10-29 17:37:29 +0100
committerSiri Hansen <[email protected]>2012-10-30 11:19:25 +0100
commit0dbae9d6fcdaa9b699e85a00fe44560b89cc24df (patch)
tree49f77cb337627fad4a9aef3511e5ed66ca9629ed
parentf0010583fd53174b72341d64be3a481cccc4623c (diff)
downloadotp-0dbae9d6fcdaa9b699e85a00fe44560b89cc24df.tar.gz
otp-0dbae9d6fcdaa9b699e85a00fe44560b89cc24df.tar.bz2
otp-0dbae9d6fcdaa9b699e85a00fe44560b89cc24df.zip
[cover] Allow reconnection if node has been disconnected or down
OTP-10523 Earlier, if the connection to a remote cover node was lost, all cover data was lost and the cover_server on the remote node would die. This would cause problems if there were cover compiled modules that would still be executed since they would attempt to write to the no longer existing ets tables belonging to the cover_server. This commit changes this behavior so that the cover_server on the remote node will survive connection loss and continue collecting cover data. If the connection is re-established then the main node will sync with the remote node again and cover data will not be lost (unless the node was down).
-rw-r--r--lib/tools/doc/src/cover.xml5
-rw-r--r--lib/tools/src/cover.erl139
-rw-r--r--lib/tools/test/cover_SUITE.erl116
-rw-r--r--lib/tools/test/cover_SUITE_data/f.erl11
4 files changed, 222 insertions, 49 deletions
diff --git a/lib/tools/doc/src/cover.xml b/lib/tools/doc/src/cover.xml
index be0fd5bd47..a2444ec947 100644
--- a/lib/tools/doc/src/cover.xml
+++ b/lib/tools/doc/src/cover.xml
@@ -106,6 +106,11 @@
from all nodes.</p>
<p>To only collect data from remote nodes without stopping
<c>cover</c> on those nodes, use <c>cover:flush/1</c></p>
+ <p>If the connection to a remote node goes down, the main node
+ will mark it as lost. If the node comes back it will be added
+ again. If the remote node was alive during the disconnected
+ periode, cover data from before and during this periode will be
+ included in the analysis.</p>
</description>
<funcs>
<func>
diff --git a/lib/tools/src/cover.erl b/lib/tools/src/cover.erl
index 2154a21ed6..10f14b0a49 100644
--- a/lib/tools/src/cover.erl
+++ b/lib/tools/src/cover.erl
@@ -26,12 +26,17 @@
%% ARCHITECTURE
%% The coverage tool consists of one process on each node involved in
%% coverage analysis. The process is registered as 'cover_server'
-%% (?SERVER). All cover_servers in the distributed system are linked
-%% together. The cover_server on the 'main' node is in charge, and it
-%% traps exits so it can detect nodedown or process crashes on the
-%% remote nodes. This process is implemented by the functions
-%% init_main/1 and main_process_loop/1. The cover_server on the remote
-%% nodes are implemented by the functions init_remote/2 and
+%% (?SERVER). The cover_server on the 'main' node is in charge, and
+%% it monitors the cover_servers on all remote nodes. When it gets a
+%% 'DOWN' message for another cover_server, it marks the node as
+%% 'lost'. If a nodeup is received for a lost node the main node
+%% ensures that the cover compiled modules are loaded again. If the
+%% remote node was alive during the disconnected periode, cover data
+%% for this periode will also be included in the analysis.
+%%
+%% The cover_server process on the main node is implemented by the
+%% functions init_main/1 and main_process_loop/1. The cover_server on
+%% the remote nodes are implemented by the functions init_remote/2 and
%% remote_process_loop/1.
%%
%% TABLES
@@ -90,7 +95,8 @@
-record(main_state, {compiled=[], % [{Module,File}]
imported=[], % [{Module,File,ImportFile}]
stopper, % undefined | pid()
- nodes=[]}). % [Node]
+ nodes=[], % [Node]
+ lost_nodes=[]}). % [Node]
-record(remote_state, {compiled=[], % [{Module,File}]
main_node}). % atom()
@@ -586,39 +592,14 @@ init_main(Starter) ->
ets:new(?BINARY_TABLE, [set, named_table]),
ets:new(?COLLECTION_TABLE, [set, public, named_table]),
ets:new(?COLLECTION_CLAUSE_TABLE, [set, public, named_table]),
+ net_kernel:monitor_nodes(true),
Starter ! {?SERVER,started},
main_process_loop(#main_state{}).
main_process_loop(State) ->
receive
{From, {start_nodes,Nodes}} ->
- ThisNode = node(),
- StartedNodes =
- lists:foldl(
- fun(Node,Acc) ->
- case rpc:call(Node,cover,remote_start,[ThisNode]) of
- {ok,_RPid} ->
- erlang:monitor(process,{?SERVER,Node}),
- [Node|Acc];
- Error ->
- io:format("Could not start cover on ~w: ~p\n",
- [Node,Error]),
- Acc
- end
- end,
- [],
- Nodes),
-
- %% In case some of the compiled modules have been unloaded they
- %% should not be loaded on the new node.
- {_LoadedModules,Compiled} =
- get_compiled_still_loaded(State#main_state.nodes,
- State#main_state.compiled),
- remote_load_compiled(StartedNodes,Compiled),
-
- State1 =
- State#main_state{nodes = State#main_state.nodes ++ StartedNodes,
- compiled = Compiled},
+ {StartedNodes,State1} = do_start_nodes(Nodes, State),
reply(From, {ok,StartedNodes}),
main_process_loop(State1);
@@ -810,9 +791,24 @@ main_process_loop(State) ->
main_process_loop(S);
{'DOWN', _MRef, process, {?SERVER,Node}, _Info} ->
- %% A remote cover_server is down, remove node from list.
- Nodes1 = State#main_state.nodes--[Node],
- main_process_loop(State#main_state{nodes=Nodes1});
+ %% A remote cover_server is down, mark as lost
+ Nodes = State#main_state.nodes--[Node],
+ Lost = [Node|State#main_state.lost_nodes],
+ main_process_loop(State#main_state{nodes=Nodes,lost_nodes=Lost});
+
+ {nodeup,Node} ->
+ State1 =
+ case lists:member(Node,State#main_state.lost_nodes) of
+ true ->
+ sync_compiled(Node,State);
+ false ->
+ State
+ end,
+ main_process_loop(State1);
+
+ {nodedown,_} ->
+ %% Will be taken care of when 'DOWN' message arrives
+ main_process_loop(State);
{From, get_main_node} ->
reply(From, node()),
@@ -874,8 +870,13 @@ remote_process_loop(State) ->
unregister(?SERVER),
ok; % not replying since 'DOWN' message will be received anyway
+ {remote,get_compiled} ->
+ remote_reply(State#remote_state.main_node,
+ State#remote_state.compiled),
+ remote_process_loop(State);
+
{From, get_main_node} ->
- reply(From, State#remote_state.main_node),
+ remote_reply(From, State#remote_state.main_node),
remote_process_loop(State);
get_status ->
@@ -987,6 +988,36 @@ unload([]) ->
%%%--Handling of remote nodes--------------------------------------------
+do_start_nodes(Nodes, State) ->
+ ThisNode = node(),
+ StartedNodes =
+ lists:foldl(
+ fun(Node,Acc) ->
+ case rpc:call(Node,cover,remote_start,[ThisNode]) of
+ {ok,_RPid} ->
+ erlang:monitor(process,{?SERVER,Node}),
+ [Node|Acc];
+ Error ->
+ io:format("Could not start cover on ~w: ~p\n",
+ [Node,Error]),
+ Acc
+ end
+ end,
+ [],
+ Nodes),
+
+ %% In case some of the compiled modules have been unloaded they
+ %% should not be loaded on the new node.
+ {_LoadedModules,Compiled} =
+ get_compiled_still_loaded(State#main_state.nodes,
+ State#main_state.compiled),
+ remote_load_compiled(StartedNodes,Compiled),
+
+ State1 =
+ State#main_state{nodes = State#main_state.nodes ++ StartedNodes,
+ compiled = Compiled},
+ {StartedNodes, State1}.
+
%% start the cover_server on a remote node
remote_start(MainNode) ->
case whereis(?SERVER) of
@@ -1010,6 +1041,30 @@ remote_start(MainNode) ->
{error,{already_started,Pid}}
end.
+%% If a lost node comes back, ensure that main and remote node has the
+%% same cover compiled modules. Note that no action is taken if the
+%% same {Mod,File} eksists on both, i.e. code change is not handled!
+sync_compiled(Node,State) ->
+ #main_state{compiled=Compiled0,nodes=Nodes,lost_nodes=Lost}=State,
+ State1 =
+ case remote_call(Node,{remote,get_compiled}) of
+ {error,node_dead} ->
+ {_,S} = do_start_nodes([Node],State),
+ S;
+ {error,_} ->
+ State;
+ RemoteCompiled ->
+ {_,Compiled} = get_compiled_still_loaded(Nodes,Compiled0),
+ Unload = [UM || {UM,_}=U <- RemoteCompiled,
+ false == lists:member(U,Compiled)],
+ remote_unload([Node],Unload),
+ Load = [L || L <- Compiled,
+ false == lists:member(L,RemoteCompiled)],
+ remote_load_compiled([Node],Load),
+ State#main_state{compiled=Compiled, nodes=[Node|Nodes]}
+ end,
+ State1#main_state{lost_nodes=Lost--[Node]}.
+
%% Load a set of cover compiled modules on remote nodes,
%% We do it ?MAX_MODS modules at a time so that we don't
%% run out of memory on the cover_server node.
@@ -2279,7 +2334,13 @@ do_reset2([]) ->
do_clear(Module) ->
ets:match_delete(?COVER_CLAUSE_TABLE, {Module,'_'}),
ets:match_delete(?COVER_TABLE, {#bump{module=Module},'_'}),
- ets:match_delete(?COLLECTION_TABLE, {#bump{module=Module},'_'}).
+ case lists:member(?COLLECTION_TABLE, ets:all()) of
+ true ->
+ %% We're on the main node
+ ets:match_delete(?COLLECTION_TABLE, {#bump{module=Module},'_'});
+ false ->
+ ok
+ end.
not_loaded(Module, unloaded, State) ->
do_clear(Module),
diff --git a/lib/tools/test/cover_SUITE.erl b/lib/tools/test/cover_SUITE.erl
index 950c4ddbcb..3bf1b44af8 100644
--- a/lib/tools/test/cover_SUITE.erl
+++ b/lib/tools/test/cover_SUITE.erl
@@ -23,7 +23,7 @@
init_per_group/2,end_per_group/2]).
-export([start/1, compile/1, analyse/1, misc/1, stop/1,
- distribution/1, export_import/1,
+ distribution/1, reconnect/1, die_and_reconnect/1, export_import/1,
otp_5031/1, eif/1, otp_5305/1, otp_5418/1, otp_6115/1, otp_7095/1,
otp_8188/1, otp_8270/1, otp_8273/1, otp_8340/1]).
@@ -45,7 +45,8 @@ suite() -> [{ct_hooks,[ts_install_cth]}].
all() ->
case whereis(cover_server) of
undefined ->
- [start, compile, analyse, misc, stop, distribution,
+ [start, compile, analyse, misc, stop,
+ distribution, reconnect, die_and_reconnect,
export_import, otp_5031, eif, otp_5305, otp_5418,
otp_6115, otp_7095, otp_8188, otp_8270, otp_8273,
otp_8340];
@@ -408,20 +409,117 @@ distribution(Config) when is_list(Config) ->
?line true = is_unloaded(LocalBeam),
?line true = is_unloaded(N2Beam),
- %% Check that cover_server on remote node dies if main node dies
+ %% Check that cover_server on remote node does not die if main node dies
?line {ok,[N1]} = cover:start(N1),
- ?line true = is_pid(rpc:call(N1,erlang,whereis,[cover_server])),
+ ?line true = is_pid(N1Server = rpc:call(N1,erlang,whereis,[cover_server])),
?line exit(whereis(cover_server),kill),
- ?line timer:sleep(10),
- ?line undefined = rpc:call(N1,erlang,whereis,[cover_server]),
-
+ ?line timer:sleep(100),
+ ?line N1Server = rpc:call(N1,erlang,whereis,[cover_server]),
+
%% Cleanup
?line Files = lsfiles(),
?line remove(files(Files, ".beam")),
?line ?t:stop_node(N1),
?line ?t:stop_node(N2).
-
+%% Test that a lost node is reconnected
+reconnect(Config) ->
+ DataDir = ?config(data_dir, Config),
+ ok = file:set_cwd(DataDir),
+
+ {ok,a} = compile:file(a),
+ {ok,b} = compile:file(b),
+ {ok,f} = compile:file(f),
+
+ {ok,N1} = ?t:start_node(cover_SUITE_reconnect,peer,
+ [{args," -pa " ++ DataDir},{start_cover,false}]),
+ {ok,a} = cover:compile(a),
+ {ok,f} = cover:compile(f),
+ {ok,[N1]} = cover:start(nodes()),
+
+ %% Some calls to check later
+ rpc:call(N1,f,f1,[]),
+ cover:flush(N1),
+ rpc:call(N1,f,f1,[]),
+
+ %% This will cause a call to f:f2() when nodes()==[] on N1
+ rpc:cast(N1,f,call_f2_when_isolated,[]),
+
+ %% Disconnect and check that node is removed from main cover node
+ net_kernel:disconnect(N1),
+ [] = cover:which_nodes(),
+ timer:sleep(500), % allow some time for the f:f2() call
+
+ %% Do some add one module (b) and remove one module (a)
+ code:purge(a),
+ {module,a} = code:load_file(a),
+ {ok,b} = cover:compile(b),
+ cover_compiled = code:which(b),
+
+ [] = cover:which_nodes(),
+ check_f_calls(1,0), % only the first call - before the flush
+
+ %% Reconnect the node and check that b and f are cover compiled but not a
+ net_kernel:connect_node(N1),
+ timer:sleep(100),
+ [N1] = cover:which_nodes(), % we are reconnected
+ cover_compiled = rpc:call(N1,code,which,[b]),
+ cover_compiled = rpc:call(N1,code,which,[f]),
+ ABeam = rpc:call(N1,code,which,[a]),
+ false = (cover_compiled==ABeam),
+
+ %% Ensure that we have:
+ %% * one f1 call from before the flush,
+ %% * one f1 call from after the flush but before disconnect
+ %% * one f2 call when disconnected
+ check_f_calls(2,1),
+
+ cover:stop(),
+ ?t:stop_node(N1),
+ ok.
+
+%% Test that a lost node is reconnected - also if it has been dead
+die_and_reconnect(Config) ->
+ DataDir = ?config(data_dir, Config),
+ ok = file:set_cwd(DataDir),
+
+ {ok,f} = compile:file(f),
+
+ NodeName = cover_SUITE_die_and_reconnect,
+ {ok,N1} = ?t:start_node(NodeName,peer,
+ [{args," -pa " ++ DataDir},{start_cover,false}]),
+ %% {ok,a} = cover:compile(a),
+ {ok,f} = cover:compile(f),
+ {ok,[N1]} = cover:start(nodes()),
+
+ %% Some calls to check later
+ rpc:call(N1,f,f1,[]),
+ cover:flush(N1),
+ rpc:call(N1,f,f1,[]),
+
+ %% Kill the node
+ rpc:call(N1,erlang,halt,[]),
+ [] = cover:which_nodes(),
+
+ check_f_calls(1,0), % only the first call - before the flush
+
+ %% Restart the node and check that cover reconnects
+ {ok,N1} = ?t:start_node(NodeName,peer,
+ [{args," -pa " ++ DataDir},{start_cover,false}]),
+ timer:sleep(100),
+ [N1] = cover:which_nodes(), % we are reconnected
+ cover_compiled = rpc:call(N1,code,which,[f]),
+
+ %% One more call...
+ rpc:call(N1,f,f1,[]),
+
+ %% Ensure that no more calls are counted
+ check_f_calls(2,0),
+
+ cover:stop(),
+ ?t:stop_node(N1),
+ ok.
+
export_import(suite) -> [];
export_import(Config) when is_list(Config) ->
?line DataDir = ?config(data_dir, Config),
@@ -1253,4 +1351,4 @@ is_unloaded(What) ->
end.
check_f_calls(F1,F2) ->
- {ok,[{{f,f1,0},F1},{{f,f2,0},F2}]} = cover:analyse(f,calls,function).
+ {ok,[{{f,f1,0},F1},{{f,f2,0},F2}|_]} = cover:analyse(f,calls,function).
diff --git a/lib/tools/test/cover_SUITE_data/f.erl b/lib/tools/test/cover_SUITE_data/f.erl
index 1ef8bbdb49..ce2963014a 100644
--- a/lib/tools/test/cover_SUITE_data/f.erl
+++ b/lib/tools/test/cover_SUITE_data/f.erl
@@ -1,5 +1,5 @@
-module(f).
--export([f1/0,f2/0]).
+-export([f1/0,f2/0,call_f2_when_isolated/0]).
f1() ->
f1_line1,
@@ -8,3 +8,12 @@ f1() ->
f2() ->
f2_line1,
f2_line2.
+
+call_f2_when_isolated() ->
+ case nodes() of
+ [] ->
+ f2();
+ _ ->
+ timer:sleep(100),
+ call_f2_when_isolated()
+ end.