aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjoewilliams <[email protected]>2011-06-09 07:58:01 -0700
committerHenrik Nord <[email protected]>2011-09-06 09:51:39 +0200
commitfeecf39417a1a5f2114a5fc18c7e0207fb642eee (patch)
tree1ec80f793d6ffb646197bdb817303bc97234e66f
parent751ec4f918bed2f5455538e6296c6b925bcca002 (diff)
downloadotp-feecf39417a1a5f2114a5fc18c7e0207fb642eee.tar.gz
otp-feecf39417a1a5f2114a5fc18c7e0207fb642eee.tar.bz2
otp-feecf39417a1a5f2114a5fc18c7e0207fb642eee.zip
General improvements to release_handler_1:get_supervised_procs
The core issues this patch attempts to solve is two fold, 1) have release_handler_1 act slightly differently in two corner cases and 2) clean up the code in get_supervised_procs/0 to remove nested cases and etc. Regarding #1, get_supervised_procs/0 will now call functions to test to see if the supervisor is suspended before attempting to ask it for a list of children. It now will print an error message regarding the suspended supervisor and produce an error that will cause the VM to restart. Previously it would timeout attempting the call to which_children and the VM would restart without any details regarding the reason. The second corner case is if in a child specification a supervisor is incorrectly stated to be a worker and get_modules is called against it. A timeout will occur causing a VM restart. Similar to the last corner case in this patch an error message is printed and an error is emitted causing a VM restart. When first looking into the issue it was hard to discover why my upgrades where failing. All I received during the upgrade process was a timeout and a VM restart, no other information. This patch should help users track down issues like these. Regarding #2, due to the above confusion in trying to figure out what had happened I dug into the code and started tracing it through and found that the nested case statements and etc made it confusing. So I started to rework and clean up, hopefully making this code path clearer to future readers.
-rw-r--r--lib/sasl/src/release_handler_1.erl107
-rw-r--r--lib/sasl/test/release_handler_SUITE.erl26
-rw-r--r--lib/sasl/test/release_handler_SUITE_data/Makefile.src16
-rw-r--r--lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app7
-rw-r--r--lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl9
-rw-r--r--lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl56
-rw-r--r--lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl15
-rw-r--r--lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl15
8 files changed, 218 insertions, 33 deletions
diff --git a/lib/sasl/src/release_handler_1.erl b/lib/sasl/src/release_handler_1.erl
index 8d050fb7b0..0cc908fdf7 100644
--- a/lib/sasl/src/release_handler_1.erl
+++ b/lib/sasl/src/release_handler_1.erl
@@ -20,7 +20,7 @@
%% External exports
-export([eval_script/3, eval_script/4, check_script/2]).
--export([get_current_vsn/1]). %% exported because used in a test case
+-export([get_current_vsn/1, get_supervised_procs/0]). %% exported because used in a test case
-record(eval_state, {bins = [], stopped = [], suspended = [], apps = [],
libdirs, unpurged = [], vsns = [], newlibs = [],
@@ -469,6 +469,19 @@ start(Procs) ->
%% supervisor module, we should load the new version, and then
%% delete the old. Then we should perform the start changes
%% manually, by adding/deleting children.
+%%
+%% Recent changes to this code cause the upgrade error out and
+%% log the case where a suspended supervisor has which_children
+%% called against it. This retains the behavior of causing a VM
+%% restart to the *old* version of a release but has the
+%% advantage of logging the pid and supervisor that had the
+%% issue.
+%%
+%% A second case where this can occur is if a child spec is
+%% incorrect and get_modules is called against a process that
+%% can't respond to the gen:call. Again an error is logged,
+%% an error returned and a VM restart is issued.
+%%
%% Returns: [{SuperPid, ChildName, ChildPid, Mods}]
%%-----------------------------------------------------------------
%% OTP-3452. For each application the first item contains the pid
@@ -478,49 +491,81 @@ start(Procs) ->
get_supervised_procs() ->
lists:foldl(
fun(Application, Procs) ->
- case application_controller:get_master(Application) of
- Pid when is_pid(Pid) ->
- {Root, _AppMod} = application_master:get_child(Pid),
- case get_supervisor_module(Root) of
- {ok, SupMod} ->
- get_procs(supervisor:which_children(Root),
- Root) ++
- [{undefined, undefined, Root, [SupMod]} |
- Procs];
- {error, _} ->
- error_logger:error_msg("release_handler: "
- "cannot find top "
- "supervisor for "
- "application ~w~n",
- [Application]),
- get_procs(supervisor:which_children(Root),
- Root) ++ Procs
- end;
- _ -> Procs
- end
+ get_master_procs(Application,
+ Procs,
+ application_controller:get_master(Application))
end,
[],
- lists:map(fun({Application, _Name, _Vsn}) ->
- Application
- end,
- application:which_applications())).
+ get_application_names()).
+
+get_supervised_procs(_, Root, Procs, {ok, SupMod}) ->
+ get_procs(maybe_supervisor_which_children(get_proc_state(Root), SupMod, Root), Root) ++
+ [{undefined, undefined, Root, [SupMod]} | Procs];
+get_supervised_procs(Application, Root, Procs, {error, _}) ->
+ error_logger:error_msg("release_handler: cannot find top supervisor for "
+ "application ~w~n", [Application]),
+ get_procs(maybe_supervisor_which_children(get_proc_state(Root), Application, Root), Root) ++ Procs.
+
+get_application_names() ->
+ lists:map(fun({Application, _Name, _Vsn}) ->
+ Application
+ end,
+ application:which_applications()).
+
+get_master_procs(Application, Procs, Pid) when is_pid(Pid) ->
+ {Root, _AppMod} = application_master:get_child(Pid),
+ get_supervised_procs(Application, Root, Procs, get_supervisor_module(Root));
+get_master_procs(_, Procs, _) ->
+ Procs.
get_procs([{Name, Pid, worker, dynamic} | T], Sup) when is_pid(Pid) ->
- Mods = get_dynamic_mods(Pid),
+ Mods = maybe_get_dynamic_mods(Name, Pid),
[{Sup, Name, Pid, Mods} | get_procs(T, Sup)];
get_procs([{Name, Pid, worker, Mods} | T], Sup) when is_pid(Pid), is_list(Mods) ->
[{Sup, Name, Pid, Mods} | get_procs(T, Sup)];
get_procs([{Name, Pid, supervisor, Mods} | T], Sup) when is_pid(Pid) ->
- [{Sup, Name, Pid, Mods} | get_procs(T, Sup)] ++
- get_procs(supervisor:which_children(Pid), Pid);
+ [{Sup, Name, Pid, Mods} | get_procs(T, Sup)] ++
+ get_procs(maybe_supervisor_which_children(get_proc_state(Pid), Name, Pid), Pid);
get_procs([_H | T], Sup) ->
get_procs(T, Sup);
get_procs(_, _Sup) ->
[].
-get_dynamic_mods(Pid) ->
- {ok,Res} = gen:call(Pid, self(), get_modules),
- Res.
+get_proc_state(Proc) ->
+ {status, _, {module, _}, [_, State, _, _, _]} = sys:get_status(Proc),
+ State.
+
+maybe_supervisor_which_children(suspended, Name, Pid) ->
+ error_logger:error_msg("release_handler: a which_children call"
+ " to ~p (~p) was avoided. This supervisor"
+ " is suspended and should likely be upgraded"
+ " differently. Exiting ...~n", [Name, Pid]),
+ error(suspended_supervisor);
+
+maybe_supervisor_which_children(State, Name, Pid) ->
+ case catch supervisor:which_children(Pid) of
+ Res when is_list(Res) ->
+ Res;
+ Other ->
+ error_logger:error_msg("release_handler: ~p~nerror during"
+ " a which_children call to ~p (~p)."
+ " [State: ~p] Exiting ... ~n",
+ [Other, Name, Pid, State]),
+ error(which_children_failed)
+ end.
+
+maybe_get_dynamic_mods(Name, Pid) ->
+ case catch gen:call(Pid, self(), get_modules) of
+ {ok, Res} ->
+ Res;
+ Other ->
+ error_logger:error_msg("release_handler: ~p~nerror during a"
+ " get_modules call to ~p (~p),"
+ " there may be an error in it's"
+ " childspec. Exiting ...~n",
+ [Other, Name, Pid]),
+ error(get_modules_failed)
+ end.
%% XXXX
%% Note: The following is a terrible hack done in order to resolve the
diff --git a/lib/sasl/test/release_handler_SUITE.erl b/lib/sasl/test/release_handler_SUITE.erl
index efa775f344..c850e339bc 100644
--- a/lib/sasl/test/release_handler_SUITE.erl
+++ b/lib/sasl/test/release_handler_SUITE.erl
@@ -55,7 +55,8 @@ win32_cases() ->
%% Cases that can be run on all platforms
cases() ->
- [otp_2740, otp_2760, otp_5761, instructions, eval_appup].
+ [otp_2740, otp_2760, otp_5761, instructions, eval_appup,
+ supervisor_which_children_timeout].
groups() ->
[{release,[],
@@ -513,6 +514,29 @@ no_cc() ->
%%%-----------------------------------------------------------------
%%-----------------------------------------------------------------
+%% release_handler_1:get_supervised_procs/0 test
+%%-----------------------------------------------------------------
+supervisor_which_children_timeout(Conf) ->
+ PrivDir = priv_dir(Conf),
+ Dir = filename:join(PrivDir,"supervisor_which_children_timeout"),
+ DataDir = ?config(data_dir,Conf),
+ LibDir = filename:join([DataDir,release_handler_timeouts]),
+
+ Rel1 = create_and_install_fake_first_release(Dir,[{dummy,"0.1",LibDir}]),
+
+ {ok, Node} = t_start_node(supervisor_which_children_timeout, Rel1, []),
+ Proc = rpc:call(Node, erlang, whereis, [dummy_sup_2]),
+ ok = rpc:call(Node, sys, suspend, [Proc]),
+ Result = {badrpc, {'EXIT', {suspended_supervisor, _}}} =
+ rpc:call(Node, release_handler_1, get_supervised_procs, []),
+ ?t:format("release_handler_1:get_supervised_procs/0: ~p~n", [Result]),
+
+ ok.
+
+supervisor_which_children_timeout(cleanup, Conf) ->
+ stop_node(node_name(supervisor_which_children_timeout)).
+
+%%-----------------------------------------------------------------
%% Ticket: OTP-2740
%% Slogan: vsn not numeric doesn't work so good in release_handling
%%-----------------------------------------------------------------
diff --git a/lib/sasl/test/release_handler_SUITE_data/Makefile.src b/lib/sasl/test/release_handler_SUITE_data/Makefile.src
index 85e25fdc2f..2dab42b65d 100644
--- a/lib/sasl/test/release_handler_SUITE_data/Makefile.src
+++ b/lib/sasl/test/release_handler_SUITE_data/Makefile.src
@@ -36,8 +36,13 @@ C= \
c/b.@EMULATOR@ \
c/c_sup.@EMULATOR@
+SUP= \
+ release_handler_timeouts/dummy-0.1/ebin/dummy_app.@EMULATOR@ \
+ release_handler_timeouts/dummy-0.1/ebin/dummy_server.@EMULATOR@ \
+ release_handler_timeouts/dummy-0.1/ebin/dummy_sup.@EMULATOR@ \
+ release_handler_timeouts/dummy-0.1/ebin/dummy_sup_2.@EMULATOR@
-all: $(P2B) $(LIB) $(APP) $(OTP2740) $(C)
+all: $(P2B) $(LIB) $(APP) $(OTP2740) $(C) $(SUP)
P2B/a-2.0/ebin/a.@EMULATOR@: P2B/a-2.0/src/a.erl
erlc $(EFLAGS) -oP2B/a-2.0/ebin P2B/a-2.0/src/a.erl
@@ -106,3 +111,12 @@ c/b.@EMULATOR@: c/b.erl
erlc $(EFLAGS) -oc c/b.erl
c/c_sup.@EMULATOR@: c/c_sup.erl
erlc $(EFLAGS) -oc c/c_sup.erl
+
+release_handler_timeouts/dummy-0.1/ebin/dummy_app.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_app.erl
+ erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_app.erl
+release_handler_timeouts/dummy-0.1/ebin/dummy_server.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_server.erl
+ erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_server.erl
+release_handler_timeouts/dummy-0.1/ebin/dummy_sup.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_sup.erl
+ erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_sup.erl
+release_handler_timeouts/dummy-0.1/ebin/dummy_sup_2.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl
+ erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl \ No newline at end of file
diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app
new file mode 100644
index 0000000000..9efdc2e5da
--- /dev/null
+++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app
@@ -0,0 +1,7 @@
+{application,dummy,
+ [{description,"a dummy app"},
+ {vsn,"0.1"},
+ {registered,[dummy_app]},
+ {mod,{dummy_app,[]}},
+ {applications,[kernel,stdlib,sasl]},
+ {modules,[dummy_app,dummy_server,dummy_sup,dummy_sup_2]}]}. \ No newline at end of file
diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl
new file mode 100644
index 0000000000..51363b3630
--- /dev/null
+++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl
@@ -0,0 +1,9 @@
+-module(dummy_app).
+-behaviour(application).
+
+-export([start/2, stop/1]).
+
+start(_,_) ->
+ dummy_sup:start_link().
+
+stop(_) -> ok.
diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl
new file mode 100644
index 0000000000..382251eba7
--- /dev/null
+++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl
@@ -0,0 +1,56 @@
+-module(dummy_server).
+-behaviour(gen_server).
+
+-export([start_link/0, set_state/1, get_state/0]).
+
+-export([init/1,
+ handle_call/3,
+ handle_cast/2,
+ handle_info/2,
+ terminate/2,
+ code_change/3]).
+
+%%
+
+start_link() ->
+ gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+set_state(What) ->
+ gen_server:call(?MODULE, {set_state, What}).
+
+get_state() ->
+ gen_server:call(?MODULE, get_state).
+
+
+%%
+
+init([]) ->
+ say("init, setting state to 0", []),
+ {ok, 0}.
+
+
+handle_call({set_state, NewState}, _From, _State) ->
+ {reply, {ok, NewState}, NewState};
+
+handle_call(get_state, _From, State) ->
+ {reply, State, State}.
+
+handle_cast('__not_implemented', State) ->
+ {noreply, State}.
+
+handle_info(_Info, State) ->
+ say("info ~p, ~p.", [_Info, State]),
+ {noreply, State}.
+
+terminate(_Reason, _State) ->
+ say("terminate ~p, ~p", [_Reason, _State]),
+ ok.
+
+code_change(_OldVsn, State, _Extra) ->
+ say("code_change ~p, ~p, ~p", [_OldVsn, State, _Extra]),
+ {ok, State}.
+
+%% Internal
+
+say(Format, Data) ->
+ io:format("~p:~p: ~s~n", [?MODULE, self(), io_lib:format(Format, Data)]).
diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl
new file mode 100644
index 0000000000..3d7b5060df
--- /dev/null
+++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl
@@ -0,0 +1,15 @@
+-module(dummy_sup).
+-behaviour(supervisor).
+
+-export([start_link/0]).
+-export([init/1]).
+
+start_link() ->
+ supervisor:start_link({local, ?MODULE}, ?MODULE, []).
+
+init([]) ->
+ DummySup2 = {dummy_sup_2,
+ {dummy_sup_2, start_link, []},
+ permanent, 5000, supervisor, [dummy_sup_2]},
+
+ {ok, {{one_for_one, 10, 10}, [DummySup2]}}.
diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl
new file mode 100644
index 0000000000..d936cbcbd6
--- /dev/null
+++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl
@@ -0,0 +1,15 @@
+-module(dummy_sup_2).
+-behaviour(supervisor).
+
+-export([start_link/0]).
+-export([init/1]).
+
+start_link() ->
+ supervisor:start_link({local, ?MODULE}, ?MODULE, []).
+
+init([]) ->
+ Dummy = {dummy_server,
+ {dummy_server, start_link, []},
+ permanent, 5000, worker, [dummy_server]},
+
+ {ok, {{one_for_one, 10, 10}, [Dummy]}}.