diff options
author | joewilliams <[email protected]> | 2011-06-09 07:58:01 -0700 |
---|---|---|
committer | Henrik Nord <[email protected]> | 2011-09-06 09:51:39 +0200 |
commit | feecf39417a1a5f2114a5fc18c7e0207fb642eee (patch) | |
tree | 1ec80f793d6ffb646197bdb817303bc97234e66f | |
parent | 751ec4f918bed2f5455538e6296c6b925bcca002 (diff) | |
download | otp-feecf39417a1a5f2114a5fc18c7e0207fb642eee.tar.gz otp-feecf39417a1a5f2114a5fc18c7e0207fb642eee.tar.bz2 otp-feecf39417a1a5f2114a5fc18c7e0207fb642eee.zip |
General improvements to release_handler_1:get_supervised_procs
The core issues this patch attempts to solve is two fold, 1) have
release_handler_1 act slightly differently in two corner cases and 2)
clean up the code in get_supervised_procs/0 to remove nested cases and
etc.
Regarding #1, get_supervised_procs/0 will now call functions to
test to see if the supervisor is suspended before attempting to ask it
for a list of children. It now will print an error message regarding
the suspended supervisor and produce an error that will cause the VM
to restart. Previously it would timeout attempting the call to
which_children and the VM would restart without any details regarding
the reason.
The second corner case is if in a child specification a supervisor is
incorrectly stated to be a worker and get_modules is called against it.
A timeout will occur causing a VM restart. Similar to the last corner
case in this patch an error message is printed and an error is emitted
causing a VM restart.
When first looking into the issue it was hard to discover why my
upgrades where failing. All I received during the upgrade process was
a timeout and a VM restart, no other information. This patch should
help users track down issues like these.
Regarding #2, due to the above confusion in trying to figure out what
had happened I dug into the code and started tracing it through and
found that the nested case statements and etc made it confusing. So I
started to rework and clean up, hopefully making this code path clearer
to future readers.
8 files changed, 218 insertions, 33 deletions
diff --git a/lib/sasl/src/release_handler_1.erl b/lib/sasl/src/release_handler_1.erl index 8d050fb7b0..0cc908fdf7 100644 --- a/lib/sasl/src/release_handler_1.erl +++ b/lib/sasl/src/release_handler_1.erl @@ -20,7 +20,7 @@ %% External exports -export([eval_script/3, eval_script/4, check_script/2]). --export([get_current_vsn/1]). %% exported because used in a test case +-export([get_current_vsn/1, get_supervised_procs/0]). %% exported because used in a test case -record(eval_state, {bins = [], stopped = [], suspended = [], apps = [], libdirs, unpurged = [], vsns = [], newlibs = [], @@ -469,6 +469,19 @@ start(Procs) -> %% supervisor module, we should load the new version, and then %% delete the old. Then we should perform the start changes %% manually, by adding/deleting children. +%% +%% Recent changes to this code cause the upgrade error out and +%% log the case where a suspended supervisor has which_children +%% called against it. This retains the behavior of causing a VM +%% restart to the *old* version of a release but has the +%% advantage of logging the pid and supervisor that had the +%% issue. +%% +%% A second case where this can occur is if a child spec is +%% incorrect and get_modules is called against a process that +%% can't respond to the gen:call. Again an error is logged, +%% an error returned and a VM restart is issued. +%% %% Returns: [{SuperPid, ChildName, ChildPid, Mods}] %%----------------------------------------------------------------- %% OTP-3452. For each application the first item contains the pid @@ -478,49 +491,81 @@ start(Procs) -> get_supervised_procs() -> lists:foldl( fun(Application, Procs) -> - case application_controller:get_master(Application) of - Pid when is_pid(Pid) -> - {Root, _AppMod} = application_master:get_child(Pid), - case get_supervisor_module(Root) of - {ok, SupMod} -> - get_procs(supervisor:which_children(Root), - Root) ++ - [{undefined, undefined, Root, [SupMod]} | - Procs]; - {error, _} -> - error_logger:error_msg("release_handler: " - "cannot find top " - "supervisor for " - "application ~w~n", - [Application]), - get_procs(supervisor:which_children(Root), - Root) ++ Procs - end; - _ -> Procs - end + get_master_procs(Application, + Procs, + application_controller:get_master(Application)) end, [], - lists:map(fun({Application, _Name, _Vsn}) -> - Application - end, - application:which_applications())). + get_application_names()). + +get_supervised_procs(_, Root, Procs, {ok, SupMod}) -> + get_procs(maybe_supervisor_which_children(get_proc_state(Root), SupMod, Root), Root) ++ + [{undefined, undefined, Root, [SupMod]} | Procs]; +get_supervised_procs(Application, Root, Procs, {error, _}) -> + error_logger:error_msg("release_handler: cannot find top supervisor for " + "application ~w~n", [Application]), + get_procs(maybe_supervisor_which_children(get_proc_state(Root), Application, Root), Root) ++ Procs. + +get_application_names() -> + lists:map(fun({Application, _Name, _Vsn}) -> + Application + end, + application:which_applications()). + +get_master_procs(Application, Procs, Pid) when is_pid(Pid) -> + {Root, _AppMod} = application_master:get_child(Pid), + get_supervised_procs(Application, Root, Procs, get_supervisor_module(Root)); +get_master_procs(_, Procs, _) -> + Procs. get_procs([{Name, Pid, worker, dynamic} | T], Sup) when is_pid(Pid) -> - Mods = get_dynamic_mods(Pid), + Mods = maybe_get_dynamic_mods(Name, Pid), [{Sup, Name, Pid, Mods} | get_procs(T, Sup)]; get_procs([{Name, Pid, worker, Mods} | T], Sup) when is_pid(Pid), is_list(Mods) -> [{Sup, Name, Pid, Mods} | get_procs(T, Sup)]; get_procs([{Name, Pid, supervisor, Mods} | T], Sup) when is_pid(Pid) -> - [{Sup, Name, Pid, Mods} | get_procs(T, Sup)] ++ - get_procs(supervisor:which_children(Pid), Pid); + [{Sup, Name, Pid, Mods} | get_procs(T, Sup)] ++ + get_procs(maybe_supervisor_which_children(get_proc_state(Pid), Name, Pid), Pid); get_procs([_H | T], Sup) -> get_procs(T, Sup); get_procs(_, _Sup) -> []. -get_dynamic_mods(Pid) -> - {ok,Res} = gen:call(Pid, self(), get_modules), - Res. +get_proc_state(Proc) -> + {status, _, {module, _}, [_, State, _, _, _]} = sys:get_status(Proc), + State. + +maybe_supervisor_which_children(suspended, Name, Pid) -> + error_logger:error_msg("release_handler: a which_children call" + " to ~p (~p) was avoided. This supervisor" + " is suspended and should likely be upgraded" + " differently. Exiting ...~n", [Name, Pid]), + error(suspended_supervisor); + +maybe_supervisor_which_children(State, Name, Pid) -> + case catch supervisor:which_children(Pid) of + Res when is_list(Res) -> + Res; + Other -> + error_logger:error_msg("release_handler: ~p~nerror during" + " a which_children call to ~p (~p)." + " [State: ~p] Exiting ... ~n", + [Other, Name, Pid, State]), + error(which_children_failed) + end. + +maybe_get_dynamic_mods(Name, Pid) -> + case catch gen:call(Pid, self(), get_modules) of + {ok, Res} -> + Res; + Other -> + error_logger:error_msg("release_handler: ~p~nerror during a" + " get_modules call to ~p (~p)," + " there may be an error in it's" + " childspec. Exiting ...~n", + [Other, Name, Pid]), + error(get_modules_failed) + end. %% XXXX %% Note: The following is a terrible hack done in order to resolve the diff --git a/lib/sasl/test/release_handler_SUITE.erl b/lib/sasl/test/release_handler_SUITE.erl index efa775f344..c850e339bc 100644 --- a/lib/sasl/test/release_handler_SUITE.erl +++ b/lib/sasl/test/release_handler_SUITE.erl @@ -55,7 +55,8 @@ win32_cases() -> %% Cases that can be run on all platforms cases() -> - [otp_2740, otp_2760, otp_5761, instructions, eval_appup]. + [otp_2740, otp_2760, otp_5761, instructions, eval_appup, + supervisor_which_children_timeout]. groups() -> [{release,[], @@ -513,6 +514,29 @@ no_cc() -> %%%----------------------------------------------------------------- %%----------------------------------------------------------------- +%% release_handler_1:get_supervised_procs/0 test +%%----------------------------------------------------------------- +supervisor_which_children_timeout(Conf) -> + PrivDir = priv_dir(Conf), + Dir = filename:join(PrivDir,"supervisor_which_children_timeout"), + DataDir = ?config(data_dir,Conf), + LibDir = filename:join([DataDir,release_handler_timeouts]), + + Rel1 = create_and_install_fake_first_release(Dir,[{dummy,"0.1",LibDir}]), + + {ok, Node} = t_start_node(supervisor_which_children_timeout, Rel1, []), + Proc = rpc:call(Node, erlang, whereis, [dummy_sup_2]), + ok = rpc:call(Node, sys, suspend, [Proc]), + Result = {badrpc, {'EXIT', {suspended_supervisor, _}}} = + rpc:call(Node, release_handler_1, get_supervised_procs, []), + ?t:format("release_handler_1:get_supervised_procs/0: ~p~n", [Result]), + + ok. + +supervisor_which_children_timeout(cleanup, Conf) -> + stop_node(node_name(supervisor_which_children_timeout)). + +%%----------------------------------------------------------------- %% Ticket: OTP-2740 %% Slogan: vsn not numeric doesn't work so good in release_handling %%----------------------------------------------------------------- diff --git a/lib/sasl/test/release_handler_SUITE_data/Makefile.src b/lib/sasl/test/release_handler_SUITE_data/Makefile.src index 85e25fdc2f..2dab42b65d 100644 --- a/lib/sasl/test/release_handler_SUITE_data/Makefile.src +++ b/lib/sasl/test/release_handler_SUITE_data/Makefile.src @@ -36,8 +36,13 @@ C= \ c/b.@EMULATOR@ \ c/c_sup.@EMULATOR@ +SUP= \ + release_handler_timeouts/dummy-0.1/ebin/dummy_app.@EMULATOR@ \ + release_handler_timeouts/dummy-0.1/ebin/dummy_server.@EMULATOR@ \ + release_handler_timeouts/dummy-0.1/ebin/dummy_sup.@EMULATOR@ \ + release_handler_timeouts/dummy-0.1/ebin/dummy_sup_2.@EMULATOR@ -all: $(P2B) $(LIB) $(APP) $(OTP2740) $(C) +all: $(P2B) $(LIB) $(APP) $(OTP2740) $(C) $(SUP) P2B/a-2.0/ebin/a.@EMULATOR@: P2B/a-2.0/src/a.erl erlc $(EFLAGS) -oP2B/a-2.0/ebin P2B/a-2.0/src/a.erl @@ -106,3 +111,12 @@ c/b.@EMULATOR@: c/b.erl erlc $(EFLAGS) -oc c/b.erl c/c_sup.@EMULATOR@: c/c_sup.erl erlc $(EFLAGS) -oc c/c_sup.erl + +release_handler_timeouts/dummy-0.1/ebin/dummy_app.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_app.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_app.erl +release_handler_timeouts/dummy-0.1/ebin/dummy_server.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_server.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_server.erl +release_handler_timeouts/dummy-0.1/ebin/dummy_sup.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_sup.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_sup.erl +release_handler_timeouts/dummy-0.1/ebin/dummy_sup_2.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl
\ No newline at end of file diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app new file mode 100644 index 0000000000..9efdc2e5da --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app @@ -0,0 +1,7 @@ +{application,dummy, + [{description,"a dummy app"}, + {vsn,"0.1"}, + {registered,[dummy_app]}, + {mod,{dummy_app,[]}}, + {applications,[kernel,stdlib,sasl]}, + {modules,[dummy_app,dummy_server,dummy_sup,dummy_sup_2]}]}.
\ No newline at end of file diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl new file mode 100644 index 0000000000..51363b3630 --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl @@ -0,0 +1,9 @@ +-module(dummy_app). +-behaviour(application). + +-export([start/2, stop/1]). + +start(_,_) -> + dummy_sup:start_link(). + +stop(_) -> ok. diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl new file mode 100644 index 0000000000..382251eba7 --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl @@ -0,0 +1,56 @@ +-module(dummy_server). +-behaviour(gen_server). + +-export([start_link/0, set_state/1, get_state/0]). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3]). + +%% + +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +set_state(What) -> + gen_server:call(?MODULE, {set_state, What}). + +get_state() -> + gen_server:call(?MODULE, get_state). + + +%% + +init([]) -> + say("init, setting state to 0", []), + {ok, 0}. + + +handle_call({set_state, NewState}, _From, _State) -> + {reply, {ok, NewState}, NewState}; + +handle_call(get_state, _From, State) -> + {reply, State, State}. + +handle_cast('__not_implemented', State) -> + {noreply, State}. + +handle_info(_Info, State) -> + say("info ~p, ~p.", [_Info, State]), + {noreply, State}. + +terminate(_Reason, _State) -> + say("terminate ~p, ~p", [_Reason, _State]), + ok. + +code_change(_OldVsn, State, _Extra) -> + say("code_change ~p, ~p, ~p", [_OldVsn, State, _Extra]), + {ok, State}. + +%% Internal + +say(Format, Data) -> + io:format("~p:~p: ~s~n", [?MODULE, self(), io_lib:format(Format, Data)]). diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl new file mode 100644 index 0000000000..3d7b5060df --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl @@ -0,0 +1,15 @@ +-module(dummy_sup). +-behaviour(supervisor). + +-export([start_link/0]). +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + DummySup2 = {dummy_sup_2, + {dummy_sup_2, start_link, []}, + permanent, 5000, supervisor, [dummy_sup_2]}, + + {ok, {{one_for_one, 10, 10}, [DummySup2]}}. diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl new file mode 100644 index 0000000000..d936cbcbd6 --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl @@ -0,0 +1,15 @@ +-module(dummy_sup_2). +-behaviour(supervisor). + +-export([start_link/0]). +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + Dummy = {dummy_server, + {dummy_server, start_link, []}, + permanent, 5000, worker, [dummy_server]}, + + {ok, {{one_for_one, 10, 10}, [Dummy]}}. |