From feecf39417a1a5f2114a5fc18c7e0207fb642eee Mon Sep 17 00:00:00 2001 From: joewilliams Date: Thu, 9 Jun 2011 07:58:01 -0700 Subject: General improvements to release_handler_1:get_supervised_procs The core issues this patch attempts to solve is two fold, 1) have release_handler_1 act slightly differently in two corner cases and 2) clean up the code in get_supervised_procs/0 to remove nested cases and etc. Regarding #1, get_supervised_procs/0 will now call functions to test to see if the supervisor is suspended before attempting to ask it for a list of children. It now will print an error message regarding the suspended supervisor and produce an error that will cause the VM to restart. Previously it would timeout attempting the call to which_children and the VM would restart without any details regarding the reason. The second corner case is if in a child specification a supervisor is incorrectly stated to be a worker and get_modules is called against it. A timeout will occur causing a VM restart. Similar to the last corner case in this patch an error message is printed and an error is emitted causing a VM restart. When first looking into the issue it was hard to discover why my upgrades where failing. All I received during the upgrade process was a timeout and a VM restart, no other information. This patch should help users track down issues like these. Regarding #2, due to the above confusion in trying to figure out what had happened I dug into the code and started tracing it through and found that the nested case statements and etc made it confusing. So I started to rework and clean up, hopefully making this code path clearer to future readers. --- lib/sasl/src/release_handler_1.erl | 107 +++++++++++++++------ lib/sasl/test/release_handler_SUITE.erl | 26 ++++- .../test/release_handler_SUITE_data/Makefile.src | 16 ++- .../dummy-0.1/ebin/dummy.app | 7 ++ .../dummy-0.1/src/dummy_app.erl | 9 ++ .../dummy-0.1/src/dummy_server.erl | 56 +++++++++++ .../dummy-0.1/src/dummy_sup.erl | 15 +++ .../dummy-0.1/src/dummy_sup_2.erl | 15 +++ 8 files changed, 218 insertions(+), 33 deletions(-) create mode 100644 lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app create mode 100644 lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl create mode 100644 lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl create mode 100644 lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl create mode 100644 lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl (limited to 'lib/sasl') diff --git a/lib/sasl/src/release_handler_1.erl b/lib/sasl/src/release_handler_1.erl index 8d050fb7b0..0cc908fdf7 100644 --- a/lib/sasl/src/release_handler_1.erl +++ b/lib/sasl/src/release_handler_1.erl @@ -20,7 +20,7 @@ %% External exports -export([eval_script/3, eval_script/4, check_script/2]). --export([get_current_vsn/1]). %% exported because used in a test case +-export([get_current_vsn/1, get_supervised_procs/0]). %% exported because used in a test case -record(eval_state, {bins = [], stopped = [], suspended = [], apps = [], libdirs, unpurged = [], vsns = [], newlibs = [], @@ -469,6 +469,19 @@ start(Procs) -> %% supervisor module, we should load the new version, and then %% delete the old. Then we should perform the start changes %% manually, by adding/deleting children. +%% +%% Recent changes to this code cause the upgrade error out and +%% log the case where a suspended supervisor has which_children +%% called against it. This retains the behavior of causing a VM +%% restart to the *old* version of a release but has the +%% advantage of logging the pid and supervisor that had the +%% issue. +%% +%% A second case where this can occur is if a child spec is +%% incorrect and get_modules is called against a process that +%% can't respond to the gen:call. Again an error is logged, +%% an error returned and a VM restart is issued. +%% %% Returns: [{SuperPid, ChildName, ChildPid, Mods}] %%----------------------------------------------------------------- %% OTP-3452. For each application the first item contains the pid @@ -478,49 +491,81 @@ start(Procs) -> get_supervised_procs() -> lists:foldl( fun(Application, Procs) -> - case application_controller:get_master(Application) of - Pid when is_pid(Pid) -> - {Root, _AppMod} = application_master:get_child(Pid), - case get_supervisor_module(Root) of - {ok, SupMod} -> - get_procs(supervisor:which_children(Root), - Root) ++ - [{undefined, undefined, Root, [SupMod]} | - Procs]; - {error, _} -> - error_logger:error_msg("release_handler: " - "cannot find top " - "supervisor for " - "application ~w~n", - [Application]), - get_procs(supervisor:which_children(Root), - Root) ++ Procs - end; - _ -> Procs - end + get_master_procs(Application, + Procs, + application_controller:get_master(Application)) end, [], - lists:map(fun({Application, _Name, _Vsn}) -> - Application - end, - application:which_applications())). + get_application_names()). + +get_supervised_procs(_, Root, Procs, {ok, SupMod}) -> + get_procs(maybe_supervisor_which_children(get_proc_state(Root), SupMod, Root), Root) ++ + [{undefined, undefined, Root, [SupMod]} | Procs]; +get_supervised_procs(Application, Root, Procs, {error, _}) -> + error_logger:error_msg("release_handler: cannot find top supervisor for " + "application ~w~n", [Application]), + get_procs(maybe_supervisor_which_children(get_proc_state(Root), Application, Root), Root) ++ Procs. + +get_application_names() -> + lists:map(fun({Application, _Name, _Vsn}) -> + Application + end, + application:which_applications()). + +get_master_procs(Application, Procs, Pid) when is_pid(Pid) -> + {Root, _AppMod} = application_master:get_child(Pid), + get_supervised_procs(Application, Root, Procs, get_supervisor_module(Root)); +get_master_procs(_, Procs, _) -> + Procs. get_procs([{Name, Pid, worker, dynamic} | T], Sup) when is_pid(Pid) -> - Mods = get_dynamic_mods(Pid), + Mods = maybe_get_dynamic_mods(Name, Pid), [{Sup, Name, Pid, Mods} | get_procs(T, Sup)]; get_procs([{Name, Pid, worker, Mods} | T], Sup) when is_pid(Pid), is_list(Mods) -> [{Sup, Name, Pid, Mods} | get_procs(T, Sup)]; get_procs([{Name, Pid, supervisor, Mods} | T], Sup) when is_pid(Pid) -> - [{Sup, Name, Pid, Mods} | get_procs(T, Sup)] ++ - get_procs(supervisor:which_children(Pid), Pid); + [{Sup, Name, Pid, Mods} | get_procs(T, Sup)] ++ + get_procs(maybe_supervisor_which_children(get_proc_state(Pid), Name, Pid), Pid); get_procs([_H | T], Sup) -> get_procs(T, Sup); get_procs(_, _Sup) -> []. -get_dynamic_mods(Pid) -> - {ok,Res} = gen:call(Pid, self(), get_modules), - Res. +get_proc_state(Proc) -> + {status, _, {module, _}, [_, State, _, _, _]} = sys:get_status(Proc), + State. + +maybe_supervisor_which_children(suspended, Name, Pid) -> + error_logger:error_msg("release_handler: a which_children call" + " to ~p (~p) was avoided. This supervisor" + " is suspended and should likely be upgraded" + " differently. Exiting ...~n", [Name, Pid]), + error(suspended_supervisor); + +maybe_supervisor_which_children(State, Name, Pid) -> + case catch supervisor:which_children(Pid) of + Res when is_list(Res) -> + Res; + Other -> + error_logger:error_msg("release_handler: ~p~nerror during" + " a which_children call to ~p (~p)." + " [State: ~p] Exiting ... ~n", + [Other, Name, Pid, State]), + error(which_children_failed) + end. + +maybe_get_dynamic_mods(Name, Pid) -> + case catch gen:call(Pid, self(), get_modules) of + {ok, Res} -> + Res; + Other -> + error_logger:error_msg("release_handler: ~p~nerror during a" + " get_modules call to ~p (~p)," + " there may be an error in it's" + " childspec. Exiting ...~n", + [Other, Name, Pid]), + error(get_modules_failed) + end. %% XXXX %% Note: The following is a terrible hack done in order to resolve the diff --git a/lib/sasl/test/release_handler_SUITE.erl b/lib/sasl/test/release_handler_SUITE.erl index efa775f344..c850e339bc 100644 --- a/lib/sasl/test/release_handler_SUITE.erl +++ b/lib/sasl/test/release_handler_SUITE.erl @@ -55,7 +55,8 @@ win32_cases() -> %% Cases that can be run on all platforms cases() -> - [otp_2740, otp_2760, otp_5761, instructions, eval_appup]. + [otp_2740, otp_2760, otp_5761, instructions, eval_appup, + supervisor_which_children_timeout]. groups() -> [{release,[], @@ -512,6 +513,29 @@ no_cc() -> %%% Testing of reported bugs and other tickets. %%%----------------------------------------------------------------- +%%----------------------------------------------------------------- +%% release_handler_1:get_supervised_procs/0 test +%%----------------------------------------------------------------- +supervisor_which_children_timeout(Conf) -> + PrivDir = priv_dir(Conf), + Dir = filename:join(PrivDir,"supervisor_which_children_timeout"), + DataDir = ?config(data_dir,Conf), + LibDir = filename:join([DataDir,release_handler_timeouts]), + + Rel1 = create_and_install_fake_first_release(Dir,[{dummy,"0.1",LibDir}]), + + {ok, Node} = t_start_node(supervisor_which_children_timeout, Rel1, []), + Proc = rpc:call(Node, erlang, whereis, [dummy_sup_2]), + ok = rpc:call(Node, sys, suspend, [Proc]), + Result = {badrpc, {'EXIT', {suspended_supervisor, _}}} = + rpc:call(Node, release_handler_1, get_supervised_procs, []), + ?t:format("release_handler_1:get_supervised_procs/0: ~p~n", [Result]), + + ok. + +supervisor_which_children_timeout(cleanup, Conf) -> + stop_node(node_name(supervisor_which_children_timeout)). + %%----------------------------------------------------------------- %% Ticket: OTP-2740 %% Slogan: vsn not numeric doesn't work so good in release_handling diff --git a/lib/sasl/test/release_handler_SUITE_data/Makefile.src b/lib/sasl/test/release_handler_SUITE_data/Makefile.src index 85e25fdc2f..2dab42b65d 100644 --- a/lib/sasl/test/release_handler_SUITE_data/Makefile.src +++ b/lib/sasl/test/release_handler_SUITE_data/Makefile.src @@ -36,8 +36,13 @@ C= \ c/b.@EMULATOR@ \ c/c_sup.@EMULATOR@ +SUP= \ + release_handler_timeouts/dummy-0.1/ebin/dummy_app.@EMULATOR@ \ + release_handler_timeouts/dummy-0.1/ebin/dummy_server.@EMULATOR@ \ + release_handler_timeouts/dummy-0.1/ebin/dummy_sup.@EMULATOR@ \ + release_handler_timeouts/dummy-0.1/ebin/dummy_sup_2.@EMULATOR@ -all: $(P2B) $(LIB) $(APP) $(OTP2740) $(C) +all: $(P2B) $(LIB) $(APP) $(OTP2740) $(C) $(SUP) P2B/a-2.0/ebin/a.@EMULATOR@: P2B/a-2.0/src/a.erl erlc $(EFLAGS) -oP2B/a-2.0/ebin P2B/a-2.0/src/a.erl @@ -106,3 +111,12 @@ c/b.@EMULATOR@: c/b.erl erlc $(EFLAGS) -oc c/b.erl c/c_sup.@EMULATOR@: c/c_sup.erl erlc $(EFLAGS) -oc c/c_sup.erl + +release_handler_timeouts/dummy-0.1/ebin/dummy_app.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_app.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_app.erl +release_handler_timeouts/dummy-0.1/ebin/dummy_server.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_server.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_server.erl +release_handler_timeouts/dummy-0.1/ebin/dummy_sup.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_sup.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_sup.erl +release_handler_timeouts/dummy-0.1/ebin/dummy_sup_2.@EMULATOR@: release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl + erlc $(EFLAGS) -orelease_handler_timeouts/dummy-0.1/ebin release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl \ No newline at end of file diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app new file mode 100644 index 0000000000..9efdc2e5da --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/ebin/dummy.app @@ -0,0 +1,7 @@ +{application,dummy, + [{description,"a dummy app"}, + {vsn,"0.1"}, + {registered,[dummy_app]}, + {mod,{dummy_app,[]}}, + {applications,[kernel,stdlib,sasl]}, + {modules,[dummy_app,dummy_server,dummy_sup,dummy_sup_2]}]}. \ No newline at end of file diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl new file mode 100644 index 0000000000..51363b3630 --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_app.erl @@ -0,0 +1,9 @@ +-module(dummy_app). +-behaviour(application). + +-export([start/2, stop/1]). + +start(_,_) -> + dummy_sup:start_link(). + +stop(_) -> ok. diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl new file mode 100644 index 0000000000..382251eba7 --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_server.erl @@ -0,0 +1,56 @@ +-module(dummy_server). +-behaviour(gen_server). + +-export([start_link/0, set_state/1, get_state/0]). + +-export([init/1, + handle_call/3, + handle_cast/2, + handle_info/2, + terminate/2, + code_change/3]). + +%% + +start_link() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). + +set_state(What) -> + gen_server:call(?MODULE, {set_state, What}). + +get_state() -> + gen_server:call(?MODULE, get_state). + + +%% + +init([]) -> + say("init, setting state to 0", []), + {ok, 0}. + + +handle_call({set_state, NewState}, _From, _State) -> + {reply, {ok, NewState}, NewState}; + +handle_call(get_state, _From, State) -> + {reply, State, State}. + +handle_cast('__not_implemented', State) -> + {noreply, State}. + +handle_info(_Info, State) -> + say("info ~p, ~p.", [_Info, State]), + {noreply, State}. + +terminate(_Reason, _State) -> + say("terminate ~p, ~p", [_Reason, _State]), + ok. + +code_change(_OldVsn, State, _Extra) -> + say("code_change ~p, ~p, ~p", [_OldVsn, State, _Extra]), + {ok, State}. + +%% Internal + +say(Format, Data) -> + io:format("~p:~p: ~s~n", [?MODULE, self(), io_lib:format(Format, Data)]). diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl new file mode 100644 index 0000000000..3d7b5060df --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup.erl @@ -0,0 +1,15 @@ +-module(dummy_sup). +-behaviour(supervisor). + +-export([start_link/0]). +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + DummySup2 = {dummy_sup_2, + {dummy_sup_2, start_link, []}, + permanent, 5000, supervisor, [dummy_sup_2]}, + + {ok, {{one_for_one, 10, 10}, [DummySup2]}}. diff --git a/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl new file mode 100644 index 0000000000..d936cbcbd6 --- /dev/null +++ b/lib/sasl/test/release_handler_SUITE_data/release_handler_timeouts/dummy-0.1/src/dummy_sup_2.erl @@ -0,0 +1,15 @@ +-module(dummy_sup_2). +-behaviour(supervisor). + +-export([start_link/0]). +-export([init/1]). + +start_link() -> + supervisor:start_link({local, ?MODULE}, ?MODULE, []). + +init([]) -> + Dummy = {dummy_server, + {dummy_server, start_link, []}, + permanent, 5000, worker, [dummy_server]}, + + {ok, {{one_for_one, 10, 10}, [Dummy]}}. -- cgit v1.2.3