From e6e31791abf090fe7e0bd3e5970b44830d087c4a Mon Sep 17 00:00:00 2001 From: Siri Hansen Date: Thu, 15 Dec 2011 11:57:30 +0100 Subject: Leave control back to gen_server during supervisor's restart loop When an attempt to restart a child failed, supervisor would earlier keep the execution flow and try to restart the child over and over again until it either succeeded or the restart frequency limit was reached. If none of these happened, supervisor would hang forever in this loop. This commit adds a timer of 0 ms where the control is left back to the gen_server which implements the supervisor. This way any incoming request to the supervisor will be handled - which could help breaking the infinite loop - e.g. shutdown request for the supervisor or for the problematic child. This introduces some incompatibilities in stdlib due to new return values from supervisor: * restart_child/2 can now return {error,restarting} * delete_child/2 can now return {error,restarting} * which_children/1 returns a list of {Id,Child,Type,Mods}, where Child, in addition to the old pid() or 'undefined', now also can be 'restarting'. --- lib/stdlib/test/Makefile | 1 + lib/stdlib/test/supervisor_SUITE.erl | 122 ++++++++++++++++++++++++++++---- lib/stdlib/test/supervisor_deadlock.erl | 45 ++++++++++++ 3 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 lib/stdlib/test/supervisor_deadlock.erl (limited to 'lib/stdlib/test') diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile index b36265302c..4de6ea3ee7 100644 --- a/lib/stdlib/test/Makefile +++ b/lib/stdlib/test/Makefile @@ -67,6 +67,7 @@ MODULES= \ string_SUITE \ supervisor_1 \ supervisor_2 \ + supervisor_deadlock \ naughty_child \ shell_SUITE \ supervisor_SUITE \ diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl index 71b76c093f..767ae3d62c 100644 --- a/lib/stdlib/test/supervisor_SUITE.erl +++ b/lib/stdlib/test/supervisor_SUITE.erl @@ -21,7 +21,7 @@ -module(supervisor_SUITE). -include_lib("common_test/include/ct.hrl"). --define(TIMEOUT, 1000). +-define(TIMEOUT, ?t:minutes(1)). %% Testserver specific export -export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1, @@ -62,7 +62,8 @@ do_not_save_start_parameters_for_temporary_children/1, do_not_save_child_specs_for_temporary_children/1, simple_one_for_one_scale_many_temporary_children/1, - simple_global_supervisor/1]). + simple_global_supervisor/1, hanging_restart_loop/1, + hanging_restart_loop_simple/1]). %%------------------------------------------------------------------------- @@ -82,7 +83,7 @@ all() -> count_children_memory, do_not_save_start_parameters_for_temporary_children, do_not_save_child_specs_for_temporary_children, simple_one_for_one_scale_many_temporary_children, temporary_bystander, - simple_global_supervisor]. + simple_global_supervisor, hanging_restart_loop, hanging_restart_loop_simple]. groups() -> [{sup_start, [], @@ -111,10 +112,8 @@ groups() -> {restart_rest_for_one, [], [rest_for_one, rest_for_one_escalation]}]. -init_per_suite(Config0) -> - Config = lists:keydelete(watchdog, 1, Config0), - Dog = test_server:timetrap(?TIMEOUT), - [{watchdog, Dog} | Config]. +init_per_suite(Config) -> + Config. end_per_suite(_Config) -> ok. @@ -129,18 +128,21 @@ init_per_testcase(count_children_memory, Config) -> try erlang:memory() of _ -> erts_debug:set_internal_state(available_internal_state, true), - Config + Dog = ?t:timetrap(?TIMEOUT), + [{watchdog,Dog}|Config] catch error:notsup -> {skip, "+Meamin used during test; erlang:memory/1 not available"} end; init_per_testcase(_Case, Config) -> - erlang:display(_Case), - Config. + Dog = ?t:timetrap(?TIMEOUT), + [{watchdog,Dog}|Config]. -end_per_testcase(count_children_memory, _Config) -> +end_per_testcase(count_children_memory, Config) -> catch erts_debug:set_internal_state(available_internal_state, false), + ?t:timetrap_cancel(?config(watchdog,Config)), ok; -end_per_testcase(_Case, _Config) -> +end_per_testcase(_Case, Config) -> + ?t:timetrap_cancel(?config(watchdog,Config)), ok. start_link(InitResult) -> @@ -1454,6 +1456,102 @@ gen_server9212() -> gen_server:start_link({global,server}, ?MODULE, InitResult, []). +%%------------------------------------------------------------------------- +%% Test that child and supervisor can be shutdown while hanging in restart loop. +%% See OTP-9549. +hanging_restart_loop(Config) when is_list(Config) -> + process_flag(trap_exit, true), + {ok, Pid} = start_link({ok, {{one_for_one, 8, 10}, []}}), + Child1 = {child1, {supervisor_deadlock, start_child, []}, + permanent, brutal_kill, worker, []}, + + %% Ets table with state read by supervisor_deadlock.erl + ets:new(supervisor_deadlock,[set,named_table,public]), + ets:insert(supervisor_deadlock,{fail_start,false}), + + {ok, CPid1} = supervisor:start_child(sup_test, Child1), + link(CPid1), + + ets:insert(supervisor_deadlock,{fail_start,true}), + supervisor_deadlock:restart_child(), + timer:sleep(2000), % allow restart to happen before proceeding + + {error, already_present} = supervisor:start_child(sup_test, Child1), + {error, restarting} = supervisor:restart_child(sup_test, child1), + {error, restarting} = supervisor:delete_child(sup_test, child1), + [{child1,restarting,worker,[]}] = supervisor:which_children(sup_test), + [1,0,0,1] = get_child_counts(sup_test), + + ok = supervisor:terminate_child(sup_test, child1), + check_exit_reason(CPid1, error), + [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test), + + ets:insert(supervisor_deadlock,{fail_start,false}), + {ok, CPid2} = supervisor:restart_child(sup_test, child1), + link(CPid2), + + ets:insert(supervisor_deadlock,{fail_start,true}), + supervisor_deadlock:restart_child(), + timer:sleep(2000), % allow restart to happen before proceeding + + %% Terminating supervisor. + %% OTP-9549 fixes so this does not give a timetrap timeout - + %% i.e. that supervisor does not hang in restart loop. + terminate(Pid,shutdown), + + %% Check that child died with reason from 'restart' request above + check_exit_reason(CPid2, error), + undefined = whereis(sup_test), + ok. + +%%------------------------------------------------------------------------- +%% Test that child and supervisor can be shutdown while hanging in +%% restart loop, simple_one_for_one. +%% See OTP-9549. +hanging_restart_loop_simple(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Child1 = {child1, {supervisor_deadlock, start_child, []}, + permanent, brutal_kill, worker, []}, + {ok, Pid} = start_link({ok, {{simple_one_for_one, 8, 10}, [Child1]}}), + + %% Ets table with state read by supervisor_deadlock.erl + ets:new(supervisor_deadlock,[set,named_table,public]), + ets:insert(supervisor_deadlock,{fail_start,false}), + + {ok, CPid1} = supervisor:start_child(sup_test, []), + link(CPid1), + + ets:insert(supervisor_deadlock,{fail_start,true}), + supervisor_deadlock:restart_child(), + timer:sleep(2000), % allow restart to happen before proceeding + + {error, simple_one_for_one} = supervisor:restart_child(sup_test, child1), + {error, simple_one_for_one} = supervisor:delete_child(sup_test, child1), + [{undefined,restarting,worker,[]}] = supervisor:which_children(sup_test), + [1,0,0,1] = get_child_counts(sup_test), + + ok = supervisor:terminate_child(sup_test, CPid1), + check_exit_reason(CPid1, error), + [] = supervisor:which_children(sup_test), + + ets:insert(supervisor_deadlock,{fail_start,false}), + {ok, CPid2} = supervisor:start_child(sup_test, []), + link(CPid2), + + ets:insert(supervisor_deadlock,{fail_start,true}), + supervisor_deadlock:restart_child(), + timer:sleep(2000), % allow restart to happen before proceeding + + %% Terminating supervisor. + %% OTP-9549 fixes so this does not give a timetrap timeout - + %% i.e. that supervisor does not hang in restart loop. + terminate(Pid,shutdown), + + %% Check that child died with reason from 'restart' request above + check_exit_reason(CPid2, error), + undefined = whereis(sup_test), + ok. + %%------------------------------------------------------------------------- terminate(Pid, Reason) when Reason =/= supervisor -> terminate(dummy, Pid, dummy, Reason). diff --git a/lib/stdlib/test/supervisor_deadlock.erl b/lib/stdlib/test/supervisor_deadlock.erl new file mode 100644 index 0000000000..288547a972 --- /dev/null +++ b/lib/stdlib/test/supervisor_deadlock.erl @@ -0,0 +1,45 @@ +-module(supervisor_deadlock). +-compile(export_all). + + +%%%----------------------------------------------------------------- +%%% gen_server callbacks +init([child]) -> + case ets:lookup(supervisor_deadlock,fail_start) of + [{fail_start, false}] -> + %% we must not fail on the first init, otherwise supervisor + %% terminates immediately + {ok, []}; + [{fail_start, true}] -> + %% Restart frequency is MaxR=8, MaxT=10, so this will + %% ensure that restart intensity is not reached -> restart + %% loop + timer:sleep(2000), % NOTE: this could be a gen_server call timeout + + {stop, error} + end. + +handle_call(_Req, _From, State) -> + {reply, ok, State}. + +%% Force a restart +handle_cast(restart, State) -> + {stop, error, State}. + +handle_info(_Msg, State) -> + {noreply, State}. + +terminate(_Reason, _State) -> + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%----------------------------------------------------------------- +%%% Start child +start_child() -> + gen_server:start_link({local, ?MODULE}, ?MODULE, [child], []). + +restart_child() -> + gen_server:cast(supervisor_deadlock, restart). -- cgit v1.2.3