aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSiri Hansen <[email protected]>2011-12-15 11:57:30 +0100
committerSiri Hansen <[email protected]>2012-03-05 11:22:13 +0100
commite6e31791abf090fe7e0bd3e5970b44830d087c4a (patch)
treed2a6f3dd000d2a1ce843d5c687498b9d43cc4879
parentd1e67d51a6d754e5a055c81c9d7447e0d87e982f (diff)
downloadotp-e6e31791abf090fe7e0bd3e5970b44830d087c4a.tar.gz
otp-e6e31791abf090fe7e0bd3e5970b44830d087c4a.tar.bz2
otp-e6e31791abf090fe7e0bd3e5970b44830d087c4a.zip
Leave control back to gen_server during supervisor's restart loop
When an attempt to restart a child failed, supervisor would earlier keep the execution flow and try to restart the child over and over again until it either succeeded or the restart frequency limit was reached. If none of these happened, supervisor would hang forever in this loop. This commit adds a timer of 0 ms where the control is left back to the gen_server which implements the supervisor. This way any incoming request to the supervisor will be handled - which could help breaking the infinite loop - e.g. shutdown request for the supervisor or for the problematic child. This introduces some incompatibilities in stdlib due to new return values from supervisor: * restart_child/2 can now return {error,restarting} * delete_child/2 can now return {error,restarting} * which_children/1 returns a list of {Id,Child,Type,Mods}, where Child, in addition to the old pid() or 'undefined', now also can be 'restarting'.
-rw-r--r--lib/stdlib/doc/src/supervisor.xml12
-rw-r--r--lib/stdlib/src/supervisor.erl170
-rw-r--r--lib/stdlib/test/Makefile1
-rw-r--r--lib/stdlib/test/supervisor_SUITE.erl122
-rw-r--r--lib/stdlib/test/supervisor_deadlock.erl45
5 files changed, 294 insertions, 56 deletions
diff --git a/lib/stdlib/doc/src/supervisor.xml b/lib/stdlib/doc/src/supervisor.xml
index d1e62230bc..35f4f82264 100644
--- a/lib/stdlib/doc/src/supervisor.xml
+++ b/lib/stdlib/doc/src/supervisor.xml
@@ -399,10 +399,11 @@ child_spec() = {Id,StartFunc,Restart,Shutdown,Type,Modules}
<c>SupRef</c>.</p>
<p>If successful, the function returns <c>ok</c>. If the child
specification identified by <c><anno>Id</anno></c> exists but
- the corresponding child process is running, the function
- returns <c>{error,running}</c>. If the child specification
- identified by <c><anno>Id</anno></c> does not exist, the function returns
- <c>{error,not_found}</c>.</p>
+ the corresponding child process is running or about to be restarted,
+ the function returns <c>{error,running}</c> or
+ <c>{error,restarting}</c> respectively. If the child specification
+ identified by <c><anno>Id</anno></c> does not exist, the function
+ returns <c>{error,not_found}</c>.</p>
</desc>
</func>
<func>
@@ -462,7 +463,8 @@ child_spec() = {Id,StartFunc,Restart,Shutdown,Type,Modules}
</item>
<item>
<p><c><anno>Child</anno></c> - the pid of the corresponding child
- process, or <c>undefined</c> if there is no such process.</p>
+ process, the atom <c>restarting</c> if the process is about to be
+ restarted or <c>undefined</c> if there is no such process.</p>
</item>
<item>
<p><c><anno>Type</anno></c> - as defined in the child specification.</p>
diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl
index ac5b078c29..f315064b03 100644
--- a/lib/stdlib/src/supervisor.erl
+++ b/lib/stdlib/src/supervisor.erl
@@ -28,8 +28,9 @@
check_childspecs/1]).
%% Internal exports
--export([init/1, handle_call/3, handle_info/2, terminate/2, code_change/3]).
--export([handle_cast/2]).
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+ terminate/2, code_change/3]).
+-export([try_again_restart/2]).
%%--------------------------------------------------------------------------
@@ -37,7 +38,7 @@
%%--------------------------------------------------------------------------
--type child() :: 'undefined' | pid() | [pid()].
+-type child() :: 'undefined' | pid().
-type child_id() :: term().
-type mfargs() :: {M :: module(), F :: atom(), A :: [term()] | undefined}.
-type modules() :: [module()] | 'dynamic'.
@@ -62,8 +63,8 @@
%%--------------------------------------------------------------------------
-record(child, {% pid is undefined when child is not running
- pid = undefined :: child(),
- name,
+ pid = undefined :: child() | {restarting,pid()} | [pid()],
+ name :: child_id(),
mfargs :: mfargs(),
restart_type :: restart(),
shutdown :: shutdown(),
@@ -95,6 +96,8 @@
[ChildSpec :: child_spec()]}}
| ignore.
+-define(restarting(_Pid_), {restarting,_Pid_}).
+
%%% ---------------------------------------------------
%%% This is a general process supervisor built upon gen_server.erl.
%%% Servers/processes should/could also be built using gen_server.erl.
@@ -139,7 +142,8 @@ start_child(Supervisor, ChildSpec) ->
Result :: {'ok', Child :: child()}
| {'ok', Child :: child(), Info :: term()}
| {'error', Error},
- Error :: 'running' | 'not_found' | 'simple_one_for_one' | term().
+ Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one' |
+ term().
restart_child(Supervisor, Name) ->
call(Supervisor, {restart_child, Name}).
@@ -147,7 +151,7 @@ restart_child(Supervisor, Name) ->
SupRef :: sup_ref(),
Id :: child_id(),
Result :: 'ok' | {'error', Error},
- Error :: 'running' | 'not_found' | 'simple_one_for_one'.
+ Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one'.
delete_child(Supervisor, Name) ->
call(Supervisor, {delete_child, Name}).
@@ -169,7 +173,7 @@ terminate_child(Supervisor, Name) ->
-spec which_children(SupRef) -> [{Id,Child,Type,Modules}] when
SupRef :: sup_ref(),
Id :: child_id() | undefined,
- Child :: child(),
+ Child :: child() | 'restarting',
Type :: worker(),
Modules :: modules().
which_children(Supervisor) ->
@@ -198,6 +202,17 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) ->
end;
check_childspecs(X) -> {error, {badarg, X}}.
+%%%-----------------------------------------------------------------
+%%% Called by timer:apply_after from restart/2
+-spec try_again_restart(SupRef, Child) -> ok when
+ SupRef :: sup_ref(),
+ Child :: child_id() | pid().
+try_again_restart(Supervisor, Child) ->
+ cast(Supervisor, {try_again_restart, Child}).
+
+cast(Supervisor, Req) ->
+ gen_server:cast(Supervisor, Req).
+
%%% ---------------------------------------------------
%%%
%%% Initialize the supervisor.
@@ -384,6 +399,8 @@ handle_call({restart_child, Name}, _From, State) ->
Error ->
{reply, Error, State}
end;
+ {value, #child{pid=?restarting(_)}} ->
+ {reply, {error, restarting}, State};
{value, _} ->
{reply, {error, running}, State};
_ ->
@@ -395,6 +412,8 @@ handle_call({delete_child, Name}, _From, State) ->
{value, Child} when Child#child.pid =:= undefined ->
NState = remove_child(Child, State),
{reply, ok, NState};
+ {value, #child{pid=?restarting(_)}} ->
+ {reply, {error, restarting}, State};
{value, _} ->
{reply, {error, running}, State};
_ ->
@@ -413,13 +432,17 @@ handle_call(which_children, _From, #state{children = [#child{restart_type = RTyp
child_type = CT,
modules = Mods}]} =
State) when ?is_simple(State) ->
- Reply = lists:map(fun({Pid, _}) -> {undefined, Pid, CT, Mods} end,
+ Reply = lists:map(fun({?restarting(_),_}) -> {undefined,restarting,CT,Mods};
+ ({Pid, _}) -> {undefined, Pid, CT, Mods} end,
?DICT:to_list(dynamics_db(RType, State#state.dynamics))),
{reply, Reply, State};
handle_call(which_children, _From, State) ->
Resp =
- lists:map(fun(#child{pid = Pid, name = Name,
+ lists:map(fun(#child{pid = ?restarting(_), name = Name,
+ child_type = ChildType, modules = Mods}) ->
+ {Name, restarting, ChildType, Mods};
+ (#child{pid = Pid, name = Name,
child_type = ChildType, modules = Mods}) ->
{Name, Pid, ChildType, Mods}
end,
@@ -432,8 +455,11 @@ handle_call(count_children, _From, #state{children = [#child{restart_type = temp
when ?is_simple(State) ->
{Active, Count} =
?SETS:fold(fun(Pid, {Alive, Tot}) ->
- if is_pid(Pid) -> {Alive+1, Tot +1};
- true -> {Alive, Tot + 1} end
+ case is_pid(Pid) andalso is_process_alive(Pid) of
+ true ->{Alive+1, Tot +1};
+ false ->
+ {Alive, Tot + 1}
+ end
end, {0, 0}, dynamics_db(temporary, State#state.dynamics)),
Reply = case CT of
supervisor -> [{specs, 1}, {active, Active},
@@ -448,8 +474,12 @@ handle_call(count_children, _From, #state{children = [#child{restart_type = RTy
when ?is_simple(State) ->
{Active, Count} =
?DICT:fold(fun(Pid, _Val, {Alive, Tot}) ->
- if is_pid(Pid) -> {Alive+1, Tot +1};
- true -> {Alive, Tot + 1} end
+ case is_pid(Pid) andalso is_process_alive(Pid) of
+ true ->
+ {Alive+1, Tot +1};
+ false ->
+ {Alive, Tot + 1}
+ end
end, {0, 0}, dynamics_db(RType, State#state.dynamics)),
Reply = case CT of
supervisor -> [{specs, 1}, {active, Active},
@@ -486,14 +516,42 @@ count_child(#child{pid = Pid, child_type = supervisor},
end.
-%%% Hopefully cause a function-clause as there is no API function
-%%% that utilizes cast.
--spec handle_cast('null', state()) -> {'noreply', state()}.
+%%% If a restart attempt failed, this message is sent via
+%%% timer:apply_after(0,...) in order to give gen_server the chance to
+%%% check it's inbox before trying again.
+-spec handle_cast({try_again_restart, child_id() | pid()}, state()) ->
+ {'noreply', state()} | {stop, shutdown, state()}.
-handle_cast(null, State) ->
- error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n",
- []),
- {noreply, State}.
+handle_cast({try_again_restart,Pid}, #state{children=[Child]}=State)
+ when ?is_simple(State) ->
+ RT = Child#child.restart_type,
+ RPid = restarting(Pid),
+ case dynamic_child_args(RPid, dynamics_db(RT, State#state.dynamics)) of
+ {ok, Args} ->
+ {M, F, _} = Child#child.mfargs,
+ NChild = Child#child{pid = RPid, mfargs = {M, F, Args}},
+ case restart(NChild,State) of
+ {ok, State1} ->
+ {noreply, State1};
+ {shutdown, State1} ->
+ {stop, shutdown, State1}
+ end;
+ error ->
+ {noreply, State}
+ end;
+
+handle_cast({try_again_restart,Name}, State) ->
+ case lists:keyfind(Name,#child.name,State#state.children) of
+ Child = #child{pid=?restarting(_)} ->
+ case restart(Child,State) of
+ {ok, State1} ->
+ {noreply, State1};
+ {shutdown, State1} ->
+ {stop, shutdown, State1}
+ end;
+ _ ->
+ {noreply,State}
+ end.
%%
%% Take care of terminated children.
@@ -624,7 +682,7 @@ handle_start_child(Child, State) ->
{error, What} ->
{{error, {What, Child}}, State}
end;
- {value, OldChild} when OldChild#child.pid =/= undefined ->
+ {value, OldChild} when is_pid(OldChild#child.pid) ->
{{error, {already_started, OldChild#child.pid}}, State};
{value, _OldChild} ->
{{error, already_present}, State}
@@ -678,7 +736,21 @@ do_restart(temporary, Reason, Child, State) ->
restart(Child, State) ->
case add_restart(State) of
{ok, NState} ->
- restart(NState#state.strategy, Child, NState);
+ case restart(NState#state.strategy, Child, NState) of
+ {try_again,NState2} ->
+ %% Leaving control back to gen_server before
+ %% trying again. This way other incoming requsts
+ %% for the supervisor can be handled - e.g. a
+ %% shutdown request for the supervisor or the
+ %% child.
+ Id = if ?is_simple(State) -> Child#child.pid;
+ true -> Child#child.name
+ end,
+ timer:apply_after(0,?MODULE,try_again_restart,[self(),Id]),
+ {ok,NState2};
+ Other ->
+ Other
+ end;
{terminate, NState} ->
report_error(shutdown, reached_max_restart_intensity,
Child, State#state.name),
@@ -686,9 +758,9 @@ restart(Child, State) ->
end.
restart(simple_one_for_one, Child, State) ->
- #child{mfargs = {M, F, A}} = Child,
- Dynamics = ?DICT:erase(Child#child.pid, dynamics_db(Child#child.restart_type,
- State#state.dynamics)),
+ #child{pid = OldPid, mfargs = {M, F, A}} = Child,
+ Dynamics = ?DICT:erase(OldPid, dynamics_db(Child#child.restart_type,
+ State#state.dynamics)),
case do_start_child_i(M, F, A) of
{ok, Pid} ->
NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)},
@@ -697,10 +769,13 @@ restart(simple_one_for_one, Child, State) ->
NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)},
{ok, NState};
{error, Error} ->
+ NState = State#state{dynamics = ?DICT:store(restarting(OldPid), A,
+ Dynamics)},
report_error(start_error, Error, Child, State#state.name),
- restart(Child, State)
+ {try_again, NState}
end;
restart(one_for_one, Child, State) ->
+ OldPid = Child#child.pid,
case do_start_child(State#state.name, Child) of
{ok, Pid} ->
NState = replace_child(Child#child{pid = Pid}, State),
@@ -709,8 +784,9 @@ restart(one_for_one, Child, State) ->
NState = replace_child(Child#child{pid = Pid}, State),
{ok, NState};
{error, Reason} ->
+ NState = replace_child(Child#child{pid = restarting(OldPid)}, State),
report_error(start_error, Reason, Child, State#state.name),
- restart(Child, State)
+ {try_again, NState}
end;
restart(rest_for_one, Child, State) ->
{ChAfter, ChBefore} = split_child(Child#child.pid, State#state.children),
@@ -719,7 +795,9 @@ restart(rest_for_one, Child, State) ->
{ok, ChAfter3} ->
{ok, State#state{children = ChAfter3 ++ ChBefore}};
{error, ChAfter3} ->
- restart(Child, State#state{children = ChAfter3 ++ ChBefore})
+ NChild = Child#child{pid=restarting(Child#child.pid)},
+ NState = State#state{children = ChAfter3 ++ ChBefore},
+ {try_again, replace_child(NChild,NState)}
end;
restart(one_for_all, Child, State) ->
Children1 = del_child(Child#child.pid, State#state.children),
@@ -728,9 +806,14 @@ restart(one_for_all, Child, State) ->
{ok, NChs} ->
{ok, State#state{children = NChs}};
{error, NChs} ->
- restart(Child, State#state{children = NChs})
+ NChild = Child#child{pid=restarting(Child#child.pid)},
+ NState = State#state{children = NChs},
+ {try_again, replace_child(NChild,NState)}
end.
+restarting(Pid) when is_pid(Pid) -> ?restarting(Pid);
+restarting(RPid) -> RPid.
+
%%-----------------------------------------------------------------
%% Func: terminate_children/2
%% Args: Children = [child_rec()] in termination order
@@ -754,7 +837,7 @@ terminate_children([Child | Children], SupName, Res) ->
terminate_children([], _SupName, Res) ->
Res.
-do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
+do_terminate(Child, SupName) when is_pid(Child#child.pid) ->
case shutdown(Child#child.pid, Child#child.shutdown) of
ok ->
ok;
@@ -765,7 +848,7 @@ do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
end,
Child#child{pid = undefined};
do_terminate(Child, _SupName) ->
- Child.
+ Child#child{pid = undefined}.
%%-----------------------------------------------------------------
%% Shutdowns a child. We must check the EXIT value
@@ -866,7 +949,7 @@ terminate_dynamic_children(Child, Dynamics, SupName) ->
TRef = erlang:start_timer(Time, self(), kill),
wait_dynamic_children(Child, Pids, Sz, TRef, EStack0)
end,
- %% Unrool stacked errors and report them
+ %% Unroll stacked errors and report them
?DICT:fold(fun(Reason, Ls, _) ->
report_error(shutdown_error, Reason,
Child#child{pid=Ls}, SupName)
@@ -885,7 +968,7 @@ monitor_dynamic_children(#child{restart_type=temporary}, Dynamics) ->
end
end, {?SETS:new(), ?DICT:new()}, Dynamics);
monitor_dynamic_children(#child{restart_type=RType}, Dynamics) ->
- ?DICT:fold(fun(P, _, {Pids, EStack}) ->
+ ?DICT:fold(fun(P, _, {Pids, EStack}) when is_pid(P) ->
case monitor_child(P) of
ok ->
{?SETS:add_element(P, Pids), EStack};
@@ -893,7 +976,9 @@ monitor_dynamic_children(#child{restart_type=RType}, Dynamics) ->
{Pids, EStack};
{error, Reason} ->
{Pids, ?DICT:append(Reason, P, EStack)}
- end
+ end;
+ (?restarting(_), _, {Pids, EStack}) ->
+ {Pids, EStack}
end, {?SETS:new(), ?DICT:new()}, Dynamics).
@@ -1020,13 +1105,20 @@ get_child(Name, State, _) ->
lists:keysearch(Name, #child.name, State#state.children).
get_dynamic_child(Pid, #state{children=[Child], dynamics=Dynamics}) ->
- case is_dynamic_pid(Pid, dynamics_db(Child#child.restart_type, Dynamics)) of
+ DynamicsDb = dynamics_db(Child#child.restart_type, Dynamics),
+ case is_dynamic_pid(Pid, DynamicsDb) of
true ->
{value, Child#child{pid=Pid}};
false ->
- case erlang:is_process_alive(Pid) of
- true -> false;
- false -> {value, Child}
+ RPid = restarting(Pid),
+ case is_dynamic_pid(RPid, DynamicsDb) of
+ true ->
+ {value, Child#child{pid=RPid}};
+ false ->
+ case erlang:is_process_alive(Pid) of
+ true -> false;
+ false -> {value, Child}
+ end
end
end.
diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile
index b36265302c..4de6ea3ee7 100644
--- a/lib/stdlib/test/Makefile
+++ b/lib/stdlib/test/Makefile
@@ -67,6 +67,7 @@ MODULES= \
string_SUITE \
supervisor_1 \
supervisor_2 \
+ supervisor_deadlock \
naughty_child \
shell_SUITE \
supervisor_SUITE \
diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl
index 71b76c093f..767ae3d62c 100644
--- a/lib/stdlib/test/supervisor_SUITE.erl
+++ b/lib/stdlib/test/supervisor_SUITE.erl
@@ -21,7 +21,7 @@
-module(supervisor_SUITE).
-include_lib("common_test/include/ct.hrl").
--define(TIMEOUT, 1000).
+-define(TIMEOUT, ?t:minutes(1)).
%% Testserver specific export
-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
@@ -62,7 +62,8 @@
do_not_save_start_parameters_for_temporary_children/1,
do_not_save_child_specs_for_temporary_children/1,
simple_one_for_one_scale_many_temporary_children/1,
- simple_global_supervisor/1]).
+ simple_global_supervisor/1, hanging_restart_loop/1,
+ hanging_restart_loop_simple/1]).
%%-------------------------------------------------------------------------
@@ -82,7 +83,7 @@ all() ->
count_children_memory, do_not_save_start_parameters_for_temporary_children,
do_not_save_child_specs_for_temporary_children,
simple_one_for_one_scale_many_temporary_children, temporary_bystander,
- simple_global_supervisor].
+ simple_global_supervisor, hanging_restart_loop, hanging_restart_loop_simple].
groups() ->
[{sup_start, [],
@@ -111,10 +112,8 @@ groups() ->
{restart_rest_for_one, [],
[rest_for_one, rest_for_one_escalation]}].
-init_per_suite(Config0) ->
- Config = lists:keydelete(watchdog, 1, Config0),
- Dog = test_server:timetrap(?TIMEOUT),
- [{watchdog, Dog} | Config].
+init_per_suite(Config) ->
+ Config.
end_per_suite(_Config) ->
ok.
@@ -129,18 +128,21 @@ init_per_testcase(count_children_memory, Config) ->
try erlang:memory() of
_ ->
erts_debug:set_internal_state(available_internal_state, true),
- Config
+ Dog = ?t:timetrap(?TIMEOUT),
+ [{watchdog,Dog}|Config]
catch error:notsup ->
{skip, "+Meamin used during test; erlang:memory/1 not available"}
end;
init_per_testcase(_Case, Config) ->
- erlang:display(_Case),
- Config.
+ Dog = ?t:timetrap(?TIMEOUT),
+ [{watchdog,Dog}|Config].
-end_per_testcase(count_children_memory, _Config) ->
+end_per_testcase(count_children_memory, Config) ->
catch erts_debug:set_internal_state(available_internal_state, false),
+ ?t:timetrap_cancel(?config(watchdog,Config)),
ok;
-end_per_testcase(_Case, _Config) ->
+end_per_testcase(_Case, Config) ->
+ ?t:timetrap_cancel(?config(watchdog,Config)),
ok.
start_link(InitResult) ->
@@ -1455,6 +1457,102 @@ gen_server9212() ->
%%-------------------------------------------------------------------------
+%% Test that child and supervisor can be shutdown while hanging in restart loop.
+%% See OTP-9549.
+hanging_restart_loop(Config) when is_list(Config) ->
+ process_flag(trap_exit, true),
+ {ok, Pid} = start_link({ok, {{one_for_one, 8, 10}, []}}),
+ Child1 = {child1, {supervisor_deadlock, start_child, []},
+ permanent, brutal_kill, worker, []},
+
+ %% Ets table with state read by supervisor_deadlock.erl
+ ets:new(supervisor_deadlock,[set,named_table,public]),
+ ets:insert(supervisor_deadlock,{fail_start,false}),
+
+ {ok, CPid1} = supervisor:start_child(sup_test, Child1),
+ link(CPid1),
+
+ ets:insert(supervisor_deadlock,{fail_start,true}),
+ supervisor_deadlock:restart_child(),
+ timer:sleep(2000), % allow restart to happen before proceeding
+
+ {error, already_present} = supervisor:start_child(sup_test, Child1),
+ {error, restarting} = supervisor:restart_child(sup_test, child1),
+ {error, restarting} = supervisor:delete_child(sup_test, child1),
+ [{child1,restarting,worker,[]}] = supervisor:which_children(sup_test),
+ [1,0,0,1] = get_child_counts(sup_test),
+
+ ok = supervisor:terminate_child(sup_test, child1),
+ check_exit_reason(CPid1, error),
+ [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test),
+
+ ets:insert(supervisor_deadlock,{fail_start,false}),
+ {ok, CPid2} = supervisor:restart_child(sup_test, child1),
+ link(CPid2),
+
+ ets:insert(supervisor_deadlock,{fail_start,true}),
+ supervisor_deadlock:restart_child(),
+ timer:sleep(2000), % allow restart to happen before proceeding
+
+ %% Terminating supervisor.
+ %% OTP-9549 fixes so this does not give a timetrap timeout -
+ %% i.e. that supervisor does not hang in restart loop.
+ terminate(Pid,shutdown),
+
+ %% Check that child died with reason from 'restart' request above
+ check_exit_reason(CPid2, error),
+ undefined = whereis(sup_test),
+ ok.
+
+%%-------------------------------------------------------------------------
+%% Test that child and supervisor can be shutdown while hanging in
+%% restart loop, simple_one_for_one.
+%% See OTP-9549.
+hanging_restart_loop_simple(Config) when is_list(Config) ->
+ process_flag(trap_exit, true),
+ Child1 = {child1, {supervisor_deadlock, start_child, []},
+ permanent, brutal_kill, worker, []},
+ {ok, Pid} = start_link({ok, {{simple_one_for_one, 8, 10}, [Child1]}}),
+
+ %% Ets table with state read by supervisor_deadlock.erl
+ ets:new(supervisor_deadlock,[set,named_table,public]),
+ ets:insert(supervisor_deadlock,{fail_start,false}),
+
+ {ok, CPid1} = supervisor:start_child(sup_test, []),
+ link(CPid1),
+
+ ets:insert(supervisor_deadlock,{fail_start,true}),
+ supervisor_deadlock:restart_child(),
+ timer:sleep(2000), % allow restart to happen before proceeding
+
+ {error, simple_one_for_one} = supervisor:restart_child(sup_test, child1),
+ {error, simple_one_for_one} = supervisor:delete_child(sup_test, child1),
+ [{undefined,restarting,worker,[]}] = supervisor:which_children(sup_test),
+ [1,0,0,1] = get_child_counts(sup_test),
+
+ ok = supervisor:terminate_child(sup_test, CPid1),
+ check_exit_reason(CPid1, error),
+ [] = supervisor:which_children(sup_test),
+
+ ets:insert(supervisor_deadlock,{fail_start,false}),
+ {ok, CPid2} = supervisor:start_child(sup_test, []),
+ link(CPid2),
+
+ ets:insert(supervisor_deadlock,{fail_start,true}),
+ supervisor_deadlock:restart_child(),
+ timer:sleep(2000), % allow restart to happen before proceeding
+
+ %% Terminating supervisor.
+ %% OTP-9549 fixes so this does not give a timetrap timeout -
+ %% i.e. that supervisor does not hang in restart loop.
+ terminate(Pid,shutdown),
+
+ %% Check that child died with reason from 'restart' request above
+ check_exit_reason(CPid2, error),
+ undefined = whereis(sup_test),
+ ok.
+
+%%-------------------------------------------------------------------------
terminate(Pid, Reason) when Reason =/= supervisor ->
terminate(dummy, Pid, dummy, Reason).
diff --git a/lib/stdlib/test/supervisor_deadlock.erl b/lib/stdlib/test/supervisor_deadlock.erl
new file mode 100644
index 0000000000..288547a972
--- /dev/null
+++ b/lib/stdlib/test/supervisor_deadlock.erl
@@ -0,0 +1,45 @@
+-module(supervisor_deadlock).
+-compile(export_all).
+
+
+%%%-----------------------------------------------------------------
+%%% gen_server callbacks
+init([child]) ->
+ case ets:lookup(supervisor_deadlock,fail_start) of
+ [{fail_start, false}] ->
+ %% we must not fail on the first init, otherwise supervisor
+ %% terminates immediately
+ {ok, []};
+ [{fail_start, true}] ->
+ %% Restart frequency is MaxR=8, MaxT=10, so this will
+ %% ensure that restart intensity is not reached -> restart
+ %% loop
+ timer:sleep(2000), % NOTE: this could be a gen_server call timeout
+
+ {stop, error}
+ end.
+
+handle_call(_Req, _From, State) ->
+ {reply, ok, State}.
+
+%% Force a restart
+handle_cast(restart, State) ->
+ {stop, error, State}.
+
+handle_info(_Msg, State) ->
+ {noreply, State}.
+
+terminate(_Reason, _State) ->
+ ok.
+
+code_change(_OldVsn, State, _Extra) ->
+ {ok, State}.
+
+
+%%%-----------------------------------------------------------------
+%%% Start child
+start_child() ->
+ gen_server:start_link({local, ?MODULE}, ?MODULE, [child], []).
+
+restart_child() ->
+ gen_server:cast(supervisor_deadlock, restart).