diff options
author | James Fish <[email protected]> | 2013-04-04 01:59:53 +0100 |
---|---|---|
committer | James Fish <[email protected]> | 2013-04-04 12:22:11 +0100 |
commit | c59c3a6d57b857913ddfa13f96425ba0d95ccb2d (patch) | |
tree | 3df89de54964e1fb9b2bd181ce7990d1c2f5d2a4 /lib | |
parent | 8dba74ac7ff331a2c4870cc64b62dd4f168533eb (diff) | |
download | otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.tar.gz otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.tar.bz2 otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.zip |
Fix rest_for_one and one_for_all restarting a child not terminated
In rest_for_one and one_for_all supervisors one child dying can cause
multiple children to be restarted. Previously if the child that caused
the restart is started successfully but another child fails to start,
the supervisor would not terminate this child with the other
successfully restarted children as no record of the pid was kept. Thus
the supervisor would try to start this child again. This could lead to
multiples of the same child or if the child is registered cause repeated
attempts at starting this child - until the max restart threshold was
reached.
Now the child that failed to start becomes the restarting child, instead
of staying with the same child, for the next restart attempt. This has
the following side effects:
1) In one_for_all the new version of the child that original died is
terminated before a restart attempt is made.
2) In rest_for_one all succesfully restarted children are not terminated
and restarting continues from the child that failed to start.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/stdlib/src/supervisor.erl | 27 | ||||
-rw-r--r-- | lib/stdlib/test/Makefile | 1 | ||||
-rw-r--r-- | lib/stdlib/test/supervisor_3.erl | 45 | ||||
-rw-r--r-- | lib/stdlib/test/supervisor_SUITE.erl | 106 |
4 files changed, 171 insertions, 8 deletions
diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl index 9f93747c3e..54328cd9ff 100644 --- a/lib/stdlib/src/supervisor.erl +++ b/lib/stdlib/src/supervisor.erl @@ -63,7 +63,9 @@ %%-------------------------------------------------------------------------- -record(child, {% pid is undefined when child is not running - pid = undefined :: child() | {restarting,pid()} | [pid()], + pid = undefined :: child() + | {restarting, pid() | undefined} + | [pid()], name :: child_id(), mfargs :: mfargs(), restart_type :: restart(), @@ -752,6 +754,9 @@ restart(Child, State) -> end, timer:apply_after(0,?MODULE,try_again_restart,[self(),Id]), {ok,NState2}; + {try_again, NState2, #child{name=ChName}} -> + timer:apply_after(0,?MODULE,try_again_restart,[self(),ChName]), + {ok,NState2}; Other -> Other end; @@ -798,10 +803,16 @@ restart(rest_for_one, Child, State) -> case start_children(ChAfter2, State#state.name) of {ok, ChAfter3} -> {ok, State#state{children = ChAfter3 ++ ChBefore}}; - {error, ChAfter3, _Reason} -> + {error, ChAfter3, {failed_to_start_child, ChName, _Reason}} + when ChName =:= Child#child.name -> NChild = Child#child{pid=restarting(Child#child.pid)}, NState = State#state{children = ChAfter3 ++ ChBefore}, - {try_again, replace_child(NChild,NState)} + {try_again, replace_child(NChild,NState)}; + {error, ChAfter3, {failed_to_start_child, ChName, _Reason}} -> + NChild = lists:keyfind(ChName, #child.name, ChAfter3), + NChild2 = NChild#child{pid=?restarting(undefined)}, + NState = State#state{children = ChAfter3 ++ ChBefore}, + {try_again, replace_child(NChild2,NState), NChild2} end; restart(one_for_all, Child, State) -> Children1 = del_child(Child#child.pid, State#state.children), @@ -809,10 +820,16 @@ restart(one_for_all, Child, State) -> case start_children(Children2, State#state.name) of {ok, NChs} -> {ok, State#state{children = NChs}}; - {error, NChs, _Reason} -> + {error, NChs, {failed_to_start_child, ChName, _Reason}} + when ChName =:= Child#child.name -> NChild = Child#child{pid=restarting(Child#child.pid)}, NState = State#state{children = NChs}, - {try_again, replace_child(NChild,NState)} + {try_again, replace_child(NChild,NState)}; + {error, NChs, {failed_to_start_child, ChName, _Reason}} -> + NChild = lists:keyfind(ChName, #child.name, NChs), + NChild2 = NChild#child{pid=?restarting(undefined)}, + NState = State#state{children = NChs}, + {try_again, replace_child(NChild2,NState), NChild2} end. restarting(Pid) when is_pid(Pid) -> ?restarting(Pid); diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile index 6aa09d7bd0..af82f22b21 100644 --- a/lib/stdlib/test/Makefile +++ b/lib/stdlib/test/Makefile @@ -66,6 +66,7 @@ MODULES= \ string_SUITE \ supervisor_1 \ supervisor_2 \ + supervisor_3 \ supervisor_deadlock \ naughty_child \ shell_SUITE \ diff --git a/lib/stdlib/test/supervisor_3.erl b/lib/stdlib/test/supervisor_3.erl new file mode 100644 index 0000000000..31b3037d6f --- /dev/null +++ b/lib/stdlib/test/supervisor_3.erl @@ -0,0 +1,45 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 1996-2011. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% +%% Description: Simulates the behaviour that a child process may have. +%% Is used by the supervisor_SUITE test suite. +-module(supervisor_3). + +-export([start_child/2, init/1]). + +-export([handle_call/3, handle_info/2, terminate/2]). + +start_child(Name, Caller) -> + gen_server:start_link(?MODULE, [Name, Caller], []). + +init([Name, Caller]) -> + Caller ! {Name, self()}, + receive + {Result, Caller} -> + Result + end. + +handle_call(Req, _From, State) -> + {reply, Req, State}. + +handle_info(_, State) -> + {noreply, State}. + +terminate(_Reason, Time) -> + timer:sleep(Time), + ok. diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl index 569c66959e..ff5be6bb95 100644 --- a/lib/stdlib/test/supervisor_SUITE.erl +++ b/lib/stdlib/test/supervisor_SUITE.erl @@ -53,9 +53,10 @@ %% Restart strategy tests -export([ one_for_one/1, one_for_one_escalation/1, one_for_all/1, - one_for_all_escalation/1, + one_for_all_escalation/1, one_for_all_other_child_fails_restart/1, simple_one_for_one/1, simple_one_for_one_escalation/1, rest_for_one/1, rest_for_one_escalation/1, + rest_for_one_other_child_fails_restart/1, simple_one_for_one_extra/1, simple_one_for_one_shutdown/1]). %% Misc tests @@ -107,12 +108,14 @@ groups() -> {restart_one_for_one, [], [one_for_one, one_for_one_escalation]}, {restart_one_for_all, [], - [one_for_all, one_for_all_escalation]}, + [one_for_all, one_for_all_escalation, + one_for_all_other_child_fails_restart]}, {restart_simple_one_for_one, [], [simple_one_for_one, simple_one_for_one_shutdown, simple_one_for_one_extra, simple_one_for_one_escalation]}, {restart_rest_for_one, [], - [rest_for_one, rest_for_one_escalation]}]. + [rest_for_one, rest_for_one_escalation, + rest_for_one_other_child_fails_restart]}]. init_per_suite(Config) -> Config. @@ -879,6 +882,57 @@ one_for_all_escalation(Config) when is_list(Config) -> %%------------------------------------------------------------------------- +%% Test that the supervisor terminates a restarted child when a different +%% child fails to start. +one_for_all_other_child_fails_restart(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Self = self(), + Child1 = {child1, {supervisor_3, start_child, [child1, Self]}, + permanent, 1000, worker, []}, + Child2 = {child2, {supervisor_3, start_child, [child2, Self]}, + permanent, 1000, worker, []}, + Children = [Child1, Child2], + StarterFun = fun() -> + {ok, SupPid} = start_link({ok, {{one_for_all, 3, 3600}, Children}}), + Self ! {sup_pid, SupPid}, + receive {stop, Self} -> ok end + end, + StarterPid = spawn_link(StarterFun), + Ok = {{ok, undefined}, Self}, + %% Let the children start. + Child1Pid = receive {child1, Pid1} -> Pid1 end, + Child1Pid ! Ok, + Child2Pid = receive {child2, Pid2} -> Pid2 end, + Child2Pid ! Ok, + %% Supervisor started. + SupPid = receive {sup_pid, Pid} -> Pid end, + link(SupPid), + exit(Child1Pid, die), + %% Let child1 restart but don't let child2. + Child1Pid2 = receive {child1, Pid3} -> Pid3 end, + Child1Pid2Ref = erlang:monitor(process, Child1Pid2), + Child1Pid2 ! Ok, + Child2Pid2 = receive {child2, Pid4} -> Pid4 end, + Child2Pid2 ! {{stop, normal}, Self}, + %% Check child1 is terminated. + receive + {'DOWN', Child1Pid2Ref, _, _, shutdown} -> + ok; + {_childName, _Pid} -> + exit(SupPid, kill), + check_exit([StarterPid, SupPid]), + test_server:fail({restarting_child_not_terminated, Child1Pid2}) + end, + %% Let the restart complete. + Child1Pid3 = receive {child1, Pid5} -> Pid5 end, + Child1Pid3 ! Ok, + Child2Pid3 = receive {child2, Pid6} -> Pid6 end, + Child2Pid3 ! Ok, + StarterPid ! {stop, Self}, + check_exit([StarterPid, SupPid]). + + +%%------------------------------------------------------------------------- %% Test the simple_one_for_one base case. simple_one_for_one(Config) when is_list(Config) -> process_flag(trap_exit, true), @@ -1044,6 +1098,52 @@ rest_for_one_escalation(Config) when is_list(Config) -> terminate(SupPid, CPid1, child1, abnormal), check_exit([CPid2, SupPid]). + +%%------------------------------------------------------------------------- +%% Test that the supervisor terminates a restarted child when a different +%% child fails to start. +rest_for_one_other_child_fails_restart(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Self = self(), + Child1 = {child1, {supervisor_3, start_child, [child1, Self]}, + permanent, 1000, worker, []}, + Child2 = {child2, {supervisor_3, start_child, [child2, Self]}, + permanent, 1000, worker, []}, + Children = [Child1, Child2], + StarterFun = fun() -> + {ok, SupPid} = start_link({ok, {{rest_for_one, 3, 3600}, Children}}), + Self ! {sup_pid, SupPid}, + receive {stop, Self} -> ok end + end, + StarterPid = spawn_link(StarterFun), + Ok = {{ok, undefined}, Self}, + %% Let the children start. + Child1Pid = receive {child1, Pid1} -> Pid1 end, + Child1Pid ! Ok, + Child2Pid = receive {child2, Pid2} -> Pid2 end, + Child2Pid ! Ok, + %% Supervisor started. + SupPid = receive {sup_pid, Pid} -> Pid end, + link(SupPid), + exit(Child1Pid, die), + %% Let child1 restart but don't let child2. + Child1Pid2 = receive {child1, Pid3} -> Pid3 end, + Child1Pid2 ! Ok, + Child2Pid2 = receive {child2, Pid4} -> Pid4 end, + Child2Pid2 ! {{stop, normal}, Self}, + %% Let child2 restart. + receive + {child2, Child2Pid3} -> + Child2Pid3 ! Ok; + {child1, _Child1Pid3} -> + exit(SupPid, kill), + check_exit([StarterPid, SupPid]), + test_server:fail({restarting_started_child, Child1Pid2}) + end, + StarterPid ! {stop, Self}, + check_exit([StarterPid, SupPid]). + + %%------------------------------------------------------------------------- %% Test that the supervisor does not hang forever if the child unliks %% and then is terminated by the supervisor. |