From c59c3a6d57b857913ddfa13f96425ba0d95ccb2d Mon Sep 17 00:00:00 2001 From: James Fish Date: Thu, 4 Apr 2013 01:59:53 +0100 Subject: Fix rest_for_one and one_for_all restarting a child not terminated In rest_for_one and one_for_all supervisors one child dying can cause multiple children to be restarted. Previously if the child that caused the restart is started successfully but another child fails to start, the supervisor would not terminate this child with the other successfully restarted children as no record of the pid was kept. Thus the supervisor would try to start this child again. This could lead to multiples of the same child or if the child is registered cause repeated attempts at starting this child - until the max restart threshold was reached. Now the child that failed to start becomes the restarting child, instead of staying with the same child, for the next restart attempt. This has the following side effects: 1) In one_for_all the new version of the child that original died is terminated before a restart attempt is made. 2) In rest_for_one all succesfully restarted children are not terminated and restarting continues from the child that failed to start. --- lib/stdlib/src/supervisor.erl | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl index 9f93747c3e..54328cd9ff 100644 --- a/lib/stdlib/src/supervisor.erl +++ b/lib/stdlib/src/supervisor.erl @@ -63,7 +63,9 @@ %%-------------------------------------------------------------------------- -record(child, {% pid is undefined when child is not running - pid = undefined :: child() | {restarting,pid()} | [pid()], + pid = undefined :: child() + | {restarting, pid() | undefined} + | [pid()], name :: child_id(), mfargs :: mfargs(), restart_type :: restart(), @@ -752,6 +754,9 @@ restart(Child, State) -> end, timer:apply_after(0,?MODULE,try_again_restart,[self(),Id]), {ok,NState2}; + {try_again, NState2, #child{name=ChName}} -> + timer:apply_after(0,?MODULE,try_again_restart,[self(),ChName]), + {ok,NState2}; Other -> Other end; @@ -798,10 +803,16 @@ restart(rest_for_one, Child, State) -> case start_children(ChAfter2, State#state.name) of {ok, ChAfter3} -> {ok, State#state{children = ChAfter3 ++ ChBefore}}; - {error, ChAfter3, _Reason} -> + {error, ChAfter3, {failed_to_start_child, ChName, _Reason}} + when ChName =:= Child#child.name -> NChild = Child#child{pid=restarting(Child#child.pid)}, NState = State#state{children = ChAfter3 ++ ChBefore}, - {try_again, replace_child(NChild,NState)} + {try_again, replace_child(NChild,NState)}; + {error, ChAfter3, {failed_to_start_child, ChName, _Reason}} -> + NChild = lists:keyfind(ChName, #child.name, ChAfter3), + NChild2 = NChild#child{pid=?restarting(undefined)}, + NState = State#state{children = ChAfter3 ++ ChBefore}, + {try_again, replace_child(NChild2,NState), NChild2} end; restart(one_for_all, Child, State) -> Children1 = del_child(Child#child.pid, State#state.children), @@ -809,10 +820,16 @@ restart(one_for_all, Child, State) -> case start_children(Children2, State#state.name) of {ok, NChs} -> {ok, State#state{children = NChs}}; - {error, NChs, _Reason} -> + {error, NChs, {failed_to_start_child, ChName, _Reason}} + when ChName =:= Child#child.name -> NChild = Child#child{pid=restarting(Child#child.pid)}, NState = State#state{children = NChs}, - {try_again, replace_child(NChild,NState)} + {try_again, replace_child(NChild,NState)}; + {error, NChs, {failed_to_start_child, ChName, _Reason}} -> + NChild = lists:keyfind(ChName, #child.name, NChs), + NChild2 = NChild#child{pid=?restarting(undefined)}, + NState = State#state{children = NChs}, + {try_again, replace_child(NChild2,NState), NChild2} end. restarting(Pid) when is_pid(Pid) -> ?restarting(Pid); -- cgit v1.2.3