Fix rest_for_one and one_for_all restarting a child not terminated

In rest_for_one and one_for_all supervisors one child dying can cause multiple children to be restarted. Previously if the child that caused the restart is started successfully but another child fails to start, the supervisor would not terminate this child with the other successfully restarted children as no record of the pid was kept. Thus the supervisor would try to start this child again. This could lead to multiples of the same child or if the child is registered cause repeated attempts at starting this child - until the max restart threshold was reached. Now the child that failed to start becomes the restarting child, instead of staying with the same child, for the next restart attempt. This has the following side effects: 1) In one_for_all the new version of the child that original died is terminated before a restart attempt is made. 2) In rest_for_one all succesfully restarted children are not terminated and restarting continues from the child that failed to start.
author: James Fish <[email protected]> 2013-04-04 01:59:53 +0100
committer: James Fish <[email protected]> 2013-04-04 12:22:11 +0100
commit: c59c3a6d57b857913ddfa13f96425ba0d95ccb2d (patch)
tree: 3df89de54964e1fb9b2bd181ce7990d1c2f5d2a4 /lib/stdlib/test/supervisor_SUITE.erl
parent: 8dba74ac7ff331a2c4870cc64b62dd4f168533eb (diff)
download: otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.tar.gz
otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.tar.bz2
otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.zip
1 files changed, 103 insertions, 3 deletions
diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl
index 569c66959e..ff5be6bb95 100644
--- a/lib/stdlib/test/supervisor_SUITE.erl
+++ b/lib/stdlib/test/supervisor_SUITE.erl
@@ -53,9 +53,10 @@
 %% Restart strategy tests 
 -export([ one_for_one/1,
 	  one_for_one_escalation/1, one_for_all/1,
-	  one_for_all_escalation/1,
+	  one_for_all_escalation/1, one_for_all_other_child_fails_restart/1,
 	  simple_one_for_one/1, simple_one_for_one_escalation/1,
 	  rest_for_one/1, rest_for_one_escalation/1,
+	  rest_for_one_other_child_fails_restart/1,
 	  simple_one_for_one_extra/1, simple_one_for_one_shutdown/1]).
 
 %% Misc tests
@@ -107,12 +108,14 @@ groups() ->
      {restart_one_for_one, [],
       [one_for_one, one_for_one_escalation]},
      {restart_one_for_all, [],
-      [one_for_all, one_for_all_escalation]},
+      [one_for_all, one_for_all_escalation,
+       one_for_all_other_child_fails_restart]},
      {restart_simple_one_for_one, [],
       [simple_one_for_one, simple_one_for_one_shutdown,
        simple_one_for_one_extra, simple_one_for_one_escalation]},
      {restart_rest_for_one, [],
-      [rest_for_one, rest_for_one_escalation]}].
+      [rest_for_one, rest_for_one_escalation,
+       rest_for_one_other_child_fails_restart]}].
 
 init_per_suite(Config) ->
     Config.
@@ -879,6 +882,57 @@ one_for_all_escalation(Config) when is_list(Config) ->
 
 
 %%-------------------------------------------------------------------------
+%% Test that the supervisor terminates a restarted child when a different
+%% child fails to start.
+one_for_all_other_child_fails_restart(Config) when is_list(Config) ->
+    process_flag(trap_exit, true),
+    Self = self(),
+    Child1 = {child1, {supervisor_3, start_child, [child1, Self]},
+	      permanent, 1000, worker, []},
+    Child2 = {child2, {supervisor_3, start_child, [child2, Self]},
+	      permanent, 1000, worker, []},
+    Children = [Child1, Child2],
+    StarterFun = fun() ->
+	    {ok, SupPid} = start_link({ok, {{one_for_all, 3, 3600}, Children}}),
+	    Self ! {sup_pid, SupPid},
+	    receive {stop, Self} -> ok end
+    end,
+    StarterPid = spawn_link(StarterFun),
+    Ok = {{ok, undefined}, Self},
+    %% Let the children start.
+    Child1Pid = receive {child1, Pid1} -> Pid1 end,
+    Child1Pid ! Ok,
+    Child2Pid = receive {child2, Pid2} -> Pid2 end,
+    Child2Pid ! Ok,
+    %% Supervisor started.
+    SupPid = receive {sup_pid, Pid} -> Pid end,
+    link(SupPid),
+    exit(Child1Pid, die),
+    %% Let child1 restart but don't let child2.
+    Child1Pid2  = receive {child1, Pid3} -> Pid3 end,
+    Child1Pid2Ref = erlang:monitor(process, Child1Pid2),
+    Child1Pid2 ! Ok,
+    Child2Pid2 = receive {child2, Pid4} -> Pid4 end,
+    Child2Pid2 ! {{stop, normal}, Self},
+    %% Check child1 is terminated.
+    receive
+	{'DOWN', Child1Pid2Ref, _, _, shutdown} ->
+	    ok;
+	{_childName, _Pid} ->
+	    exit(SupPid, kill),
+	    check_exit([StarterPid, SupPid]),
+	    test_server:fail({restarting_child_not_terminated, Child1Pid2})
+    end,
+    %% Let the restart complete.
+    Child1Pid3 = receive {child1, Pid5} -> Pid5 end,
+    Child1Pid3 ! Ok,
+    Child2Pid3 = receive {child2, Pid6} -> Pid6 end,
+    Child2Pid3 ! Ok,
+    StarterPid ! {stop, Self},
+    check_exit([StarterPid, SupPid]).
+
+
+%%-------------------------------------------------------------------------
 %% Test the simple_one_for_one base case.
 simple_one_for_one(Config) when is_list(Config) ->
     process_flag(trap_exit, true),
@@ -1044,6 +1098,52 @@ rest_for_one_escalation(Config) when is_list(Config) ->
     terminate(SupPid, CPid1, child1, abnormal),
     check_exit([CPid2, SupPid]).
 
+
+%%-------------------------------------------------------------------------
+%% Test that the supervisor terminates a restarted child when a different
+%% child fails to start.
+rest_for_one_other_child_fails_restart(Config) when is_list(Config) ->
+    process_flag(trap_exit, true),
+    Self = self(),
+    Child1 = {child1, {supervisor_3, start_child, [child1, Self]},
+	      permanent, 1000, worker, []},
+    Child2 = {child2, {supervisor_3, start_child, [child2, Self]},
+	      permanent, 1000, worker, []},
+    Children = [Child1, Child2],
+    StarterFun = fun() ->
+	    {ok, SupPid} = start_link({ok, {{rest_for_one, 3, 3600}, Children}}),
+	    Self ! {sup_pid, SupPid},
+	    receive {stop, Self} -> ok end
+    end,
+    StarterPid = spawn_link(StarterFun),
+    Ok = {{ok, undefined}, Self},
+    %% Let the children start.
+    Child1Pid = receive {child1, Pid1} -> Pid1 end,
+    Child1Pid ! Ok,
+    Child2Pid = receive {child2, Pid2} -> Pid2 end,
+    Child2Pid ! Ok,
+    %% Supervisor started.
+    SupPid = receive {sup_pid, Pid} -> Pid end,
+    link(SupPid),
+    exit(Child1Pid, die),
+    %% Let child1 restart but don't let child2.
+    Child1Pid2  = receive {child1, Pid3} -> Pid3 end,
+    Child1Pid2 ! Ok,
+    Child2Pid2 = receive {child2, Pid4} -> Pid4 end,
+    Child2Pid2 ! {{stop, normal}, Self},
+	%% Let child2 restart.
+    receive
+	{child2, Child2Pid3} ->
+	    Child2Pid3 ! Ok;
+	{child1, _Child1Pid3} ->
+	    exit(SupPid, kill),
+	    check_exit([StarterPid, SupPid]),
+	    test_server:fail({restarting_started_child, Child1Pid2})
+    end,
+    StarterPid ! {stop, Self},
+    check_exit([StarterPid, SupPid]).
+
+
 %%-------------------------------------------------------------------------
 %% Test that the supervisor does not hang forever if the child unliks
 %% and then is terminated by the supervisor.
author	James Fish <[email protected]>	2013-04-04 01:59:53 +0100
committer	James Fish <[email protected]>	2013-04-04 12:22:11 +0100
commit	c59c3a6d57b857913ddfa13f96425ba0d95ccb2d (patch)
tree	3df89de54964e1fb9b2bd181ce7990d1c2f5d2a4 /lib/stdlib/test/supervisor_SUITE.erl
parent	8dba74ac7ff331a2c4870cc64b62dd4f168533eb (diff)
download	otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.tar.gz otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.tar.bz2 otp-c59c3a6d57b857913ddfa13f96425ba0d95ccb2d.zip