From 4422cb32637a341eac2dc03172e4aa859be67f34 Mon Sep 17 00:00:00 2001 From: Siri Hansen Date: Wed, 3 Feb 2016 12:10:06 +0100 Subject: Add supervisor:get_callback_module/1 This function is used by release_handler during upgrade. This was earlier implemented in the release_handler, but it required a copy og the definition of the supervisor's internal state, which caused problems when this state was updated. --- lib/sasl/src/release_handler_1.erl | 33 ++------------------------------- lib/sasl/src/sasl.app.src | 2 +- lib/stdlib/src/supervisor.erl | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 32 deletions(-) diff --git a/lib/sasl/src/release_handler_1.erl b/lib/sasl/src/release_handler_1.erl index 536ac924d4..86e3bddc1c 100644 --- a/lib/sasl/src/release_handler_1.erl +++ b/lib/sasl/src/release_handler_1.erl @@ -655,48 +655,19 @@ maybe_get_dynamic_mods(Name, Pid) -> error(get_modules_failed) end. -%% XXXX -%% Note: The following is a terrible hack done in order to resolve the -%% problem stated in ticket OTP-3452. - -%% XXXX NOTE WELL: This record is from supervisor.erl. Also the record -%% name is really `state'. --record(supervisor_state, {name, - strategy, - children = [], - dynamics = [], - intensity, - period, - restarts = [], - module, - args}). - %% Return the name of the call-back module that implements the %% (top) supervisor SupPid. %% Returns: {ok, Module} | {error,undefined} %% get_supervisor_module(SupPid) -> - case catch get_supervisor_module1(SupPid) of - {ok, Module} when is_atom(Module) -> + case catch supervisor:get_callback_module(SupPid) of + Module when is_atom(Module) -> {ok, Module}; _Other -> io:format("~w: reason: ~w~n", [SupPid, _Other]), {error, undefined} end. -get_supervisor_module1(SupPid) -> - {status, _Pid, {module, _Mod}, - [_PDict, _SysState, _Parent, _Dbg, Misc]} = sys:get_status(SupPid), - %% supervisor Misc field changed at R13B04, handle old and new variants here - State = case Misc of - [_Name, State1, _Type, _Time] -> - State1; - [_Header, _Data, {data, [{"State", State2}]}] -> - State2 - end, - %% Cannot use #supervisor_state{module = Module} = State. - {ok, element(#supervisor_state.module, State)}. - %%----------------------------------------------------------------- %% Func: do_soft_purge/3 %% Args: Mod = atom() diff --git a/lib/sasl/src/sasl.app.src b/lib/sasl/src/sasl.app.src index 705bb73fc5..507e2dc229 100644 --- a/lib/sasl/src/sasl.app.src +++ b/lib/sasl/src/sasl.app.src @@ -46,6 +46,6 @@ {env, [{sasl_error_logger, tty}, {errlog_type, all}]}, {mod, {sasl, []}}, - {runtime_dependencies, ["tools-2.6.14","stdlib-2.6","kernel-4.1", + {runtime_dependencies, ["tools-2.6.14","stdlib-2.8","kernel-4.1", "erts-6.0"]}]}. diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl index 92a0c29011..6a5005b4b6 100644 --- a/lib/stdlib/src/supervisor.erl +++ b/lib/stdlib/src/supervisor.erl @@ -33,6 +33,9 @@ terminate/2, code_change/3]). -export([try_again_restart/2]). +%% For release_handler only +-export([get_callback_module/1]). + %%-------------------------------------------------------------------------- -export_type([sup_flags/0, child_spec/0, startchild_ret/0, strategy/0]). @@ -250,6 +253,17 @@ try_again_restart(Supervisor, Child) -> cast(Supervisor, Req) -> gen_server:cast(Supervisor, Req). +%%%----------------------------------------------------------------- +%%% Called by release_handler during upgrade +-spec get_callback_module(Pid) -> Module when + Pid :: pid(), + Module :: atom(). +get_callback_module(Pid) -> + {status, _Pid, {module, _Mod}, + [_PDict, _SysState, _Parent, _Dbg, Misc]} = sys:get_status(Pid), + [_Header, _Data, {data, [{"State", State}]}] = Misc, + State#state.module. + %%% --------------------------------------------------- %%% %%% Initialize the supervisor. -- cgit v1.2.3 From 264a5e804d6e23bf6c6916887669760cbf243caa Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 28 May 2015 19:41:25 +0100 Subject: Speed up supervisor:count_children/1; simple_one_for_one Speed up supervisor:count_children/1 for simple_one_for_one supervisors. This is achieved by avoiding looping through all the child process and verifying that each one is alive. For a supervisor with 100,000 'temporary' children the count-time will drop from approx 25ms to about 0.005ms. For a supervisor with 100,000 'permanent' or 'transient' children the count-time will drop from approx 30ms to about 0.005ms. This avoids having the supervisor block for an extended period while the count takes place. Under normal circumstances the accuracy of the result should also improve since the duration is too short for many processes to die during the count. --- lib/sasl/test/release_handler_SUITE.erl | 2 +- lib/stdlib/doc/src/supervisor.xml | 5 +++- lib/stdlib/src/supervisor.erl | 49 +++++++++++++++---------------- lib/stdlib/test/supervisor_SUITE.erl | 52 ++++++++++++++++++++++++++++++++- lib/stdlib/test/supervisor_deadlock.erl | 14 +++++++-- 5 files changed, 90 insertions(+), 32 deletions(-) diff --git a/lib/sasl/test/release_handler_SUITE.erl b/lib/sasl/test/release_handler_SUITE.erl index d57de2593a..ee620dcdb4 100644 --- a/lib/sasl/test/release_handler_SUITE.erl +++ b/lib/sasl/test/release_handler_SUITE.erl @@ -1363,7 +1363,7 @@ upgrade_supervisor(Conf) when is_list(Conf) -> %% Check that the restart strategy and child spec is updated {status, _, {module, _}, [_, _, _, _, [_,_,{data,[{"State",State}]}]]} = rpc:call(Node,sys,get_status,[a_sup]), - {state,_,RestartStrategy,[Child],_,_,_,_,_,_} = State, + {state,_,RestartStrategy,[Child],_,_,_,_,_,_,_} = State, one_for_all = RestartStrategy, % changed from one_for_one {child,_,_,_,_,brutal_kill,_,_} = Child, % changed from timeout 2000 diff --git a/lib/stdlib/doc/src/supervisor.xml b/lib/stdlib/doc/src/supervisor.xml index 24ff251ce3..815bf4a489 100644 --- a/lib/stdlib/doc/src/supervisor.xml +++ b/lib/stdlib/doc/src/supervisor.xml @@ -543,7 +543,10 @@

active - the count of all actively running child processes - managed by this supervisor.

+ managed by this supervisor. In the case of simple_one_for_one + supervisors, no check is carried out to ensure that each child process + is still alive, though the result provided here is likely to be very + accurate unless the supervisor is heavily overloaded.

supervisors - the count of all children marked as diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl index 6a5005b4b6..cecdebd0c8 100644 --- a/lib/stdlib/src/supervisor.erl +++ b/lib/stdlib/src/supervisor.erl @@ -116,6 +116,7 @@ intensity :: non_neg_integer(), period :: pos_integer(), restarts = [], + dynamic_restarts = 0 :: non_neg_integer(), module, args}). -type state() :: #state{}. @@ -518,39 +519,26 @@ handle_call(which_children, _From, State) -> handle_call(count_children, _From, #state{children = [#child{restart_type = temporary, child_type = CT}]} = State) when ?is_simple(State) -> - {Active, Count} = - ?SETS:fold(fun(Pid, {Alive, Tot}) -> - case is_pid(Pid) andalso is_process_alive(Pid) of - true ->{Alive+1, Tot +1}; - false -> - {Alive, Tot + 1} - end - end, {0, 0}, dynamics_db(temporary, State#state.dynamics)), + Sz = ?SETS:size(dynamics_db(temporary, State#state.dynamics)), Reply = case CT of - supervisor -> [{specs, 1}, {active, Active}, - {supervisors, Count}, {workers, 0}]; - worker -> [{specs, 1}, {active, Active}, - {supervisors, 0}, {workers, Count}] + supervisor -> [{specs, 1}, {active, Sz}, + {supervisors, Sz}, {workers, 0}]; + worker -> [{specs, 1}, {active, Sz}, + {supervisors, 0}, {workers, Sz}] end, {reply, Reply, State}; -handle_call(count_children, _From, #state{children = [#child{restart_type = RType, +handle_call(count_children, _From, #state{dynamic_restarts = Restarts, + children = [#child{restart_type = RType, child_type = CT}]} = State) when ?is_simple(State) -> - {Active, Count} = - ?DICTS:fold(fun(Pid, _Val, {Alive, Tot}) -> - case is_pid(Pid) andalso is_process_alive(Pid) of - true -> - {Alive+1, Tot +1}; - false -> - {Alive, Tot + 1} - end - end, {0, 0}, dynamics_db(RType, State#state.dynamics)), + Sz = ?DICTS:size(dynamics_db(RType, State#state.dynamics)), + Active = Sz - Restarts, Reply = case CT of supervisor -> [{specs, 1}, {active, Active}, - {supervisors, Count}, {workers, 0}]; + {supervisors, Sz}, {workers, 0}]; worker -> [{specs, 1}, {active, Active}, - {supervisors, 0}, {workers, Count}] + {supervisors, 0}, {workers, Sz}] end, {reply, Reply, State}; @@ -820,8 +808,15 @@ restart(Child, State) -> {shutdown, remove_child(Child, NState)} end. -restart(simple_one_for_one, Child, State) -> +restart(simple_one_for_one, Child, State0) -> #child{pid = OldPid, mfargs = {M, F, A}} = Child, + State = case OldPid of + ?restarting(_) -> + NRes = State0#state.dynamic_restarts - 1, + State0#state{dynamic_restarts = NRes}; + _ -> + State0 + end, Dynamics = ?DICTS:erase(OldPid, dynamics_db(Child#child.restart_type, State#state.dynamics)), case do_start_child_i(M, F, A) of @@ -832,7 +827,9 @@ restart(simple_one_for_one, Child, State) -> NState = State#state{dynamics = ?DICTS:store(Pid, A, Dynamics)}, {ok, NState}; {error, Error} -> - NState = State#state{dynamics = ?DICTS:store(restarting(OldPid), A, + NRestarts = State#state.dynamic_restarts + 1, + NState = State#state{dynamic_restarts = NRestarts, + dynamics = ?DICTS:store(restarting(OldPid), A, Dynamics)}, report_error(start_error, Error, Child, State#state.name), {try_again, NState} diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl index 8cb2a5194a..903ca76575 100644 --- a/lib/stdlib/test/supervisor_SUITE.erl +++ b/lib/stdlib/test/supervisor_SUITE.erl @@ -67,6 +67,7 @@ %% Misc tests -export([child_unlink/1, tree/1, count_children/1, + count_restarting_children/1, do_not_save_start_parameters_for_temporary_children/1, do_not_save_child_specs_for_temporary_children/1, simple_one_for_one_scale_many_temporary_children/1, @@ -90,7 +91,8 @@ all() -> {group, normal_termination}, {group, shutdown_termination}, {group, abnormal_termination}, child_unlink, tree, - count_children, do_not_save_start_parameters_for_temporary_children, + count_children, count_restarting_children, + do_not_save_start_parameters_for_temporary_children, do_not_save_child_specs_for_temporary_children, simple_one_for_one_scale_many_temporary_children, temporary_bystander, simple_global_supervisor, hanging_restart_loop, hanging_restart_loop_simple, @@ -1458,6 +1460,54 @@ count_children(Config) when is_list(Config) -> [terminate(SupPid, Pid, child, kill) || {undefined, Pid, worker, _Modules} <- Children3], [1,0,0,0] = get_child_counts(sup_test). +%%------------------------------------------------------------------------- +%% Test count_children when some children are restarting +count_restarting_children(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Child = {child, {supervisor_deadlock, start_child_noreg, []}, + permanent, brutal_kill, worker, []}, + %% 2 sek delay on failing restart (see supervisor_deadlock.erl) -> + %% MaxR=20, MaxT=10 should ensure a restart loop when starting and + %% restarting 3 instances of the child (as below) + {ok, SupPid} = start_link({ok, {{simple_one_for_one, 20, 10}, [Child]}}), + + %% Ets table with state read by supervisor_deadlock.erl + ets:new(supervisor_deadlock,[set,named_table,public]), + ets:insert(supervisor_deadlock,{fail_start,false}), + + [1,0,0,0] = get_child_counts(SupPid), + {ok, Ch1_1} = supervisor:start_child(SupPid, []), + [1,1,0,1] = get_child_counts(SupPid), + {ok, Ch1_2} = supervisor:start_child(SupPid, []), + [1,2,0,2] = get_child_counts(SupPid), + {ok, Ch1_3} = supervisor:start_child(SupPid, []), + [1,3,0,3] = get_child_counts(SupPid), + + supervisor_deadlock:restart_child(Ch1_1), + supervisor_deadlock:restart_child(Ch1_2), + supervisor_deadlock:restart_child(Ch1_3), + test_server:sleep(400), + [1,3,0,3] = get_child_counts(SupPid), + [Ch2_1, Ch2_2, Ch2_3] = [C || {_,C,_,_} <- supervisor:which_children(SupPid)], + + ets:insert(supervisor_deadlock,{fail_start,true}), + supervisor_deadlock:restart_child(Ch2_1), + supervisor_deadlock:restart_child(Ch2_2), + test_server:sleep(4000), % allow restart to happen before proceeding + [1,1,0,3] = get_child_counts(SupPid), + + ets:insert(supervisor_deadlock,{fail_start,false}), + test_server:sleep(4000), % allow restart to happen before proceeding + [1,3,0,3] = get_child_counts(SupPid), + + ok = supervisor:terminate_child(SupPid, Ch2_3), + [1,2,0,2] = get_child_counts(SupPid), + [Ch3_1, Ch3_2] = [C || {_,C,_,_} <- supervisor:which_children(SupPid)], + ok = supervisor:terminate_child(SupPid, Ch3_1), + [1,1,0,1] = get_child_counts(SupPid), + ok = supervisor:terminate_child(SupPid, Ch3_2), + [1,0,0,0] = get_child_counts(SupPid). + %%------------------------------------------------------------------------- %% Temporary children shall not be restarted so they should not save %% start parameters, as it potentially can take up a huge amount of diff --git a/lib/stdlib/test/supervisor_deadlock.erl b/lib/stdlib/test/supervisor_deadlock.erl index 288547a972..8d3d1c6f30 100644 --- a/lib/stdlib/test/supervisor_deadlock.erl +++ b/lib/stdlib/test/supervisor_deadlock.erl @@ -11,9 +11,11 @@ init([child]) -> %% terminates immediately {ok, []}; [{fail_start, true}] -> - %% Restart frequency is MaxR=8, MaxT=10, so this will - %% ensure that restart intensity is not reached -> restart - %% loop + %% A restart frequency of MaxR=8, MaxT=10 should ensure + %% that restart intensity is not reached -> restart loop. + %% (Note that if we use simple_one_for_one, and start + %% 'many' child instances, the restart frequency must be + %% ajusted accordingly.) timer:sleep(2000), % NOTE: this could be a gen_server call timeout {stop, error} @@ -41,5 +43,11 @@ code_change(_OldVsn, State, _Extra) -> start_child() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [child], []). +start_child_noreg() -> + gen_server:start_link(?MODULE, [child], []). + restart_child() -> gen_server:cast(supervisor_deadlock, restart). + +restart_child(Pid) -> + gen_server:cast(Pid, restart). -- cgit v1.2.3