diff options
author | Jay Nelson <[email protected]> | 2010-01-25 11:01:32 -0800 |
---|---|---|
committer | Björn Gustavsson <[email protected]> | 2010-02-10 17:14:28 +0100 |
commit | 1686757b304b66050f54db8e0ce5ab758bde96af (patch) | |
tree | 4b23dd691ff970e43bc5bfb6dc87941facee621c | |
parent | ada6afd00530d6569c41741cfd9d63311ff60f25 (diff) | |
download | otp-1686757b304b66050f54db8e0ce5ab758bde96af.tar.gz otp-1686757b304b66050f54db8e0ce5ab758bde96af.tar.bz2 otp-1686757b304b66050f54db8e0ce5ab758bde96af.zip |
Add count_children/1 to supervisor.erl to determine the number of
children being managed without the memory impact of which_children/1
The function which_children/1 returns a list of the child processes
currently being supervised, but it has the penalty of creating a new
list thereby consuming more memory. In low memory situations it is
often desirable to know which supervisor may have generated many
processes, but the act of discovering the culprit should not cause the
node to crash (or worse a different node if the kernel kills one
randomly). The new function count_children/1 can give an indication
of which supervisor is taxing resources the most without adding to the
burden. Rather than creating a new list, it walks the supervisor's
internal children structure using an accumulator function so that any
used memory can be incrementally collected yet the resulting count can
still be obtained.
The return result of count_children/1 is a property list of counts
containing:
- {specs, Total_Num_Child_Specs}
- {active, Num_Active_Child_Processes_Of_Supervisor_Or_Worker_Type}
- {supervisors, Num_Supervisor_Type_Children_Including_Dead_Processes}
- {workers, Num_Worker_Type_Children_Including_Dead_Processes}
This patch was made in response to mailing list discussions of the
problem diagnosing heavily taxed production systems. I cannot find
the original request, but http://www.erlang.org/cgi-bin/ezmlm-cgi/4/35060
is my original post of the patch.
-rw-r--r-- | lib/stdlib/doc/src/supervisor.xml | 38 | ||||
-rw-r--r-- | lib/stdlib/src/supervisor.erl | 49 | ||||
-rw-r--r-- | lib/stdlib/test/supervisor_SUITE.erl | 175 |
3 files changed, 247 insertions, 15 deletions
diff --git a/lib/stdlib/doc/src/supervisor.xml b/lib/stdlib/doc/src/supervisor.xml index adf9d24eae..fb2ec9ba67 100644 --- a/lib/stdlib/doc/src/supervisor.xml +++ b/lib/stdlib/doc/src/supervisor.xml @@ -402,9 +402,12 @@ child_spec() = {Id,StartFunc,Restart,Shutdown,Type,Modules} <v> Module = atom()</v> </type> <desc> - <p>Returns a list with information about all child + <p>Returns a newly created list with information about all child specifications and child processes belonging to the supervisor <c>SupRef</c>.</p> + <p>Note that calling this function when supervising a large + number of children under low memory conditions can cause an + out of memory exception.</p> <p>See <c>start_child/2</c> for a description of <c>SupRef</c>.</p> <p>The information given for each child specification/process is:</p> @@ -428,6 +431,39 @@ child_spec() = {Id,StartFunc,Restart,Shutdown,Type,Modules} </desc> </func> <func> + <name>count_children(SupRef) -> PropListOfCounts</name> + <fsummary>Return counts for the number of childspecs, active children, supervisors and workers.</fsummary> + <type> + <v>SupRef = Name | {Name,Node} | {global,Name} | pid()</v> + <v> Name = Node = atom()</v> + <v>PropListOfCounts = [{specs, ChildSpecCount}, {active, ActiveProcessCount}, {supervisors, ChildSupervisorCount}, {workers, ChildWorkerCount}]</v> + </type> + <desc> + <p>Returns a property list (see <c>proplists</c>) containing the + counts for each of the following elements of the supervisor's + child specifications and managed processes:</p> + <list type="bulleted"> + <item> + <p><c>specs</c> - the total count of children, dead or alive.</p> + </item> + <item> + <p><c>active</c> - the count of all actively running child processes + managed by this supervisor.</p> + </item> + <item> + <p><c>supervisors</c> - the count of all children marked as + child_type = supervisor in the spec list, whether or not the + child process is still alive.</p> + </item> + <item> + <p><c>workers</c> - the count of all children marked as + child_type = worker in the spec list, whether or not the child + process is still alive.</p> + </item> + </list> + </desc> + </func> + <func> <name>check_childspecs([ChildSpec]) -> Result</name> <fsummary>Check if children specifications are syntactically correct.</fsummary> <type> diff --git a/lib/stdlib/src/supervisor.erl b/lib/stdlib/src/supervisor.erl index fb1303d1eb..342b71f2da 100644 --- a/lib/stdlib/src/supervisor.erl +++ b/lib/stdlib/src/supervisor.erl @@ -24,7 +24,7 @@ -export([start_link/2,start_link/3, start_child/2, restart_child/2, delete_child/2, terminate_child/2, - which_children/1, + which_children/1, count_children/1, check_childspecs/1]). -export([behaviour_info/1]). @@ -95,6 +95,9 @@ terminate_child(Supervisor, Name) -> which_children(Supervisor) -> call(Supervisor, which_children). +count_children(Supervisor) -> + call(Supervisor, count_children). + call(Supervisor, Req) -> gen_server:call(Supervisor, Req, infinity). @@ -297,7 +300,49 @@ handle_call(which_children, _From, State) -> {Name, Pid, ChildType, Mods} end, State#state.children), - {reply, Resp, State}. + {reply, Resp, State}; + +handle_call(count_children, _From, State) when ?is_simple(State) -> + [#child{child_type = CT}] = State#state.children, + {Active, Count} = + ?DICT:fold(fun(Pid, _Val, {Alive, Tot}) -> + if is_pid(Pid) -> {Alive+1, Tot +1}; + true -> {Alive, Tot + 1} end + end, {0, 0}, State#state.dynamics), + Reply = case CT of + supervisor -> [{specs, 1}, {active, Active}, + {supervisors, Count}, {workers, 0}]; + worker -> [{specs, 1}, {active, Active}, + {supervisors, 0}, {workers, Count}] + end, + {reply, Reply, State}; + +handle_call(count_children, _From, State) -> + + %% Specs and children are together on the children list... + {Specs, Active, Supers, Workers} = + lists:foldl(fun(Child, Counts) -> + count_child(Child, Counts) + end, {0,0,0,0}, State#state.children), + + %% Reformat counts to a property list. + Reply = [{specs, Specs}, {active, Active}, + {supervisors, Supers}, {workers, Workers}], + {reply, Reply, State}. + + +count_child(#child{pid = Pid, child_type = worker}, + {Specs, Active, Supers, Workers}) -> + case is_pid(Pid) andalso is_process_alive(Pid) of + true -> {Specs+1, Active+1, Supers, Workers+1}; + false -> {Specs+1, Active, Supers, Workers+1} + end; +count_child(#child{pid = Pid, child_type = supervisor}, + {Specs, Active, Supers, Workers}) -> + case is_pid(Pid) andalso is_process_alive(Pid) of + true -> {Specs+1, Active+1, Supers+1, Workers}; + false -> {Specs+1, Active, Supers+1, Workers} + end. %%% Hopefully cause a function-clause as there is no API function diff --git a/lib/stdlib/test/supervisor_SUITE.erl b/lib/stdlib/test/supervisor_SUITE.erl index f25e877289..039ea298c4 100644 --- a/lib/stdlib/test/supervisor_SUITE.erl +++ b/lib/stdlib/test/supervisor_SUITE.erl @@ -50,7 +50,7 @@ simple_one_for_one_extra/1]). %% Misc tests --export([child_unlink/1, tree/1]). +-export([child_unlink/1, tree/1, count_children_memory/1]). %------------------------------------------------------------------------- @@ -60,7 +60,8 @@ all(suite) -> child_adm_simple, extra_return, child_specs, restart_one_for_one, restart_one_for_all, restart_simple_one_for_one, restart_rest_for_one, - normal_termination, abnormal_termination, child_unlink, tree]}. + normal_termination, abnormal_termination, child_unlink, tree, + count_children_memory]}. start(InitResult) -> @@ -72,6 +73,15 @@ init(fail) -> init(InitResult) -> InitResult. +%% Respect proplist return of supervisor:count_children +get_child_counts(Supervisor) -> + Counts = supervisor:count_children(Supervisor), + [proplists:get_value(specs, Counts), + proplists:get_value(active, Counts), + proplists:get_value(supervisors, Counts), + proplists:get_value(workers, Counts)]. + + %------------------------------------------------------------------------- % % Test cases starts here. @@ -140,6 +150,8 @@ sup_start_ignore_child(Config) when is_list(Config) -> ?line [{child2, CPid2, worker, []},{child1, undefined, worker, []}] = supervisor:which_children(sup_test), + ?line [2,1,0,2] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- @@ -341,6 +353,8 @@ extra_return(Config) when is_list(Config) -> ?line {error, running} = supervisor:delete_child(sup_test, child1), ?line {error, running} = supervisor:restart_child(sup_test, child1), ?line [{child1, CPid, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), + ?line ok = supervisor:terminate_child(sup_test, child1), receive {'EXIT', CPid, shutdown} -> ok; @@ -350,22 +364,31 @@ extra_return(Config) when is_list(Config) -> ?line test_server:fail(no_child_termination) end, ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test), + ?line [1,0,0,1] = get_child_counts(sup_test), + ?line {ok, CPid2,extra_return} = supervisor:restart_child(sup_test, child1), ?line [{child1, CPid2, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), + ?line ok = supervisor:terminate_child(sup_test, child1), ?line ok = supervisor:terminate_child(sup_test, child1), ?line ok = supervisor:delete_child(sup_test, child1), ?line {error, not_found} = supervisor:restart_child(sup_test, child1), ?line [] = supervisor:which_children(sup_test), + ?line [0,0,0,0] = get_child_counts(sup_test), + ?line {ok, CPid3, extra_return} = supervisor:start_child(sup_test, Child), ?line [{child1, CPid3, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- child_adm(doc)-> ["Test API functions start_child/2, terminate_child/2, delete_child/2 " - "restart_child/2, which_children/1. Only correct childspecs are used, " - "handling of incorrect childspecs is tested in child_specs/1"]; + "restart_child/2, which_children/1, count_children/1. Only correct " + "childspecs are used, handling of incorrect childspecs is tested in " + "child_specs/1"]; child_adm(suite) -> []; child_adm(Config) when is_list(Config) -> process_flag(trap_exit, true), @@ -373,6 +396,7 @@ child_adm(Config) when is_list(Config) -> worker, []}, ?line {ok, _Pid} = start({ok, {{one_for_one, 2, 3600}, [Child]}}), ?line [{child1, CPid, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), link(CPid), %% Start of an already runnig process @@ -392,6 +416,7 @@ child_adm(Config) when is_list(Config) -> ?line test_server:fail(no_child_termination) end, ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test), + ?line [1,0,0,1] = get_child_counts(sup_test), %% Like deleting something that does not exist, it will succeed! ?line ok = supervisor:terminate_child(sup_test, child1), @@ -402,6 +427,7 @@ child_adm(Config) when is_list(Config) -> %% Restart ?line {ok, CPid2} = supervisor:restart_child(sup_test, child1), ?line [{child1, CPid2, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), ?line {error, running} = supervisor:restart_child(sup_test, child1), ?line {error, not_found} = supervisor:restart_child(sup_test, child2), @@ -414,15 +440,19 @@ child_adm(Config) when is_list(Config) -> ?line ok = supervisor:delete_child(sup_test, child1), ?line {error, not_found} = supervisor:restart_child(sup_test, child1), ?line [] = supervisor:which_children(sup_test), + ?line [0,0,0,0] = get_child_counts(sup_test), %% Start ?line {'EXIT',{noproc,{gen_server,call, _}}} = (catch supervisor:start_child(foo, Child)), ?line {ok, CPid3} = supervisor:start_child(sup_test, Child), ?line [{child1, CPid3, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), ?line {'EXIT',{noproc,{gen_server,call,[foo,which_children,infinity]}}} = (catch supervisor:which_children(foo)), + ?line {'EXIT',{noproc,{gen_server,call,[foo,count_children,infinity]}}} + = (catch supervisor:count_children(foo)), ok. %------------------------------------------------------------------------- child_adm_simple(doc) -> @@ -436,6 +466,7 @@ child_adm_simple(Config) when is_list(Config) -> ?line {ok, _Pid} = start({ok, {{simple_one_for_one, 2, 3600}, [Child]}}), %% In simple_one_for_one all children are added dynamically ?line [] = supervisor:which_children(sup_test), + ?line [1,0,0,0] = get_child_counts(sup_test), %% Start ?line {'EXIT',{noproc,{gen_server,call, _}}} = @@ -443,12 +474,14 @@ child_adm_simple(Config) when is_list(Config) -> ?line {ok, CPid1} = supervisor:start_child(sup_test, []), ?line [{undefined, CPid1, worker, []}] = supervisor:which_children(sup_test), + ?line [1,1,0,1] = get_child_counts(sup_test), ?line {ok, CPid2} = supervisor:start_child(sup_test, []), ?line Children = supervisor:which_children(sup_test), ?line 2 = length(Children), ?line true = lists:member({undefined, CPid2, worker, []}, Children), ?line true = lists:member({undefined, CPid1, worker, []}, Children), + ?line [1,2,0,2] = get_child_counts(sup_test), %% Termination ?line {error, simple_one_for_one} = @@ -541,8 +574,10 @@ permanent_normal(Config) when is_list(Config) -> ok; false -> ?line test_server:fail({permanent_child_not_restarted, Child1}) - end. + end, + ?line [1,1,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- transient_normal(doc) -> ["A transient child should not be restarted if it exits with " @@ -558,8 +593,10 @@ transient_normal(Config) when is_list(Config) -> CPid1 ! stop, test_server:sleep(100), - ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test). + ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test), + ?line [1,0,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- temporary_normal(doc) -> ["A temporary process should never be restarted"]; @@ -574,8 +611,10 @@ temporary_normal(Config) when is_list(Config) -> CPid1 ! stop, test_server:sleep(100), - ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test). + ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test), + ?line [1,0,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- abnormal_termination(doc) -> ["Testes the supervisors behaviour if a child dies with reason abnormal"]; @@ -601,8 +640,10 @@ permanent_abnormal(Config) when is_list(Config) -> ok; false -> ?line test_server:fail({permanent_child_not_restarted, Child1}) - end. + end, + ?line [1,1,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- transient_abnormal(doc) -> ["A transient child should be restarted if it exits with " @@ -624,9 +665,10 @@ transient_abnormal(Config) when is_list(Config) -> ok; false -> ?line test_server:fail({transient_child_not_restarted, Child1}) - end. - + end, + ?line [1,1,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- temporary_abnormal(doc) -> ["A temporary process should never be restarted"]; @@ -641,8 +683,10 @@ temporary_abnormal(Config) when is_list(Config) -> CPid1 ! die, test_server:sleep(100), - ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test). + ?line [{child1,undefined,worker,[]}] = supervisor:which_children(sup_test), + ?line [1,0,0,1] = get_child_counts(sup_test), + ok. %------------------------------------------------------------------------- restart_one_for_one(doc) -> ["Test that the one_for_one strategy works."]; @@ -679,6 +723,7 @@ one_for_one(Config) when is_list(Config) -> end; true -> ?line test_server:fail({bad_child_list, Children}) end, + ?line [2,2,0,2] = get_child_counts(sup_test), %% Test restart frequency property CPid2 ! die, @@ -772,6 +817,8 @@ one_for_all(Config) when is_list(Config) -> true -> ?line test_server:fail(bad_child); false -> ok end, + ?line [2,2,0,2] = get_child_counts(sup_test), + %%% Test restart frequency property [{_, Pid3, _, _}|_] = supervisor:which_children(sup_test), Pid3 ! die, @@ -854,6 +901,8 @@ simple_one_for_one(Config) when is_list(Config) -> end; true -> ?line test_server:fail({bad_child_list, Children}) end, + ?line [1,2,0,2] = get_child_counts(sup_test), + %% Test restart frequency property CPid2 ! die, receive @@ -896,6 +945,8 @@ simple_one_for_one_extra(Config) when is_list(Config) -> end; true -> ?line test_server:fail({bad_child_list, Children}) end, + ?line [1,2,0,2] = get_child_counts(sup_test), + CPid2 ! die, receive {'EXIT', CPid2, _} -> ok @@ -962,6 +1013,8 @@ rest_for_one(Config) when is_list(Config) -> link(CPid2), ?line {ok, CPid3} = supervisor:start_child(sup_test, Child3), link(CPid3), + ?line [3,3,0,3] = get_child_counts(sup_test), + CPid2 ! die, receive {'EXIT', CPid2, died} -> ok; @@ -987,6 +1040,8 @@ rest_for_one(Config) when is_list(Config) -> if length(Children) == 3 -> ok; true -> ?line test_server:fail({bad_child_list, Children}) end, + ?line [3,3,0,3] = get_child_counts(sup_test), + %% Test that no old children is still alive SCh = lists:map(fun({_,P,_,_}) -> P end, Children), case lists:member(CPid1, SCh) of @@ -1114,15 +1169,19 @@ tree(Config) when is_list(Config) -> %% Child supervisors ?line {ok, Sup1} = supervisor:start_child(Pid, ChildSup1), ?line {ok, Sup2} = supervisor:start_child(Pid, ChildSup2), + ?line [2,2,2,0] = get_child_counts(Pid), %% Workers - ?line [{_, CPid2, _, _},{_, CPid1, _, _}] = supervisor:which_children(Sup1), + ?line [2,2,0,2] = get_child_counts(Sup1), + ?line [0,0,0,0] = get_child_counts(Sup2), %% Dynamic children ?line {ok, CPid3} = supervisor:start_child(Sup2, Child3), ?line {ok, CPid4} = supervisor:start_child(Sup2, Child4), + ?line [2,2,0,2] = get_child_counts(Sup1), + ?line [2,2,0,2] = get_child_counts(Sup2), link(Sup1), link(Sup2), @@ -1144,9 +1203,11 @@ tree(Config) when is_list(Config) -> ?line [{_, CPid2, _, _},{_, CPid1, _, _}] = supervisor:which_children(Sup1), + ?line [2,2,0,2] = get_child_counts(Sup1), ?line [{_, NewCPid4, _, _},{_, CPid3, _, _}] = supervisor:which_children(Sup2), + ?line [2,2,0,2] = get_child_counts(Sup2), link(NewCPid4), @@ -1195,9 +1256,99 @@ tree(Config) when is_list(Config) -> ?line [{supchild2, NewSup2, _, _},{supchild1, NewSup1, _, _}] = supervisor:which_children(Pid), + ?line [2,2,2,0] = get_child_counts(Pid), ?line [{child2, _, _, _},{child1, _, _, _}] = supervisor:which_children(NewSup1), + ?line [2,2,0,2] = get_child_counts(NewSup1), + ?line [] = supervisor:which_children(NewSup2), + ?line [0,0,0,0] = get_child_counts(NewSup2), ok. +%------------------------------------------------------------------------- +count_children_allocator_test(MemoryState) -> + Allocators = [temp_alloc, eheap_alloc, binary_alloc, ets_alloc, + driver_alloc, sl_alloc, ll_alloc, fix_alloc, std_alloc, + sys_alloc], + MemoryStateList = element(4, MemoryState), + AllocTypes = [lists:keyfind(Alloc, 1, MemoryStateList) + || Alloc <- Allocators], + AllocStates = [lists:keyfind(e, 1, AllocValue) + || {_Type, AllocValue} <- AllocTypes], + lists:all(fun(State) -> State == {e, true} end, AllocStates). + +count_children_memory(doc) -> + ["Test that which_children eats memory, but count_children does not."]; +count_children_memory(suite) -> + MemoryState = erlang:system_info(allocator), + case count_children_allocator_test(MemoryState) of + true -> []; + false -> + {skip, "+Meamin used during test; erlang:memory/1 not available"} + end; +count_children_memory(Config) when is_list(Config) -> + process_flag(trap_exit, true), + Child = {child, {supervisor_1, start_child, []}, temporary, 1000, + worker, []}, + ?line {ok, _Pid} = start({ok, {{simple_one_for_one, 2, 3600}, [Child]}}), + [supervisor:start_child(sup_test, []) || _Ignore <- lists:seq(1,1000)], + + garbage_collect(), + _Size1 = erlang:memory(processes_used), + Children = supervisor:which_children(sup_test), + _Size2 = erlang:memory(processes_used), + ChildCount = get_child_counts(sup_test), + Size3 = erlang:memory(processes_used), + + [supervisor:start_child(sup_test, []) || _Ignore2 <- lists:seq(1,1000)], + + garbage_collect(), + Children2 = supervisor:which_children(sup_test), + Size4 = erlang:memory(processes_used), + ChildCount2 = get_child_counts(sup_test), + Size5 = erlang:memory(processes_used), + + garbage_collect(), + Children3 = supervisor:which_children(sup_test), + Size6 = erlang:memory(processes_used), + ChildCount3 = get_child_counts(sup_test), + Size7 = erlang:memory(processes_used), + + ?line 1000 = length(Children), + ?line [1,1000,0,1000] = ChildCount, + ?line 2000 = length(Children2), + ?line [1,2000,0,2000] = ChildCount2, + ?line Children3 = Children2, + ?line ChildCount3 = ChildCount2, + + %% count_children consumes memory using an accumulator function, + %% but the space can be reclaimed incrementally, whereas + %% which_children generates a return list. + case (Size5 =< Size4) of + true -> ok; + false -> + ?line test_server:fail({count_children, used_more_memory}) + end, + case Size7 =< Size6 of + true -> ok; + false -> + ?line test_server:fail({count_children, used_more_memory}) + end, + + case Size4 > Size3 of + true -> ok; + false -> + ?line test_server:fail({which_children, used_no_memory}) + end, + case Size6 > Size5 of + true -> ok; + false -> + ?line test_server:fail({which_children, used_no_memory}) + end, + + [exit(Pid, kill) || {undefined, Pid, worker, _Modules} <- Children3], + test_server:sleep(100), + ?line [1,0,0,0] = get_child_counts(sup_test), + + ok. |