From 99a29386cb1a62bc7e9c205d897cd20017c598b4 Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 21 Jul 2016 19:07:16 +0200 Subject: Ignore long time failures during high CPU utilization --- erts/emulator/test/long_timers_test.erl | 188 ++++++++++++++++++++++++++------ erts/emulator/test/z_SUITE.erl | 8 +- 2 files changed, 162 insertions(+), 34 deletions(-) (limited to 'erts') diff --git a/erts/emulator/test/long_timers_test.erl b/erts/emulator/test/long_timers_test.erl index 7c055a31f9..90efe1292c 100644 --- a/erts/emulator/test/long_timers_test.erl +++ b/erts/emulator/test/long_timers_test.erl @@ -27,11 +27,16 @@ %%% Created : 21 Aug 2006 by Rickard Green %%%------------------------------------------------------------------- +-define(HIGH_CPU_INFO, "Ignored due to high CPU utilization."). +-define(MISSING_CPU_INFO, "Ignored due to missing CPU utilization information."). -define(MAX_TIMEOUT, 60). % Minutes --define(MAX_LATE_MS, 15*1000). % Milliseconds +-define(MAX_LATE_MS, 1000). % Milliseconds -define(REG_NAME, '___LONG___TIMERS___TEST___SERVER___'). +-define(HIGH_UTIL, 96.0). +-define(UTIL_INTERVAL, 10000). + -define(DRV_NAME, timer_driver). % First byte in communication with the timer driver @@ -72,52 +77,149 @@ check_result() -> receive {'DOWN', Mon, process, _, Reason} -> {?REG_NAME, 'DOWN', Reason}; - {result, ?REG_NAME, TORs, Start, End} -> + {result, ?REG_NAME, TORs, Start, End, UtilData} -> erlang:demonitor(Mon), receive {'DOWN', Mon, _, _, _} -> ok after 0 -> ok end, stop_node(Node), - check(TORs, ms((End - Start) - max_late()), ok) + Res = check(TORs, Start, End, UtilData, ms((End - Start) - max_late()), ok), + io:format("Start = ~p~n End = ~p~n UtilData = ~p~n", [Start, End, UtilData]), + Res end. +res(New, Old) when New == failed; Old == failed -> + failed; +res(New, Old) when New == missing_cpu_info; Old == missing_cpu_info -> + missing_cpu_info; +res(New, Old) when New == high_cpu; Old == high_cpu -> + high_cpu; +res(New, _Old) -> + New. + check([#timeout_rec{timeout = Timeout, type = Type, timeout_diff = undefined} | TORs], + Start, + End, + UtilData, NeedRes, - _Ok) when Timeout < NeedRes -> - io:format("~p timeout = ~p ms failed! No timeout.~n", - [Type, Timeout]), - check(TORs, NeedRes, failed); + Ok) when Timeout < NeedRes -> + {NewOk, HCPU} = case had_high_cpu_util(Start, + Timeout, + End - Timeout*1000, + UtilData) of + yes -> {res(high_cpu, Ok), ?HIGH_CPU_INFO}; + no -> {res(failed, Ok), ""}; + missing -> {res(missing_cpu_info, Ok), "FAILED", ?MISSING_CPU_INFO} + end, + io:format("~p timeout = ~p ms FAILED! No timeout. ~s~n", + [Type, Timeout, HCPU]), + check(TORs, Start, End, UtilData, NeedRes, NewOk); check([#timeout_rec{timeout_diff = undefined} | TORs], + Start, + End, + UtilData, NeedRes, Ok) -> - check(TORs, NeedRes, Ok); + check(TORs, Start, End, UtilData, NeedRes, Ok); check([#timeout_rec{timeout = Timeout, type = Type, timeout_diff = {error, Reason}} | TORs], + Start, + End, + UtilData, NeedRes, _Ok) -> - io:format("~p timeout = ~p ms failed! exit reason ~p~n", + io:format("~p timeout = ~p ms FAILED! exit reason ~p~n", [Type, Timeout, Reason]), - check(TORs, NeedRes, failed); + check(TORs, Start, End, UtilData, NeedRes, failed); check([#timeout_rec{timeout = Timeout, type = Type, timeout_diff = TimeoutDiff} | TORs], + Start, + End, + UtilData, NeedRes, Ok) -> - {NewOk, SuccessStr} = case ((0 =< TimeoutDiff) - andalso (TimeoutDiff =< max_late())) of - true -> {Ok, "succeeded"}; - false -> {failed, "FAILED"} + {NewOk, SuccessStr, HCPU} = case {(0 =< TimeoutDiff), + (TimeoutDiff =< max_late())} of + {true, true} -> + {res(ok, Ok), "succeeded", ""}; + {false, _} -> + {res(failed, Ok), "FAILED", ""}; + _ -> + case had_high_cpu_util(Start, + Timeout, + TimeoutDiff, + UtilData) of + yes -> {res(high_cpu, Ok), "FAILED", ?HIGH_CPU_INFO}; + no -> {res(failed, Ok), "FAILED", ""}; + missing -> {res(missing_cpu_info, Ok), "FAILED", ?MISSING_CPU_INFO} + end end, - io:format("~s timeout = ~s ms ~s! timeout diff = ~s.~n", + io:format("~s timeout = ~s ms ~s! timeout diff = ~s. ~s~n", [type_str(Type), time_str(Timeout), SuccessStr, - time_str(TimeoutDiff, erlang:convert_time_unit(1, seconds, native))]), - check(TORs, NeedRes, NewOk); -check([], _NeedRes, Ok) -> + time_str(TimeoutDiff, 1000000), + HCPU]), + check(TORs, Start, End, UtilData, NeedRes, NewOk); +check([],_Start,_End,_UtilData,_NeedRes, Ok) -> Ok. +% TargetTimeout in ms, other in us. +had_high_cpu_util(StartTime, + TargetTimeout, + TimeoutDiff, + UtilData) -> + TargetTo = StartTime + TargetTimeout*1000, + ActTo = TargetTo + TimeoutDiff, + hcpu(ActTo, TargetTo, UtilData). + +hcpu(_ActTo, _TargetTo, [{UT, 0} | _] = UD) -> + missing; %% Util is the integer zero when not supported... +%% UT2 =:= UT1 +hcpu(ActTo, TargetTo, [{UT, _}, {UT, _} | _] = UD) -> + hcpu(ActTo, TargetTo, tl(UD)); +%% UT2 > UT1 > ActTo > TargetTo +hcpu(ActTo, TargetTo, [{_UT2, _}, {UT1, _} | _] = UD) when UT1 > ActTo -> + hcpu(ActTo, TargetTo, tl(UD)); +%% UT2 >= ActTo > TargetTo >= UT1 +hcpu(ActTo, TargetTo, + [{UT2, U}, {UT1, _} | _]) when UT2 >= ActTo, + TargetTo >= UT1 -> + case U >= (((ActTo - TargetTo) / (UT2 - UT1)) + * (?HIGH_UTIL/100.0)) of + true -> yes; + false -> no + end; +%% UT2 >= ActTo >= UT1 > TargetTo +hcpu(ActTo, TargetTo, + [{UT2, U}, {UT1, _} | _] = UD) when UT2 >= ActTo, + ActTo >= UT1, + UT1 > TargetTo -> + case U >= (((ActTo - UT1) / (UT2 - UT1)) + * (?HIGH_UTIL/100.0)) of + true -> hcpu(ActTo, TargetTo, tl(UD)); + false -> no + end; +%% ActTo > UT2 >= TargetTo >= UT1 +hcpu(ActTo, TargetTo, + [{UT2, U}, {UT1, _} | _]) when ActTo > UT2, + TargetTo >= UT1 -> + case U >= (((UT2 - TargetTo) / (UT2 - UT1)) + * (?HIGH_UTIL/100.0)) of + true -> yes; + false -> no + end; +%% ActTo > UT2 > UT1 > TargetTo +hcpu(ActTo, TargetTo, + [{UT2, U}, {UT1, _} | _] = UD) when ActTo > UT2, + UT1 > TargetTo -> + case U >= ?HIGH_UTIL of + true -> hcpu(ActTo, TargetTo, tl(UD)); + false -> no + end. + type_str(receive_after) -> "receive ... after"; type_str(bif_timer) -> "BIF timer"; type_str(driver) -> "driver". @@ -142,24 +244,24 @@ unit_str(Res) -> Res. to_diff(Timeout, Start, Stop) -> %% 'Timeout' in milli seconds - %% 'Start', 'Stop', and result in native unit - (Stop - Start) - erlang:convert_time_unit(Timeout, milli_seconds, native). + %% 'Start', 'Stop', and result in micro seconds + (Stop - Start) - Timeout*1000. ms(Time) -> - erlang:convert_time_unit(Time, native, milli_seconds). + erlang:convert_time_unit(Time, micro_seconds, milli_seconds). max_late() -> - erlang:convert_time_unit(?MAX_LATE_MS, milli_seconds, native). + erlang:convert_time_unit(?MAX_LATE_MS, milli_seconds, micro_seconds). receive_after(Timeout) -> - Start = erlang:monotonic_time(), + Start = erlang:monotonic_time(micro_seconds), receive {get_result, ?REG_NAME} -> ?REG_NAME ! #timeout_rec{pid = self(), type = receive_after, timeout = Timeout} after Timeout -> - Stop = erlang:monotonic_time(), + Stop = erlang:monotonic_time(micro_seconds), receive {get_result, ?REG_NAME} -> ?REG_NAME ! #timeout_rec{pid = self(), @@ -174,7 +276,7 @@ receive_after(Timeout) -> driver(Timeout) -> Port = open_port({spawn, ?DRV_NAME},[]), link(Port), - Start = erlang:monotonic_time(), + Start = erlang:monotonic_time(micro_seconds), erlang:port_command(Port, <>), receive {get_result, ?REG_NAME} -> @@ -182,7 +284,7 @@ driver(Timeout) -> type = driver, timeout = Timeout}; {Port,{data,[?TIMER]}} -> - Stop = erlang:monotonic_time(), + Stop = erlang:monotonic_time(micro_seconds), unlink(Port), true = erlang:port_close(Port), receive @@ -197,7 +299,7 @@ driver(Timeout) -> end. bif_timer(Timeout) -> - Start = erlang:monotonic_time(), + Start = erlang:monotonic_time(micro_seconds), Tmr = erlang:start_timer(Timeout, self(), ok), receive {get_result, ?REG_NAME} -> @@ -205,7 +307,7 @@ bif_timer(Timeout) -> type = bif_timer, timeout = Timeout}; {timeout, Tmr, ok} -> - Stop = erlang:monotonic_time(), + Stop = erlang:monotonic_time(micro_seconds), receive {get_result, ?REG_NAME} -> ?REG_NAME ! #timeout_rec{pid = self(), @@ -218,13 +320,22 @@ bif_timer(Timeout) -> end. test(Starter, DrvDir, StartDone) -> + process_flag(priority, high), erl_ddll:start(), ok = load_driver(DrvDir, ?DRV_NAME), process_flag(trap_exit, true), register(?REG_NAME, self()), {group_leader, GL} = process_info(whereis(net_kernel),group_leader), group_leader(GL, self()), - Start = erlang:monotonic_time(), + try + application:start(sasl), + application:start(os_mon) + catch + _ : _ -> + ok + end, + UtilData = new_util(), + Start = erlang:monotonic_time(micro_seconds), TORs = lists:map(fun (Min) -> TO = Min*60*1000, [#timeout_rec{pid = spawn_opt( @@ -252,16 +363,27 @@ test(Starter, DrvDir, StartDone) -> lists:seq(1, ?MAX_TIMEOUT)), FlatTORs = lists:flatten(TORs), Starter ! StartDone, - test_loop(FlatTORs, Start). + test_loop(FlatTORs, Start, UtilData). + +new_util() -> + new_util([]). + +new_util(UtilData) -> + Util = cpu_sup:util(), + Time = erlang:monotonic_time(micro_seconds), + [{Time, Util} | UtilData]. -test_loop(TORs, Start) -> +test_loop(TORs, Start, UtilData) -> receive {get_result, ?REG_NAME, Pid} -> - End = erlang:monotonic_time(), - Pid ! {result, ?REG_NAME, get_test_results(TORs), Start, End}, + End = erlang:monotonic_time(micro_seconds), + EndUtilData = new_util(UtilData), + Pid ! {result, ?REG_NAME, get_test_results(TORs), Start, End, EndUtilData}, erl_ddll:unload_driver(?DRV_NAME), erl_ddll:stop(), exit(bye) + after ?UTIL_INTERVAL -> + test_loop(TORs, Start, new_util(UtilData)) end. get_test_results(TORs) -> diff --git a/erts/emulator/test/z_SUITE.erl b/erts/emulator/test/z_SUITE.erl index d1085c1958..ab56018373 100644 --- a/erts/emulator/test/z_SUITE.erl +++ b/erts/emulator/test/z_SUITE.erl @@ -191,7 +191,13 @@ node_container_refc_check(Config) when is_list(Config) -> ok. long_timers(Config) when is_list(Config) -> - ok = long_timers_test:check_result(). + case long_timers_test:check_result() of + ok -> ok; + high_cpu -> {comment, "Ignored failures due to high CPU utilization"}; + missing_cpu_info -> {comment, "Ignored failures due to missing CPU utilization information"}; + Fail -> ct:fail(Fail) + end. + pollset_size(Config) when is_list(Config) -> Name = pollset_size_testcase_initial_state_holder, -- cgit v1.2.3