diff options
-rw-r--r-- | lib/kernel/doc/src/logger_chapter.xml | 16 | ||||
-rw-r--r-- | lib/kernel/src/logger_disk_log_h.erl | 64 | ||||
-rw-r--r-- | lib/kernel/src/logger_h_common.erl | 16 | ||||
-rw-r--r-- | lib/kernel/src/logger_h_common.hrl | 2 | ||||
-rw-r--r-- | lib/kernel/src/logger_std_h.erl | 22 | ||||
-rw-r--r-- | lib/kernel/test/logger_disk_log_h_SUITE.erl | 185 | ||||
-rw-r--r-- | lib/kernel/test/logger_std_h_SUITE.erl | 262 |
7 files changed, 365 insertions, 202 deletions
diff --git a/lib/kernel/doc/src/logger_chapter.xml b/lib/kernel/doc/src/logger_chapter.xml index 519df2ba48..3150c5adb4 100644 --- a/lib/kernel/doc/src/logger_chapter.xml +++ b/lib/kernel/doc/src/logger_chapter.xml @@ -661,10 +661,20 @@ do_log(Fd,Log,#{formatter:={FModule,FConfig}}) -> </item> </taglist> - <p>For the overload protection algorithm to work properly, it is a - requirement that:</p> + <p>For the overload protection algorithm to work properly, it is + required that:</p> - <p><c>toggle_sync_qlen < drop_new_reqs_qlen < flush_reqs_qlen</c></p> + <p><c>toggle_sync_qlen =< drop_new_reqs_qlen =< flush_reqs_qlen</c></p> + + <p>and that:</p> + + <p><c>drop_new_reqs_qlen > 1</c></p> + + <p>If <c>toggle_sync_qlen</c> is set to <c>0</c>, the handler will handle all + requests synchronously. Setting the value of <c>toggle_sync_qlen</c> to the same + as <c>drop_new_reqs_qlen</c>, disables the synchronous mode. Likewise, setting + the value of <c>drop_new_reqs_qlen</c> to the same as <c>flush_reqs_qlen</c>, + disables the drop mode.</p> <p>During high load scenarios, the length of the handler message queue rarely grows in a linear and predictable way. Instead, whenever the diff --git a/lib/kernel/src/logger_disk_log_h.erl b/lib/kernel/src/logger_disk_log_h.erl index 0150fa781a..57c54ce27e 100644 --- a/lib/kernel/src/logger_disk_log_h.erl +++ b/lib/kernel/src/logger_disk_log_h.erl @@ -278,10 +278,11 @@ init([Name, Config = #{disk_log_opts := LogOpts}, last_log_ts => T0, burst_win_ts => T0, burst_msg_count => 0, + last_op => sync, prev_log_result => ok, prev_sync_result => ok, prev_disk_log_info => undefined}), - gen_server:cast(self(), {repeated_disk_log_sync,T0}), + gen_server:cast(self(), repeated_disk_log_sync), enter_loop(Config, State1); Error -> logger_h_common:error_notify({open_disk_log,Name,Error}), @@ -316,8 +317,7 @@ handle_call(disk_log_sync, _From, State = #{id := Name}) -> {reply, Result, State1}; handle_call({change_config,_OldConfig,NewConfig}, _From, - State = #{filesync_repeat_interval := FSyncInt0, - last_log_ts := LastLogTS}) -> + State = #{filesync_repeat_interval := FSyncInt0}) -> HConfig = maps:get(?MODULE, NewConfig, #{}), State1 = #{toggle_sync_qlen := TSQL, drop_new_reqs_qlen := DNRQL, @@ -338,9 +338,8 @@ handle_call({change_config,_OldConfig,NewConfig}, _From, _ = logger_h_common:cancel_timer(maps:get(rep_sync_tref, State, undefined)), - _ = gen_server:cast(self(), {repeated_disk_log_sync, - LastLogTS}) - end, + _ = gen_server:cast(self(), repeated_disk_log_sync) + end, {reply, ok, State1}; false -> {reply, {error,{invalid_levels,{TSQL,DNRQL,FRQL}}}, State} @@ -370,24 +369,23 @@ handle_cast({log, Bin}, State) -> %% clause gets called repeatedly by the handler. In order to %% guarantee that a filesync *always* happens after the last log %% request, the repeat operation must be active! -handle_cast({repeated_disk_log_sync,LastLogTS0}, +handle_cast(repeated_disk_log_sync, State = #{id := Name, filesync_repeat_interval := FSyncInt, - last_log_ts := LastLogTS1}) -> + last_op := LastOp}) -> State1 = if is_integer(FSyncInt) -> %% only do filesync if something has been %% written since last time we checked - NewState = if LastLogTS1 == LastLogTS0 -> + NewState = if LastOp == sync -> State; true -> disk_log_sync(Name, State) end, {ok,TRef} = timer:apply_after(FSyncInt, gen_server,cast, - [self(), - {repeated_disk_log_sync,LastLogTS1}]), - NewState#{rep_sync_tref => TRef}; + [self(),repeated_disk_log_sync]), + NewState#{rep_sync_tref => TRef, last_op => sync}; true -> State end, @@ -649,10 +647,9 @@ close_disk_log(Name, _) -> ok. disk_log_write(Name, Bin, State) -> - Result = case ?disk_log_blog(Name, Bin) of ok -> - ok; + State#{prev_log_result => ok, last_op => write}; LogError -> _ = case maps:get(prev_log_result, State) of LogError -> @@ -664,29 +661,26 @@ disk_log_write(Name, Bin, State) -> LogOpts, LogError}) end, - LogError - end, - State#{prev_log_result => Result}. + State#{prev_log_result => LogError} + end. disk_log_sync(Name, State) -> - Result = - case ?disk_log_sync(Name) of - ok -> - ok; - SyncError -> - _ = case maps:get(prev_sync_result, State) of - SyncError -> - %% don't report same error twice - ok; - _ -> - LogOpts = maps:get(log_opts, State), - logger_h_common:error_notify({Name,sync, - LogOpts, - SyncError}) - end, - SyncError - end, - State#{prev_sync_result => Result}. + case ?disk_log_sync(Name) of + ok -> + State#{prev_sync_result => ok, last_op => sync}; + SyncError -> + _ = case maps:get(prev_sync_result, State) of + SyncError -> + %% don't report same error twice + ok; + _ -> + LogOpts = maps:get(log_opts, State), + logger_h_common:error_notify({Name,sync, + LogOpts, + SyncError}) + end, + State#{prev_sync_result => SyncError} + end. error_notify_new(Info,Info, _Term) -> ok; diff --git a/lib/kernel/src/logger_h_common.erl b/lib/kernel/src/logger_h_common.erl index 7caad366ae..901c4c0dad 100644 --- a/lib/kernel/src/logger_h_common.erl +++ b/lib/kernel/src/logger_h_common.erl @@ -135,7 +135,8 @@ call_cast_or_drop(Name, Bin) -> _:{timeout,_} -> ?observe(Name,{dropped,1}) end; - drop -> ?observe(Name,{dropped,1}) + drop -> + ?observe(Name,{dropped,1}) catch %% if the ETS table doesn't exist (maybe because of a %% handler restart), we can only drop the request @@ -152,12 +153,15 @@ check_load(State = #{id:=Name, mode := Mode, flush_reqs_qlen := FlushQLen}) -> {_,Mem} = process_info(self(), memory), ?observe(Name,{max_mem,Mem}), - %% make sure the handler process doesn't get scheduled - %% out between the message_queue_len check below and the - %% action that follows (flush or write). {_,QLen} = process_info(self(), message_queue_len), ?observe(Name,{max_qlen,QLen}), - + %% When the handler process gets scheduled in, it's impossible + %% to predict the QLen. We could jump "up" arbitrarily from say + %% async to sync, async to drop, sync to flush, etc. However, when + %% the handler process manages the log requests (without flushing), + %% one after the other, we will move "down" from drop to sync and + %% from sync to async. This way we don't risk getting stuck in + %% drop or sync mode with an empty mailbox. {Mode1,_NewDrops,_NewFlushes} = if QLen >= FlushQLen -> @@ -292,7 +296,7 @@ overload_levels_ok(HandlerConfig) -> TSQL = maps:get(toggle_sync_qlen, HandlerConfig, ?TOGGLE_SYNC_QLEN), DNRQL = maps:get(drop_new_reqs_qlen, HandlerConfig, ?DROP_NEW_REQS_QLEN), FRQL = maps:get(flush_reqs_qlen, HandlerConfig, ?FLUSH_REQS_QLEN), - (TSQL < DNRQL) andalso (DNRQL < FRQL). + (DNRQL > 1) andalso (TSQL =< DNRQL) andalso (DNRQL =< FRQL). error_notify(Term) -> ?internal_log(error, Term). diff --git a/lib/kernel/src/logger_h_common.hrl b/lib/kernel/src/logger_h_common.hrl index 89378dbb10..ed365ce6eb 100644 --- a/lib/kernel/src/logger_h_common.hrl +++ b/lib/kernel/src/logger_h_common.hrl @@ -124,7 +124,7 @@ %%% slow down execution and therefore should not be include in code %%% to be officially released. -%% -define(TEST_HOOKS, true). +-define(TEST_HOOKS, true). -ifdef(TEST_HOOKS). -define(TEST_HOOKS_TAB, logger_h_test_hooks). diff --git a/lib/kernel/src/logger_std_h.erl b/lib/kernel/src/logger_std_h.erl index 31edcfea8b..e5e0febc88 100644 --- a/lib/kernel/src/logger_std_h.erl +++ b/lib/kernel/src/logger_std_h.erl @@ -257,10 +257,11 @@ init([Name, Config, file_ctrl_sync => FileCtrlSyncInt, last_qlen => 0, last_log_ts => T0, + last_op => sync, burst_win_ts => T0, burst_msg_count => 0}), proc_lib:init_ack({ok,self()}), - gen_server:cast(self(), {repeated_filesync,T0}), + gen_server:cast(self(), repeated_filesync), enter_loop(Config, State1); Error -> logger_h_common:error_notify({init_handler,Name,Error}), @@ -310,12 +311,11 @@ handle_call(filesync, _From, State = #{type := Type, if is_atom(Type) -> {reply, ok, State}; true -> - {reply, file_ctrl_filesync_sync(FileCtrlPid), State} + {reply, file_ctrl_filesync_sync(FileCtrlPid), State#{last_op=>sync}} end; handle_call({change_config,_OldConfig,NewConfig}, _From, - State = #{filesync_repeat_interval := FSyncInt0, - last_log_ts := LastLogTS}) -> + State = #{filesync_repeat_interval := FSyncInt0}) -> HConfig = maps:get(?MODULE, NewConfig, #{}), State1 = maps:merge(State, HConfig), case logger_h_common:overload_levels_ok(State1) of @@ -334,8 +334,7 @@ handle_call({change_config,_OldConfig,NewConfig}, _From, _ = logger_h_common:cancel_timer(maps:get(rep_sync_tref, State, undefined)), - gen_server:cast(self(), {repeated_filesync, - LastLogTS}) + gen_server:cast(self(), repeated_filesync) end, {reply, ok, State1}; false -> @@ -365,24 +364,24 @@ handle_cast({log, Bin}, State) -> %% clause gets called repeatedly by the handler. In order to %% guarantee that a filesync *always* happens after the last log %% request, the repeat operation must be active! -handle_cast({repeated_filesync,LastLogTS0}, +handle_cast(repeated_filesync, State = #{type := Type, file_ctrl_pid := FileCtrlPid, filesync_repeat_interval := FSyncInt, - last_log_ts := LastLogTS1}) -> + last_op := LastOp}) -> State1 = if not is_atom(Type), is_integer(FSyncInt) -> %% only do filesync if something has been %% written since last time we checked - if LastLogTS1 == LastLogTS0 -> + if LastOp == sync -> ok; true -> file_ctrl_filesync_async(FileCtrlPid) end, {ok,TRef} = timer:apply_after(FSyncInt, gen_server,cast, - [self(),{repeated_filesync,LastLogTS1}]), - State#{rep_sync_tref => TRef}; + [self(),repeated_filesync]), + State#{rep_sync_tref => TRef, last_op => sync}; true -> State end, @@ -600,6 +599,7 @@ write(Name, Mode, T1, Bin, _CallOrCast, State1#{mode => Mode1, last_qlen := LastQLen1, last_log_ts => T1, + last_op => write, burst_win_ts => BurstWinT, burst_msg_count => BurstMsgCount1, file_ctrl_sync => diff --git a/lib/kernel/test/logger_disk_log_h_SUITE.erl b/lib/kernel/test/logger_disk_log_h_SUITE.erl index 63e5b56021..7c33c9130c 100644 --- a/lib/kernel/test/logger_disk_log_h_SUITE.erl +++ b/lib/kernel/test/logger_disk_log_h_SUITE.erl @@ -369,10 +369,18 @@ config_fail(_Config) -> #{logger_disk_log_h => #{bad => bad}, filter_default=>log, formatter=>{?MODULE,self()}}), - {error,{handler_not_added,{invalid_levels,{42,42,_}}}} = + + {error,{handler_not_added,{invalid_levels,{_,1,_}}}} = + logger:add_handler(?MODULE,logger_disk_log_h, + #{logger_disk_log_h => #{drop_new_reqs_qlen=>1}}), + {error,{handler_not_added,{invalid_levels,{43,42,_}}}} = logger:add_handler(?MODULE,logger_disk_log_h, - #{logger_disk_log_h => #{toggle_sync_qlen=>42, + #{logger_disk_log_h => #{toggle_sync_qlen=>43, drop_new_reqs_qlen=>42}}), + {error,{handler_not_added,{invalid_levels,{_,43,42}}}} = + logger:add_handler(?MODULE,logger_disk_log_h, + #{logger_disk_log_h => #{drop_new_reqs_qlen=>43, + flush_reqs_qlen=>42}}), ok = logger:add_handler(?MODULE,logger_disk_log_h, #{filter_default=>log, @@ -848,62 +856,115 @@ internal_log(Type, Term) -> op_switch_to_sync(Config) -> {Log,HConfig,DLHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), + NumOfReqs = 500, NewHConfig = - HConfig#{logger_disk_log_h => DLHConfig#{toggle_sync_qlen => 3, - drop_new_reqs_qlen => 501, - flush_reqs_qlen => 2000, + HConfig#{logger_disk_log_h => DLHConfig#{toggle_sync_qlen => 2, + drop_new_reqs_qlen => NumOfReqs+1, + flush_reqs_qlen => 2*NumOfReqs, enable_burst_limit => false}}, ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 500, send_burst({n,NumOfReqs}, seq, {chars,79}, info), - NumOfReqs = count_lines(Log), - ok = file:delete(Log). + Lines = count_lines(Log), + ok = file:delete(Log), + NumOfReqs = Lines, + ok. op_switch_to_sync(cleanup, _Config) -> ok = stop_handler(?MODULE). +op_switch_to_drop() -> + [{timetrap,{seconds,180}}]. op_switch_to_drop(Config) -> - {Log,HConfig,DLHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), - - NewHConfig = - HConfig#{logger_disk_log_h => DLHConfig#{toggle_sync_qlen => 2, - drop_new_reqs_qlen => 3, - flush_reqs_qlen => 600, - enable_burst_limit => false}}, - ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 500, - send_burst({n,NumOfReqs}, seq, {chars,79}, info), - Logged = count_lines(Log), - ct:pal("Number of messages dropped = ~w (~w)", - [NumOfReqs-Logged,NumOfReqs]), - true = (Logged < NumOfReqs), - ok = file:delete(Log). + Test = + fun() -> + {Log,HConfig,DLHConfig} = + start_handler(?MODULE, ?FUNCTION_NAME, Config), + NumOfReqs = 300, + Procs = 2, + Bursts = 10, + NewHConfig = + HConfig#{logger_disk_log_h => + DLHConfig#{toggle_sync_qlen => 1, + drop_new_reqs_qlen => 2, + flush_reqs_qlen => Procs*NumOfReqs*Bursts, + enable_burst_limit => false}}, + ok = logger:set_handler_config(?MODULE, NewHConfig), + %% It sometimes happens that the handler either gets + %% the requests in a slow enough pace so that dropping + %% never occurs. Therefore, lets generate a number of + %% bursts to increase the chance of message buildup. + [send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info) || + _ <- lists:seq(1, Bursts)], + Logged = count_lines(Log), + ok= stop_handler(?MODULE), + _ = file:delete(Log), + ct:pal("Number of messages dropped = ~w (~w)", + [Procs*NumOfReqs*Bursts-Logged,Procs*NumOfReqs*Bursts]), + true = (Logged < (Procs*NumOfReqs*Bursts)), + true = (Logged > 0), + ok + end, + %% As it's tricky to get the timing right in only one go, we perform the + %% test repeatedly, hoping that will generate a successful result. + case repeat_until_ok(Test, 10) of + {ok,{Failures,_Result}} -> + ct:log("Failed ~w times before success!", [Failures]); + {fails,Reason} -> + ct:fail(Reason) + end. op_switch_to_drop(cleanup, _Config) -> - ok = stop_handler(?MODULE). + _ = stop_handler(?MODULE). op_switch_to_flush() -> [{timetrap,{minutes,3}}]. op_switch_to_flush(Config) -> - {Log,HConfig,DLHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), - - %% it's important that both async and sync requests have been queued - %% when the flush happens (verify with coverage of flush_log_requests/2) + Test = + fun() -> + {Log,HConfig,DLHConfig} = + start_handler(?MODULE, ?FUNCTION_NAME, Config), + + %% NOTE: it's important that both async and sync + %% requests have been queued when the flush happens + %% (verify with coverage of flush_log_requests/2) - NewHConfig = - HConfig#{logger_disk_log_h => DLHConfig#{toggle_sync_qlen => 2, - drop_new_reqs_qlen => 99, - flush_reqs_qlen => 100, - enable_burst_limit => false}}, - ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 1000, - Procs = 500, - send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info), - Logged = count_lines(Log), - ct:pal("Number of messages flushed/dropped = ~w (~w)", - [(NumOfReqs*Procs)-Logged,NumOfReqs*Procs]), - true = (Logged < (NumOfReqs*Procs)), - ok = file:delete(Log). + NewHConfig = + HConfig#{logger_disk_log_h => + DLHConfig#{toggle_sync_qlen => 2, + %% disable drop mode + drop_new_reqs_qlen => 300, + flush_reqs_qlen => 300, + enable_burst_limit => false}}, + ok = logger:set_handler_config(?MODULE, NewHConfig), + NumOfReqs = 1500, + Procs = 10, + Bursts = 10, + %% It sometimes happens that the handler either gets + %% the requests in a slow enough pace so that flushing + %% never occurs, or it gets all messages at once, + %% causing all messages to get flushed (no dropping of + %% sync messages gets tested). Therefore, lets + %% generate a number of bursts to increase the chance + %% of message buildup in some random fashion. + [send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info) || + _ <- lists:seq(1,Bursts)], + Logged = count_lines(Log), + ok= stop_handler(?MODULE), + _ = file:delete(Log), + ct:pal("Number of messages flushed/dropped = ~w (~w)", + [NumOfReqs*Procs*Bursts-Logged,NumOfReqs*Procs*Bursts]), + true = (Logged < (NumOfReqs*Procs*Bursts)), + true = (Logged > 0), + ok + end, + %% As it's tricky to get the timing right in only one go, we perform the + %% test repeatedly, hoping that will generate a successful result. + case repeat_until_ok(Test, 10) of + {ok,{Failures,_Result}} -> + ct:log("Failed ~w times before success!", [Failures]); + {fails,Reason} -> + ct:fail(Reason) + end. op_switch_to_flush(cleanup, _Config) -> - ok = stop_handler(?MODULE). + _ = stop_handler(?MODULE). limit_burst_disabled(Config) -> @@ -987,7 +1048,7 @@ qlen_kill_new(Config) -> {_Log,HConfig,DLHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), Pid0 = whereis(?MODULE), {_,Mem0} = process_info(Pid0, memory), - RestartAfter = 2000, + RestartAfter = ?HANDLER_RESTART_AFTER, NewHConfig = HConfig#{logger_disk_log_h => DLHConfig#{enable_kill_overloaded=>true, @@ -1008,7 +1069,7 @@ qlen_kill_new(Config) -> killed -> ct:pal("Slow shutdown, handler process was killed!", []) end, - timer:sleep(RestartAfter + 1000), + timer:sleep(RestartAfter + 2000), true = is_pid(whereis(?MODULE)), ok after @@ -1024,7 +1085,7 @@ mem_kill_new(Config) -> {_Log,HConfig,DLHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), Pid0 = whereis(?MODULE), {_,Mem0} = process_info(Pid0, memory), - RestartAfter = 2000, + RestartAfter = ?HANDLER_RESTART_AFTER, NewHConfig = HConfig#{logger_disk_log_h => DLHConfig#{enable_kill_overloaded=>true, @@ -1045,7 +1106,7 @@ mem_kill_new(Config) -> killed -> ct:pal("Slow shutdown, handler process was killed!", []) end, - timer:sleep(RestartAfter * 2), + timer:sleep(RestartAfter + 2000), true = is_pid(whereis(?MODULE)), ok after @@ -1078,7 +1139,7 @@ restart_after(Config) -> end, {Log,_,_} = start_handler(?MODULE, ?FUNCTION_NAME, Config), - RestartAfter = 2000, + RestartAfter = ?HANDLER_RESTART_AFTER, NewHConfig2 = HConfig#{logger_disk_log_h=>DLHConfig#{enable_kill_overloaded=>true, handler_overloaded_qlen=>10, @@ -1090,7 +1151,7 @@ restart_after(Config) -> send_burst({n,100}, {spawn,2,0}, {chars,79}, info), receive {'DOWN', MRef2, _, _, _Info2} -> - timer:sleep(RestartAfter + 1000), + timer:sleep(RestartAfter + 2000), Pid1 = whereis(?MODULE), true = is_pid(Pid1), false = (Pid1 == Pid0), @@ -1361,6 +1422,29 @@ count_lines1(File) -> file:close(Dev), Lines. +repeat_until_ok(Fun, N) -> + repeat_until_ok(Fun, 0, N, undefined). + +repeat_until_ok(_Fun, Stop, Stop, Reason) -> + {fails,Reason}; + +repeat_until_ok(Fun, C, Stop, FirstReason) -> + if C > 0 -> timer:sleep(5000); + true -> ok + end, + try Fun() of + Result -> + {ok,{C,Result}} + catch + _:Reason:Stack -> + ct:pal("Test fails: ~p (~p)~n", [Reason,hd(Stack)]), + if FirstReason == undefined -> + repeat_until_ok(Fun, C+1, Stop, {Reason,Stack}); + true -> + repeat_until_ok(Fun, C+1, Stop, FirstReason) + end + end. + start_tracer(Trace,Expected) -> Pid = self(), dbg:tracer(process,{fun tracer/2,{Pid,Expected}}), @@ -1382,7 +1466,8 @@ tpl([{M,F,A}|Trace]) -> tpl([]) -> ok. -tracer({trace,_,call,{logger_disk_log_h,handle_cast,[{Op,_}|_]}}, {Pid,[{Mod,Func,Op}|Expected]}) -> +tracer({trace,_,call,{logger_disk_log_h,handle_cast,[Op|_]}}, + {Pid,[{Mod,Func,Op}|Expected]}) -> maybe_tracer_done(Pid,Expected,{Mod,Func,Op}); tracer({trace,_,call,{Mod=disk_log,Func=blog,[_,Data]}}, {Pid,[{Mod,Func,Data}|Expected]}) -> maybe_tracer_done(Pid,Expected,{Mod,Func,Data}); diff --git a/lib/kernel/test/logger_std_h_SUITE.erl b/lib/kernel/test/logger_std_h_SUITE.erl index 7c8d63cbbd..34c3167960 100644 --- a/lib/kernel/test/logger_std_h_SUITE.erl +++ b/lib/kernel/test/logger_std_h_SUITE.erl @@ -289,10 +289,17 @@ config_fail(_Config) -> #{logger_std_h => #{restart_type => bad}, filter_default=>log, formatter=>{?MODULE,self()}}), - {error,{handler_not_added,{invalid_levels,{42,42,_}}}} = + {error,{handler_not_added,{invalid_levels,{_,1,_}}}} = logger:add_handler(?MODULE,logger_std_h, - #{logger_std_h => #{toggle_sync_qlen=>42, + #{logger_std_h => #{drop_new_reqs_qlen=>1}}), + {error,{handler_not_added,{invalid_levels,{43,42,_}}}} = + logger:add_handler(?MODULE,logger_std_h, + #{logger_std_h => #{toggle_sync_qlen=>43, drop_new_reqs_qlen=>42}}), + {error,{handler_not_added,{invalid_levels,{_,43,42}}}} = + logger:add_handler(?MODULE,logger_std_h, + #{logger_std_h => #{drop_new_reqs_qlen=>43, + flush_reqs_qlen=>42}}), ok = logger:add_handler(?MODULE,logger_std_h, #{filter_default=>log, @@ -691,16 +698,17 @@ internal_log(Type, Term) -> op_switch_to_sync_file(Config) -> {Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), + NumOfReqs = 500, NewHConfig = - HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 3, - drop_new_reqs_qlen => 501, - flush_reqs_qlen => 2000, + HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 2, + drop_new_reqs_qlen => NumOfReqs+1, + flush_reqs_qlen => 2*NumOfReqs, enable_burst_limit => false}}, ok = logger:set_handler_config(?MODULE, NewHConfig), %% TRecvPid = start_op_trace(), - NumOfReqs = 500, send_burst({n,NumOfReqs}, seq, {chars,79}, info), - NumOfReqs = count_lines(Log), + Lines = count_lines(Log), + ok = file:delete(Log), %% true = analyse_trace(TRecvPid, %% fun(Events) -> find_mode(async,Events) end), %% true = analyse_trace(TRecvPid, @@ -711,68 +719,82 @@ op_switch_to_sync_file(Config) -> %% fun(Events) -> find_mode(drop,Events) end), %% false = analyse_trace(TRecvPid, %% fun(Events) -> find_mode(flush,Events) end), - ok = file:delete(Log), %% stop_op_trace(TRecvPid), + NumOfReqs = Lines, ok. op_switch_to_sync_file(cleanup, _Config) -> ok = stop_handler(?MODULE). op_switch_to_sync_tty(Config) -> {HConfig,StdHConfig} = start_handler(?MODULE, standard_io, Config), + NumOfReqs = 500, NewHConfig = HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 3, - drop_new_reqs_qlen => 501, - flush_reqs_qlen => 2000, + drop_new_reqs_qlen => NumOfReqs+1, + flush_reqs_qlen => 2*NumOfReqs, enable_burst_limit => false}}, ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 500, send_burst({n,NumOfReqs}, seq, {chars,79}, info), ok. op_switch_to_sync_tty(cleanup, _Config) -> ok = stop_handler(?MODULE). +op_switch_to_drop_file() -> + [{timetrap,{seconds,180}}]. op_switch_to_drop_file(Config) -> - {Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), - - NewHConfig = - HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 2, - drop_new_reqs_qlen => 3, - flush_reqs_qlen => 600, + Test = + fun() -> + {Log,HConfig,StdHConfig} = + start_handler(?MODULE, ?FUNCTION_NAME, Config), + NumOfReqs = 300, + Procs = 2, + Bursts = 10, + NewHConfig = + HConfig#{logger_std_h => + StdHConfig#{toggle_sync_qlen => 1, + drop_new_reqs_qlen => 2, + flush_reqs_qlen => + Procs*NumOfReqs*Bursts, enable_burst_limit => false}}, - ok = logger:set_handler_config(?MODULE, NewHConfig), - %% TRecvPid = start_op_trace(), - NumOfReqs = 500, - send_burst({n,NumOfReqs}, seq, {chars,79}, info), - Logged = count_lines(Log), - ct:pal("Number of messages dropped = ~w (~w)", - [NumOfReqs-Logged,NumOfReqs]), - true = (Logged < NumOfReqs), - %% true = analyse_trace(TRecvPid, - %% fun(Events) -> find_mode(async,Events) end), - %% true = analyse_trace(TRecvPid, - %% fun(Events) -> find_mode(drop,Events) end), - %% false = analyse_trace(TRecvPid, - %% fun(Events) -> find_mode(flush,Events) end), - %% true = analyse_trace(TRecvPid, - %% fun(Events) -> find_switch(async,drop,Events) - %% orelse find_switch(sync,drop,Events) - %% end), - ok = file:delete(Log), - %% stop_op_trace(TRecvPid), - ok. + ok = logger:set_handler_config(?MODULE, NewHConfig), + %% It sometimes happens that the handler gets the + %% requests in a slow enough pace so that dropping + %% never occurs. Therefore, lets generate a number of + %% bursts to increase the chance of message buildup. + [send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info) || + _ <- lists:seq(1, Bursts)], + Logged = count_lines(Log), + ok = stop_handler(?MODULE), + _ = file:delete(Log), + ct:pal("Number of messages dropped = ~w (~w)", + [Procs*NumOfReqs*Bursts-Logged,Procs*NumOfReqs*Bursts]), + true = (Logged < (Procs*NumOfReqs*Bursts)), + true = (Logged > 0), + ok + end, + %% As it's tricky to get the timing right in only one go, we perform the + %% test repeatedly, hoping that will generate a successful result. + case repeat_until_ok(Test, 10) of + {ok,{Failures,_Result}} -> + ct:log("Failed ~w times before success!", [Failures]); + {fails,Reason} -> + ct:fail(Reason) + end. op_switch_to_drop_file(cleanup, _Config) -> - ok = stop_handler(?MODULE). + _ = stop_handler(?MODULE). op_switch_to_drop_tty(Config) -> {HConfig,StdHConfig} = start_handler(?MODULE, standard_io, Config), + NumOfReqs = 300, + Procs = 2, NewHConfig = - HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 2, - drop_new_reqs_qlen => 3, - flush_reqs_qlen => 600, + HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 1, + drop_new_reqs_qlen => 2, + flush_reqs_qlen => + Procs*NumOfReqs+1, enable_burst_limit => false}}, ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 500, - send_burst({n,NumOfReqs}, seq, {chars,79}, info), + send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info), ok. op_switch_to_drop_tty(cleanup, _Config) -> ok = stop_handler(?MODULE). @@ -780,32 +802,54 @@ op_switch_to_drop_tty(cleanup, _Config) -> op_switch_to_flush_file() -> [{timetrap,{minutes,3}}]. op_switch_to_flush_file(Config) -> - {Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), - - %% it's important that both async and sync requests have been queued - %% when the flush happens (verify with coverage of flush_log_requests/2) - - NewHConfig = - HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 2, - drop_new_reqs_qlen => 99, - flush_reqs_qlen => 100, + Test = + fun() -> + {Log,HConfig,StdHConfig} = + start_handler(?MODULE, ?FUNCTION_NAME, Config), + + %% NOTE: it's important that both async and sync + %% requests have been queued when the flush happens + %% (verify with coverage of flush_log_requests/2) + + NewHConfig = + HConfig#{logger_std_h => + StdHConfig#{toggle_sync_qlen => 2, + %% disable drop mode + drop_new_reqs_qlen => 300, + flush_reqs_qlen => 300, enable_burst_limit => false}}, - ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 10000, - Procs = 100, - send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info), - Logged = count_lines(Log), - ct:pal("Number of messages flushed/dropped = ~w (~w)", - [(NumOfReqs*Procs)-Logged,NumOfReqs*Procs]), - true = (Logged < (NumOfReqs*Procs)), - - %%! --- Thu Apr 12 13:46:00 2018 --- peppe was here! - %%! TODO: Verify that handler has switched to flush mode - - ok = file:delete(Log), - ok. + ok = logger:set_handler_config(?MODULE, NewHConfig), + NumOfReqs = 1500, + Procs = 10, + Bursts = 10, + %% It sometimes happens that the handler either gets + %% the requests in a slow enough pace so that flushing + %% never occurs, or it gets all messages at once, + %% causing all messages to get flushed (no dropping of + %% sync messages gets tested). Therefore, lets + %% generate a number of bursts to increase the chance + %% of message buildup in some random fashion. + [send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info) || + _ <- lists:seq(1,Bursts)], + Logged = count_lines(Log), + ok = stop_handler(?MODULE), + _ = file:delete(Log), + ct:pal("Number of messages flushed/dropped = ~w (~w)", + [NumOfReqs*Procs*Bursts-Logged,NumOfReqs*Procs*Bursts]), + true = (Logged < (NumOfReqs*Procs*Bursts)), + true = (Logged > 0), + ok + end, + %% As it's tricky to get the timing right in only one go, we perform the + %% test repeatedly, hoping that will generate a successful result. + case repeat_until_ok(Test, 10) of + {ok,{Failures,_Result}} -> + ct:log("Failed ~w times before success!", [Failures]); + {fails,Reason} -> + ct:fail(Reason) + end. op_switch_to_flush_file(cleanup, _Config) -> - ok = stop_handler(?MODULE). + _ = stop_handler(?MODULE). op_switch_to_flush_tty(Config) -> {HConfig,StdHConfig} = start_handler(?MODULE, standard_io, Config), @@ -815,12 +859,13 @@ op_switch_to_flush_tty(Config) -> NewHConfig = HConfig#{logger_std_h => StdHConfig#{toggle_sync_qlen => 2, - drop_new_reqs_qlen => 99, + %% disable drop mode + drop_new_reqs_qlen => 100, flush_reqs_qlen => 100, enable_burst_limit => false}}, ok = logger:set_handler_config(?MODULE, NewHConfig), - NumOfReqs = 10000, - Procs = 10, + NumOfReqs = 1000, + Procs = 100, send_burst({n,NumOfReqs}, {spawn,Procs,0}, {chars,79}, info), ok. op_switch_to_flush_tty(cleanup, _Config) -> @@ -904,10 +949,10 @@ kill_disabled(cleanup, _Config) -> ok = stop_handler(?MODULE). qlen_kill_new(Config) -> - {Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), + {_Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), Pid0 = whereis(?MODULE), {_,Mem0} = process_info(Pid0, memory), - RestartAfter = 2000, + RestartAfter = ?HANDLER_RESTART_AFTER, NewHConfig = HConfig#{logger_std_h=>StdHConfig#{enable_kill_overloaded=>true, handler_overloaded_qlen=>10, @@ -927,7 +972,7 @@ qlen_kill_new(Config) -> killed -> ct:pal("Slow shutdown, handler process was killed!", []) end, - timer:sleep(RestartAfter + 1000), + timer:sleep(RestartAfter + 2000), true = is_pid(whereis(?MODULE)), ok after @@ -941,7 +986,7 @@ qlen_kill_new(cleanup, _Config) -> %% choke the standard handler on remote node to verify the termination %% works as expected -qlen_kill_std(Config) -> +qlen_kill_std(_Config) -> %%! HERE %% Dir = ?config(priv_dir, Config), %% File = lists:concat([?MODULE,"_",?FUNCTION_NAME,".log"]), @@ -955,10 +1000,10 @@ qlen_kill_std(Config) -> {skip,"Not done yet"}. mem_kill_new(Config) -> - {Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), + {_Log,HConfig,StdHConfig} = start_handler(?MODULE, ?FUNCTION_NAME, Config), Pid0 = whereis(?MODULE), {_,Mem0} = process_info(Pid0, memory), - RestartAfter = 2000, + RestartAfter = ?HANDLER_RESTART_AFTER, NewHConfig = HConfig#{logger_std_h=>StdHConfig#{enable_kill_overloaded=>true, handler_overloaded_qlen=>50000, @@ -978,7 +1023,7 @@ mem_kill_new(Config) -> killed -> ct:pal("Slow shutdown, handler process was killed!", []) end, - timer:sleep(RestartAfter * 2), + timer:sleep(RestartAfter + 2000), true = is_pid(whereis(?MODULE)), ok after @@ -992,7 +1037,7 @@ mem_kill_new(cleanup, _Config) -> %% choke the standard handler on remote node to verify the termination %% works as expected -mem_kill_std(Config) -> +mem_kill_std(_Config) -> {skip,"Not done yet"}. restart_after(Config) -> @@ -1016,7 +1061,7 @@ restart_after(Config) -> end, {Log,_,_} = start_handler(?MODULE, ?FUNCTION_NAME, Config), - RestartAfter = 2000, + RestartAfter = ?HANDLER_RESTART_AFTER, NewHConfig2 = HConfig#{logger_std_h=>StdHConfig#{enable_kill_overloaded=>true, handler_overloaded_qlen=>10, @@ -1028,7 +1073,7 @@ restart_after(Config) -> send_burst({n,100}, {spawn,2,0}, {chars,79}, info), receive {'DOWN', MRef2, _, _, _Info2} -> - timer:sleep(RestartAfter + 1000), + timer:sleep(RestartAfter + 2000), Pid1 = whereis(?MODULE), true = is_pid(Pid1), false = (Pid1 == Pid0), @@ -1074,7 +1119,7 @@ handler_requests_under_load(Config) -> NoOfReqs = lists:foldl(fun({_,Res}, N) -> N + length(Res) end, 0, ReqResult), ct:pal("~w requests made. Errors: ~n~p", [NoOfReqs,Errors]), ok = file:delete(Log). -handler_requests_under_load(cleanup, Config) -> +handler_requests_under_load(cleanup, _Config) -> ok = stop_handler(?MODULE). send_requests(HName, TO, Reqs = [{Req,Res}|Rs]) -> @@ -1126,8 +1171,9 @@ start_handler(Name, FuncName, Config) -> {Log,HConfig,StdHConfig}. stop_handler(Name) -> - ok = logger:remove_handler(Name), - ct:pal("Handler ~p stopped!", [Name]). + R = logger:remove_handler(Name), + ct:pal("Handler ~p stopped! Result: ~p", [Name,R]), + R. count_lines(File) -> wait_until_written(File, -1), @@ -1299,6 +1345,30 @@ try_match_file(_,Pattern,_,Incorrect) -> [Pattern,Incorrect]), erlang:error({error,not_matching_pattern,Pattern,Incorrect}). +repeat_until_ok(Fun, N) -> + repeat_until_ok(Fun, 0, N, undefined). + +repeat_until_ok(_Fun, Stop, Stop, Reason) -> + {fails,Reason}; + +repeat_until_ok(Fun, C, Stop, FirstReason) -> + if C > 0 -> timer:sleep(5000); + true -> ok + end, + try Fun() of + Result -> + {ok,{C,Result}} + catch + _:Reason:Stack -> + ct:pal("Test fails: ~p (~p)~n", [Reason,hd(Stack)]), + if FirstReason == undefined -> + repeat_until_ok(Fun, C+1, Stop, {Reason,Stack}); + true -> + repeat_until_ok(Fun, C+1, Stop, FirstReason) + end + end. + + %%%----------------------------------------------------------------- %%% start_op_trace() -> @@ -1339,17 +1409,17 @@ find_mode(flush, Events) -> find_mode(Mode, Events) -> lists:keymember([{mode,Mode}], 3, Events). -find_switch(From, To, Events) -> - try lists:foldl(fun({trace_return,check_load,{To,_,_,_}}, - {trace_call,check_load,[#{mode := From}]}) -> - throw(match); - (Event, _) -> - Event - end, undefined, Events) of - _ -> false - catch - throw:match -> true - end. +%% find_switch(_From, To, Events) -> +%% try lists:foldl(fun({trace_return,check_load,{To,_,_,_}}, +%% {trace_call,check_load,[#{mode := From}]}) -> +%% throw(match); +%% (Event, _) -> +%% Event +%% end, undefined, Events) of +%% _ -> false +%% catch +%% throw:match -> true +%% end. analyse_trace(TRecvPid, TestFun) -> TRecvPid ! {test,self(),TestFun}, @@ -1411,7 +1481,7 @@ tpl([{M,F,A}|Trace]) -> tpl([]) -> ok. -tracer({trace,_,call,{logger_std_h,handle_cast,[{Op,_}|_]}}, +tracer({trace,_,call,{logger_std_h,handle_cast,[Op|_]}}, {Pid,[{Mod,Func,Op}|Expected]}) -> maybe_tracer_done(Pid,Expected,{Mod,Func,Op}); tracer({trace,_,call,{Mod=logger_std_h,Func=write_to_dev,[_,Data,_,_,_]}}, |