From b91eceaf71e0009346d119346a33894ed53be7b0 Mon Sep 17 00:00:00 2001 From: Peter Andersson Date: Thu, 7 Jun 2018 10:59:04 +0200 Subject: [logger] Implement logging of handler overload status Conflicts: lib/kernel/src/logger_disk_log_h.erl lib/kernel/src/logger_std_h.erl --- lib/kernel/src/logger_disk_log_h.erl | 89 ++++++++++++++++++++++-------- lib/kernel/src/logger_h_common.erl | 71 ++++++++++++++++-------- lib/kernel/src/logger_std_h.erl | 102 ++++++++++++++++++++++++----------- 3 files changed, 184 insertions(+), 78 deletions(-) (limited to 'lib/kernel/src') diff --git a/lib/kernel/src/logger_disk_log_h.erl b/lib/kernel/src/logger_disk_log_h.erl index d339957ac8..c3d2204630 100644 --- a/lib/kernel/src/logger_disk_log_h.erl +++ b/lib/kernel/src/logger_disk_log_h.erl @@ -37,6 +37,9 @@ adding_handler/1, removing_handler/1, changing_config/2, swap_buffer/2]). +%% handler internal +-export([log_handler_info/4]). + %%%=================================================================== %%% API %%%=================================================================== @@ -250,7 +253,7 @@ swap_buffer(Name, Buffer) -> log(LogEvent, Config = #{id := Name, config := #{handler_pid := HPid, mode_tab := ModeTab}}) -> - %% if the handler has crashed, we must drop this request + %% if the handler has crashed, we must drop this event %% and hope the handler restarts so we can try again true = is_process_alive(HPid), Bin = logger_h_common:log_to_binary(LogEvent, Config), @@ -295,6 +298,16 @@ init([Name, Config = #{config := HConfig, disk_log_opts := LogOpts}, mode_tab => ModeTab}}, proc_lib:init_ack({ok,self(),Config1}), gen_server:cast(self(), repeated_disk_log_sync), + case logger_h_common:unset_restart_flag(Name, ?MODULE) of + true -> + %% inform about restart + gen_server:cast(self(), {log_handler_info, + "Handler ~p restarted", + [Name]}); + false -> + %% initial start + ok + end, enter_loop(Config1, State1) catch _:Error -> @@ -315,15 +328,13 @@ enter_loop(#{wait_for_buffer:=true}=Config,State) -> Bin = logger_h_common:log_to_binary(Log,Config), {_,S1} = do_log(Bin,cast,S), S1 - end, - State, - Buffer) + end, State, Buffer) end, gen_server:enter_loop(?MODULE,[],State1); enter_loop(_Config,State) -> gen_server:enter_loop(?MODULE,[],State). -%% This is the synchronous log request. +%% This is the synchronous log event. handle_call({log, Bin}, _From, State) -> {Result,State1} = do_log(Bin, call, State), %% Result == ok | dropped @@ -377,15 +388,19 @@ handle_call(stop, _From, State) -> {stop, {shutdown,stopped}, ok, State}. -%% This is the asynchronous log request. +%% This is the asynchronous log event. handle_cast({log, Bin}, State) -> {_,State1} = do_log(Bin, cast, State), {noreply, State1}; +handle_cast({log_handler_info, Format, Args}, State = #{id:=Name}) -> + log_handler_info(Name, Format, Args, State), + {noreply, State}; + %% If FILESYNC_REPEAT_INTERVAL is set to a millisec value, this %% clause gets called repeatedly by the handler. In order to %% guarantee that a filesync *always* happens after the last log -%% request, the repeat operation must be active! +%% event, the repeat operation must be active! handle_cast(repeated_disk_log_sync, State = #{id := Name, filesync_repeat_interval := FSyncInt, @@ -526,21 +541,31 @@ stop(Name) -> if C == 0 -> Interval; true -> C-1 end). -%% check for overload between every request (and set Mode to async, +%% check for overload between every event (and set Mode to async, %% sync or drop accordingly), but never flush the whole mailbox -%% before LogWindowSize requests have been handled -do_log(Bin, CallOrCast, State = #{id:=Name, mode := _Mode0}) -> +%% before LogWindowSize events have been handled +do_log(Bin, CallOrCast, State = #{id:=Name, mode := Mode0}) -> T1 = ?timestamp(), %% check if the handler is getting overloaded, or if it's %% recovering from overload (the check must be done for each - %% request to react quickly to large bursts of requests and + %% event to react quickly to large bursts of events and %% to ensure that the handler can never end up in drop mode %% with an empty mailbox, which would stop operation) {Mode1,QLen,Mem,State1} = logger_h_common:check_load(State), + if (Mode1 == drop) andalso (Mode0 =/= drop) -> + log_handler_info(Name, "Handler ~p switched to drop mode", + [Name], State); + (Mode0 == drop) andalso ((Mode1 == async) orelse (Mode1 == sync)) -> + log_handler_info(Name, "Handler ~p switched to ~w mode", + [Name,Mode1], State); + true -> + ok + end, + %% kill the handler if it can't keep up with the load - logger_h_common:kill_if_choked(Name, QLen, Mem, State), + logger_h_common:kill_if_choked(Name, QLen, Mem, ?MODULE, State), if Mode1 == flush -> flush(Name, QLen, T1, State1); @@ -550,22 +575,26 @@ do_log(Bin, CallOrCast, State = #{id:=Name, mode := _Mode0}) -> %% this function is called by do_log/3 after an overload check %% has been performed, where QLen > FlushQLen -flush(_Name, _QLen0, T1, State=#{last_log_ts := _T0, mode_tab := ModeTab}) -> +flush(Name, _QLen0, T1, State=#{last_log_ts := _T0, mode_tab := ModeTab}) -> %% flush messages in the mailbox (a limited number in %% order to not cause long delays) - _NewFlushed = logger_h_common:flush_log_requests(?FLUSH_MAX_N), + NewFlushed = logger_h_common:flush_log_events(?FLUSH_MAX_N), + + %% write info in log about flushed messages + log_handler_info(Name, "Handler ~p flushed ~w log events", + [Name,NewFlushed], State), %% because of the receive loop when flushing messages, the %% handler will be scheduled out often and the mailbox could %% grow very large, so we'd better check the queue again here {_,_QLen1} = process_info(self(), message_queue_len), - ?observe(_Name,{max_qlen,_QLen1}), + ?observe(Name,{max_qlen,_QLen1}), - %% Add 1 for the current log request - ?observe(_Name,{flushed,_NewFlushed+1}), + %% Add 1 for the current log event + ?observe(Name,{flushed,NewFlushed+1}), State1 = ?update_max_time(?diff_time(T1,_T0),State), - {dropped,?update_other(flushed,FLUSHED,_NewFlushed, + {dropped,?update_other(flushed,FLUSHED,NewFlushed, State1#{mode => ?set_mode(ModeTab,async), last_qlen => 0, last_log_ts => T1})}. @@ -578,10 +607,10 @@ write(Name, Mode, T1, Bin, _CallOrCast, last_qlen := LastQLen, last_log_ts := T0}) -> %% check if we need to limit the number of writes - %% during a burst of log requests + %% during a burst of log events {DoWrite,BurstWinT,BurstMsgCount} = logger_h_common:limit_burst(State), - %% only send a synhrounous request to the disk_log process + %% only send a synhrounous event to the disk_log process %% every DLSyncInt time, to give the handler time between %% writes so it can keep up with incoming messages {Status,LastQLen1,State1} = @@ -599,13 +628,13 @@ write(Name, Mode, T1, Bin, _CallOrCast, {dropped, LastQLen, State} end, - %% Check if the time since the previous log request is long enough - + %% Check if the time since the previous log event is long enough - %% and the queue length small enough - to assume the mailbox has %% been emptied, and if so, do filesync operation and reset mode to %% async. Note that this is the best we can do to detect an idle %% handler without setting a timer after each log call/cast. If the - %% time between two consecutive log requests is fast and no new - %% request comes in after the last one, idle state won't be detected! + %% time between two consecutive log events is fast and no new + %% event comes in after the last one, idle state won't be detected! Time = ?diff_time(T1,T0), {Mode1,BurstMsgCount1,State2} = if (LastQLen1 < ?FILESYNC_OK_QLEN) andalso @@ -628,6 +657,20 @@ write(Name, Mode, T1, Bin, _CallOrCast, {Status,State4}. +log_handler_info(Name, Format, Args, State) -> + Config = + case logger:get_handler_config(Name) of + {ok,Conf} -> Conf; + _ -> #{formatter=>{?DEFAULT_FORMATTER,?DEFAULT_FORMAT_CONFIG}} + end, + Meta = #{time=>erlang:system_time(microsecond)}, + Bin = logger_h_common:log_to_binary(#{level => notice, + msg => {Format,Args}, + meta => Meta}, Config), + _ = disk_log_write(Name, Bin, State), + ok. + + open_disk_log(Name, LogOpts) -> #{file := File, max_no_bytes := MaxNoBytes, diff --git a/lib/kernel/src/logger_h_common.erl b/lib/kernel/src/logger_h_common.erl index 2a935328aa..d2fcc7e115 100644 --- a/lib/kernel/src/logger_h_common.erl +++ b/lib/kernel/src/logger_h_common.erl @@ -27,10 +27,12 @@ call_cast_or_drop/4, check_load/1, limit_burst/1, - kill_if_choked/4, - flush_log_requests/0, - flush_log_requests/1, + kill_if_choked/5, + flush_log_events/0, + flush_log_events/1, handler_exit/2, + set_restart_flag/2, + unset_restart_flag/2, cancel_timer/1, stop_or_restart/3, overload_levels_ok/1, @@ -52,7 +54,8 @@ log_to_binary(Log,Config) -> do_log_to_binary(Log,Config). do_log_to_binary(Log,Config) -> - {Formatter,FormatterConfig} = maps:get(formatter,Config), + {Formatter,FormatterConfig} = + maps:get(formatter,Config,{?DEFAULT_FORMATTER,?DEFAULT_FORMAT_CONFIG}), String = try_format(Log,Formatter,FormatterConfig), try unicode:characters_to_binary(String) catch _:_ -> @@ -121,10 +124,10 @@ check_common_config(_) -> %%%----------------------------------------------------------------- %%% Overload Protection call_cast_or_drop(_Name, HandlerPid, ModeTab, Bin) -> - %% If the handler process is getting overloaded, the log request + %% If the handler process is getting overloaded, the log event %% will be synchronous instead of asynchronous (slows down the %% logging tempo of a process doing lots of logging. If the - %% handler is choked, drop mode is set and no request will be sent. + %% handler is choked, drop mode is set and no event will be sent. try ?get_mode(ModeTab) of async -> gen_server:cast(HandlerPid, {log,Bin}); @@ -143,7 +146,7 @@ call_cast_or_drop(_Name, HandlerPid, ModeTab, Bin) -> ?observe(_Name,{dropped,1}) catch %% if the ETS table doesn't exist (maybe because of a - %% handler restart), we can only drop the request + %% handler restart), we can only drop the event _:_ -> ?observe(_Name,{dropped,1}) end, ok. @@ -151,6 +154,24 @@ call_cast_or_drop(_Name, HandlerPid, ModeTab, Bin) -> handler_exit(_Name, Reason) -> exit(Reason). +set_restart_flag(Name, Module) -> + Flag = list_to_atom(lists:concat([Module,"_",Name,"_restarting"])), + spawn(fun() -> + register(Flag, self()), + timer:sleep(infinity) + end), + ok. + +unset_restart_flag(Name, Module) -> + Flag = list_to_atom(lists:concat([Module,"_",Name,"_restarting"])), + case whereis(Flag) of + undefined -> + false; + Pid -> + exit(Pid, kill), + true + end. + check_load(State = #{id:=_Name, mode_tab := ModeTab, mode := Mode, toggle_sync_qlen := ToggleSyncQLen, drop_new_reqs_qlen := DropNewQLen, @@ -162,7 +183,7 @@ check_load(State = #{id:=_Name, mode_tab := ModeTab, mode := Mode, %% When the handler process gets scheduled in, it's impossible %% to predict the QLen. We could jump "up" arbitrarily from say %% async to sync, async to drop, sync to flush, etc. However, when - %% the handler process manages the log requests (without flushing), + %% the handler process manages the log events (without flushing), %% one after the other, we will move "down" from drop to sync and %% from sync to async. This way we don't risk getting stuck in %% drop or sync mode with an empty mailbox. @@ -171,7 +192,7 @@ check_load(State = #{id:=_Name, mode_tab := ModeTab, mode := Mode, QLen >= FlushQLen -> {flush, 0,1}; QLen >= DropNewQLen -> - %% Note that drop mode will force log requests to + %% Note that drop mode will force log events to %% be dropped on the client side (never sent get to %% the handler). IncDrops = if Mode == drop -> 0; true -> 1 end, @@ -208,38 +229,42 @@ limit_burst(#{burst_win_ts := BurstWinT0, {true,BurstWinT0,BurstMsgCount+1} end. -kill_if_choked(Name, QLen, Mem, - #{enable_kill_overloaded := KillIfOL, - handler_overloaded_qlen := HOLQLen, - handler_overloaded_mem := HOLMem}) -> +kill_if_choked(Name, QLen, Mem, HandlerMod, + State = #{enable_kill_overloaded := KillIfOL, + handler_overloaded_qlen := HOLQLen, + handler_overloaded_mem := HOLMem}) -> if KillIfOL andalso - ((QLen > HOLQLen) orelse (Mem > HOLMem)) -> + ((QLen > HOLQLen) orelse (Mem > HOLMem)) -> + HandlerMod:log_handler_info(Name, + "Handler ~p overloaded and stopping", + [Name], State), + set_restart_flag(Name, HandlerMod), handler_exit(Name, {shutdown,{overloaded,Name,QLen,Mem}}); true -> ok end. -flush_log_requests() -> - flush_log_requests(-1). +flush_log_events() -> + flush_log_events(-1). -flush_log_requests(Limit) -> +flush_log_events(Limit) -> process_flag(priority, high), - Flushed = flush_log_requests(0, Limit), + Flushed = flush_log_events(0, Limit), process_flag(priority, normal), Flushed. -flush_log_requests(Limit, Limit) -> +flush_log_events(Limit, Limit) -> Limit; -flush_log_requests(N, Limit) -> - %% flush log requests but leave other requests, such as +flush_log_events(N, Limit) -> + %% flush log events but leave other events, such as %% file/disk_log_sync, info and change_config, so that these %% have a chance to be processed even under heavy load receive {'$gen_cast',{log,_}} -> - flush_log_requests(N+1, Limit); + flush_log_events(N+1, Limit); {'$gen_call',{Pid,MRef},{log,_}} -> Pid ! {MRef, dropped}, - flush_log_requests(N+1, Limit) + flush_log_events(N+1, Limit) after 0 -> N end. diff --git a/lib/kernel/src/logger_std_h.erl b/lib/kernel/src/logger_std_h.erl index bebd5300b4..77b054a9b7 100644 --- a/lib/kernel/src/logger_std_h.erl +++ b/lib/kernel/src/logger_std_h.erl @@ -38,6 +38,9 @@ -export([log/2, adding_handler/1, removing_handler/1, changing_config/2, swap_buffer/2]). +%% handler internal +-export([log_handler_info/4]). + %%%=================================================================== %%% API %%%=================================================================== @@ -232,7 +235,7 @@ swap_buffer(Name, Buffer) -> log(LogEvent, Config = #{id := Name, config := #{handler_pid := HPid, mode_tab := ModeTab}}) -> - %% if the handler has crashed, we must drop this request + %% if the handler has crashed, we must drop this event %% and hope the handler restarts so we can try again true = is_process_alive(HPid), Bin = logger_h_common:log_to_binary(LogEvent, Config), @@ -284,17 +287,20 @@ init([Name, Config = #{config := HConfig}, proc_lib:init_ack(Error) end. -do_init(Name, Std) when Std=:=standard_io; Std=:=standard_error -> - case open_log_file(Name, Std) of - {ok,FileCtrlPid} -> - {ok,#{id=>Name,type=>Std,file_ctrl_pid=>FileCtrlPid}}; - Error -> - Error - end; -do_init(Name, FileInfo) when is_tuple(FileInfo) -> - case open_log_file(Name, FileInfo) of +do_init(Name, Type) -> + case open_log_file(Name, Type) of {ok,FileCtrlPid} -> - {ok,#{id=>Name,type=>FileInfo,file_ctrl_pid=>FileCtrlPid}}; + case logger_h_common:unset_restart_flag(Name, ?MODULE) of + true -> + %% inform about restart + gen_server:cast(self(), {log_handler_info, + "Handler ~p restarted", + [Name]}); + false -> + %% initial start + ok + end, + {ok,#{id=>Name,type=>Type,file_ctrl_pid=>FileCtrlPid}}; Error -> Error end. @@ -316,7 +322,7 @@ enter_loop(#{wait_for_buffer:=true}=Config,State) -> enter_loop(_Config,State) -> gen_server:enter_loop(?MODULE,[],State). -%% This is the synchronous log request. +%% This is the synchronous log event. handle_call({log, Bin}, _From, State) -> {Result,State1} = do_log(Bin, call, State), %% Result == ok | dropped @@ -371,15 +377,19 @@ handle_call(reset, _From, State) -> handle_call(stop, _From, State) -> {stop, {shutdown,stopped}, ok, State}. -%% This is the asynchronous log request. +%% This is the asynchronous log event. handle_cast({log, Bin}, State) -> {_,State1} = do_log(Bin, cast, State), {noreply, State1}; +handle_cast({log_handler_info, Format, Args}, State = #{id:=Name}) -> + log_handler_info(Name, Format, Args, State), + {noreply, State}; + %% If FILESYNC_REPEAT_INTERVAL is set to a millisec value, this %% clause gets called repeatedly by the handler. In order to %% guarantee that a filesync *always* happens after the last log -%% request, the repeat operation must be active! +%% event, the repeat operation must be active! handle_cast(repeated_filesync, State = #{type := Type, file_ctrl_pid := FileCtrlPid, @@ -514,21 +524,31 @@ stop(Name) -> if C == 0 -> Interval; true -> C-1 end). -%% check for overload between every request (and set Mode to async, +%% check for overload between every event (and set Mode to async, %% sync or drop accordingly), but never flush the whole mailbox -%% before LogWindowSize requests have been handled -do_log(Bin, CallOrCast, State = #{id:=Name}) -> +%% before LogWindowSize events have been handled +do_log(Bin, CallOrCast, State = #{id:=Name, mode:=Mode0}) -> T1 = ?timestamp(), %% check if the handler is getting overloaded, or if it's %% recovering from overload (the check must be done for each - %% request to react quickly to large bursts of requests and + %% event to react quickly to large bursts of events and %% to ensure that the handler can never end up in drop mode %% with an empty mailbox, which would stop operation) {Mode1,QLen,Mem,State1} = logger_h_common:check_load(State), + if (Mode1 == drop) andalso (Mode0 =/= drop) -> + log_handler_info(Name, "Handler ~p switched to drop mode", + [Name], State); + (Mode0 == drop) andalso ((Mode1 == async) orelse (Mode1 == sync)) -> + log_handler_info(Name, "Handler ~p switched to ~w mode", + [Name,Mode1], State); + true -> + ok + end, + %% kill the handler if it can't keep up with the load - logger_h_common:kill_if_choked(Name, QLen, Mem, State), + logger_h_common:kill_if_choked(Name, QLen, Mem, ?MODULE, State), if Mode1 == flush -> flush(Name, QLen, T1, State1); @@ -538,22 +558,26 @@ do_log(Bin, CallOrCast, State = #{id:=Name}) -> %% this clause is called by do_log/3 after an overload check %% has been performed, where QLen > FlushQLen -flush(_Name, _QLen0, T1, State=#{last_log_ts := _T0, mode_tab := ModeTab}) -> +flush(Name, _QLen0, T1, State=#{last_log_ts := _T0, mode_tab := ModeTab}) -> %% flush messages in the mailbox (a limited number in %% order to not cause long delays) - _NewFlushed = logger_h_common:flush_log_requests(?FLUSH_MAX_N), + NewFlushed = logger_h_common:flush_log_events(?FLUSH_MAX_N), + + %% write info in log about flushed messages + log_handler_info(Name, "Handler ~p flushed ~w log events", + [Name,NewFlushed], State), %% because of the receive loop when flushing messages, the %% handler will be scheduled out often and the mailbox could %% grow very large, so we'd better check the queue again here {_,_QLen1} = process_info(self(), message_queue_len), - ?observe(_Name,{max_qlen,_QLen1}), + ?observe(Name,{max_qlen,_QLen1}), - %% Add 1 for the current log request - ?observe(_Name,{flushed,_NewFlushed+1}), + %% Add 1 for the current log event + ?observe(Name,{flushed,NewFlushed+1}), State1 = ?update_max_time(?diff_time(T1,_T0),State), - {dropped,?update_other(flushed,FLUSHED,_NewFlushed, + {dropped,?update_other(flushed,FLUSHED,NewFlushed, State1#{mode => ?set_mode(ModeTab,async), last_qlen => 0, last_log_ts => T1})}. @@ -567,10 +591,10 @@ write(_Name, Mode, T1, Bin, _CallOrCast, last_log_ts := T0, file_ctrl_sync_int := FileCtrlSyncInt}) -> %% check if we need to limit the number of writes - %% during a burst of log requests + %% during a burst of log events {DoWrite,BurstWinT,BurstMsgCount} = logger_h_common:limit_burst(State), - %% only send a synhrounous request to the file controller process + %% only send a synhrounous event to the file controller process %% every FileCtrlSyncInt time, to give the handler time between %% file writes so it can keep up with incoming messages {Result,LastQLen1} = @@ -587,13 +611,13 @@ write(_Name, Mode, T1, Bin, _CallOrCast, {dropped,LastQLen} end, - %% Check if the time since the previous log request is long enough - + %% Check if the time since the previous log event is long enough - %% and the queue length small enough - to assume the mailbox has %% been emptied, and if so, do filesync operation and reset mode to %% async. Note that this is the best we can do to detect an idle %% handler without setting a timer after each log call/cast. If the - %% time between two consecutive log requests is fast and no new - %% request comes in after the last one, idle state won't be detected! + %% time between two consecutive log events is fast and no new + %% event comes in after the last one, idle state won't be detected! Time = ?diff_time(T1,T0), {Mode1,BurstMsgCount1} = if (LastQLen1 < ?FILESYNC_OK_QLEN) andalso @@ -654,6 +678,20 @@ close_log_file(Fd) -> _ = file:datasync(Fd), _ = file:close(Fd). + +log_handler_info(Name, Format, Args, #{file_ctrl_pid := FileCtrlPid}) -> + Config = + case logger:get_handler_config(Name) of + {ok,Conf} -> Conf; + _ -> #{formatter=>{?DEFAULT_FORMATTER,?DEFAULT_FORMAT_CONFIG}} + end, + Meta = #{time=>erlang:system_time(microsecond)}, + Bin = logger_h_common:log_to_binary(#{level => notice, + msg => {Format,Args}, + meta => Meta}, Config), + _ = file_write_async(FileCtrlPid, Bin), + ok. + %%%----------------------------------------------------------------- %%% File control process @@ -726,7 +764,7 @@ file_ctrl_init(HandlerName, StdDev, Starter) -> file_ctrl_loop(Fd, Type, DevName, Synced, PrevWriteResult, PrevSyncResult, HandlerName) -> receive - %% asynchronous request + %% asynchronous event {log,Bin} -> Result = if Type == file -> write_to_dev(Fd, Bin, DevName, @@ -737,7 +775,7 @@ file_ctrl_loop(Fd, Type, DevName, Synced, file_ctrl_loop(Fd, Type, DevName, false, Result, PrevSyncResult, HandlerName); - %% synchronous request + %% synchronous event {{log,From,Bin,FileSync},MRef} -> if Type == file -> %% check that file hasn't been deleted -- cgit v1.2.3