diff options
Diffstat (limited to 'lib/kernel/src/logger_olp.erl')
-rw-r--r-- | lib/kernel/src/logger_olp.erl | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/lib/kernel/src/logger_olp.erl b/lib/kernel/src/logger_olp.erl new file mode 100644 index 0000000000..8365383fe2 --- /dev/null +++ b/lib/kernel/src/logger_olp.erl @@ -0,0 +1,627 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017-2018. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(logger_olp). +-behaviour(gen_server). + +-include("logger_olp.hrl"). +-include("logger_internal.hrl"). + +%% API +-export([start_link/4, load/2, info/1, reset/1, stop/1, restart/1, + set_opts/2, get_opts/1, get_default_opts/0, get_pid/1, + call/2, cast/2, get_ref/0, get_ref/1]). + +%% gen_server and proc_lib callbacks +-export([init/1, handle_call/3, handle_cast/2, handle_info/2, + terminate/2, code_change/3]). + +-define(OPT_KEYS,[sync_mode_qlen, + drop_mode_qlen, + flush_qlen, + burst_limit_enable, + burst_limit_max_count, + burst_limit_window_time, + overload_kill_enable, + overload_kill_qlen, + overload_kill_mem_size, + overload_kill_restart_after]). + +-export_type([olp_ref/0, options/0]). + +-opaque olp_ref() :: {atom(),pid(),ets:tid()}. + +-type options() :: logger:olp_config(). + +%%%----------------------------------------------------------------- +%%% API + +-spec start_link(Name,Module,Args,Options) -> {ok,Pid,Olp} | {error,Reason} when + Name :: atom(), + Module :: module(), + Args :: term(), + Options :: options(), + Pid :: pid(), + Olp :: olp_ref(), + Reason :: term(). +start_link(Name,Module,Args,Options0) when is_map(Options0) -> + Options = maps:merge(get_default_opts(),Options0), + case check_opts(Options) of + ok -> + proc_lib:start_link(?MODULE,init,[[Name,Module,Args,Options]]); + Error -> + Error + end. + +-spec load(Olp, Msg) -> ok when + Olp :: olp_ref(), + Msg :: term(). +load({_Name,Pid,ModeRef},Msg) -> + %% If the process is getting overloaded, the message will be + %% synchronous instead of asynchronous (slows down the tempo of a + %% process causing much load). If the process is choked, drop mode + %% is set and no message is sent. + try ?get_mode(ModeRef) of + async -> + gen_server:cast(Pid, {'$olp_load',Msg}); + sync -> + case call(Pid, {'$olp_load',Msg}) of + ok -> + ok; + _Other -> + %% dropped or {error,busy} + ?observe(_Name,{dropped,1}), + ok + end; + drop -> + ?observe(_Name,{dropped,1}) + catch + %% if the ETS table doesn't exist (maybe because of a + %% process restart), we can only drop the event + _:_ -> ?observe(_Name,{dropped,1}) + end, + ok. + +-spec info(Olp) -> map() | {error, busy} when + Olp :: atom() | pid() | olp_ref(). +info(Olp) -> + call(Olp, info). + +-spec reset(Olp) -> ok | {error, busy} when + Olp :: atom() | pid() | olp_ref(). +reset(Olp) -> + call(Olp, reset). + +-spec stop(Olp) -> ok when + Olp :: atom() | pid() | olp_ref(). +stop({_Name,Pid,_ModRef}) -> + stop(Pid); +stop(Pid) -> + _ = gen_server:call(Pid, stop), + ok. + +-spec set_opts(Olp, Opts) -> ok | {error,term()} | {error, busy} when + Olp :: atom() | pid() | olp_ref(), + Opts :: options(). +set_opts(Olp, Opts) -> + call(Olp, {set_opts,Opts}). + +-spec get_opts(Olp) -> options() | {error, busy} when + Olp :: atom() | pid() | olp_ref(). +get_opts(Olp) -> + call(Olp, get_opts). + +-spec get_default_opts() -> options(). +get_default_opts() -> + #{sync_mode_qlen => ?SYNC_MODE_QLEN, + drop_mode_qlen => ?DROP_MODE_QLEN, + flush_qlen => ?FLUSH_QLEN, + burst_limit_enable => ?BURST_LIMIT_ENABLE, + burst_limit_max_count => ?BURST_LIMIT_MAX_COUNT, + burst_limit_window_time => ?BURST_LIMIT_WINDOW_TIME, + overload_kill_enable => ?OVERLOAD_KILL_ENABLE, + overload_kill_qlen => ?OVERLOAD_KILL_QLEN, + overload_kill_mem_size => ?OVERLOAD_KILL_MEM_SIZE, + overload_kill_restart_after => ?OVERLOAD_KILL_RESTART_AFTER}. + +-spec restart(fun(() -> any())) -> ok. +restart(Fun) -> + Result = + try Fun() + catch C:R:S -> + {error,{restart_failed,Fun,C,R,S}} + end, + ?LOG_INTERNAL(debug,[{logger_olp,restart}, + {result,Result}]), + ok. + +-spec get_ref() -> olp_ref(). +get_ref() -> + get(olp_ref). + +-spec get_ref(PidOrName) -> olp_ref() | {error, busy} when + PidOrName :: pid() | atom(). +get_ref(PidOrName) -> + call(PidOrName,get_ref). + +-spec get_pid(olp_ref()) -> pid(). +get_pid({_Name,Pid,_ModeRef}) -> + Pid. + +%%%=================================================================== +%%% gen_server callbacks +%%%=================================================================== + +init([Name,Module,Args,Options]) -> + register(Name, self()), + process_flag(message_queue_data, off_heap), + + ?start_observation(Name), + + try ets:new(Name, [public]) of + ModeRef -> + OlpRef = {Name,self(),ModeRef}, + put(olp_ref,OlpRef), + try Module:init(Args) of + {ok,CBState} -> + ?set_mode(ModeRef, async), + T0 = ?timestamp(), + proc_lib:init_ack({ok,self(),OlpRef}), + %% Storing options in state to avoid copying + %% (sending) the option data with each message + State0 = ?merge_with_stats( + Options#{id => Name, + idle=> true, + module => Module, + mode_ref => ModeRef, + mode => async, + last_qlen => 0, + last_load_ts => T0, + burst_win_ts => T0, + burst_msg_count => 0, + cb_state => CBState}), + State = reset_restart_flag(State0), + gen_server:enter_loop(?MODULE, [], State); + Error -> + _ = ets:delete(ModeRef), + unregister(Name), + proc_lib:init_ack(Error) + catch + _:Error -> + _ = ets:delete(ModeRef), + unregister(Name), + proc_lib:init_ack(Error) + end + catch + _:Error -> + unregister(Name), + proc_lib:init_ack(Error) + end. + +%% This is the synchronous load event. +handle_call({'$olp_load', Msg}, _From, State) -> + {Result,State1} = do_load(Msg, call, State#{idle=>false}), + %% Result == ok | dropped + reply_return(Result,State1); + +handle_call(get_ref,_From,#{id:=Name,mode_ref:=ModeRef}=State) -> + reply_return({Name,self(),ModeRef},State); + +handle_call({set_opts,Opts0},_From,State) -> + Opts = maps:merge(maps:with(?OPT_KEYS,State),Opts0), + case check_opts(Opts) of + ok -> + reply_return(ok, maps:merge(State,Opts)); + Error -> + reply_return(Error, State) + end; + +handle_call(get_opts,_From,State) -> + reply_return(maps:with(?OPT_KEYS,State), State); + +handle_call(info, _From, State) -> + reply_return(State, State); + +handle_call(reset, _From, #{module:=Module,cb_state:=CBState}=State) -> + State1 = ?merge_with_stats(State), + CBState1 = try_callback_call(Module,reset_state,[CBState],CBState), + reply_return(ok, State1#{idle => true, + last_qlen => 0, + last_load_ts => ?timestamp(), + cb_state => CBState1}); + +handle_call(stop, _From, State) -> + {stop, {shutdown,stopped}, ok, State}; + +handle_call(Msg, From, #{module:=Module,cb_state:=CBState}=State) -> + case try_callback_call(Module,handle_call,[Msg, From, CBState]) of + {reply,Reply,CBState1} -> + reply_return(Reply,State#{cb_state=>CBState1}); + {noreply,CBState1} -> + noreply_return(State#{cb_state=>CBState1}); + {stop, Reason, Reply, CBState1} -> + {stop, Reason, Reply, State#{cb_state=>CBState1}}; + {stop, Reason, CBState1} -> + {stop, Reason, State#{cb_state=>CBState1}} + end. + +%% This is the asynchronous load event. +handle_cast({'$olp_load', Msg}, State) -> + {_Result,State1} = do_load(Msg, cast, State#{idle=>false}), + noreply_return(State1); + +handle_cast(Msg, #{module:=Module, cb_state:=CBState} = State) -> + case try_callback_call(Module,handle_cast,[Msg, CBState]) of + {noreply,CBState1} -> + noreply_return(State#{cb_state=>CBState1}); + {stop, Reason, CBState1} -> + {stop, Reason, State#{cb_state=>CBState1}} + end. + +handle_info(timeout, #{mode_ref:=_ModeRef, mode:=Mode} = State) -> + State1 = notify(idle,State), + State2 = maybe_notify_mode_change(async,State1), + {noreply, State2#{idle => true, + mode => ?change_mode(_ModeRef, Mode, async), + burst_msg_count => 0}}; +handle_info(Msg, #{module := Module, cb_state := CBState} = State) -> + case try_callback_call(Module,handle_info,[Msg, CBState]) of + {noreply,CBState1} -> + noreply_return(State#{cb_state=>CBState1}); + {stop, Reason, CBState1} -> + {stop, Reason, State#{cb_state=>CBState1}}; + {load,CBState1} -> + {_,State1} = do_load(Msg, cast, State#{idle=>false, + cb_state=>CBState1}), + noreply_return(State1) + end. + +terminate({shutdown,{overloaded,_QLen,_Mem}}, + #{id:=Name, module := Module, cb_state := CBState, + overload_kill_restart_after := RestartAfter} = State) -> + %% We're terminating because of an overload situation (see + %% kill_if_choked/3). + unregister(Name), %%!!!! to avoid error printout of callback crashed on stop + case try_callback_call(Module,terminate,[overloaded,CBState],ok) of + {ok,Fun} when is_function(Fun,0), is_integer(RestartAfter) -> + set_restart_flag(State), + _ = timer:apply_after(RestartAfter,?MODULE,restart,[Fun]), + ok; + _ -> + ok + end; +terminate(Reason, #{id:=Name, module:=Module, cb_state:=CBState}) -> + _ = try_callback_call(Module,terminate,[Reason,CBState],ok), + unregister(Name), + ok. + +code_change(_OldVsn, State, _Extra) -> + {ok, State}. + + +%%%----------------------------------------------------------------- +%%% Internal functions +-spec call(Olp, term()) -> term() | {error,busy} when + Olp :: atom() | pid() | olp_ref(). +call({_Name, Pid, _ModeRef},Msg) -> + call(Pid, Msg); +call(Server, Msg) -> + try + gen_server:call(Server, Msg) + catch + _:{timeout,_} -> {error,busy} + end. + +-spec cast(olp_ref(),term()) -> ok. +cast({_Name, Pid, _ModeRef},Msg) -> + gen_server:cast(Pid, Msg). + +%% check for overload between every event (and set Mode to async, +%% sync or drop accordingly), but never flush the whole mailbox +%% before LogWindowSize events have been handled +do_load(Msg, CallOrCast, State) -> + T1 = ?timestamp(), + State1 = ?update_time(T1,State), + + %% check if the process is getting overloaded, or if it's + %% recovering from overload (the check must be done for each + %% event to react quickly to large bursts of events and + %% to ensure that the handler can never end up in drop mode + %% with an empty mailbox, which would stop operation) + {Mode1,QLen,Mem,State2} = check_load(State1), + + %% kill the handler if it can't keep up with the load + kill_if_choked(QLen, Mem, State2), + + if Mode1 == flush -> + flush(T1, State2); + true -> + handle_load(Mode1, T1, Msg, CallOrCast, State2) + end. + +%% this function is called by do_load/3 after an overload check +%% has been performed, where QLen > FlushQLen +flush(T1, State=#{id := _Name, mode := Mode, last_load_ts := _T0, mode_ref := ModeRef}) -> + %% flush load messages in the mailbox (a limited number in order + %% to not cause long delays) + NewFlushed = flush_load(?FLUSH_MAX_N), + + %% write info in log about flushed messages + State1=notify({flushed,NewFlushed},State), + + %% because of the receive loop when flushing messages, the + %% handler will be scheduled out often and the mailbox could + %% grow very large, so we'd better check the queue again here + {_,QLen1} = process_info(self(), message_queue_len), + ?observe(_Name,{max_qlen,QLen1}), + + %% Add 1 for the current log event + ?observe(_Name,{flushed,NewFlushed+1}), + + State2 = ?update_max_time(?diff_time(T1,_T0),State1), + State3 = ?update_max_qlen(QLen1,State2), + State4 = maybe_notify_mode_change(async,State3), + {dropped,?update_other(flushed,FLUSHED,NewFlushed, + State4#{mode => ?change_mode(ModeRef,Mode,async), + last_qlen => QLen1, + last_load_ts => T1})}. + +%% this function is called to actually handle the message +handle_load(Mode, T1, Msg, _CallOrCast, + State = #{id := _Name, + module := Module, + cb_state := CBState, + last_qlen := LastQLen, + last_load_ts := _T0}) -> + %% check if we need to limit the number of writes + %% during a burst of log events + {DoWrite,State1} = limit_burst(State), + + {Result,LastQLen1,CBState1} = + if DoWrite -> + ?observe(_Name,{_CallOrCast,1}), + CBS = try_callback_call(Module,handle_load,[Msg,CBState]), + {ok,element(2, process_info(self(), message_queue_len)),CBS}; + true -> + ?observe(_Name,{flushed,1}), + {dropped,LastQLen,CBState} + end, + State2 = State1#{cb_state=>CBState1}, + + State3 = State2#{mode => Mode}, + State4 = ?update_calls_or_casts(_CallOrCast,1,State3), + State5 = ?update_max_qlen(LastQLen1,State4), + State6 = + ?update_max_time(?diff_time(T1,_T0), + State5#{last_qlen := LastQLen1, + last_load_ts => T1}), + State7 = case Result of + ok -> + S = ?update_freq(T1,State6), + ?update_other(writes,WRITES,1,S); + _ -> + State6 + end, + {Result,State7}. + + +%%%----------------------------------------------------------------- +%%% Check that the options are valid +check_opts(Options) when is_map(Options) -> + case do_check_opts(maps:to_list(Options)) of + ok -> + case overload_levels_ok(Options) of + true -> + ok; + false -> + Faulty = maps:with([sync_mode_qlen, + drop_mode_qlen, + flush_qlen],Options), + {error,{invalid_olp_levels,Faulty}} + end; + {error,Key,Value} -> + {error,{invalid_olp_config,#{Key=>Value}}} + end. + +do_check_opts([{sync_mode_qlen,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{drop_mode_qlen,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{flush_qlen,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{burst_limit_enable,Bool}|Options]) when is_boolean(Bool) -> + do_check_opts(Options); +do_check_opts([{burst_limit_max_count,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{burst_limit_window_time,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{overload_kill_enable,Bool}|Options]) when is_boolean(Bool) -> + do_check_opts(Options); +do_check_opts([{overload_kill_qlen,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{overload_kill_mem_size,N}|Options]) when is_integer(N) -> + do_check_opts(Options); +do_check_opts([{overload_kill_restart_after,NorA}|Options]) + when is_integer(NorA); NorA == infinity -> + do_check_opts(Options); +do_check_opts([{Key,Value}|_]) -> + {error,Key,Value}; +do_check_opts([]) -> + ok. + +set_restart_flag(#{id := Name, module := Module}) -> + Flag = list_to_atom(lists:concat([Module,"_",Name,"_restarting"])), + spawn(fun() -> + register(Flag, self()), + timer:sleep(infinity) + end), + ok. + +reset_restart_flag(#{id := Name, module := Module} = State) -> + Flag = list_to_atom(lists:concat([Module,"_",Name,"_restarting"])), + case whereis(Flag) of + undefined -> + State; + Pid -> + exit(Pid, kill), + notify(restart,State) + end. + +check_load(State = #{id:=_Name, mode_ref := ModeRef, mode := Mode, + sync_mode_qlen := SyncModeQLen, + drop_mode_qlen := DropModeQLen, + flush_qlen := FlushQLen}) -> + {_,Mem} = process_info(self(), memory), + ?observe(_Name,{max_mem,Mem}), + {_,QLen} = process_info(self(), message_queue_len), + ?observe(_Name,{max_qlen,QLen}), + %% When the handler process gets scheduled in, it's impossible + %% to predict the QLen. We could jump "up" arbitrarily from say + %% async to sync, async to drop, sync to flush, etc. However, when + %% the handler process manages the log events (without flushing), + %% one after the other, we will move "down" from drop to sync and + %% from sync to async. This way we don't risk getting stuck in + %% drop or sync mode with an empty mailbox. + {Mode1,_NewDrops,_NewFlushes} = + if + QLen >= FlushQLen -> + {flush, 0,1}; + QLen >= DropModeQLen -> + %% Note that drop mode will force load messages to + %% be dropped on the client side (never sent to + %% the olp process). + IncDrops = if Mode == drop -> 0; true -> 1 end, + {?change_mode(ModeRef, Mode, drop), IncDrops,0}; + QLen >= SyncModeQLen -> + {?change_mode(ModeRef, Mode, sync), 0,0}; + true -> + {?change_mode(ModeRef, Mode, async), 0,0} + end, + State1 = ?update_other(drops,DROPS,_NewDrops,State), + State2 = ?update_max_qlen(QLen,State1), + State3 = ?update_max_mem(Mem,State2), + State4 = maybe_notify_mode_change(Mode1,State3), + {Mode1, QLen, Mem, + ?update_other(flushes,FLUSHES,_NewFlushes, + State4#{last_qlen => QLen})}. + +limit_burst(#{burst_limit_enable := false}=State) -> + {true,State}; +limit_burst(#{burst_win_ts := BurstWinT0, + burst_msg_count := BurstMsgCount, + burst_limit_window_time := BurstLimitWinTime, + burst_limit_max_count := BurstLimitMaxCnt} = State) -> + if (BurstMsgCount >= BurstLimitMaxCnt) -> + %% the limit for allowed messages has been reached + BurstWinT1 = ?timestamp(), + case ?diff_time(BurstWinT1,BurstWinT0) of + BurstCheckTime when BurstCheckTime < (BurstLimitWinTime*1000) -> + %% we're still within the burst time frame + {false,?update_other(burst_drops,BURSTS,1,State)}; + _BurstCheckTime -> + %% burst time frame passed, reset counters + {true,State#{burst_win_ts => BurstWinT1, + burst_msg_count => 0}} + end; + true -> + %% the limit for allowed messages not yet reached + {true,State#{burst_win_ts => BurstWinT0, + burst_msg_count => BurstMsgCount+1}} + end. + +kill_if_choked(QLen, Mem, #{overload_kill_enable := KillIfOL, + overload_kill_qlen := OLKillQLen, + overload_kill_mem_size := OLKillMem}) -> + if KillIfOL andalso + ((QLen > OLKillQLen) orelse (Mem > OLKillMem)) -> + exit({shutdown,{overloaded,QLen,Mem}}); + true -> + ok + end. + +flush_load(Limit) -> + process_flag(priority, high), + Flushed = flush_load(0, Limit), + process_flag(priority, normal), + Flushed. + +flush_load(Limit, Limit) -> + Limit; +flush_load(N, Limit) -> + %% flush log events but leave other events, such as info, reset + %% and stop, so that these have a chance to be processed even + %% under heavy load + receive + {'$gen_cast',{'$olp_load',_}} -> + flush_load(N+1, Limit); + {'$gen_call',{Pid,MRef},{'$olp_load',_}} -> + Pid ! {MRef, dropped}, + flush_load(N+1, Limit); + {log,_,_,_,_} -> + flush_load(N+1, Limit); + {log,_,_,_} -> + flush_load(N+1, Limit) + after + 0 -> N + end. + +overload_levels_ok(Options) -> + SMQL = maps:get(sync_mode_qlen, Options, ?SYNC_MODE_QLEN), + DMQL = maps:get(drop_mode_qlen, Options, ?DROP_MODE_QLEN), + FQL = maps:get(flush_qlen, Options, ?FLUSH_QLEN), + (DMQL > 1) andalso (SMQL =< DMQL) andalso (DMQL =< FQL). + +maybe_notify_mode_change(drop,#{mode:=Mode0}=State) + when Mode0=/=drop -> + notify({mode_change,Mode0,drop},State); +maybe_notify_mode_change(Mode1,#{mode:=drop}=State) + when Mode1==async; Mode1==sync -> + notify({mode_change,drop,Mode1},State); +maybe_notify_mode_change(_,State) -> + State. + +notify(Note,#{module:=Module,cb_state:=CBState}=State) -> + CBState1 = try_callback_call(Module,notify,[Note,CBState],CBState), + State#{cb_state=>CBState1}. + +try_callback_call(Module, Function, Args) -> + try_callback_call(Module, Function, Args, '$no_default_return'). + +try_callback_call(Module, Function, Args, DefRet) -> + try apply(Module, Function, Args) + catch + throw:R -> R; + error:undef:S when DefRet=/='$no_default_return' -> + case S of + [{Module,Function,Args,_}|_] -> + DefRet; + _ -> + erlang:raise(error,undef,S) + end + end. + +noreply_return(#{idle:=true}=State) -> + {noreply,State}; +noreply_return(#{idle:=false}=State) -> + {noreply,State,?IDLE_DETECT_TIME}. + +reply_return(Reply,#{idle:=true}=State) -> + {reply,Reply,State}; +reply_return(Reply,#{idle:=false}=State) -> + {reply,Reply,State,?IDLE_DETECT_TIME}. |