aboutsummaryrefslogblamecommitdiffstats
path: root/lib/kernel/test/logger_stress_SUITE.erl
blob: 1a278fb1b2e8516478f1bedc4ca1c392adb00aa4 (plain) (tree)




































































                                                                           
                      
                          
                           
                               
































                                                                    
                                                            





















                                                                                







                                                                            





                                                                                
















                                                                             







                                                                         





                                                                     
















































































                                                                                 
                                           






                                                                               





                                               

                                                                     





                                                                            

                                                                                
                                            

                                                                                                              

                                            





                                                                    
                                                                                  
                                   

                                                   
                                                                     
 


                                              









                                    







                                                                      







                                                                      
                                                                     




















                                                            


                                                                        

                                                            

                                                                    



































































































































































































                                                                                                      
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2018. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%%     http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
-module(logger_stress_SUITE).

-compile(export_all).

-include_lib("common_test/include/ct_event.hrl").
-include_lib("kernel/include/logger.hrl").
-include_lib("kernel/src/logger_h_common.hrl").

-ifdef(SAVE_STATS).
  -define(COLLECT_STATS(_All_,_Procs_),
          ct:pal("~p",[stats(_All_,_Procs_)])).
-else.
  -define(COLLECT_STATS(_All_,_Procs__), ok).
-endif.

-define(TEST_DURATION,120). % seconds

suite() ->
    [{timetrap,{minutes,3}},
     {ct_hooks,[logger_test_lib]}].

init_per_suite(Config) ->
    Config.

end_per_suite(_Config) ->
    ok.

init_per_group(_Group, Config) ->
    Config.

end_per_group(_Group, _Config) ->
    ok.

init_per_testcase(_TestCase, Config) ->
    Config.

end_per_testcase(Case, Config) ->
    try apply(?MODULE,Case,[cleanup,Config])
    catch error:undef -> ok
    end,
    ok.

groups() ->
    [].

all() -> 
    [allow_events,
     reject_events,
     std_handler,
     disk_log_handler,
     std_handler_time,
     std_handler_time_big,
     disk_log_handler_time,
     disk_log_handler_time_big,
     emulator_events,
     remote_events,
     remote_to_disk_log,
     remote_emulator_events,
     remote_emulator_to_disk_log].

%%%-----------------------------------------------------------------
%%% Test cases
%%%-----------------------------------------------------------------
%% Time from log macro call to handler callback
allow_events(Config) ->
    {ok,_,Node} =
        logger_test_lib:setup(Config,
                              [{logger,
                                [{handler,default,?MODULE,#{}}]},
                               {logger_level,notice}]),
    N = 100000,
    {T,_} = timer:tc(fun() -> rpc:call(Node,?MODULE,nlogs,[N]) end),
    IOPS = N * 1000/T, % log events allowed per millisecond
    ct_event:notify(#event{name = benchmark_data,
                           data = [{value,IOPS}]}),
    {comment,io_lib:format("~.2f accepted events pr millisecond",
                           [IOPS])}.

%% Time from log macro call to reject (log level)
reject_events(Config) ->
    {ok,_,Node} =
        logger_test_lib:setup(Config,
                              [{logger,
                                [{handler,default,?MODULE,#{}}]},
                               {logger_level,error}]),
    N = 1000000,
    {T,_} = timer:tc(fun() -> rpc:call(Node,?MODULE,nlogs,[N]) end),
    IOPS = N * 1000/T, % log events rejected per millisecond
    ct_event:notify(#event{name = benchmark_data,
                           data = [{value,IOPS}]}),
    {comment,io_lib:format("~.2f rejected events pr millisecond",
                           [IOPS])}.

%% Cascading failure that produce gen_server and proc_lib reports -
%% how many of the produced log events are actually written to a log
%% with logger_std_h file handler.
std_handler(Config) ->
    {ok,_,Node} =
        logger_test_lib:setup(Config,
                              [{logger,
                                [{handler,default,logger_std_h,
                                  #{config=>#{type=>{file,"default.log"}}}}]}]),

    cascade({Node,{logger_backend,log_allowed,2},[]},
            {Node,{logger_std_h,write,4},[{default,logger_std_h_default}]},
            fun otp_cascading/0).
std_handler(cleanup,_Config) ->
    _ = file:delete("default.log"),
    ok.

%% Disable overload protection and just print a lot - measure time.
%% The IOPS reported is the number of log events written per millisecond.
std_handler_time(Config) ->
    measure_handler_time(logger_std_h,#{type=>{file,"default.log"}},Config).
std_handler_time(cleanup,_Config) ->
    _ = file:delete("default.log"),
    ok.

std_handler_time_big(Config) ->
    measure_handler_time_big(logger_std_h,#{type=>{file,"default.log"}},Config).
std_handler_time_big(cleanup,_Config) ->
    _ = file:delete("default.log"),
    ok.

%% Cascading failure that produce gen_server and proc_lib reports -
%% how many of the produced log events are actually written to a log
%% with logger_disk_log_h wrap file handler.
disk_log_handler(Config) ->
    {ok,_,Node} =
        logger_test_lib:setup(Config,
                              [{logger,
                                [{handler,default,logger_disk_log_h,#{}}]}]),
    cascade({Node,{logger_backend,log_allowed,2},[]},
            {Node,{logger_disk_log_h,write,4},
             [{default,logger_disk_log_h_default}]},
            fun otp_cascading/0).
disk_log_handler(cleanup,_Config) ->
    Files = filelib:wildcard("default.log.*"),
    [_ = file:delete(F) || F <- Files],
    ok.

%% Disable overload protection and just print a lot - measure time.
%% The IOPS reported is the number of log events written per millisecond.
disk_log_handler_time(Config) ->
    measure_handler_time(logger_disk_log_h,#{type=>halt},Config).
disk_log_handler_time(cleanup,_Config) ->
    _ = file:delete("default"),
    ok.

disk_log_handler_time_big(Config) ->
    measure_handler_time_big(logger_disk_log_h,#{type=>halt},Config).
disk_log_handler_time_big(cleanup,_Config) ->
    _ = file:delete("default"),
    ok.

%% Cascading failure that produce log events from the emulator - how
%% many of the produced log events pass through the proxy.
emulator_events(Config) ->
    {ok,_,Node} =
        logger_test_lib:setup(Config,
                              [{logger,
                                [{handler,default,?MODULE,#{}}]}]),
    cascade({Node,{?MODULE,producer,0},[]},
            {Node,{?MODULE,log,2},[{proxy,logger_proxy}]},
            fun em_cascading/0).

%% Cascading failure that produce gen_server and proc_lib reports on
%% remote node - how many of the produced log events pass through the
%% proxy.
remote_events(Config) ->
    {ok,_,Node1} =
        logger_test_lib:setup([{postfix,1}|Config],
                              [{logger,
                                [{handler,default,?MODULE,#{}}]}]),
    {ok,_,Node2} =
        logger_test_lib:setup([{postfix,2}|Config],[]),
    cascade({Node2,{logger_backend,log_allowed,2},[{remote_proxy,logger_proxy}]},
            {Node1,{?MODULE,log,2},[{local_proxy,logger_proxy}]},
            fun otp_cascading/0).

%% Cascading failure that produce gen_server and proc_lib reports on
%% remote node - how many of the produced log events are actually
%% written to a log with logger_disk_log_h wrap file handler.
remote_to_disk_log(Config) ->
    {ok,_,Node1} =
        logger_test_lib:setup([{postfix,1}|Config],
                              [{logger,
                                [{handler,default,logger_disk_log_h,#{}}]}]),
    {ok,_,Node2} =
        logger_test_lib:setup([{postfix,2}|Config],[]),
    cascade({Node2,{logger_backend,log_allowed,2},[{remote_proxy,logger_proxy}]},
            {Node1,{logger_disk_log_h,write,4},
             [{local_proxy,logger_proxy},
              {local_default,logger_disk_log_h_default}]},
            fun otp_cascading/0).
remote_to_disk_log(cleanup,_Config) ->
    Files = filelib:wildcard("default.log.*"),
    [_ = file:delete(F) || F <- Files],
    ok.

%% Cascading failure that produce log events from the emulator on
%% remote node - how many of the produced log events pass through the
%% proxy.
remote_emulator_events(Config) ->
    {ok,_,Node1} =
        logger_test_lib:setup([{postfix,1}|Config],
                              [{logger,
                                [{handler,default,?MODULE,#{}}]}]),
    {ok,_,Node2} =
        logger_test_lib:setup([{postfix,2}|Config],[]),
    cascade({Node2,{?MODULE,producer,0},[{remote_proxy,logger_proxy}]},
            {Node1,{?MODULE,log,2},[{local_proxy,logger_proxy}]},
            fun em_cascading/0).

%% Cascading failure that produce log events from the emulator on
%% remote node - how many of the produced log events are actually
%% written to a log with logger_disk_log_h wrap file handler.
remote_emulator_to_disk_log(Config) ->
    {ok,_,Node1} =
        logger_test_lib:setup([{postfix,1}|Config],
                              [{logger,
                                [{handler,default,logger_disk_log_h,#{}}]}]),
    {ok,_,Node2} =
        logger_test_lib:setup([{postfix,2}|Config],[]),
    cascade({Node2,{?MODULE,producer,0},[{remote_proxy,logger_proxy}]},
            {Node1,{logger_disk_log_h,write,4},
             [{local_proxy,logger_proxy},
              {local_default,logger_disk_log_h_default}]},
            fun em_cascading/0).
remote_emulator_to_disk_log(cleanup,_Config) ->
    Files = filelib:wildcard("default.log.*"),
    [_ = file:delete(F) || F <- Files],
    ok.

%%%-----------------------------------------------------------------
%%% Internal functions
measure_handler_time(Module,HCfg,Config) ->
    measure_handler_time(Module,100000,small_fa(),millisecond,HCfg,#{},Config).

measure_handler_time_big(Module,HCfg,Config) ->
    FCfg = #{chars_limit=>4096, max_size=>1024},
    measure_handler_time(Module,100,big_fa(),second,HCfg,FCfg,Config).

measure_handler_time(Module,N,FA,Unit,HCfg,FCfg,Config) ->
    {ok,_,Node} =
        logger_test_lib:setup(
          Config,
          [{logger,
            [{handler,default,Module,
              #{formatter => {logger_formatter,
                              maps:merge(#{legacy_header=>false,
                                           single_line=>true},FCfg)},
                config=>maps:merge(#{sync_mode_qlen => N+1,
                                     drop_mode_qlen => N+1,
                                     flush_qlen => N+1,
                                     burst_limit_enable => false,
                                     filesync_repeat_interval => no_repeat},
                                   HCfg)}}]}]),
    %% HPid = rpc:call(Node,erlang,whereis,[?name_to_reg_name(Module,default)]),
    %% {links,[_,FCPid]} = rpc:call(Node,erlang,process_info,[HPid,links]),
    T0 = erlang:monotonic_time(millisecond),
    ok = rpc:call(Node,?MODULE,nlogs_wait,[N,FA,Module]),
    %% ok = rpc:call(Node,fprof,apply,[fun ?MODULE:nlogs_wait/2,[N div 10,FA,Module],[{procs,[HPid,FCPid]}]]),
    T1 = erlang:monotonic_time(millisecond),
    T = T1-T0,
    M = case Unit of
            millisecond -> 1;
            second -> 1000
        end,
    IOPS = M*N/T,
    ct:pal("N: ~p~nT: ~p~nIOPS: ~.2f events pr ~w",[N,T,IOPS,Unit]),
    %% Stats = rpc:call(Node,logger_olp,info,[?name_to_reg_name(Module,default)]),
    %% ct:pal("Stats: ~p",[Stats]),
    ct_event:notify(#event{name = benchmark_data,
                           data = [{value,IOPS}]}),
    {comment,io_lib:format("~.2f events written pr ~w",[IOPS,Unit])}.

nlogs_wait(N,{F,A},Module) ->
    group_leader(whereis(user),self()),
    [?LOG_NOTICE(F,A) || _ <- lists:seq(1,N)],
    wait(Module).

wait(Module) ->
    case Module:filesync(default) of
        {error,handler_busy} ->
            wait(Module);
        ok ->
            ok
    end.

small_fa() ->
    Str = "\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "[\\]^_`abcdefghijklmnopqr",
    {"~ts",[Str]}.

big_fa() ->
    {"~p",[lists:duplicate(1048576,"a")]}.

nlogs(N) ->
    group_leader(whereis(user),self()),
    Str = "\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        "[\\]^_`abcdefghijklmnopqr",
    [?LOG_NOTICE(Str) || _ <- lists:seq(1,N)],
    ok.

%% cascade(ProducerInfo,ConsumerInfo,TestFun)
cascade({PNode,PMFA,_PStatProcs},{CNode,CMFA,_CStatProcs},TestFun) ->
    Tab  = ets:new(counter,[set,public]),
    ets:insert(Tab,{producer,0}),
    ets:insert(Tab,{consumer,0}),
    dbg:tracer(process,{fun tracer/2,{Tab,PNode,CNode}}),
    dbg:n(PNode),
    dbg:n(CNode),
    dbg:cn(node()),
    dbg:p(all,[call,arity]),
    dbg:tpl(PMFA,[]),
    dbg:tpl(CMFA,[]),

    Pid = rpc:call(CNode,?MODULE,wrap_test,[PNode,TestFun]),
    MRef = erlang:monitor(process,Pid),
    TO = ?TEST_DURATION*1000,
    receive {'DOWN',MRef,_,_,Reason} ->
            ct:fail({remote_pid_down,Reason})
    after TO ->
            All = ets:lookup_element(Tab,producer,2),
            Written = ets:lookup_element(Tab,consumer,2),
            dbg:stop_clear(),
            ?COLLECT_STATS(All,
                           [{PNode,P,Id} || {Id,P} <- _PStatProcs] ++
                               [{CNode,P,Id} || {Id,P} <- _CStatProcs]),
            Ratio = Written/All * 100,
            ct_event:notify(#event{name = benchmark_data,
                                   data = [{value,Ratio}]}),
            {comment,io_lib:format("~p % (~p written, ~p produced)",
                                   [round(Ratio),Written,All])}
    end.

wrap_test(Fun) ->
    wrap_test(node(),Fun).
wrap_test(Node,Fun) ->
    reset(),
    group_leader(whereis(user),self()),
    rpc:call(Node,?MODULE,do_fun,[Fun]).

do_fun(Fun) ->
    reset(),
    Fun().

reset() ->
    reset([logger_std_h_default, logger_disk_log_h_default, logger_proxy]).
reset([P|Ps]) ->
    is_pid(whereis(P)) andalso logger_olp:reset(P),
    reset(Ps);
reset([]) ->
    ok.


tracer({trace,_,call,{?MODULE,producer,_}},{Tab,_PNode,_CNode}=S) ->
    ets:update_counter(Tab,producer,1),
    S;
tracer({trace,Pid,call,{logger_backend,log_allowed,_}},{Tab,PNode,_CNode}=S) when node(Pid)=:=PNode ->
    ets:update_counter(Tab,producer,1),
    S;
tracer({trace,_,call,{?MODULE,log,_}},{Tab,_PNode,_CNode}=S) ->
    ets:update_counter(Tab,consumer,1),
    S;
tracer({trace,_,call,{_,write,_}},{Tab,_PNode,_CNode}=S) ->
    ets:update_counter(Tab,consumer,1),
    S;
tracer(_,S) ->
    S.


%%%-----------------------------------------------------------------
%%% Collect statistics
-define(STAT_KEYS,
        [burst_drops,
         calls,
         casts,
         drops,
         flushed,
         flushes,
         freq,
         last_qlen,
         max_qlen,
         time,
         writes]).
-define(EVENT_KEYS,
        [calls,casts,flushed]).

stats(All,Procs) ->
    NI = [{Id,rpc:call(N,logger_olp,info,[P])} || {N,P,Id}<-Procs],
    [{all,All}|[stats(Id,I,All) || {Id,I} <- NI]].

stats(Id,Info,All) ->
    S = maps:with(?STAT_KEYS,Info),
    AllOnProc = lists:sum(maps:values(maps:with(?EVENT_KEYS,S))),
    if All>0 ->
            Writes = maps:get(writes,S),
            {_,ActiveTime} = maps:get(time,S),
            Rate = round(100*Writes/All),
            RateOnProc =
                if AllOnProc>0 ->
                        round(100*Writes/AllOnProc);
                   true ->
                        0
                end,
            AvFreq =
                if ActiveTime>0 ->
                        round(Writes/ActiveTime);
                   true ->
                        0
                end,
            {Id,
             {stats,S},
             {rate,Rate},
             {rate_on_proc,RateOnProc},
             {av_freq,AvFreq}};
       true ->
            {Id,none}
    end.

%%%-----------------------------------------------------------------
%%% Spawn a lot of processes that crash repeatedly, causing a lot of
%%% error reports from the emulator.
em_cascading() ->
    spawn(fun() -> super() end).

super() ->
    process_flag(trap_exit,true),
    spawn_link(fun server/0),
    [spawn_link(fun client/0) || _<-lists:seq(1,10000)],
    super_loop().

super_loop() ->
    receive
        {'EXIT',_,server} ->
            spawn_link(fun server/0),
            super_loop();
        {'EXIT',_,_} ->
            _L = lists:sum(lists:seq(1,10000)),
            spawn_link(fun client/0),
            super_loop()
    end.

client() ->
    receive
    after 1 ->
            case whereis(server) of
                Pid when is_pid(Pid) ->
                    ok;
                undefined ->
                    producer(),
                    erlang:error(some_exception)
            end
    end,
    client().

server() ->
    register(server,self()),
    receive
    after 3000 ->
            exit(server)
    end.
        

%%%-----------------------------------------------------------------
%%% Create a supervisor tree with processes that crash repeatedly,
%%% causing a lot of supervisor reports and crashreports
otp_cascading() ->
    {ok,Pid} = supervisor:start_link({local,otp_super}, ?MODULE, [otp_super]),
    unlink(Pid),
    Pid.

otp_server_sup() ->
    supervisor:start_link({local,otp_server_sup},?MODULE,[otp_server_sup]).

otp_client_sup(N) ->
    supervisor:start_link({local,otp_client_sup},?MODULE,[otp_client_sup,N]).

otp_server() ->
    gen_server:start_link({local,otp_server},?MODULE,[otp_server],[]).

otp_client() ->
    gen_server:start_link(?MODULE,[otp_client],[]).

init([otp_super]) ->
    {ok, {{one_for_one, 200, 10},
            [{client_sup,
                    {?MODULE, otp_client_sup, [10000]},
                    permanent, 1000, supervisor, [?MODULE]},
             {server_sup,
                    {?MODULE, otp_server_sup, []},
                    permanent, 1000, supervisor, [?MODULE]}
            ]}};
init([otp_server_sup]) ->
    {ok, {{one_for_one, 2, 10},
            [{server,
                    {?MODULE, otp_server, []},
                    permanent, 1000, worker, [?MODULE]}
            ]}};
init([otp_client_sup,N]) ->
    spawn(fun() ->
                  [supervisor:start_child(otp_client_sup,[])
                   || _ <- lists:seq(1,N)]
          end),
    {ok, {{simple_one_for_one, N*10, 1},
            [{client,
                    {?MODULE, otp_client, []},
                    permanent, 1000, worker, [?MODULE]}
            ]}};
init([otp_server]) ->
    {ok, server, 10000};
init([otp_client]) ->
    {ok, client,1}.

handle_info(timeout, client) ->
    true = is_pid(whereis(otp_server)),
    {noreply,client,1};
handle_info(timeout, server) ->
    exit(self(), some_error).

%%%-----------------------------------------------------------------
%%% Logger callbacks
log(_LogEvent,_Config) ->
    ok.

%%%-----------------------------------------------------------------
%%% Function to trace on for counting produced emulator messages
producer() ->
    ok.