aboutsummaryrefslogblamecommitdiffstats
path: root/lib/diameter/test/diameter_watchdog_SUITE.erl
blob: f9f72d364a5ceed3d20ae1bd034c4961fdcde8bd (plain) (tree)
1
2
3
4


                   
                                                        



























                                                                         
                                        
 












                                    
 
                         





                                                                              














                                                            











                                                                     

                                          
 



                                                    

             
                                                                 

                                                             
                              

                                                            
                                                                       


                                                      


                              







                                                                              
                                                                               















                                                                              
                                                     
 
            




                                                                           
 


                             
 


                                                                    
 



                                                    
 




























                                                                            
 















                                                               



                                                                 
 





                                                         



                                                                        






                                                                     







                                                                     




                                                                     







                                                                        






                                                                     



                                                                     



                                                                     


                                                                     




                                                                       





                                                                       






                                                                       





                                                                       
 

                                                               
 
                                         
 


                                                                
 
                                          
 
                                     
 









                                                                       
 
                                                               
 






                                                          
 



                                                         
 



                                                         
 
       
 
                
 


                                                            
 

                                                            
 




































                                                                      
 
                                                                              
 


































                                                                            

       


















                                                                       

       



                                                
 

                       

       



                                                            
 


                                                   
 






                                                         

        

                    
 





                                                              
 
                                                                              
 

                              
 

                                
 
                                                                              
 

                         
 


                   
 

        
 
































                                                                   





                             
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2010-2013. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%

%%
%% Tests of the RFC3539 watchdog state machine as implemented by
%% module diameter_watchdog.
%%

-module(diameter_watchdog_SUITE).

-export([suite/0,
         all/0,
         init_per_suite/1,
         end_per_suite/1]).

%% testcases
-export([reopen/1, reopen/4, reopen/7]).

-export([id/1,    %% jitter callback
         run1/1]).

%% diameter_app callbacks
-export([peer_up/3,
         peer_down/3]).

%% gen_tcp-ish interface
-export([listen/2,
         accept/1,
         connect/3,
         send/2,
         setopts/2]).

-include("diameter.hrl").
-include("diameter_ct.hrl").

%% ===========================================================================

-define(util, diameter_util).

-define(BASE, ?DIAMETER_DICT_COMMON).
-define(REALM, "erlang.org").
-define(ADDR, {127,0,0,1}).

%% Config for diameter:start_service/2.
-define(SERVICE(Name),
        [{'Origin-Host', Name ++ "." ++ ?REALM},
         {'Origin-Realm', ?REALM},
         {'Host-IP-Address', [?ADDR]},
         {'Vendor-Id', 42},
         {'Product-Name', "OTP/diameter"},
         {'Auth-Application-Id', [?DIAMETER_APP_ID_COMMON]},
         {application, [{alias, Name},
                        {dictionary, ?BASE},
                        {module, ?MODULE}]}]).

%% Watchdog timer as a callback.
-define(WD(T), {?MODULE, id, [T]}).

%% Watchdog timers used by the testcases. Note that the short timeout
%% with random jitter is excluded since the reopen/1 isn't smart
%% enough to deal with it: see ONE_WD below.
-define(WD_TIMERS, [?WD(6000)
                    | [F_(T_) || T_ <- [10000, 20000, 30000],
                                 F_ <- [fun(T__) -> T__ end,
                                        fun(T__) -> ?WD(T__) end]]]).

%% Watchdog timer of the misbehaving peer.
-define(PEER_WD, 10000).

%% Receive a watchdog event within a specified time.
-define(EVENT(T, Tmo),
        receive #diameter_event{info = T} -> now()
        after Tmo -> ?ERROR({timeout, Tmo})
        end).

%% Receive an event in a given number of watchdogs, plus or minus
%% half. Note that the call to now_diff assumes left to right
%% evaluation order.
-define(EVENT(T, N, WdL, WdH),
        [?ERROR({received, _Elapsed_, _LowerBound_, N, WdL})
         || _UpperBound_ <- [(N)*(WdH) + (WdH) div 2],
            _Elapsed_    <- [now_diff(now(), ?EVENT(T, _UpperBound_))],
            _LowerBound_ <- [(N)*(WdL) - (WdL) div 2],
            _Elapsed_ =< _LowerBound_*1000]).

-define(EVENT(T, N, Wd),
        ?EVENT(T, N, Wd, Wd)).

%% A timeout that ensures one watchdog. The ensure only one watchdog
%% requires (Wd + 2000) + 1000 < 2*(Wd - 2000) ==> 7000 < Wd for the
%% case with random jitter.
-define(ONE_WD(Wd), jitter(Wd,2000) + 1000).

%% ===========================================================================

suite() ->
    [{timetrap, {minutes, 10}}].%% enough for 17 watchdogs @ 30 sec plus jitter

all() ->
    [reopen].

init_per_suite(Config) ->
    ok = diameter:start(),
    Config.

end_per_suite(_Config) ->
    ok = diameter:stop().

%% ===========================================================================
%% # reopen/1
%% ===========================================================================

%% Test the watchdog state machine for the required failover, failback
%% and reopen behaviour by examining watchdog events.

reopen(_) ->
    [] = run([[reopen, T, Wd, N, M]
              || Wd <- ?WD_TIMERS,       %% watchdog_timer value
                 T <- [listen, connect], %% watchdog to test
                 N <- [0,1,2],           %% DWR's to answer before ignoring
                 M <- ['DWR', 'DWA', 'RAA']]). %% how to induce failback

reopen(Type, Wd, N, M) ->
    Server = start_service(),
    Client = start_service(),

    %% The peer to the transport whose watchdog is tested is given a
    %% long watchdog timeout so that it doesn't send DWR of its own.
    {Node, Peer} = {{[], Wd}, {[{module, ?MODULE}], ?WD(?PEER_WD)}},

    {{LH,LW},{CH,CW}} = case Type of
                            listen  -> {Node, Peer};
                            connect -> {Peer, Node}
                        end,

    LO = [{transport_module, diameter_tcp},
          {transport_config, LH ++ [{ip, ?ADDR}, {port, 0}]},
          {watchdog_timer, LW}],

    {ok, LRef} = diameter:add_transport(Server, {listen, LO}),

    [LP] = ?util:lport(tcp, LRef, 20),

    CO = [{transport_module, diameter_tcp},
          {transport_config, CH ++ [{ip, ?ADDR}, {port, 0},
                                    {raddr, ?ADDR}, {rport, LP}]},
          {watchdog_timer, CW}],

    %% Use a temporary process to ensure the connecting transport is
    %% added only once events from the listening transport are
    %% subscribed to.
    Pid = spawn(fun() -> receive _ -> ok end end),

    [] = run([[reopen, Type, T, LRef, Pid, Wd, N, M]
              || T <- [{listen, Server}, {connect, Client, CO}]]).

%% start_service/1

start_service() ->
    Name = hostname(),
    ok = diameter:start_service(Name, [{monitor, self()} | ?SERVICE(Name)]),
    Name.

%% reopen/7

reopen(Type, {listen = T, SvcName}, Ref, Pid, Wd, N, M) ->
    diameter:subscribe(SvcName),
    Pid ! ok,
    recv(Type, T, SvcName, Ref, Wd, N, M);

reopen(Type, {connect = T, SvcName, Opts}, _, Pid, Wd, N, M) ->
    diameter:subscribe(SvcName),
    MRef = erlang:monitor(process, Pid),
    receive {'DOWN', MRef, process, _, _} -> ok end,
    {ok, Ref} = diameter:add_transport(SvcName, {T, Opts}),
    recv(Type, T, SvcName, Ref, Wd, N, M).

%% recv/7

%% The watchdog to be tested.
recv(Type, Type, _SvcName, Ref, Wd, N, M) ->
    %% Connection should come up immediately as a consequence of
    %% starting the watchdog process. In the accepting case this
    %% results in a new watchdog on a transport waiting for a new
    %% connection.

    ?EVENT({watchdog, Ref, _, {initial, okay}, _}, 2000),
    ?EVENT({up, Ref, _, _, #diameter_packet{}}, 0),

    %% Low/high watchdog timeouts.
    WdL = jitter(Wd, -2000),
    WdH = jitter(Wd, 2000),

    %%   OKAY          Timer expires &      Failover()
    %%                 Pending              SetWatchdog()        SUSPECT
    %%
    %% The peer replies to N DWR's before becoming silent, we should
    %% go down after N+2 watchdog_timer expirations: that is, after
    %% the first unanswered DWR. Knowing the min/max watchdog timeout
    %% values gives the time interval in which the event is expected.

    ?EVENT({watchdog, Ref, _, {okay, suspect}, _}, N+2, WdL, WdH),
    ?EVENT({down, Ref, _, _}, 0),

    %%   SUSPECT       Receive DWA          Pending = FALSE
    %%                                      Failback()
    %%                                      SetWatchdog()        OKAY
    %%
    %%   SUSPECT       Receive non-DWA      Failback()
    %%                                      SetWatchdog()        OKAY
    %%
    %% The peer sends a message before the expiry of another watchdog
    %% to induce failback.

    ?EVENT({watchdog, Ref, _, {suspect, okay}, _}, WdH + 2000),
    ?EVENT({up, Ref, _, _}, 0),

    %%   OKAY          Timer expires &      SendWatchdog()
    %%                 !Pending             SetWatchdog()
    %%                                      Pending = TRUE       OKAY
    %%
    %%   OKAY          Timer expires &      Failover()
    %%                 Pending              SetWatchdog()        SUSPECT
    %%
    %% The peer is now ignoring all watchdogs so the connection goes
    %% back down after either one or two watchdog expiries, depending
    %% on whether or not DWA restored the connection.

    F = choose(M == 'DWA', 2, 1),
    ?EVENT({watchdog, Ref, _, {okay, suspect}, _}, F, WdL, WdH),
    ?EVENT({down, Ref, _, _}, 0),

    %%   SUSPECT       Timer expires        CloseConnection()
    %%                                      SetWatchdog()        DOWN
    %%
    %% Non-response brings the connection down after another timeout.

    ?EVENT({watchdog, Ref, _, {suspect, down}, _}, 1, WdL, WdH),

    %%   DOWN          Timer expires        AttemptOpen()
    %%                                      SetWatchdog()        DOWN
    %%
    %%   DOWN          Connection up        NumDWA = 0
    %%                                      SendWatchdog()
    %%                                      SetWatchdog()
    %%                                      Pending = TRUE       REOPEN
    %%
    %% The connection is reestablished after another timeout.

    recv_reopen(Type, Ref, WdL, WdH),

    %%   REOPEN        Receive non-DWA      Throwaway()          REOPEN
    %%
    %%   REOPEN        Receive DWA &        Pending = FALSE
    %%                 NumDWA < 2           NumDWA++             REOPEN
    %%
    %%   REOPEN        Receive DWA &        Pending = FALSE
    %%                 NumDWA == 2          NumDWA++
    %%                                      Failback()           OKAY
    %%
    %%   REOPEN        Timer expires &      SendWatchdog()
    %%                 !Pending             SetWatchdog()
    %%                                      Pending = TRUE       REOPEN
    %%
    %% An exchange of 3 watchdogs (the first directly after
    %% capabilities exchange) brings the connection back up.

    ?EVENT({watchdog, Ref, _, {reopen, okay}, _}, 2, WdL, WdH),
    ?EVENT({up, Ref, _, _, #diameter_packet{}}, 0),

    %% Non-response brings it down again.

    ?EVENT({watchdog, Ref, _, {okay, suspect}, _}, 2, WdL, WdH),
    ?EVENT({down, Ref, _, _}, 0),
    ?EVENT({watchdog, Ref, _, {suspect, down}, _}, 1, WdL, WdH),

    %% Reestablish after another watchdog.

    recv_reopen(Type, Ref, WdL, WdH),

    %%   REOPEN        Timer expires &      NumDWA = -1
    %%                 Pending &            SetWatchdog()
    %%                 NumDWA >= 0                               REOPEN
    %%
    %%   REOPEN        Timer expires &      CloseConnection()
    %%                 Pending &            SetWatchdog()
    %%                 NumDWA < 0                                DOWN
    %%
    %% Peer is now ignoring all watchdogs go down again after 2
    %% timeouts.

    ?EVENT({watchdog, Ref, _, {reopen, down}, _}, 2, WdL, WdH);

%% The misbehaving peer.
recv(_, Type, SvcName, Ref, Wd, N, M) ->
    %% First transport process.
    ?EVENT({watchdog, Ref, _, {initial, okay}, _}, 1000),
    ?EVENT({up, Ref, _, _, #diameter_packet{}}, 0),
    reg(Type, Ref, SvcName, {SvcName, {Wd,N,M}}),
    ?EVENT({watchdog, Ref, _, {okay, down}, _}, infinity),

    %% Second transport process.
    ?EVENT({watchdog, Ref, _, {_, reopen}, _}, infinity),
    reg(Type, Ref, SvcName, 3),
    ?EVENT({watchdog, Ref, _, {_, down}, _}, infinity),

    %% Third transport process.
    ?EVENT({watchdog, Ref, _, {_, reopen}, _}, infinity),
    reg(Type, Ref, SvcName, 0),
    ?EVENT({watchdog, Ref, _, {_, down}, _}, infinity),

    ok.

%% recv_reopen/4

recv_reopen(connect, Ref, WdL, WdH) ->
    ?EVENT({watchdog, Ref, _, {_, reopen}, _}, 1, WdL, WdH),
    ?EVENT({reconnect, Ref, _}, 0);

recv_reopen(listen, Ref, _, _) ->
    ?EVENT({watchdog, Ref, _, {_, reopen}, _}, 1, ?PEER_WD).

%% reg/4
%%
%% Lookup the pid of the transport process and publish a term for
%% send/2 to lookup.
reg(Type, Ref, SvcName, T) ->
    TPid = tpid(Type, Ref, diameter:service_info(SvcName, transport)),
    true = diameter_reg:add_new({?MODULE, TPid, T}).
    
%% tpid/3

tpid(connect, Ref, [[{ref, Ref},
                     {type, connect},
                     {options, _},
                     {watchdog, _},
                     {peer, _},
                     {apps, _},
                     {caps, _},
                     {port, [{owner, TPid} | _]}
                     | _]]) ->
    TPid;

tpid(listen, Ref, [[{ref, Ref},
                    {type, listen},
                    {options, _},
                    {accept, As}
                    | _]]) ->
    [[{watchdog, _},
      {peer, _},
      {apps, _},
      {caps, _},
      {port, [{owner, TPid} | _]}
      | _]]
        = lists:filter(fun([{watchdog, {_,_,S}} | _]) ->
                               S == okay orelse S == reopen 
                       end,
                       As),
    TPid.

%% ===========================================================================

listen(PortNr, Opts) ->
    gen_tcp:listen(PortNr, Opts).

accept(LSock) ->
    gen_tcp:accept(LSock).

connect(Addr, Port, Opts) ->
    gen_tcp:connect(Addr, Port, Opts).

setopts(Sock, Opts) ->
    inet:setopts(Sock, Opts).

send(Sock, Bin) ->
    send(getr(config), Sock, Bin).

%% send/3

%% First outgoing message from a new transport process is CER/CEA.
%% Remaining outgoing messages are either DWR or DWA.
send(undefined, Sock, Bin) ->
    putr(config, init),
    gen_tcp:send(Sock, Bin);

%% Outgoing DWR: fake reception of DWA. Use the fact that AVP values
%% are ignored. This is to ensure that the peer's watchdog state
%% transitions are only induced by responses to messages it sends.
send(_, Sock, <<_:32, 1:1, _:7, 280:24, _:32, EId:32, HId:32, _/binary>>) ->
    Pkt = #diameter_packet{header = #diameter_header{version = 1,
                                                     end_to_end_id = EId,
                                                     hop_by_hop_id = HId},
                           msg = ['DWA', {'Result-Code', 2001},
                                         {'Origin-Host', "XXX"},
                                         {'Origin-Realm', ?REALM}]},
    #diameter_packet{bin = Bin} = diameter_codec:encode(?BASE, Pkt),
    self() ! {tcp, Sock, Bin},
    ok;

%% First outgoing DWA.
send(init, Sock, Bin) ->
    [{{?MODULE, _, T}, _}] = diameter_reg:wait({?MODULE, self(), '_'}),
    putr(config, T),
    send(Sock, Bin);

%% First transport process.
send({SvcName, {_,_,_} = T}, Sock, Bin) ->
    [{'Origin-Host', _} = OH, {'Origin-Realm', _} = OR | _]
        = ?SERVICE(SvcName),
    putr(origin, [OH, OR]),
    putr(config, T),
    send(Sock, Bin);
    
%% Discard DWA, failback after another timeout in the peer.
send({Wd, 0 = No, Msg}, Sock, Bin) ->
    Origin = getr(origin),
    spawn(fun() -> failback(?ONE_WD(Wd), Msg, Sock, Bin, Origin) end),
    putr(config, No),
    ok;

%% Send DWA while we're in the mood (aka 0 < N).
send({Wd, N, Msg}, Sock, Bin) ->
    putr(config, {Wd, N-1, Msg}),
    gen_tcp:send(Sock, Bin);

%% Discard DWA.
send(0, _Sock, _Bin) ->
    ok;

%% Send DWA.
send(N, Sock, <<_:32, 0:1, _:7, 280:24, _/binary>> = Bin) ->
    putr(config, N-1),
    gen_tcp:send(Sock, Bin).

failback(Tmo, Msg, Sock, Bin, Origin) ->
    timer:sleep(Tmo),
    ok = gen_tcp:send(Sock, msg(Msg, Bin, Origin)).

%% msg/2

msg('DWA', Bin, _Origin) ->
    Bin;
msg(Msg, _Bin, Origin) ->
    #diameter_packet{bin = Bin}
        = diameter_codec:encode(?BASE, msg(Msg, Origin)),
    Bin.

msg('DWR' = M, T) ->
    [M | T];

msg('RAA', T) ->
    ['RAA', {'Session-Id', diameter:session_id("abc")},
            {'Result-Code', 2001}
          | T].
%% An unexpected answer is discarded after passing through the
%% watchdog state machine.

%% ===========================================================================

peer_up(_SvcName, _Peer, S) ->
    S.

peer_down(_SvcName, _Peer, S) ->
    S.

%% ===========================================================================

choose(true, X, _)  -> X;
choose(false, _, X) -> X.

%% id/1
%%
%% Jitter callback.

id(T) ->
    T.

%% run/1
%%
%% A more useful badmatch in case of failure.

run(Fs) ->
    ?util:run([{?MODULE, [run1, F]} || F <- Fs]).

run1([F|A]) ->
    ok = try
             apply(?MODULE, F, A),
             ok
         catch
             E:R ->
                 S = erlang:get_stacktrace(),
                 io:format("~p~n", [{A, E, R, S}]),
                 S
         end.

%% now_diff/2

now_diff(T1, T2) ->
    timer:now_diff(T2, T1).

%% jitter/2

jitter(?WD(T), _) ->
    T;
jitter(T,D) ->
    T+D.

%% Generate a unique hostname for the faked peer.
hostname() ->
    lists:flatten(io_lib:format("~p-~p-~p", tuple_to_list(now()))).

putr(Key, Val) ->
    put({?MODULE, Key}, Val).

getr(Key) ->
    get({?MODULE, Key}).