%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2010-2017. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
%%
%% Tests of the RFC3539 watchdog state machine as implemented by
%% module diameter_watchdog.
%%
-module(diameter_watchdog_SUITE).
-export([suite/0,
all/0,
init_per_suite/1,
end_per_suite/1]).
%% testcases
-export([reopen/0, reopen/1, reopen/4, reopen/6,
suspect/1, suspect/4,
okay/1, okay/4]).
-export([id/1, %% jitter callback
run1/1,
abuse/1,
abuse/2]).
%% diameter_app callbacks
-export([peer_up/3,
peer_down/3]).
%% diameter_tcp message_cb
-export([message/3]).
-include("diameter.hrl").
-include("diameter_ct.hrl").
%% ===========================================================================
-define(util, diameter_util).
-define(BASE, ?DIAMETER_DICT_COMMON).
-define(REALM, "erlang.org").
-define(ADDR, {127,0,0,1}).
%% Config for diameter:start_service/2.
-define(SERVICE(Name),
[{'Origin-Host', Name ++ "." ++ ?REALM},
{'Origin-Realm', ?REALM},
{'Host-IP-Address', [?ADDR]},
{'Vendor-Id', 42},
{'Product-Name', "OTP/diameter"},
{'Auth-Application-Id', [0 = ?BASE:id()]},
{application, [{alias, Name},
{dictionary, ?BASE},
{module, ?MODULE}]}]).
%% Watchdog timer as a callback.
-define(WD(T), {?MODULE, id, [T]}).
%% Watchdog timers used by the testcases.
-define(WD_TIMERS, [10000, ?WD(10000)]).
%% Watchdog timer of the misbehaving node.
-define(PEER_WD, 10000).
%% A timeout that ensures one watchdog. To ensure only one watchdog
%% requires (Wd + 2000) + 1000 < 2*(Wd - 2000) ==> 7000 < Wd for the
%% case with random jitter.
-define(ONE_WD(Wd), jitter(Wd,2000) + 1000).
-define(INFO(T), #diameter_event{info = T}).
%% Receive an event message from diameter.
-define(EVENT(T), %% apply to not bind T_
apply(fun() ->
receive ?INFO(T = T_) -> log_event(T_) end
end,
[])).
%% Receive a watchdog event.
-define(WD_EVENT(Ref), log_wd(element(4, ?EVENT({watchdog, Ref, _, _, _})))).
-define(WD_EVENT(Ref, Ms),
apply(fun() ->
receive ?INFO({watchdog, Ref, _, T_, _}) ->
log_wd(T_)
after Ms ->
false
end
end,
[])).
%% Log to make failures identifiable.
-define(LOG(T), ?LOG("~p", [T])).
-define(LOG(F,A), ct:pal("~p: " ++ F, [self() | A])).
-define(WARN(F,A), ct:pal(error, "~p: " ++ F, [self() | A])).
%% ===========================================================================
suite() ->
[{timetrap, {seconds, 90}}].
all() ->
[reopen,
suspect,
okay].
init_per_suite(Config) ->
ok = diameter:start(),
Config.
end_per_suite(_Config) ->
ok = diameter:stop().
%% ===========================================================================
%% # reopen/1
%% ===========================================================================
%% Test the watchdog state machine for the required failover, failback
%% and reopen behaviour by examining watchdog events.
reopen() ->
[{timetrap, {minutes, 5}}]. %% 20 watchdogs @ 15 sec
reopen(_) ->
[] = run([[reopen, T, W, N, M]
|| T <- [listen, connect], %% watchdog to test
W <- ?WD_TIMERS, %% watchdog_timer value
N <- [0,1,2], %% DWR's to answer before ignoring
M <- ['DWR', 'DWA', 'RAA']]). %% how to induce failback
reopen(Test, Wd, N, M) ->
%% Publish a ref ensure the connecting transport is added only
%% once events from the listening transport are subscribed to.
Ref = make_ref(),
[] = run([[reopen, T, Test, Ref, Wd, N, M] || T <- [listen, connect]]).
%% reopen/6
reopen(Type, Test, Ref, Wd, N, M) ->
{SvcName, TRef} = start(Type, Ref, cfg(Type, Test, Wd)),
reopen(Type, Test, SvcName, TRef, Wd, N, M).
cfg(Type, Type, Wd) ->
{Wd, [], false};
cfg(_Type, _Test, _Wd) ->
{?WD(?PEER_WD), [{okay, 0}], true}.
%% reopen/7
%% The watchdog to be tested.
reopen(Type, Type, SvcName, Ref, Wd, N, M) ->
?LOG("node ~p", [[Type, SvcName, Ref, Wd, N, M]]),
%% Connection should come up immediately as a consequence of
%% starting the watchdog process. In the accepting case this
%% results in a new watchdog on a transport waiting for a new
%% connection.
{initial, okay} = ?WD_EVENT(Ref),
?EVENT({up, Ref, _, _, #diameter_packet{}}),
%% OKAY Timer expires & Failover()
%% Pending SetWatchdog() SUSPECT
%%
%% The peer replies to N DWR's before becoming silent, we should
%% go down after N+2 watchdog_timer expirations: that is, after
%% the first unanswered DWR. Knowing the min/max watchdog timeout
%% values gives the time interval in which the event is expected.
[0,0,0,0] = wd_counts(SvcName),
{okay, suspect} = ?WD_EVENT(Ref),
?EVENT({down, Ref, _, _}),
%% N received DWA's
[_,_,_,N] = wd_counts(SvcName),
%% SUSPECT Receive DWA Pending = FALSE
%% Failback()
%% SetWatchdog() OKAY
%%
%% SUSPECT Receive non-DWA Failback()
%% SetWatchdog() OKAY
%%
%% The peer sends a message before the expiry of another watchdog
%% to induce failback.
{suspect, okay} = ?WD_EVENT(Ref),
?EVENT({up, Ref, _, _}),
%% N+1 sent DWR's, N/N+1 received DWA's
R1 = N+1,
A1 = choose(M == 'DWA', R1, N),
[R1,_,_,A1] = wd_counts(SvcName),
%% OKAY Timer expires & SendWatchdog()
%% !Pending SetWatchdog()
%% Pending = TRUE OKAY
%%
%% OKAY Timer expires & Failover()
%% Pending SetWatchdog() SUSPECT
%%
%% The peer is now ignoring all watchdogs so the connection goes
%% back down after either one or two watchdog expiries, depending
%% on whether or not DWA restored the connection.
{okay, suspect} = ?WD_EVENT(Ref),
?EVENT({down, Ref, _, _}),
%% SUSPECT Timer expires CloseConnection()
%% SetWatchdog() DOWN
%%
%% Non-response brings the connection down after another timeout.
{suspect, down} = ?WD_EVENT(Ref),
R2 = R1 + choose(M == 'DWA', 1, 0),
A2 = A1,
[R2,_,_,A2] = wd_counts(SvcName),
%% DOWN Timer expires AttemptOpen()
%% SetWatchdog() DOWN
%%
%% DOWN Connection up NumDWA = 0
%% SendWatchdog()
%% SetWatchdog()
%% Pending = TRUE REOPEN
%%
%% The connection is reestablished after another timeout.
recv_reopen(Type, Ref),
%% REOPEN Receive non-DWA Throwaway() REOPEN
%%
%% REOPEN Receive DWA & Pending = FALSE
%% NumDWA < 2 NumDWA++ REOPEN
%%
%% REOPEN Receive DWA & Pending = FALSE
%% NumDWA == 2 NumDWA++
%% Failback() OKAY
%%
%% REOPEN Timer expires & SendWatchdog()
%% !Pending SetWatchdog()
%% Pending = TRUE REOPEN
%%
%% An exchange of 3 watchdogs (the first directly after
%% capabilities exchange) brings the connection back up.
{reopen, okay} = ?WD_EVENT(Ref),
?EVENT({up, Ref, _, _, #diameter_packet{}}),
%% Three DWR's have been answered.
R3 = R2 + 3,
A3 = A2 + 3,
[R3,_,_,A3] = wd_counts(SvcName),
%% Non-response brings it down again.
{okay, suspect} = ?WD_EVENT(Ref),
?EVENT({down, Ref, _, _}),
{suspect, down} = ?WD_EVENT(Ref),
R4 = R3 + 1,
A4 = A3,
[R4,_,_,A4] = wd_counts(SvcName),
%% Reestablish after another watchdog.
recv_reopen(Type, Ref),
%% REOPEN Timer expires & NumDWA = -1
%% Pending & SetWatchdog()
%% NumDWA >= 0 REOPEN
%%
%% REOPEN Timer expires & CloseConnection()
%% Pending & SetWatchdog()
%% NumDWA < 0 DOWN
%%
%% Peer is now ignoring all watchdogs go down again after 2
%% timeouts.
{reopen, down} = ?WD_EVENT(Ref);
%% The misbehaving peer.
reopen(Type, _, SvcName, Ref, Wd, N, M) ->
?LOG("peer ~p", [[Type, SvcName, Ref, Wd, N, M]]),
%% First transport process.
{initial, okay} = ?WD_EVENT(Ref),
?EVENT({up, Ref, _, _, #diameter_packet{}}),
reg(Ref, SvcName, {SvcName, {Wd,N,M}}),
{okay, down} = ?WD_EVENT(Ref),
%% Second transport process.
?EVENT({watchdog, Ref, _, {_, okay}, _}),
reg(Ref, SvcName, 3), %% answer 3 watchdogs then fall silent
?EVENT({watchdog, Ref, _, {_, down}, _}),
%% Third transport process.
?EVENT({watchdog, Ref, _, {_, okay}, _}),
reg(Ref, SvcName, 0), %% disable outgoing DWA
?EVENT({watchdog, Ref, _, {_, down}, _}),
ok.
log_wd({From, To} = T) ->
?LOG("~p -> ~p", [From, To]),
T.
log_event(E) ->
T = element(1,E),
T == watchdog orelse ?LOG("~p", [T]),
E.
%% recv_reopen/2
recv_reopen(connect, Ref) ->
{down, reopen} = ?WD_EVENT(Ref),
?EVENT({reconnect, Ref, _});
recv_reopen(listen, Ref) ->
{_, reopen} = ?WD_EVENT(Ref).
%% reg/3
%%
%% Lookup the pid of the transport process and publish a term for
%% message/3 to lookup.
reg(TRef, SvcName, T) ->
TPid = tpid(TRef, diameter:service_info(SvcName, transport)),
true = diameter_reg:add_new({?MODULE, TPid, T}).
%% tpid/2
tpid(Ref, [[{ref, Ref},
{type, connect},
{options, _},
{watchdog, _},
{peer, _},
{apps, _},
{caps, _},
{port, [{owner, TPid} | _]}
| _]]) ->
TPid;
tpid(Ref, [[{ref, Ref},
{type, listen},
{options, _},
{accept, As}
| _]]) ->
[[{watchdog, _},
{peer, _},
{apps, _},
{caps, _},
{port, [{owner, TPid} | _]}
| _]]
= lists:filter(fun([{watchdog, {_,_,S}} | _]) ->
S == okay orelse S == reopen
end,
As),
TPid.
%% ===========================================================================
%% # suspect/1
%% ===========================================================================
%% Configure transports to require a set number of watchdog timeouts
%% before moving from OKAY to SUSPECT.
suspect(_) ->
[] = run([[abuse, [suspect, N]] || N <- [0,1,3]]).
suspect(Type, Fake, Ref, N)
when is_reference(Ref) ->
{SvcName, TRef}
= start(Type, Ref, {?WD(10000), [{suspect, N}], Fake}),
{initial, okay} = ?WD_EVENT(TRef),
suspect(TRef, Fake, SvcName, N);
suspect(TRef, true, SvcName, _) ->
reg(TRef, SvcName, 0), %% disable outgoing DWA
{okay, _} = ?WD_EVENT(TRef);
suspect(TRef, false, SvcName, 0) -> %% SUSPECT disabled
%% Wait 2+ watchdogs and see that only one watchdog has been sent.
false = ?WD_EVENT(TRef, 28000),
[1,0,0,0] = wd_counts(SvcName);
suspect(TRef, false, SvcName, N) ->
%% Check that no watchdog transition takes place within N+
%% watchdogs ...
false = ?WD_EVENT(TRef, N*10000+8000),
[1,0,0,0] = wd_counts(SvcName),
%% ... but that the connection then becomes suspect ...
{okay, suspect} = ?WD_EVENT(TRef, 10000),
[1,0,0,0] = wd_counts(SvcName),
%% ... and goes down.
{suspect, down} = ?WD_EVENT(TRef, 18000),
[1,0,0,0] = wd_counts(SvcName).
%% abuse/1
abuse(F) ->
[] = run([[abuse, F, T] || T <- [listen, connect]]).
abuse(F, [_,_,_|_] = Args) ->
?LOG("~p", [Args]),
apply(?MODULE, F, Args);
abuse([F|A], Test) ->
Ref = make_ref(),
[] = run([[abuse, F, [T, T == Test, Ref] ++ A]
|| T <- [listen, connect]]);
abuse(F, Test) ->
abuse([F], Test).
%% ===========================================================================
%% # okay/1
%% ===========================================================================
%% Configure the number of watchdog exchanges before moving from
%% REOPEN to OKAY.
okay(_) ->
[] = run([[abuse, [okay, N]] || N <- [0,2,3]]).
okay(Type, Fake, Ref, N)
when is_reference(Ref) ->
{SvcName, TRef}
= start(Type, Ref, {?WD(10000),
[{okay, choose(Fake, 0, N)}],
Fake}),
{initial, okay} = ?WD_EVENT(TRef),
okay(TRef,
Fake,
SvcName,
choose(Type == listen, initial, down),
N).
okay(TRef, true, SvcName, Down, _) ->
reg(TRef, SvcName, 0), %% disable outgoing DWA
{okay, down} = ?WD_EVENT(TRef),
{Down, okay} = ?WD_EVENT(TRef),
reg(TRef, SvcName, -1), %% enable outgoing DWA
{okay, down} = ?WD_EVENT(TRef);
okay(TRef, false, SvcName, Down, N) ->
{okay, suspect} = ?WD_EVENT(TRef),
[1,0,0,0] = wd_counts(SvcName),
{suspect, down} = ?WD_EVENT(TRef),
ok(TRef, SvcName, Down, N).
ok(TRef, SvcName, Down, 0) ->
%% Connection comes up without watchdog exchange.
{Down, okay} = ?WD_EVENT(TRef),
[1,0,0,0] = wd_counts(SvcName),
%% Wait 2+ watchdog timeouts to see that the connection stays up
%% and two watchdogs are exchanged.
false = ?WD_EVENT(TRef, 28000),
[3,0,0,2] = wd_counts(SvcName);
ok(TRef, SvcName, Down, N) ->
%% Connection required watchdog exchange before reaching OKAY.
{Down, reopen} = ?WD_EVENT(TRef),
{reopen, okay} = ?WD_EVENT(TRef),
%% One DWR was sent in moving to expect, plus N more to reopen the
%% connection.
N1 = N+1,
[N1,0,0,N] = wd_counts(SvcName).
%% ===========================================================================
%% wd_counts/1
wd_counts(SvcName) ->
[Info] = diameter:service_info(SvcName, transport),
{_, Counters} = lists:keyfind(statistics, 1, Info),
[proplists:get_value({{0,280,R}, D}, Counters, 0) || D <- [send,recv],
R <- [1,0]].
%% start/3
start(Type, Ref, T) ->
Name = hostname(),
true = diameter:subscribe(Name),
ok = diameter:start_service(Name, [{monitor, self()} | ?SERVICE(Name)]),
{ok, TRef} = diameter:add_transport(Name, {Type, opts(Type, Ref, T)}),
true = diameter_reg:add_new({Type, Ref, Name}),
{Name, TRef}.
opts(Type, Ref, {Timer, Config, Fake})
when is_boolean(Fake) ->
[{transport_module, diameter_tcp},
{transport_config, mod(Fake) ++ [{ip, ?ADDR}, {port, 0}]
++ cfg(Type, Ref)},
{watchdog_timer, Timer},
{watchdog_config, Config}].
mod(B) ->
[{message_cb, [fun message/3, capx]} || B].
cfg(listen, _) ->
[];
cfg(connect, Ref) ->
[{{_, _, SvcName}, _Pid}] = diameter_reg:wait({listen, Ref, '_'}),
[[{ref, LRef} | _]] = diameter:service_info(SvcName, transport),
[LP] = ?util:lport(tcp, LRef),
[{raddr, ?ADDR}, {rport, LP}].
%% ===========================================================================
%% message/3
message(send, Bin, X) ->
send(Bin, X);
message(recv, Bin, _) ->
[Bin];
message(_, _, _) ->
[].
%% send/2
%% First outgoing message from a new transport process is CER/CEA.
%% Remaining outgoing messages are either DWR or DWA.
send(Bin, capx) ->
<<_:32, _:8, 257:24, _/binary>> = Bin, %% assert on CER/CEA
[Bin, fun message/3, init];
%% Outgoing DWR: fake reception of DWA. Use the fact that AVP values
%% are ignored. This is to ensure that the peer's watchdog state
%% transitions are only induced by responses to messages it sends.
send(<<_:32, 1:1, _:7, 280:24, _:32, EId:32, HId:32, _/binary>>, _) ->
Pkt = #diameter_packet{header = #diameter_header{version = 1,
end_to_end_id = EId,
hop_by_hop_id = HId},
msg = ['DWA', {'Result-Code', 2001},
{'Origin-Host', "XXX"},
{'Origin-Realm', ?REALM}]},
#diameter_packet{bin = Bin} = diameter_codec:encode(?BASE, Pkt),
[recv, Bin];
%% First outgoing DWA.
send(Bin, init) ->
[{{?MODULE, _, T}, _}] = diameter_reg:wait({?MODULE, self(), '_'}),
send(Bin, T);
%% First transport process.
send(Bin, {SvcName, {_,_,_} = T}) ->
[{'Origin-Host', _} = OH, {'Origin-Realm', _} = OR | _]
= ?SERVICE(SvcName),
putr(origin, [OH, OR]),
send(Bin, T);
%% Discard DWA, failback after another timeout in the peer.
send(Bin, {Wd, 0 = No, Msg}) ->
Origin = getr(origin),
[{defer, ?ONE_WD(Wd), [msg(Msg, Bin, Origin)]}, fun message/3, No];
%% Send DWA while we're in the mood (aka 0 < N).
send(Bin, {Wd, N, Msg}) ->
[Bin, fun message/3, {Wd, N-1, Msg}];
%% Discard DWA.
send(_Bin, 0 = No) ->
[fun message/3, No];
%% Send DWA.
send(<<_:32, 0:1, _:7, 280:24, _/binary>> = DWA, N) ->
[DWA, fun message/3, N-1].
%% msg/2
msg('DWA', Bin, _Origin) ->
Bin;
msg(Msg, _Bin, Origin) ->
#diameter_packet{bin = Bin}
= diameter_codec:encode(?BASE, msg(Msg, Origin)),
Bin.
msg('DWR' = M, T) ->
[M | T];
msg('RAA', T) ->
['RAA', {'Session-Id', diameter:session_id("abc")},
{'Result-Code', 2001}
| T].
%% An unexpected answer is discarded after passing through the
%% watchdog state machine.
%% ===========================================================================
peer_up(_SvcName, _Peer, S) ->
S.
peer_down(_SvcName, _Peer, S) ->
S.
%% ===========================================================================
choose(true, X, _) -> X;
choose(false, _, X) -> X.
%% id/1
%%
%% Jitter callback.
id(T) ->
T.
%% run/1
%%
%% A more useful badmatch in case of failure.
run(Fs) ->
?util:run([{?MODULE, [run1, F]} || F <- Fs]).
run1([F|A]) ->
ok = try
apply(?MODULE, F, A),
ok
catch
E:R ->
S = erlang:get_stacktrace(),
?WARN("~p", [{A, E, R, S}]),
S
end.
%% jitter/2
jitter(?WD(T), _) ->
T;
jitter(T,D) ->
T+D.
%% Generate a unique hostname for the faked peer.
hostname() ->
?util:unique_string().
putr(Key, Val) ->
put({?MODULE, Key}, Val).
getr(Key) ->
get({?MODULE, Key}).