From 899120dd73724793c787db3227ecb1ba42c00381 Mon Sep 17 00:00:00 2001
From: Anders Svensson
Date: Wed, 27 Feb 2013 10:30:54 +0100
Subject: Add transport_opt() watchdog_config
To make the number of watchdogs sent before the transitions REOPEN ->
OKAY and OKAY -> SUSPECT configurable. Using anything other then the
default config is non-standard and should only be used for test.
---
lib/diameter/doc/src/diameter.xml | 24 ++++++
lib/diameter/src/base/diameter.erl | 1 +
lib/diameter/src/base/diameter_watchdog.erl | 111 ++++++++++++++++++++++------
3 files changed, 114 insertions(+), 22 deletions(-)
diff --git a/lib/diameter/doc/src/diameter.xml b/lib/diameter/doc/src/diameter.xml
index 379e9f0738..75e95a9a3d 100644
--- a/lib/diameter/doc/src/diameter.xml
+++ b/lib/diameter/doc/src/diameter.xml
@@ -1125,6 +1125,30 @@ modules in order until one establishes a connection within the
corresponding timeout (see below) or all fail.
+
+{watchdog_config, [{okay|suspect, non_neg_integer()}]}
+-
+
+Specifies configuration that alters the behaviour of the watchdog
+state machine.
+On key okay, the non-negative number of answered DWR
+messages required before transitioning from REOPEN to OKAY.
+On key suspect, the positive number of unanswered DWR messages
+before transitioning from OKAY to SUSPECT, or 0 to never make this
+transition.
+
+
+Defaults to [{okay, 3}, {suspect, 1}].
+Not specifying a key is equivalent to specifying
+the default value for that key.
+
+
+The default value is as required by RFC 3539: changing it results
+in non-standard behaviour that should only be used to simulate
+misbehaving nodes during test.
+
+
+
{watchdog_timer, TwInit}
-
diff --git a/lib/diameter/src/base/diameter.erl b/lib/diameter/src/base/diameter.erl
index c67fba5f89..5f06cef020 100644
--- a/lib/diameter/src/base/diameter.erl
+++ b/lib/diameter/src/base/diameter.erl
@@ -336,6 +336,7 @@ call(SvcName, App, Message) ->
| {length_errors, exit | handle | discard}
| {reconnect_timer, 'Unsigned32'()}
| {watchdog_timer, 'Unsigned32'() | {module(), atom(), list()}}
+ | {watchdog_config, [{okay|suspect, non_neg_integer()}]}
| {private, any()}.
%% Predicate passed to remove_transport/2
diff --git a/lib/diameter/src/base/diameter_watchdog.erl b/lib/diameter/src/base/diameter_watchdog.erl
index 073a415d10..0b32974efe 100644
--- a/lib/diameter/src/base/diameter_watchdog.erl
+++ b/lib/diameter/src/base/diameter_watchdog.erl
@@ -47,6 +47,12 @@
-define(BASE, ?DIAMETER_DICT_COMMON).
+-define(IS_NATURAL(N), (is_integer(N) andalso 0 =< N)).
+
+-record(config,
+ {suspect = 1 :: non_neg_integer(), %% OKAY -> SUSPECT
+ okay = 3 :: non_neg_integer()}). %% REOPEN -> OKAY
+
-record(watchdog,
{%% PCB - Peer Control Block; see RFC 3539, Appendix A
status = initial :: initial | okay | suspect | down | reopen,
@@ -54,7 +60,8 @@
tw :: 6000..16#FFFFFFFF | {module(), atom(), list()},
%% {M,F,A} -> integer() >= 0
num_dwa = 0 :: -1 | non_neg_integer(),
- %% number of DWAs received during reopen
+ %% number of DWAs received in reopen,
+ %% or to send in okay before moving to suspect
%% end PCB
parent = self() :: pid(), %% service process
transport :: pid() | undefined, %% peer_fsm process
@@ -64,7 +71,8 @@
%% term passed into diameter_service with incoming message
sequence :: diameter:sequence(), %% mask
restrict :: {diameter:restriction(), boolean()},
- shutdown = false :: boolean()}).
+ shutdown = false :: boolean(),
+ config :: #config{}}).
%% ---------------------------------------------------------------------------
%% start/2
@@ -129,7 +137,8 @@ i({Ack, T, Pid, {RecvData,
receive_data = RecvData,
dictionary = Dict0,
sequence = Mask,
- restrict = {Restrict, lists:member(node(), Nodes)}}.
+ restrict = {Restrict, lists:member(node(), Nodes)},
+ config = config(Opts)}.
wait(Ref, Pid) ->
receive
@@ -139,6 +148,27 @@ wait(Ref, Pid) ->
exit({shutdown, D})
end.
+%% config/1
+%%
+%% Could also configure counts for SUSPECT to DOWN and REOPEN to DOWN,
+%% but don't.
+
+config(Opts) ->
+ Config = proplists:get_value(watchdog_config, Opts, []),
+ is_list(Config) orelse config_error({watchdog_config, Config}),
+ lists:foldl(fun config/2, #config{}, Config).
+
+config({suspect, N}, Rec)
+ when ?IS_NATURAL(N) ->
+ Rec#config{suspect = N};
+
+config({okay, N}, Rec)
+ when ?IS_NATURAL(N) ->
+ Rec#config{okay = N};
+
+config(T, _) ->
+ config_error(T).
+
%% start/5
start(T, Opts, Mask, Nodes, Dict0, Svc) ->
@@ -219,6 +249,17 @@ handle_info(T, #watchdog{} = State) ->
?LOG(stop, T),
event(T, State, State#watchdog{status = down}),
{stop, {shutdown, T}, State}
+ end;
+
+handle_info(T, State) -> %% started in old code
+ handle_info(T, upgrade(State)).
+
+upgrade(State) ->
+ case erlang:append_element(State, #config{}) of
+ #watchdog{status = okay, config = #config{suspect = OS}} = S ->
+ S#watchdog{num_dwa = OS};
+ #watchdog{} = S ->
+ S
end.
close({'DOWN', _, process, TPid, {shutdown, Reason}},
@@ -331,11 +372,13 @@ transition({accepted = T, TPid}, #watchdog{transport = TPid,
transition({open, TPid, Hosts, _} = Open,
#watchdog{transport = TPid,
status = initial,
- restrict = {_, R}}
+ restrict = {_,R},
+ config = #config{suspect = OS}}
= S) ->
case okay(getr(restart), Hosts, R) of
okay ->
- set_watchdog(S#watchdog{status = okay});
+ set_watchdog(S#watchdog{status = okay,
+ num_dwa = OS});
reopen ->
transition(Open, S#watchdog{status = down})
end;
@@ -347,15 +390,22 @@ transition({open, TPid, Hosts, _} = Open,
transition({open = Key, TPid, _Hosts, T},
#watchdog{transport = TPid,
- status = down}
+ status = down,
+ config = #config{suspect = OS,
+ okay = RO}}
= S) ->
- %% Store the info we need to notify the parent to reopen the
- %% connection after the requisite DWA's are received, at which
- %% time we eraser(open). The reopen message is a later addition,
- %% to communicate the new capabilities as soon as they're known.
- putr(Key, {TPid, T}),
- set_watchdog(send_watchdog(S#watchdog{status = reopen,
- num_dwa = 0}));
+ case RO of
+ 0 -> %% non-standard: skip REOPEN
+ set_watchdog(S#watchdog{status = okay,
+ num_dwa = OS});
+ _ ->
+ %% Store the info we need to notify the parent to reopen
+ %% the connection after the requisite DWA's are received,
+ %% at which time we eraser(open).
+ putr(Key, {TPid, T}),
+ set_watchdog(send_watchdog(S#watchdog{status = reopen,
+ num_dwa = 0}))
+ end;
%% OKAY Connection down CloseConnection()
%% Failover()
@@ -553,22 +603,27 @@ rcv(_, #watchdog{status = okay} = S) ->
%% SUSPECT Receive non-DWA Failback()
%% SetWatchdog() OKAY
-rcv('DWA', #watchdog{status = suspect} = S) ->
+rcv('DWA', #watchdog{status = suspect, config = #config{suspect = OS}} = S) ->
set_watchdog(S#watchdog{status = okay,
+ num_dwa = OS,
pending = false});
-rcv(_, #watchdog{status = suspect} = S) ->
- set_watchdog(S#watchdog{status = okay});
+rcv(_, #watchdog{status = suspect, config = #config{suspect = OS}} = S) ->
+ set_watchdog(S#watchdog{status = okay,
+ num_dwa = OS});
%% REOPEN Receive DWA & Pending = FALSE
%% NumDWA == 2 NumDWA++
%% Failback() OKAY
rcv('DWA', #watchdog{status = reopen,
- num_dwa = 2 = N}
- = S) ->
+ num_dwa = N,
+ config = #config{suspect = OS,
+ okay = RO}}
+ = S)
+ when N+1 == RO ->
S#watchdog{status = okay,
- num_dwa = N+1,
+ num_dwa = OS,
pending = false};
%% REOPEN Receive DWA & Pending = FALSE
@@ -607,9 +662,16 @@ timeout(#watchdog{status = T,
%% Pending SetWatchdog() SUSPECT
timeout(#watchdog{status = okay,
- pending = true}
- = S) ->
- S#watchdog{status = suspect};
+ pending = true,
+ num_dwa = N}
+ = S) ->
+ case N of
+ 1 ->
+ S#watchdog{status = suspect};
+ _ -> %% non-standard
+ send_watchdog(S#watchdog{pending = false,
+ num_dwa = decr(N)})
+ end;
%% SUSPECT Timer expires CloseConnection()
%% SetWatchdog() DOWN
@@ -663,6 +725,11 @@ timeout(#watchdog{status = T} = S)
T == down ->
restart(S).
+decr(0 = N) ->
+ N;
+decr(N) ->
+ N-1.
+
%% restart/1
restart(#watchdog{transport = undefined} = S) ->
--
cgit v1.2.3