diff options
author | Anders Svensson <[email protected]> | 2013-02-12 12:51:56 +0100 |
---|---|---|
committer | Anders Svensson <[email protected]> | 2013-02-12 12:51:56 +0100 |
commit | 1e7b3534eeea4761bb45ff31bcdaf57cd66a7cb7 (patch) | |
tree | 0bb4c01bc6c303154814848e4a52b07a16f161bd /lib/diameter | |
parent | 5bb22560c350baa834196835640a5aab21e97f8a (diff) | |
parent | 551bdf0da65bcb9838ddc613e0507e2871dcd5dc (diff) | |
download | otp-1e7b3534eeea4761bb45ff31bcdaf57cd66a7cb7.tar.gz otp-1e7b3534eeea4761bb45ff31bcdaf57cd66a7cb7.tar.bz2 otp-1e7b3534eeea4761bb45ff31bcdaf57cd66a7cb7.zip |
Merge branch 'anders/diameter/reopen/OTP-10692'
* anders/diameter/reopen/OTP-10692:
Fix testcases in event suite
Fix faulty watchdog transition INITIAL -> DOWN
Fix faulty watchdog transition DOWN -> INITIAL
Comment fixes
Rename records peer/conn -> watchdog/peer in diameter_service
Add testcases to traffic suite
Simplify watchdog transitions in service process
Simplify transport shutdown
Remove upgrade code not needed after application restart
Tweak (make) silent rules support
Diffstat (limited to 'lib/diameter')
-rw-r--r-- | lib/diameter/src/Makefile | 14 | ||||
-rw-r--r-- | lib/diameter/src/base/diameter_peer_fsm.erl | 111 | ||||
-rw-r--r-- | lib/diameter/src/base/diameter_reg.erl | 9 | ||||
-rw-r--r-- | lib/diameter/src/base/diameter_service.erl | 570 | ||||
-rw-r--r-- | lib/diameter/src/base/diameter_watchdog.erl | 206 | ||||
-rw-r--r-- | lib/diameter/src/transport/diameter_tcp.erl | 2 | ||||
-rw-r--r-- | lib/diameter/test/diameter_event_SUITE.erl | 2 | ||||
-rw-r--r-- | lib/diameter/test/diameter_traffic_SUITE.erl | 13 | ||||
-rw-r--r-- | lib/diameter/test/diameter_watchdog_SUITE.erl | 10 |
9 files changed, 411 insertions, 526 deletions
diff --git a/lib/diameter/src/Makefile b/lib/diameter/src/Makefile index a08c204a23..2b02f1de2b 100644 --- a/lib/diameter/src/Makefile +++ b/lib/diameter/src/Makefile @@ -119,7 +119,8 @@ ERL_COMPILE_FLAGS += \ # erl/hrl from dictionary file. gen/diameter_gen_%.erl gen/diameter_gen_%.hrl: dict/%.dia - $(dia_verbose)../bin/diameterc -o gen -i $(EBIN) $< + $(dia_verbose) \ + ../bin/diameterc -o gen -i $(EBIN) $< opt: $(TARGET_FILES) @@ -134,11 +135,13 @@ debug: # The dictionary parser. gen/$(DICT_YRL).erl: compiler/$(DICT_YRL).yrl - $(yecc_verbose)$(ERLC) -Werror -o $(@D) $< + $(yecc_verbose) \ + $(ERLC) -Werror -o $(@D) $< # Generate the app file. $(APP_TARGET): $(APP_SRC) ../vsn.mk modules.mk - $(gen_verbose)M=`echo $(notdir $(APP_MODULES)) | tr ' ' ,`; \ + $(gen_verbose) \ + M=`echo $(notdir $(APP_MODULES)) | tr ' ' ,`; \ R=`echo $(REGISTERED) | tr ' ' ,`; \ sed -e 's;%VSN%;$(VSN);' \ -e "s;%MODULES%;$$M;" \ @@ -146,7 +149,8 @@ $(APP_TARGET): $(APP_SRC) ../vsn.mk modules.mk $< > $@ $(APPUP_TARGET): $(APPUP_SRC) ../vsn.mk - $(vsn_verbose)sed -e 's;%VSN%;$(VSN);' $< > $@ + $(vsn_verbose) \ + sed -e 's;%VSN%;$(VSN);' $< > $@ app: $(APP_TARGET) $(APPUP_TARGET) dict: $(DICT_ERLS) @@ -262,7 +266,7 @@ depend: depend.mk # Generate dependencies makefile. depend.mk: depend.sed $(MODULES:%=%.erl) Makefile - $(gen_verbose)(for f in $(MODULES); do \ + (for f in $(MODULES); do \ (echo $$f; cat $$f.erl) | sed -f $<; \ done) \ > $@ diff --git a/lib/diameter/src/base/diameter_peer_fsm.erl b/lib/diameter/src/base/diameter_peer_fsm.erl index 5dab6214b1..de341741db 100644 --- a/lib/diameter/src/base/diameter_peer_fsm.erl +++ b/lib/diameter/src/base/diameter_peer_fsm.erl @@ -126,24 +126,17 @@ %% State Machine rather than closer to the transport. This is what we %% now do below: connect/accept call diameter_watchdog and return the %% pid of the watchdog process, and the watchdog in turn calls start/3 -%% below to start the process implementing the Peer State Machine. The -%% former is a "peer" in diameter_service while the latter is a -%% "conn". In a sense, diameter_service sees the watchdog as -%% implementing the Peer State Machine and the process implemented -%% here as being the transport, not being aware of the watchdog at -%% all. +%% below to start the process implementing the Peer State Machine. %% -%%% --------------------------------------------------------------------------- -%%% # start({connect|accept, Ref}, Opts, Service) -%%% -%%% Output: Pid -%%% --------------------------------------------------------------------------- +%% --------------------------------------------------------------------------- +%% # start/3 +%% --------------------------------------------------------------------------- -spec start(T, [Opt], {diameter:sequence(), diameter:restriction(), #diameter_service{}}) - -> pid() + -> {reference(), pid()} when T :: {connect|accept, diameter:transport_ref()}, Opt :: diameter:transport_opt(). @@ -152,9 +145,15 @@ %% specified on the transport in question. Check here that the list is %% still non-empty. -start({_,_} = Type, Opts, MS) -> - {ok, Pid} = diameter_peer_fsm_sup:start_child({self(), Type, Opts, MS}), - Pid. +start({_,_} = Type, Opts, S) -> + Ack = make_ref(), + T = {Ack, self(), Type, Opts, S}, + {ok, Pid} = diameter_peer_fsm_sup:start_child(T), + try + {erlang:monitor(process, Pid), Pid} + after + Pid ! Ack + end. start_link(T) -> {ok, _} = proc_lib:start_link(?MODULE, @@ -163,8 +162,8 @@ start_link(T) -> infinity, diameter_lib:spawn_opts(server, [])). -%%% --------------------------------------------------------------------------- -%%% --------------------------------------------------------------------------- +%% --------------------------------------------------------------------------- +%% --------------------------------------------------------------------------- %% init/1 @@ -172,12 +171,13 @@ init(T) -> proc_lib:init_ack({ok, self()}), gen_server:enter_loop(?MODULE, [], i(T)). -i({WPid, T, Opts, {Mask, Nodes, #diameter_service{applications = Apps, - capabilities = LCaps} - = Svc}}) -> - [] /= Apps orelse ?ERROR({no_apps, T, Opts}), +i({Ack, WPid, {M, Ref} = T, Opts, {Mask, + Nodes, + #diameter_service{capabilities = LCaps} + = Svc}}) -> + erlang:monitor(process, WPid), + wait(Ack, WPid), putr(?DWA_KEY, dwa(LCaps)), - {M, Ref} = T, diameter_stats:reg(Ref), {[Cs,Ds], Rest} = proplists:split(Opts, [capabilities_cb, disconnect_cb]), putr(?CB_KEY, {Ref, [F || {_,F} <- Cs]}), @@ -185,7 +185,6 @@ i({WPid, T, Opts, {Mask, Nodes, #diameter_service{applications = Apps, putr(?REF_KEY, Ref), putr(?SEQUENCE_KEY, Mask), putr(?RESTRICT_KEY, Nodes), - erlang:monitor(process, WPid), {TPid, Addrs} = start_transport(T, Rest, Svc), Tmo = proplists:get_value(capx_timeout, Opts, ?EVENT_TIMEOUT), ?IS_TIMEOUT(Tmo) orelse ?ERROR({invalid, {capx_timeout, Tmo}}), @@ -203,6 +202,16 @@ i({WPid, T, Opts, {Mask, Nodes, #diameter_service{applications = Apps, %% watchdog start (start/2) succeeds regardless so as not to crash the %% service. +%% Wait for the caller to have a monitor to avoid a race with our +%% death. (Since the exit reason is used in diameter_service.) +wait(Ref, Pid) -> + receive + Ref -> + ok; + {'DOWN', _, process, Pid, _} = D -> + exit({shutdown, D}) + end. + start_transport(T, Opts, #diameter_service{capabilities = LCaps} = Svc) -> Addrs0 = LCaps#diameter_caps.host_ip_address, start_transport(Addrs0, {T, Opts, Svc}). @@ -274,13 +283,12 @@ handle_info(T, #state{} = State) -> {noreply, S}; {stop, Reason} -> ?LOG(stop, Reason), - x(Reason, State); + {stop, {shutdown, Reason}, State}; stop -> ?LOG(stop, T), - x(T, State) + {stop, {shutdown, T}, State} catch exit: {diameter_codec, encode, _} = Reason -> - close_wd(Reason, State#state.parent), ?LOG(stop, Reason), %% diameter_codec:encode/2 emits an error report. Only %% indicate the probable reason here. @@ -300,10 +308,6 @@ handle_info(T, #state{} = State) -> %% succesfully encoded. It's not checked at diameter:add_transport/2 %% since this can be called before creating the service. -x(Reason, #state{} = S) -> - close_wd(Reason, S), - {stop, {shutdown, Reason}, S}. - %% terminate/2 terminate(_, _) -> @@ -314,8 +318,8 @@ terminate(_, _) -> code_change(_, State, _) -> {ok, State}. -%%% --------------------------------------------------------------------------- -%%% --------------------------------------------------------------------------- +%% --------------------------------------------------------------------------- +%% --------------------------------------------------------------------------- putr(Key, Val) -> put({?MODULE, Key}, Val). @@ -378,9 +382,8 @@ transition({diameter, {recv, Pkt}}, S) -> recv(Pkt, S); %% Timeout when still in the same state ... -transition({timeout = T, PS}, #state{state = PS} = S) -> - close({capx(PS), T}, S), - stop; +transition({timeout = T, PS}, #state{state = PS}) -> + {stop, {capx(PS), T}}; %% ... or not. transition({timeout, _}, _) -> @@ -393,8 +396,6 @@ transition({send, Msg}, #state{transport = TPid}) -> %% Request for graceful shutdown at remove_transport, stop_service of %% application shutdown. -transition({shutdown = T, Pid}, S) -> - transition({T, Pid, transport}, S); transition({shutdown, Pid, Reason}, #state{parent = Pid, dpr = false} = S) -> dpr(Reason, S); transition({shutdown, Pid, _}, #state{parent = Pid}) -> @@ -459,7 +460,7 @@ send_CER(#state{state = {'Wait-Conn-Ack', Tmo}, OH = LCaps#diameter_caps.origin_host, req_send_CER(OH, Remote) orelse - close({already_connected, Remote, LCaps}, S), + close({already_connected, Remote, LCaps}), CER = build_CER(S), ?LOG(send, 'CER'), #diameter_packet{header = #diameter_header{end_to_end_id = Eid, @@ -692,17 +693,17 @@ cea(CEA, RC) -> CEA#diameter_base_CEA{'Result-Code' = RC}. post('CER' = T, RC, Pkt, S) -> - [fun close/2, {T, caps(S), {RC, Pkt}}]; + [fun(_) -> close({T, caps(S), {RC, Pkt}}) end]; post(_, _, _, _) -> ok. rejected({capabilities_cb, _F, Reason}, T, S) -> rejected(Reason, T, S); -rejected(discard, T, S) -> - close(T, S); +rejected(discard, T, _) -> + close(T); rejected({N, Es}, T, S) -> - {answer('CER', N, Es, S), [fun close/2, T]}; + {answer('CER', N, Es, S), [fun(_) -> close(T) end]}; rejected(N, T, S) -> rejected({N, []}, T, S). @@ -862,7 +863,7 @@ handle_CEA(#diameter_packet{bin = Bin} of _ -> open(DPkt, SApps, Caps, {connect, hd([_] = IS)}, S) catch - ?FAILURE(Reason) -> close({'CEA', Reason, Caps, DPkt}, S) + ?FAILURE(Reason) -> close({'CEA', Reason, Caps, DPkt}) end. %% Check more than the result code since the peer could send success %% regardless. If not 2001 then a peer_up callback could do anything @@ -882,7 +883,7 @@ recv_CEA(#diameter_packet{header = #diameter_header{version T; recv_CEA(Pkt, S) -> - close({'CEA', caps(S), Pkt}, S). + close({'CEA', caps(S), Pkt}). caps(#diameter_service{capabilities = Caps}) -> Caps; @@ -935,14 +936,14 @@ open(Pkt, SupportedApps, Caps, {Type, IS}, #state{parent = Pid, %% We've advertised TLS support: tell the transport the result %% and expect a reply when the handshake is complete. -tls_ack(true, Caps, Type, IS, #state{transport = TPid} = S) -> +tls_ack(true, Caps, Type, IS, #state{transport = TPid}) -> Ref = make_ref(), TPid ! {diameter, {tls, Ref, Type, IS == ?TLS}}, receive {diameter, {tls, Ref}} -> ok; {'DOWN', _, process, TPid, Reason} -> - close({tls_ack, Reason, Caps}, S) + close({tls_ack, Reason, Caps}) end; %% Or not. Don't send anything to the transport so that transports @@ -955,25 +956,11 @@ capz(#diameter_caps{} = L, #diameter_caps{} = R) -> = list_to_tuple([diameter_caps | lists:zip(tl(tuple_to_list(L)), tl(tuple_to_list(R)))]). -%% close/2 +%% close/1 -%% Tell the watchdog that our death isn't due to transport failure. -close(Reason, #state{parent = Pid}) -> - close_wd(Reason, Pid), +close(Reason) -> throw({?MODULE, close, Reason}). -%% close_wd/2 - -%% Ensure the watchdog dies if DPR has been sent ... -close_wd(_, #state{dpr = false}) -> - ok; -close_wd(Reason, #state{parent = Pid}) -> - close_wd(Reason, Pid); - -%% ... or otherwise -close_wd(Reason, Pid) -> - Pid ! {close, self(), Reason}. - %% dwa/1 dwa(#diameter_caps{origin_host = OH, diff --git a/lib/diameter/src/base/diameter_reg.erl b/lib/diameter/src/base/diameter_reg.erl index 619b12ecad..ac58e4bf5b 100644 --- a/lib/diameter/src/base/diameter_reg.erl +++ b/lib/diameter/src/base/diameter_reg.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2010-2012. All Rights Reserved. +%% Copyright Ericsson AB 2010-2013. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -208,10 +208,6 @@ init(_) -> %% # handle_call/3 %% ---------------------------------------------------------- -handle_call(Req, From, S) - when not is_record(S, state) -> - handle_call(Req, From, upgrade(S)); - handle_call({add, Fun, Key, Pid}, _, S) -> B = Fun(?TABLE, {Key, Pid}), monitor(B andalso no_monitor(Pid), Pid), @@ -281,9 +277,6 @@ code_change(_OldVsn, State, _Extra) -> %% =========================================================================== -upgrade(S) -> - #state{} = list_to_tuple(tuple_to_list(S) ++ [[]]). - monitor(true, Pid) -> ets:insert(?TABLE, ?MONITOR(Pid, erlang:monitor(process, Pid))); monitor(false, _) -> diff --git a/lib/diameter/src/base/diameter_service.erl b/lib/diameter/src/base/diameter_service.erl index d2a416166f..3cab914fdb 100644 --- a/lib/diameter/src/base/diameter_service.erl +++ b/lib/diameter/src/base/diameter_service.erl @@ -64,15 +64,7 @@ -include_lib("diameter/include/diameter.hrl"). -include("diameter_internal.hrl"). -%% The states mirrored by peer_up/peer_down callbacks. --define(STATE_UP, up). --define(STATE_DOWN, down). - --type op_state() :: ?STATE_UP - | ?STATE_DOWN. - -%% The RFC 3539 watchdog states that are now maintained, albeit -%% along with the old up/down. okay = up, else down. +%% RFC 3539 watchdog states. -define(WD_INITIAL, initial). -define(WD_OKAY, okay). -define(WD_SUSPECT, suspect). @@ -122,8 +114,10 @@ {id = now(), service_name, %% as passed to start_service/2, key in ?STATE_TABLE service :: #diameter_service{}, - peerT = ets_new(peers) :: ets:tid(),%% #peer{} at start_fsm - connT = ets_new(conns) :: ets:tid(),%% #conn{} at connection_up/reopen + watchdogT = ets_new(watchdogs) %% #watchdog{} at start + :: ets:tid(), + peerT = ets_new(peers) %% #peer{pid = TPid} at okay/reopen + :: ets:tid(), shared_peers = ?Dict:new(), %% Alias -> [{TPid, Caps}, ...] local_peers = ?Dict:new(), %% Alias -> [{TPid, Caps}, ...] monitor = false :: false | pid(), %% process to die with @@ -139,36 +133,26 @@ %% service record is used to determine whether or not we need to call %% the process for a pick_peer callback. -%% Record representing a watchdog process as implemented by -%% diameter_watchdog. The term "peer" here is historical, made -%% especially confusing by the fact that a peer_ref() in the -%% documentation is the key of a #conn{} record, not a #peer{} record. -%% The name is also unfortunate given the meaning of peer in the -%% Diameter sense. --record(peer, +%% Record representing an RFC 3539 watchdog process implemented by +%% diameter_watchdog. +-record(watchdog, {pid :: match(pid()), type :: match(connect | accept), ref :: match(reference()), %% key into diameter_config options :: match([diameter:transport_opt()]),%% from start_transport - op_state = {?STATE_DOWN, ?WD_INITIAL} - :: match(op_state() | {op_state(), wd_state()}), + state = ?WD_INITIAL :: match(wd_state()), started = now(), %% at process start - conn = false :: match(boolean() | pid())}). - %% true at accepted, pid() at connection_up or reopen - -%% Record representing a peer process as implemented by -%% diameter_peer_fsm. The term "conn" is historical. Despite the name -%% here, comments refer to watchdog and peer processes, that are keys -%% in #peer{} and #conn{} records respectively. To add to the -%% confusion, a #request.transport is a peer process = key in a -%% #conn{} record. The actual transport process (that the peer process -%% knows about and that has a transport connection) isn't seen here. --record(conn, + peer = false :: match(boolean() | pid())}). + %% true at accepted, pid() at okay/reopen + +%% Record representing an Peer State Machine processes implemented by +%% diameter_peer_fsm. +-record(peer, {pid :: pid(), apps :: [{0..16#FFFFFFFF, diameter:app_alias()}], %% {Id, Alias} caps :: #diameter_caps{}, started = now(), %% at process start - peer :: pid()}). %% key into peerT + watchdog :: pid()}). %% key into watchdogT %% Record stored in diameter_request for each outgoing request. -record(request, @@ -520,67 +504,39 @@ transition({accepted, Pid, TPid}, S) -> accepted(Pid, TPid, S), ok; -%% Peer process has a new open connection. -transition({connection_up, Pid, T}, S) -> - connection_up(Pid, T, S), - ok; - -%% Watchdog has a new connection that will be opened after DW[RA] -%% exchange. This message was added long after connection_up, to -%% communicate the information as soon as it's available. Leave -%% connection_up as is it for now, duplicated information and all. -transition({reopen, Pid, T}, S) -> - reopen(Pid, T, S), - ok; - -%% Watchdog has left state OKAY. -transition({connection_down, Pid}, S) -> - connection_down(Pid, S), - ok; - -%% Watchdog has returned to state OKAY. -transition({connection_up, Pid}, S) -> - connection_up(Pid, S), - ok; - -%% Accepting transport has lost connectivity. -transition({close, Pid}, S) -> - close(Pid, S), - ok; - %% Connecting transport is being restarted by watchdog. transition({reconnect, Pid}, S) -> reconnect(Pid, S), ok; -%% Watchdog is sending notification of a state transition. Note that -%% the connection_up/down messages pre-date this message and are still -%% used. A watchdog message will follow these and communicate the same -%% state as was set in handling connection_up/down. -transition({watchdog, Pid, {TPid, From, To}}, #state{service_name = SvcName, - peerT = PeerT}) -> - #peer{ref = Ref, type = T, options = Opts, op_state = {OS,_}} - = P - = fetch(PeerT, Pid), - insert(PeerT, P#peer{op_state = {OS, To}}), +%% Watchdog is sending notification of transport death. +transition({close, Pid, Reason}, #state{service_name = SvcName, + watchdogT = WatchdogT}) -> + #watchdog{state = WS, + ref = Ref, + type = Type, + options = Opts} + = fetch(WatchdogT, Pid), + WS /= ?WD_OKAY + andalso + send_event(SvcName, {closed, Ref, Reason, {type(Type), Opts}}), + ok; + +%% Watchdog is sending notification of a state transition. +transition({watchdog, Pid, {[TPid | Data], From, To}}, + #state{service_name = SvcName, + watchdogT = WatchdogT} + = S) -> + #watchdog{ref = Ref, type = T, options = Opts} + = Wd + = fetch(WatchdogT, Pid), + watchdog(TPid, Data, From, To, Wd, S), send_event(SvcName, {watchdog, Ref, TPid, {From, To}, {T, Opts}}), ok; -%% Death of a watchdog process (#peer.pid) results in the removal of -%% it's peer and any associated conn record when 'DOWN' is received -%% (after this) but the states will be {?STATE_UP, ?WD_DOWN} for a -%% short time. (No real problem since ?WD_* is only used in -%% service_info.) We set ?WD_OKAY as a consequence of connection_up -%% since we know a watchdog is coming. We can't set anything at -%% connection_down since we don't know if the subsequent watchdog -%% message will be ?WD_DOWN or ?WD_SUSPECT. We don't (yet) set -%% ?STATE_* as a consequence of a watchdog message since this requires -%% changing some of the matching on ?STATE_*. -%% -%% Death of a peer process process (#conn.pid, #peer.conn) results in -%% connection_down followed by watchdog ?WD_DOWN. The latter doesn't -%% result in the conn record being deleted since 'DOWN' from death of -%% its watchdog doesn't (yet) deal with the record having been -%% removed. +%% Death of a watchdog process (#watchdog.pid) results in the removal of +%% it's peer and any associated conn record when 'DOWN' is received. +%% Death of a peer process process (#peer.pid, #watchdog.peer) results in +%% ?WD_DOWN. %% Monitor process has died. Just die with a reason that tells %% diameter_config about the happening. If a cleaner shutdown is @@ -589,9 +545,9 @@ transition({'DOWN', MRef, process, _, Reason}, #state{monitor = MRef}) -> {stop, {monitor, Reason}}; %% Local watchdog process has died. -transition({'DOWN', _, process, Pid, Reason}, S) +transition({'DOWN', _, process, Pid, _Reason}, S) when node(Pid) == node() -> - peer_down(Pid, Reason, S), + watchdog_down(Pid, S), ok; %% Remote service wants to know about shared peers. @@ -716,49 +672,30 @@ mod_state(Alias, ModS) -> %%% # shutdown/2 %%% --------------------------------------------------------------------------- -%% remove_transport: ask watchdogs to terminate their transport. -shutdown(Refs, #state{peerT = PeerT}) +%% remove_transport +shutdown(Refs, #state{watchdogT = WatchdogT}) when is_list(Refs) -> - ets:foldl(fun(P,ok) -> sp(P, Refs), ok end, ok, PeerT); - -%% application/service shutdown: ask transports to terminate themselves. -shutdown(Reason, #state{peerT = PeerT}) -> - %% A transport might not be alive to receive the shutdown request - %% but give those that are a chance to shutdown gracefully. - shutdown(conn, Reason, PeerT), - %% Kill the watchdogs explicitly in case there was no transport. - shutdown(peer, Reason, PeerT). - -%% sp/2 - -sp(#peer{ref = Ref, pid = Pid}, Refs) -> - lists:member(Ref, Refs) - andalso (Pid ! {shutdown, self()}). %% 'DOWN' cleans up + ets:foldl(fun(P,ok) -> st(P, Refs), ok end, ok, WatchdogT); -%% shutdown/3 - -shutdown(Who, Reason, T) -> - diameter_lib:wait(ets:foldl(fun(X,A) -> shutdown(Who, X, Reason, A) end, +%% application/service shutdown +shutdown(Reason, #state{watchdogT = WatchdogT}) + when Reason == application; + Reason == service -> + diameter_lib:wait(ets:foldl(fun(P,A) -> st(P, Reason, A) end, [], - T)). + WatchdogT)). -shutdown(conn = Who, #peer{op_state = {OS,_}} = P, Reason, Acc) -> - shutdown(Who, P#peer{op_state = OS}, Reason, Acc); +%% st/2 -shutdown(conn, - #peer{pid = Pid, op_state = ?STATE_UP, conn = TPid}, - Reason, - Acc) -> - TPid ! {shutdown, Pid, Reason}, - [TPid | Acc]; +st(#watchdog{ref = Ref, pid = Pid}, Refs) -> + lists:member(Ref, Refs) + andalso (Pid ! {shutdown, self(), transport}). %% 'DOWN' cleans up -shutdown(peer, #peer{pid = Pid}, _Reason, Acc) - when is_pid(Pid) -> - exit(Pid, shutdown), - [Pid | Acc]; +%% st/3 -shutdown(_, #peer{}, _, Acc) -> - Acc. +st(#watchdog{pid = Pid}, Reason, Acc) -> + Pid ! {shutdown, self(), Reason}, + [Pid | Acc]. %%% --------------------------------------------------------------------------- %%% # call_service/2 @@ -787,8 +724,6 @@ cs(undefined, _) -> %%% --------------------------------------------------------------------------- %%% # i/1 -%%% -%%% Output: #state{} %%% --------------------------------------------------------------------------- %% Intialize the state of a service gen_server. @@ -891,22 +826,22 @@ type(connect = T) -> T. %% start/4 -start(Ref, Type, Opts, #state{peerT = PeerT, - connT = ConnT, +start(Ref, Type, Opts, #state{watchdogT = WatchdogT, + peerT = PeerT, options = SvcOpts, service_name = SvcName, service = Svc}) when Type == connect; Type == accept -> - Pid = s(Type, Ref, {ConnT, + Pid = s(Type, Ref, {PeerT, Opts, SvcName, SvcOpts, merge_service(Opts, Svc)}), - insert(PeerT, #peer{pid = Pid, - type = Type, - ref = Ref, - options = Opts}), + insert(WatchdogT, #watchdog{pid = Pid, + type = Type, + ref = Ref, + options = Opts}), Pid. %% Note that the service record passed into the watchdog is the merged @@ -953,23 +888,50 @@ ms(_, Svc) -> %%% # accepted/3 %%% --------------------------------------------------------------------------- -accepted(Pid, _TPid, #state{peerT = PeerT} = S) -> - #peer{ref = Ref, type = accept = T, conn = false, options = Opts} - = P - = fetch(PeerT, Pid), - insert(PeerT, P#peer{conn = true}), %% mark replacement as started - start(Ref, T, Opts, S). %% start new watchdog +accepted(Pid, _TPid, #state{watchdogT = WatchdogT} = S) -> + #watchdog{ref = Ref, type = accept = T, peer = false, options = Opts} + = Wd + = fetch(WatchdogT, Pid), + insert(WatchdogT, Wd#watchdog{peer = true}),%% mark replacement as started + start(Ref, T, Opts, S). %% start new watchdog fetch(Tid, Key) -> [T] = ets:lookup(Tid, Key), - case T of - #peer{op_state = ?STATE_UP} = P -> - P#peer{op_state = {?STATE_UP, ?WD_OKAY}}; - #peer{op_state = ?STATE_DOWN} = P -> - P#peer{op_state = {?STATE_DOWN, ?WD_DOWN}}; - _ -> - T - end. + T. + +%%% --------------------------------------------------------------------------- +%%% # watchdog/6 +%%% +%%% React to a watchdog state transition. +%%% --------------------------------------------------------------------------- + +%% Watchdog has a new open connection. +watchdog(TPid, [T], _, ?WD_OKAY, Wd, State) -> + connection_up({TPid, T}, Wd, State); + +%% Watchdog has a new connection that will be opened after DW[RA] +%% exchange. +watchdog(TPid, [T], _, ?WD_REOPEN, Wd, State) -> + reopen({TPid, T}, Wd, State); + +%% Watchdog has recovered a suspect connection. +watchdog(TPid, [], ?WD_SUSPECT, ?WD_OKAY, Wd, State) -> + #watchdog{peer = TPid} = Wd, %% assert + connection_up(Wd, State); + +%% Watchdog has an unresponsive connection. +watchdog(TPid, [], ?WD_OKAY, ?WD_SUSPECT = To, Wd, State) -> + #watchdog{peer = TPid} = Wd, %% assert + connection_down(Wd, To, State); + +%% Watchdog has lost its connection. +watchdog(TPid, [], _, ?WD_DOWN = To, Wd, #state{peerT = PeerT} = S) -> + close(Wd, S), + connection_down(Wd, To, S), + ets:delete(PeerT, TPid); + +watchdog(_, [], _, _, _, _) -> + ok. %%% --------------------------------------------------------------------------- %%% # connection_up/3 @@ -977,72 +939,61 @@ fetch(Tid, Key) -> %% Watchdog process has reached state OKAY. -connection_up(Pid, {TPid, {Caps, SApps, Pkt}}, #state{peerT = PeerT, - connT = ConnT} - = S) -> - P = fetch(PeerT, Pid), - C = #conn{pid = TPid, - apps = SApps, - caps = Caps, - peer = Pid}, - - insert(ConnT, C), - connection_up([Pkt], P#peer{conn = TPid}, C, S). +connection_up({TPid, {Caps, SApps, Pkt}}, + #watchdog{pid = Pid} + = Wd, + #state{peerT = PeerT} + = S) -> + Pr = #peer{pid = TPid, + apps = SApps, + caps = Caps, + watchdog = Pid}, + insert(PeerT, Pr), + connection_up([Pkt], Wd#watchdog{peer = TPid}, Pr, S). %%% --------------------------------------------------------------------------- %%% # reopen/3 %%% --------------------------------------------------------------------------- -%% Note that this connection_up/3 rewrites the same #conn{} now -%% written here. Both do so in case reopen has not happened in old -%% code. - -reopen(Pid, {TPid, {Caps, SApps, _Pkt}}, #state{peerT = PeerT, - connT = ConnT}) -> - P = fetch(PeerT, Pid), - C = #conn{pid = TPid, - apps = SApps, - caps = Caps, - peer = Pid}, - - insert(ConnT, C), - #peer{op_state = {?STATE_DOWN, _}} - = P, - insert(PeerT, P#peer{op_state = {?STATE_DOWN, ?WD_REOPEN}, - conn = TPid}). +reopen({TPid, {Caps, SApps, _Pkt}}, + #watchdog{pid = Pid} + = Wd, + #state{watchdogT = WatchdogT, + peerT = PeerT}) -> + insert(PeerT, #peer{pid = TPid, + apps = SApps, + caps = Caps, + watchdog = Pid}), + insert(WatchdogT, Wd#watchdog{state = ?WD_REOPEN, + peer = TPid}). %%% --------------------------------------------------------------------------- %%% # connection_up/2 %%% --------------------------------------------------------------------------- -%% Peer process has transitioned back into the open state. Note that there -%% has been no new capabilties exchange in this case. +%% Watchdog has recovered as suspect connection. Note that there has +%% been no new capabilties exchange in this case. -connection_up(Pid, #state{peerT = PeerT, - connT = ConnT} - = S) -> - #peer{conn = TPid} = P = fetch(PeerT, Pid), - C = fetch(ConnT, TPid), - connection_up([], P, C, S). +connection_up(#watchdog{peer = TPid} = Wd, #state{peerT = PeerT} = S) -> + connection_up([], Wd, fetch(PeerT, TPid), S). %% connection_up/4 -connection_up(T, P, C, #state{peerT = PeerT, - local_peers = LDict, - service_name = SvcName, - service - = #diameter_service{applications = Apps}} - = S) -> - #peer{conn = TPid, op_state = {?STATE_DOWN, _}} - = P, - #conn{apps = SApps, caps = Caps} - = C, - - insert(PeerT, P#peer{op_state = {?STATE_UP, ?WD_OKAY}}), - +connection_up(Extra, + #watchdog{peer = TPid} + = Wd, + #peer{apps = SApps, caps = Caps} + = Pr, + #state{watchdogT = WatchdogT, + local_peers = LDict, + service_name = SvcName, + service + = #diameter_service{applications = Apps}} + = S) -> + insert(WatchdogT, Wd#watchdog{state = ?WD_OKAY}), request_peer_up(TPid), insert_local_peer(SApps, {{TPid, Caps}, {SvcName, Apps}}, LDict), - report_status(up, P, C, S, T). + report_status(up, Wd, Pr, S, Extra). insert_local_peer(SApps, T, LDict) -> lists:foldl(fun(A,D) -> ilp(A, T, D) end, LDict, SApps). @@ -1078,41 +1029,38 @@ peer_cb(MFA, Alias) -> end. %%% --------------------------------------------------------------------------- -%%% # connection_down/2 +%%% # connection_down/3 %%% --------------------------------------------------------------------------- -%% Watchdog has transitioned out of state OKAY. - -connection_down(Pid, #state{peerT = PeerT, - connT = ConnT} - = S) -> - #peer{op_state = {?STATE_UP, WS}, %% assert - conn = TPid} - = P - = fetch(PeerT, Pid), - - C = fetch(ConnT, TPid), - insert(PeerT, P#peer{op_state = {?STATE_DOWN, WS}}), - connection_down(P,C,S). - -%% connection_down/3 - -connection_down(#peer{op_state = {?STATE_DOWN, _}}, _, _) -> - ok; - -connection_down(#peer{conn = TPid, - op_state = {?STATE_UP, _}} - = P, - #conn{caps = Caps, +connection_down(#watchdog{state = ?WD_OKAY, + peer = TPid} + = Wd, + #peer{caps = Caps, apps = SApps} - = C, + = Pr, #state{service_name = SvcName, service = #diameter_service{applications = Apps}, local_peers = LDict} = S) -> - report_status(down, P, C, S, []), + report_status(down, Wd, Pr, S, []), remove_local_peer(SApps, {{TPid, Caps}, {SvcName, Apps}}, LDict), - request_peer_down(TPid, S). + request_peer_down(TPid, S); + +connection_down(#watchdog{}, #peer{}, _) -> + ok; + +connection_down(#watchdog{state = WS, + peer = TPid} + = Wd, + To, + #state{watchdogT = WatchdogT, + peerT = PeerT} + = S) + when is_atom(To) -> + insert(WatchdogT, Wd#watchdog{state = To}), + ?WD_OKAY == WS + andalso + connection_down(Wd, fetch(PeerT, TPid), S). remove_local_peer(SApps, T, LDict) -> lists:foldl(fun(A,D) -> rlp(A, T, D) end, LDict, SApps). @@ -1130,64 +1078,51 @@ down_conn(Id, Alias, TC, {SvcName, Apps}) -> peer_cb({ModX, peer_down, [SvcName, TC]}, Alias). %%% --------------------------------------------------------------------------- -%%% # peer_down/3 +%%% # watchdog_down/2 %%% --------------------------------------------------------------------------- %% Watchdog process has died. -peer_down(Pid, Reason, #state{peerT = PeerT} = S) -> - P = fetch(PeerT, Pid), - ets:delete_object(PeerT, P), - closed(Reason, P, S), - restart(P,S), - peer_down(P,S). - -%% Send an event at connection establishment failure. -closed({shutdown, {close, _TPid, Reason}}, - #peer{op_state = {?STATE_DOWN, _}, - ref = Ref, - type = Type, - options = Opts}, - #state{service_name = SvcName}) -> - send_event(SvcName, {closed, Ref, Reason, {type(Type), Opts}}); -closed(_, _, _) -> - ok. +watchdog_down(Pid, #state{watchdogT = WatchdogT} = S) -> + Wd = fetch(WatchdogT, Pid), + ets:delete_object(WatchdogT, Wd), + restart(Wd,S), + wd_down(Wd,S). -%% The watchdog has never reached OKAY ... -peer_down(#peer{conn = B}, _) +%% Watchdog has never reached OKAY ... +wd_down(#watchdog{peer = B}, _) when is_boolean(B) -> ok; %% ... or maybe it has. -peer_down(#peer{conn = TPid} = P, #state{connT = ConnT} = S) -> - #conn{} = C = fetch(ConnT, TPid), - ets:delete_object(ConnT, C), - connection_down(P,C,S). +wd_down(#watchdog{peer = TPid} = Wd, #state{peerT = PeerT} = S) -> + connection_down(Wd, ?WD_DOWN, S), + ets:delete(PeerT, TPid). %% restart/2 -restart(P,S) -> - q_restart(restart(P), S). +restart(Wd, S) -> + q_restart(restart(Wd), S). %% restart/1 %% Always try to reconnect. -restart(#peer{ref = Ref, - type = connect = T, - options = Opts, - started = Time}) -> +restart(#watchdog{ref = Ref, + type = connect = T, + options = Opts, + started = Time}) -> {Time, {Ref, T, Opts}}; %% Transport connection hasn't yet been accepted ... -restart(#peer{ref = Ref, - type = accept = T, - options = Opts, - conn = false, - started = Time}) -> +restart(#watchdog{ref = Ref, + type = accept = T, + options = Opts, + peer = false, + started = Time}) -> {Time, {Ref, T, Opts}}; %% ... or it has: a replacement has already been spawned. -restart(#peer{type = accept}) -> +restart(#watchdog{type = accept}) -> false. %% q_restart/2 @@ -1248,14 +1183,13 @@ tc(false = No, _, _) -> %% removed %% the accepting watchdog upon reception of a CER from the previously %% connected peer, or us after reconnect_timer timeout. -close(Pid, #state{service_name = SvcName, - peerT = PeerT}) -> - #peer{pid = Pid, - type = accept, - ref = Ref, - options = Opts} - = fetch(PeerT, Pid), - +close(#watchdog{type = connect}, _) -> + ok; +close(#watchdog{type = accept, + pid = Pid, + ref = Ref, + options = Opts}, + #state{service_name = SvcName}) -> c(Pid, diameter_config:have_transport(SvcName, Ref), Opts). %% Tell watchdog to (maybe) die later ... @@ -1278,11 +1212,11 @@ c(Pid, false, _Opts) -> %%% --------------------------------------------------------------------------- reconnect(Pid, #state{service_name = SvcName, - peerT = PeerT}) -> - #peer{ref = Ref, - type = connect, - options = Opts} - = fetch(PeerT, Pid), + watchdogT = WatchdogT}) -> + #watchdog{ref = Ref, + type = connect, + options = Opts} + = fetch(WatchdogT, Pid), send_event(SvcName, {reconnect, Ref, Opts}). %%% --------------------------------------------------------------------------- @@ -1742,6 +1676,8 @@ have_request(Pkt, TPid) -> %% request_peer_up/1 +%% Insert an element that is used to detect whether or not there has +%% been a failover when inserting an outgoing request. request_peer_up(TPid) -> ets:insert(?REQUEST_TABLE, {TPid}). @@ -1760,8 +1696,8 @@ request_peer_down(TPid, S) -> %%% recv_request/3 %%% --------------------------------------------------------------------------- -recv_request(TPid, Pkt, {ConnT, SvcName, Apps, Mask}) -> - try ets:lookup(ConnT, TPid) of +recv_request(TPid, Pkt, {PeerT, SvcName, Apps, Mask}) -> + try ets:lookup(PeerT, TPid) of [C] -> recv_request(C, TPid, Pkt, SvcName, Apps, Mask); [] -> %% transport has gone down @@ -1773,7 +1709,7 @@ recv_request(TPid, Pkt, {ConnT, SvcName, Apps, Mask}) -> %% recv_request/5 -recv_request(#conn{apps = SApps, caps = Caps}, +recv_request(#peer{apps = SApps, caps = Caps}, TPid, Pkt, SvcName, @@ -2593,11 +2529,11 @@ rt(#request{packet = #diameter_packet{msg = Msg}, %%% --------------------------------------------------------------------------- report_status(Status, - #peer{ref = Ref, - conn = TPid, - type = Type, - options = Opts}, - #conn{apps = [_|_] = As, + #watchdog{ref = Ref, + peer = TPid, + type = Type, + options = Opts}, + #peer{apps = [_|_] = As, caps = Caps}, #state{service_name = SvcName} = S, @@ -2676,7 +2612,7 @@ rpd(Pid, Alias, PDict) -> %%% --------------------------------------------------------------------------- %%% find_transport/[34] %%% -%%% Output: {TransportPid, #diameter_caps{}, #diameter_app{}} +%%% Return: {TransportPid, #diameter_caps{}, #diameter_app{}} %%% | false %%% | {error, Reason} %%% --------------------------------------------------------------------------- @@ -2814,7 +2750,7 @@ avp_decode(_, _, #diameter_avp{value = V}) -> %%% --------------------------------------------------------------------------- %%% # pick_peer(App, [DestRealm, DestHost], Filter, #state{}) %%% -%%% Output: {TransportPid, #diameter_caps{}, App} +%%% Return: {TransportPid, #diameter_caps{}, App} %%% | false %%% | {error, Reason} %%% --------------------------------------------------------------------------- @@ -2966,8 +2902,8 @@ eq(Any, Id, PeerId) -> %% transports/1 -transports(#state{peerT = PeerT}) -> - ets:select(PeerT, [{#peer{conn = '$1', _ = '_'}, +transports(#state{watchdogT = WatchdogT}) -> + ets:select(WatchdogT, [{#watchdog{peer = '$1', _ = '_'}, [{'is_pid', '$1'}], ['$1']}]). @@ -3021,11 +2957,12 @@ tagged_info(Item, S) undefined end; -tagged_info(TPid, #state{peerT = PT, connT = CT}) +tagged_info(TPid, #state{watchdogT = WatchdogT, peerT = PeerT}) when is_pid(TPid) -> try - [#conn{peer = Pid}] = ets:lookup(CT, TPid), - [#peer{ref = Ref, type = Type, options = Opts}] = ets:lookup(PT, Pid), + [#peer{watchdog = Pid}] = ets:lookup(PeerT, TPid), + [#watchdog{ref = Ref, type = Type, options = Opts}] + = ets:lookup(WatchdogT, Pid), [{ref, Ref}, {type, Type}, {options, Opts}] @@ -3108,11 +3045,11 @@ complete(Pre) -> %% info_stats/1 -info_stats(#state{peerT = PeerT}) -> - MatchSpec = [{#peer{ref = '$1', conn = '$2', _ = '_'}, +info_stats(#state{watchdogT = WatchdogT}) -> + MatchSpec = [{#watchdog{ref = '$1', peer = '$2', _ = '_'}, [{'is_pid', '$2'}], [['$1', '$2']]}], - try ets:select(PeerT, MatchSpec) of + try ets:select(WatchdogT, MatchSpec) of L -> diameter_stats:read(lists:append(L)) catch @@ -3122,7 +3059,8 @@ info_stats(#state{peerT = PeerT}) -> %% info_transport/1 %% %% One entry per configured transport. Statistics for each entry are -%% the accumulated values for the ref and associated peer pids. +%% the accumulated values for the ref and associated watchdog/peer +%% pids. info_transport(S) -> PeerD = peer_dict(S, config_dict(S)), @@ -3155,43 +3093,42 @@ transport([[_,_] | L]) -> %% Possibly many peer entries for a listening transport. Note that all %% have the same options by construction, which is not terribly space -%% efficient. (TODO: all entries for the same Ref should share options.) +%% efficient. transport([[{type, accept}, {options, Opts} | _] | _] = Ls) -> [{type, listen}, {options, Opts}, {accept, [lists:nthtail(2,L) || L <- Ls]}]. -peer_dict(#state{peerT = PeerT, connT = ConnT}, Dict0) -> - try ets:tab2list(PeerT) of +peer_dict(#state{watchdogT = WatchdogT, peerT = PeerT}, Dict0) -> + try ets:tab2list(WatchdogT) of L -> - lists:foldl(fun(T,A) -> peer_acc(ConnT, A, T) end, Dict0, L) + lists:foldl(fun(T,A) -> peer_acc(PeerT, A, T) end, Dict0, L) catch error: badarg -> Dict0 %% service has gone down end. -peer_acc(ConnT, Acc, #peer{pid = Pid, - type = Type, - ref = Ref, - options = Opts, - op_state = OS, - started = T, - conn = TPid}) -> - WS = wd_state(OS), +peer_acc(PeerT, Acc, #watchdog{pid = Pid, + type = Type, + ref = Ref, + options = Opts, + state = WS, + started = At, + peer = TPid}) -> dict:append(Ref, [{type, Type}, {options, Opts}, - {watchdog, {Pid, T, WS}} - | info_conn(ConnT, TPid, WS /= ?WD_DOWN)], + {watchdog, {Pid, At, WS}} + | info_peer(PeerT, TPid, WS)], Acc). -info_conn(ConnT, TPid, true) - when is_pid(TPid) -> - try ets:lookup(ConnT, TPid) of - T -> info_conn(T) +info_peer(PeerT, TPid, WS) + when is_pid(TPid), WS /= ?WD_DOWN -> + try ets:lookup(PeerT, TPid) of + T -> info_peer(T) catch error: badarg -> [] %% service has gone down end; -info_conn(_, _, _) -> +info_peer(_, _, _) -> []. %% The point of extracting the config here is so that 'transport' info @@ -3210,19 +3147,12 @@ config_acc({Ref, T, Opts}, Dict) config_acc(_, Dict) -> Dict. -wd_state({_,S}) -> - S; -wd_state(?STATE_UP) -> - ?WD_OKAY; -wd_state(?STATE_DOWN) -> - ?WD_DOWN. - -info_conn([#conn{pid = Pid, apps = SApps, caps = Caps, started = T}]) -> +info_peer([#peer{pid = Pid, apps = SApps, caps = Caps, started = T}]) -> [{peer, {Pid, T}}, {apps, SApps}, {caps, info_caps(Caps)} | try [{port, info_port(Pid)}] catch _:_ -> [] end]; -info_conn([] = No) -> +info_peer([] = No) -> No. %% Extract information that the processes involved are expected to diff --git a/lib/diameter/src/base/diameter_watchdog.erl b/lib/diameter/src/base/diameter_watchdog.erl index a5429c967c..10ab246b88 100644 --- a/lib/diameter/src/base/diameter_watchdog.erl +++ b/lib/diameter/src/base/diameter_watchdog.erl @@ -62,11 +62,13 @@ restrict :: {diameter:restriction(), boolean()}, shutdown = false :: boolean()}). +%% --------------------------------------------------------------------------- %% start/2 %% %% Start a monitor before the watchdog is allowed to proceed to ensure %% that a failed capabilities exchange produces the desired exit %% reason. +%% --------------------------------------------------------------------------- -spec start(Type, {RecvData, [Opt], SvcName, SvcOpts, #diameter_service{}}) -> {reference(), pid()} @@ -77,12 +79,12 @@ SvcName :: diameter:service_name(). start({_,_} = Type, T) -> - Ref = make_ref(), - {ok, Pid} = diameter_watchdog_sup:start_child({Ref, {Type, self(), T}}), + Ack = make_ref(), + {ok, Pid} = diameter_watchdog_sup:start_child({Ack, Type, self(), T}), try {erlang:monitor(process, Pid), Pid} after - Pid ! Ref + send(Pid, Ack) end. start_link(T) -> @@ -101,22 +103,15 @@ init(T) -> proc_lib:init_ack({ok, self()}), gen_server:enter_loop(?MODULE, [], i(T)). -i({Ref, {_, Pid, _} = T}) -> - MRef = erlang:monitor(process, Pid), - receive - Ref -> - make_state(T); - {'DOWN', MRef, process, _, _} = D -> - exit({shutdown, D}) - end. - -make_state({T, Pid, {RecvData, - Opts, - SvcName, - SvcOpts, - #diameter_service{applications = Apps, - capabilities = Caps} - = Svc}}) -> +i({Ack, T, Pid, {RecvData, + Opts, + SvcName, + SvcOpts, + #diameter_service{applications = Apps, + capabilities = Caps} + = Svc}}) -> + erlang:monitor(process, Pid), + wait(Ack, Pid), random:seed(now()), putr(restart, {T, Opts, Svc}), %% save seeing it in trace putr(dwr, dwr(Caps)), %% @@ -124,9 +119,7 @@ make_state({T, Pid, {RecvData, Restrict = proplists:get_value(restrict_connections, SvcOpts), Nodes = restrict_nodes(Restrict), #watchdog{parent = Pid, - transport = monitor(diameter_peer_fsm:start(T, - Opts, - {Mask, Nodes, Svc})), + transport = start(T, Opts, Mask, Nodes, Svc), tw = proplists:get_value(watchdog_timer, Opts, ?DEFAULT_TW_INIT), @@ -134,6 +127,21 @@ make_state({T, Pid, {RecvData, sequence = Mask, restrict = {Restrict, lists:member(node(), Nodes)}}. +wait(Ref, Pid) -> + receive + Ref -> + ok; + {'DOWN', _, process, Pid, _} = D -> + exit({shutdown, D}) + end. + +%% start/5 + +start(T, Opts, Mask, Nodes, Svc) -> + {_MRef, Pid} + = diameter_peer_fsm:start(T, Opts, {Mask, Nodes, Svc}), + Pid. + %% handle_call/3 handle_call(_, _, State) -> @@ -151,41 +159,59 @@ handle_info(T, #watchdog{} = State) -> ok -> {noreply, State}; #watchdog{} = S -> - event(State, S), + close(T, State), %% service expects 'close' message + event(T, State, S), %% before 'watchdog' {noreply, S}; stop -> ?LOG(stop, T), - event(State, State#watchdog{status = down}), + event(T, State, State#watchdog{status = down}), {stop, {shutdown, T}, State} - end; + end. -handle_info(T, S) -> - handle_info(T, upgrade(S)). +close({'DOWN', _, process, TPid, {shutdown, Reason}}, + #watchdog{transport = TPid, + parent = Pid}) -> + send(Pid, {close, self(), Reason}); -upgrade(S) -> - #watchdog{} = list_to_tuple(tuple_to_list(S) - ++ [?NOMASK, {nodes, true}, false]). +close(_, _) -> + ok. -event(#watchdog{status = T}, #watchdog{status = T}) -> +event(_, #watchdog{status = T}, #watchdog{status = T}) -> ok; -event(#watchdog{transport = undefined}, #watchdog{transport = undefined}) -> +event(_, #watchdog{transport = undefined}, #watchdog{transport = undefined}) -> ok; -event(#watchdog{status = From, transport = F, parent = Pid}, +event(Msg, + #watchdog{status = From, transport = F, parent = Pid}, #watchdog{status = To, transport = T}) -> - E = {tpid(F,T), From, To}, - notify(Pid, E), + TPid = tpid(F,T), + E = {[TPid | data(Msg, TPid, From, To)], From, To}, + send(Pid, {watchdog, self(), E}), ?LOG(transition, {self(), E}). +data(Msg, TPid, reopen, okay) -> + {recv, TPid, 'DWA', _Pkt} = Msg, %% assert + {TPid, T} = eraser(open), + [T]; + +data({open, TPid, _Hosts, T}, TPid, _From, To) + when To == okay; + To == reopen -> + [T]; + +data(_, _, _, _) -> + []. + tpid(_, Pid) when is_pid(Pid) -> Pid; + tpid(Pid, _) -> Pid. -notify(Pid, E) -> - Pid ! {watchdog, self(), E}. +send(Pid, T) -> + Pid ! T. %% terminate/2 @@ -215,15 +241,13 @@ transition(close, #watchdog{}) -> ok; %% Service is asking for the peer to be taken down gracefully. -transition({shutdown, Pid}, #watchdog{parent = Pid, - transport = undefined, - status = S}) -> - down = S, %% sanity check +transition({shutdown, Pid, _}, #watchdog{parent = Pid, + transport = undefined}) -> stop; -transition({shutdown = T, Pid}, #watchdog{parent = Pid, - transport = TPid} - = S) -> - TPid ! {T, self()}, +transition({shutdown = T, Pid, Reason}, #watchdog{parent = Pid, + transport = TPid} + = S) -> + send(TPid, {T, self(), Reason}), S#watchdog{shutdown = true}; %% Parent process has died, @@ -234,13 +258,9 @@ transition({'DOWN', _, process, Pid, _Reason}, %% Transport has accepted a connection. transition({accepted = T, TPid}, #watchdog{transport = TPid, parent = Pid}) -> - Pid ! {T, self(), TPid}, + send(Pid, {T, self(), TPid}), ok; -%% Transport is telling us that its impending death isn't failure. -transition({close, TPid, _Reason}, #watchdog{transport = TPid}) -> - stop; - %% STATE Event Actions New State %% ===== ------ ------- ---------- %% INITIAL Connection up SetWatchdog() OKAY @@ -255,15 +275,13 @@ transition({close, TPid, _Reason}, #watchdog{transport = TPid}) -> %% know the identity of the peer (ie. now) that we know that we're in %% state down rather than initial. -transition({open, TPid, Hosts, T} = Open, +transition({open, TPid, Hosts, _} = Open, #watchdog{transport = TPid, status = initial, - parent = Pid, restrict = {_, R}} = S) -> case okay(getr(restart), Hosts, R) of okay -> - open(Pid, {TPid, T}), set_watchdog(S#watchdog{status = okay}); reopen -> transition(Open, S#watchdog{status = down}) @@ -274,17 +292,15 @@ transition({open, TPid, Hosts, T} = Open, %% SetWatchdog() %% Pending = TRUE REOPEN -transition({open = P, TPid, _Hosts, T}, +transition({open = Key, TPid, _Hosts, T}, #watchdog{transport = TPid, - parent = Pid, status = down} = S) -> %% Store the info we need to notify the parent to reopen the %% connection after the requisite DWA's are received, at which %% time we eraser(open). The reopen message is a later addition, %% to communicate the new capabilities as soon as they're known. - putr(P, {TPid, T}), - Pid ! {reopen, self(), {TPid, T}}, + putr(Key, {TPid, T}), set_watchdog(send_watchdog(S#watchdog{status = reopen, num_dwa = 0})); @@ -296,26 +312,18 @@ transition({open = P, TPid, _Hosts, T}, %% REOPEN Connection down CloseConnection() %% SetWatchdog() DOWN -transition({'DOWN', _, process, TPid, _}, +transition({'DOWN', _, process, TPid, _Reason}, #watchdog{transport = TPid, - status = S, - shutdown = D}) - when S == initial; - D -> + shutdown = true}) -> stop; -transition({'DOWN', _, process, TPid, _}, - #watchdog{transport = TPid} +transition({'DOWN', _, process, TPid, _Reason}, + #watchdog{transport = TPid, + status = T} = S) -> - failover(S), - close(S), - set_watchdog(S#watchdog{status = down, + set_watchdog(S#watchdog{status = case T of initial -> T; _ -> down end, pending = false, transport = undefined}); -%% Any outstanding pending (or other messages from the transport) will -%% have arrived before 'DOWN' since the message comes from the same -%% process. Note that we could also get this message in the initial -%% state. %% Incoming message. transition({recv, TPid, Name, Pkt}, #watchdog{transport = TPid} = S) -> @@ -331,15 +339,11 @@ transition({timeout, _, tw}, #watchdog{}) -> %% State query. transition({state, Pid}, #watchdog{status = S}) -> - Pid ! {self(), S}, + send(Pid, {self(), S}), ok. %% =========================================================================== -monitor(Pid) -> - erlang:monitor(process, Pid), - Pid. - putr(Key, Val) -> put({?MODULE, Key}, Val). @@ -386,7 +390,7 @@ okay([{_,P}]) -> %% ... or it has. okay(C) -> - [_|_] = [P ! close || {_,P} <- C, self() /= P], + [_|_] = [send(P, close) || {_,P} <- C, self() /= P], reopen. %% set_watchdog/1 @@ -408,36 +412,13 @@ tw(T) tw({M,F,A}) -> apply(M,F,A). -%% open/2 - -open(Pid, {_,_} = T) -> - Pid ! {connection_up, self(), T}. - -%% failover/1 - -failover(#watchdog{status = okay, - parent = Pid}) -> - Pid ! {connection_down, self()}; - -failover(_) -> - ok. - -%% close/1 - -close(#watchdog{status = down}) -> - ok; - -close(#watchdog{parent = Pid}) -> - {{T, _}, _, _} = getr(restart), - T == accept andalso (Pid ! {close, self()}). - %% send_watchdog/1 send_watchdog(#watchdog{pending = false, transport = TPid, sequence = Mask} = S) -> - TPid ! {send, encode(getr(dwr), Mask)}, + send(TPid, {send, encode(getr(dwr), Mask)}), ?LOG(send, 'DWR'), S#watchdog{pending = true}. @@ -518,12 +499,10 @@ rcv(_, #watchdog{status = okay} = S) -> %% SetWatchdog() OKAY rcv('DWA', #watchdog{status = suspect} = S) -> - failback(S), set_watchdog(S#watchdog{status = okay, pending = false}); rcv(_, #watchdog{status = suspect} = S) -> - failback(S), set_watchdog(S#watchdog{status = okay}); %% REOPEN Receive DWA & Pending = FALSE @@ -531,10 +510,8 @@ rcv(_, #watchdog{status = suspect} = S) -> %% Failback() OKAY rcv('DWA', #watchdog{status = reopen, - num_dwa = 2 = N, - parent = Pid} + num_dwa = 2 = N} = S) -> - open(Pid, eraser(open)), S#watchdog{status = okay, num_dwa = N+1, pending = false}; @@ -553,11 +530,6 @@ rcv('DWA', #watchdog{status = reopen, rcv(_, #watchdog{status = reopen} = S) -> throwaway(S). -%% failback/1 - -failback(#watchdog{parent = Pid}) -> - Pid ! {connection_up, self()}. - %% timeout/1 %% %% The caller sets the watchdog on the return value. @@ -582,7 +554,6 @@ timeout(#watchdog{status = T, timeout(#watchdog{status = okay, pending = true} = S) -> - failover(S), S#watchdog{status = suspect}; %% SUSPECT Timer expires CloseConnection() @@ -599,7 +570,6 @@ timeout(#watchdog{status = T, when T == suspect; T == reopen, P, N < 0 -> exit(TPid, {shutdown, watchdog_timeout}), - close(S), S#watchdog{status = down}; %% REOPEN Timer expires & NumDWA = -1 @@ -633,7 +603,9 @@ timeout(#watchdog{status = reopen, %% process has died. We only need to handle state down since we start %% the first watchdog when transitioning out of initial. -timeout(#watchdog{status = down} = S) -> +timeout(#watchdog{status = T} = S) + when T == initial; + T == down -> restart(S). %% restart/1 @@ -659,11 +631,9 @@ restart({{connect, _} = T, Opts, Svc}, #watchdog{parent = Pid, sequence = Mask, restrict = {R,_}} = S) -> - Pid ! {reconnect, self()}, + send(Pid, {reconnect, self()}), Nodes = restrict_nodes(R), - S#watchdog{transport = monitor(diameter_peer_fsm:start(T, - Opts, - {Mask, Nodes, Svc})), + S#watchdog{transport = start(T, Opts, Mask, Nodes, Svc), restrict = {R, lists:member(node(), Nodes)}}; %% No restriction on the number of connections to the same peer: just diff --git a/lib/diameter/src/transport/diameter_tcp.erl b/lib/diameter/src/transport/diameter_tcp.erl index 7ec7b1c5e7..596e582ab0 100644 --- a/lib/diameter/src/transport/diameter_tcp.erl +++ b/lib/diameter/src/transport/diameter_tcp.erl @@ -368,8 +368,6 @@ handle_info(T, #monitor{} = S) -> %% # code_change/3 %% --------------------------------------------------------------------------- -code_change(_, {transport, _, _, _, _} = S, _) -> - {ok, #transport{} = list_to_tuple(tuple_to_list(S) ++ [false])}; code_change(_, State, _) -> {ok, State}. diff --git a/lib/diameter/test/diameter_event_SUITE.erl b/lib/diameter/test/diameter_event_SUITE.erl index 7c1c76f22a..18bdcb1f54 100644 --- a/lib/diameter/test/diameter_event_SUITE.erl +++ b/lib/diameter/test/diameter_event_SUITE.erl @@ -121,7 +121,6 @@ down(Config) -> {applications, [?DICT_ACCT]}, {reconnect_timer, 5000}]), start = event(Svc), - {watchdog, Ref, _, {initial, down}, _} = event(Svc), {closed, Ref, {'CEA', ?NO_COMMON_APP, _, #diameter_packet{}}, _} = event(Svc), {reconnect, Ref, _} = event(Svc). @@ -132,7 +131,6 @@ cea_timeout(Config) -> {Svc, Ref} = connect(Config, [{capx_timeout, ?SERVER_CAPX_TMO div 2}, {reconnect_timer, 2*?SERVER_CAPX_TMO}]), start = event(Svc), - {watchdog, Ref, _, {initial, down}, _} = event(Svc), {closed, Ref, {'CEA', timeout}, _} = event(Svc). stop(_Config) -> diff --git a/lib/diameter/test/diameter_traffic_SUITE.erl b/lib/diameter/test/diameter_traffic_SUITE.erl index b03a9ce4d1..f8db4a6863 100644 --- a/lib/diameter/test/diameter_traffic_SUITE.erl +++ b/lib/diameter/test/diameter_traffic_SUITE.erl @@ -80,7 +80,9 @@ send_multiple_filters_2/1, send_multiple_filters_3/1, send_anything/1, + outstanding/1, remove_transports/1, + empty/1, stop_services/1, stop/1]). @@ -202,7 +204,7 @@ all() -> C <- ?CONTAINERS, A <- ?ENCODINGS, P <- [[], [parallel]]] - ++ [remove_transports, stop_services, stop]. + ++ [outstanding, remove_transports, empty, stop_services, stop]. groups() -> Ts = tc(), @@ -288,6 +290,11 @@ add_transports(Config) -> || C <- ?CONNECTIONS], ?util:write_priv(Config, "transport", [LRef | Cs]). +%% Ensure there are no outstanding requests in request table. +outstanding(_Config) -> + [] = [T || T <- ets:tab2list(diameter_request), + is_atom(element(1,T))]. + remove_transports(Config) -> [LRef | Cs] = ?util:read_priv(Config, "transport"), [?util:disconnect(?CLIENT, C, ?SERVER, LRef) || C <- Cs]. @@ -296,6 +303,10 @@ stop_services(_Config) -> ok = diameter:stop_service(?CLIENT), ok = diameter:stop_service(?SERVER). +%% Ensure even transports have been removed from request table. +empty(_Config) -> + [] = ets:tab2list(diameter_request). + stop(_Config) -> ok = diameter:stop(). diff --git a/lib/diameter/test/diameter_watchdog_SUITE.erl b/lib/diameter/test/diameter_watchdog_SUITE.erl index 7ce09e93ca..f9f72d364a 100644 --- a/lib/diameter/test/diameter_watchdog_SUITE.erl +++ b/lib/diameter/test/diameter_watchdog_SUITE.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2010-2012. All Rights Reserved. +%% Copyright Ericsson AB 2010-2013. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -127,13 +127,7 @@ end_per_suite(_Config) -> %% =========================================================================== %% Test the watchdog state machine for the required failover, failback -%% and reopen behaviour. Do this by having the testcase replace -%% diameter_service and start watchdogs, and having this module -%% implement a transport process that plays the role of the peer -%% Diameter node. - -%reopen(_) -> -% reopen(connect, ?WD(10000), 1, 'DWR'); +%% and reopen behaviour by examining watchdog events. reopen(_) -> [] = run([[reopen, T, Wd, N, M] |