From 6d087921e7608691ed5cd554484a13b961d3620f Mon Sep 17 00:00:00 2001 From: Anders Svensson Date: Mon, 19 May 2014 09:10:00 +0200 Subject: Fix watchdog table leak Commit ef5fddcb (diameter-1.4.1, R16B) caused the leak in the case of an accepting watchdog with restrict_connections = false. It (correctly) ensured the state remained at INITIAL but a subsequent 'close' message to terminate the process was ignored since the state was not DOWN. In fact, no 'close' was sent since there was no state transition or previous connection: the former triggers the message from diameter_service, the latter from diameter_watchdog. The message is now sent to self() from the watchdog itself. Send 'close' in the same way when multiple connections to the same peer are allowed, to avoid waiting for a watchdog timer expiry for the process to terminate in this case. --- lib/diameter/src/base/diameter_service.erl | 6 ++-- lib/diameter/src/base/diameter_watchdog.erl | 51 +++++++++++++++++++---------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/lib/diameter/src/base/diameter_service.erl b/lib/diameter/src/base/diameter_service.erl index 70e66537ed..4b2e5b674c 100644 --- a/lib/diameter/src/base/diameter_service.erl +++ b/lib/diameter/src/base/diameter_service.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2010-2013. All Rights Reserved. +%% Copyright Ericsson AB 2010-2014. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -1188,7 +1188,7 @@ tc(false = No, _, _) -> %% removed %% another watchdog to be able to detect that it should transition %% from initial into reopen rather than okay. That someone is either %% the accepting watchdog upon reception of a CER from the previously -%% connected peer, or us after connect_timer timeout. +%% connected peer, or us after connect_timer timeout or immediately. close(#watchdog{type = connect}, _) -> ok; @@ -1199,7 +1199,7 @@ close(#watchdog{type = accept, #state{service_name = SvcName}) -> c(Pid, diameter_config:have_transport(SvcName, Ref), Opts). -%% Tell watchdog to (maybe) die later ... +%% Tell watchdog to die later ... c(Pid, true, Opts) -> Tc = connect_timer(Opts, 2*?DEFAULT_TC), erlang:send_after(Tc, Pid, close); diff --git a/lib/diameter/src/base/diameter_watchdog.erl b/lib/diameter/src/base/diameter_watchdog.erl index 9a1c8b6585..62dc32af69 100644 --- a/lib/diameter/src/base/diameter_watchdog.erl +++ b/lib/diameter/src/base/diameter_watchdog.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2010-2013. All Rights Reserved. +%% Copyright Ericsson AB 2010-2014. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -49,8 +49,6 @@ -define(IS_NATURAL(N), (is_integer(N) andalso 0 =< N)). --define(CHOOSE(B,T,F), if (B) -> T; true -> F end). - -record(config, {suspect = 1 :: non_neg_integer(), %% OKAY -> SUSPECT okay = 3 :: non_neg_integer()}). %% REOPEN -> OKAY @@ -328,14 +326,13 @@ code_change(_, State, _) -> %% The state transitions documented here are extracted from RFC 3539, %% the commentary is ours. -%% Service or watchdog is telling the watchdog of an accepting -%% transport to die after connect_timer expiry or reestablished -%% connection (in another transport process) respectively. -transition(close, #watchdog{status = down}) -> +%% Service is telling the watchdog of an accepting transport to die +%% following transport death in state INITIAL, or after connect_timer +%% expiry; or another watchdog is saying the same after reestablishing +%% a connection previously had by this one. +transition(close, #watchdog{}) -> {{accept, _}, _, _} = getr(restart), %% assert stop; -transition(close, #watchdog{}) -> - ok; %% Service is asking for the peer to be taken down gracefully. transition({shutdown, Pid, _}, #watchdog{parent = Pid, @@ -425,11 +422,30 @@ transition({'DOWN', _, process, TPid, _Reason}, transition({'DOWN', _, process, TPid, _Reason}, #watchdog{transport = TPid, - status = T} - = S) -> - set_watchdog(S#watchdog{status = ?CHOOSE(initial == T, T, down), - pending = false, - transport = undefined}); + status = T, + restrict = {_,R}} + = S0) -> + S = S0#watchdog{pending = false, + transport = undefined}, + {{M,_}, _, _} = getr(restart), + + %% Close an accepting watchdog immediately if there's no + %% restriction on the number of connections to the same peer: the + %% state machine never enters state REOPEN in this case. The + %% 'close' message (instead of stop) is so as not to bypass the + %% sending of messages to the service process in handle_info/2. + + if T /= initial, M == accept, not R -> + send(self(), close), + S#watchdog{status = down}; + T /= initial -> + set_watchdog(S#watchdog{status = down}); + M == connect -> + set_watchdog(S); + M == accept -> + send(self(), close), + S + end; %% Incoming message. transition({recv, TPid, Name, Pkt}, #watchdog{transport = TPid} = S) -> @@ -755,7 +771,7 @@ timeout(#watchdog{status = T} = S) restart(#watchdog{transport = undefined} = S) -> restart(getr(restart), S); -restart(S) -> +restart(S) -> %% reconnect has won race with timeout S. %% restart/2 @@ -785,9 +801,10 @@ restart({{connect, _} = T, Opts, Svc}, %% die. Note that a state machine never enters state REOPEN in this %% case. restart({{accept, _}, _, _}, #watchdog{restrict = {_, false}}) -> - stop; + stop; %% 'DOWN' was in old code: 'close' was not sent -%% Otherwise hang around until told to die. +%% Otherwise hang around until told to die, either by the service or +%% by another watchdog. restart({{accept, _}, _, _}, S) -> S. -- cgit v1.2.3