From 8e6d005dcd8f62745d28d1820c4b2a5025634f5f Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Mon, 14 Dec 2015 15:15:52 +0300 Subject: Prevent down nodes going undetected in epmd In the following (rare) case down node will be always registered in epmd: - client connects to epmd and sends ALIVE2 request - epmd reads this request and starts to process it - during that time client socket closes in such way that subsequent write(2) in epmd will result in error - at this point we have node that was registered in database, but as the connection struct has no 'keep' flag set, the do_read() closes connection and removes it from select fdset - and so there is no way for this node to be cleaned up later. We've seen several epmd instances in such state on our production systems. And while I'm not sure what was the exact sequence of events that leads to failed write(2), issue could be easily reproduced using SO_LINGER option for socket. --- erts/epmd/test/epmd_SUITE.erl | 58 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) (limited to 'erts/epmd/test') diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl index e8bbfdbb18..4de65500e9 100644 --- a/erts/epmd/test/epmd_SUITE.erl +++ b/erts/epmd/test/epmd_SUITE.erl @@ -76,7 +76,9 @@ buffer_overrun_2/1, no_nonlocal_register/1, no_nonlocal_kill/1, - no_live_killing/1 + no_live_killing/1, + + socket_reset_before_alive2_reply_is_written/1 ]). @@ -123,7 +125,8 @@ all() -> returns_valid_populated_extra_with_nulls, names_stdout, {group, buffer_overrun}, no_nonlocal_register, - no_nonlocal_kill, no_live_killing]. + no_nonlocal_kill, no_live_killing, + socket_reset_before_alive2_reply_is_written]. groups() -> [{buffer_overrun, [], @@ -243,11 +246,7 @@ register_node(Name,Port) -> register_node_v2(Port,$M,0,5,5,Name,""). register_node_v2(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) -> - Utf8Name = unicode:characters_to_binary(Name), - Req = [?EPMD_ALIVE2_REQ, put16(Port), NodeType, Prot, - put16(HVsn), put16(LVsn), - put16(size(Utf8Name)), binary_to_list(Utf8Name), - size16(Extra), Extra], + Req = alive2_req(Port, NodeType, Prot, HVsn, LVsn, Name, Extra), case send_req(Req) of {ok,Sock} -> case recv(Sock,4) of @@ -938,6 +937,42 @@ no_live_killing(Config) when is_list(Config) -> ?line close(Sock3), ok. +socket_reset_before_alive2_reply_is_written(doc) -> + ["Check for regression - don't make zombie from node which " + "sends TCP RST at wrong time"]; +socket_reset_before_alive2_reply_is_written(suite) -> + []; +socket_reset_before_alive2_reply_is_written(Config) when is_list(Config) -> + %% - delay_write for easier triggering of race condition + %% - relaxed_command_check for gracefull shutdown of epmd even if there + %% is stuck node. + ?line ok = epmdrun("-delay_write 1 -relaxed_command_check"), + + %% We can't use send_req/1 directly as we want to do inet:setopts/2 + %% on our socket. + ?line {ok, Sock} = connect(), + + %% Issuing close/1 on such socket will result in immediate RST packet. + ?line ok = inet:setopts(Sock, [{linger, {true, 0}}]), + + Req = alive2_req(4711, 77, 0, 5, 5, "test", []), + ?line ok = send(Sock, [size16(Req), Req]), + + timer:sleep(500), %% Wait for the first 1/2 of delay_write before closing + ?line ok = close(Sock), + + timer:sleep(500 + ?SHORT_PAUSE), %% Wait for the other 1/2 of delay_write + + %% Wait another delay_write interval, due to delay doubling in epmd. + %% Should be removed when this is issue is fixed there. + timer:sleep(1000), + + ?line {ok, SockForNames} = connect_active(), + + %% And there should be no stuck nodes + ?line {ok, []} = do_get_names(SockForNames), + ?line ok = close(SockForNames), + ok. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Terminate all tests with killing epmd. @@ -1200,3 +1235,12 @@ flat_count([_|T], N) -> flat_count(T, N); flat_count([], N) -> N. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +alive2_req(Port, NodeType, Prot, HVsn, LVsn, Name, Extra) -> + Utf8Name = unicode:characters_to_binary(Name), + [?EPMD_ALIVE2_REQ, put16(Port), NodeType, Prot, + put16(HVsn), put16(LVsn), + put16(size(Utf8Name)), binary_to_list(Utf8Name), + size16(Extra), Extra]. -- cgit v1.2.3 From 03fcb7dabf8861e60ffab4121a909b347bccfec9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Mon, 15 Feb 2016 15:33:12 +0100 Subject: Eliminate use of test_server.hrl and test_server_line.hrl As a first step to removing the test_server application as as its own separate application, change the inclusion of test_server.hrl to an inclusion of ct.hrl and remove the inclusion of test_server_line.hrl. --- erts/epmd/test/epmd_SUITE.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'erts/epmd/test') diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl index 4de65500e9..0c8f01ef6b 100644 --- a/erts/epmd/test/epmd_SUITE.erl +++ b/erts/epmd/test/epmd_SUITE.erl @@ -18,7 +18,7 @@ %% %CopyrightEnd% %% -module(epmd_SUITE). --include_lib("test_server/include/test_server.hrl"). +-include_lib("common_test/include/ct.hrl"). -include_lib("kernel/include/file.hrl"). -- cgit v1.2.3