diff options
author | Magnus Henoch <[email protected]> | 2015-10-28 17:25:07 +0000 |
---|---|---|
committer | Magnus Henoch <[email protected]> | 2015-10-28 19:11:03 +0000 |
commit | c4e594710f0e822db06a277b0a763e02d73d6e24 (patch) | |
tree | 82be80a96ba1201d5029c74896020cdf5ea61b44 /lib/ssl/src | |
parent | bd1251dfe4d60f09e569731d36a92e94acbe297a (diff) | |
download | otp-c4e594710f0e822db06a277b0a763e02d73d6e24.tar.gz otp-c4e594710f0e822db06a277b0a763e02d73d6e24.tar.bz2 otp-c4e594710f0e822db06a277b0a763e02d73d6e24.zip |
TLS distribution: wait for code server
As described in the comments in the patch, doing a TLS handshake
requires the crypto module to be loaded. The crypto module needs the
code server to find its NIF library. However, there is a time window
between opening the listening ports for distribution and starting the
code server, and if we get an incoming connection in that time window,
the node would believe that it's alive, but it wouldn't actually
accept any more connections.
Diffstat (limited to 'lib/ssl/src')
-rw-r--r-- | lib/ssl/src/ssl_tls_dist_proxy.erl | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/lib/ssl/src/ssl_tls_dist_proxy.erl b/lib/ssl/src/ssl_tls_dist_proxy.erl index a22af6b960..cc4c410520 100644 --- a/lib/ssl/src/ssl_tls_dist_proxy.erl +++ b/lib/ssl/src/ssl_tls_dist_proxy.erl @@ -149,6 +149,7 @@ accept_loop(Proxy, world = Type, Listen, Extra) -> case gen_tcp:accept(Listen) of {ok, Socket} -> Opts = get_ssl_options(server), + wait_for_code_server(), case ssl:ssl_accept(Socket, Opts) of {ok, SslSocket} -> PairHandler = @@ -165,6 +166,35 @@ accept_loop(Proxy, world = Type, Listen, Extra) -> end, accept_loop(Proxy, Type, Listen, Extra). +wait_for_code_server() -> + %% This is an ugly hack. Upgrading a socket to TLS requires the + %% crypto module to be loaded. Loading the crypto module triggers + %% its on_load function, which calls code:priv_dir/1 to find the + %% directory where its NIF library is. However, distribution is + %% started earlier than the code server, so the code server is not + %% necessarily started yet, and code:priv_dir/1 might fail because + %% of that, if we receive an incoming connection on the + %% distribution port early enough. + %% + %% If the on_load function of a module fails, the module is + %% unloaded, and the function call that triggered loading it fails + %% with 'undef', which is rather confusing. + %% + %% Thus, the ssl_tls_dist_proxy process will terminate, and be + %% restarted by ssl_dist_sup. However, it won't have any memory + %% of being asked by net_kernel to listen for incoming + %% connections. Hence, the node will believe that it's open for + %% distribution, but it actually isn't. + %% + %% So let's avoid that by waiting for the code server to start. + case whereis(code_server) of + undefined -> + timer:sleep(10), + wait_for_code_server(); + Pid when is_pid(Pid) -> + ok + end. + try_connect(Port) -> case gen_tcp:connect({127,0,0,1}, Port, [{active, false}, {packet,?PPRE}]) of R = {ok, _S} -> |