aboutsummaryrefslogtreecommitdiffstats
path: root/lib/ssl
diff options
context:
space:
mode:
authorMagnus Henoch <[email protected]>2015-10-28 17:25:07 +0000
committerMagnus Henoch <[email protected]>2015-10-28 19:11:03 +0000
commitc4e594710f0e822db06a277b0a763e02d73d6e24 (patch)
tree82be80a96ba1201d5029c74896020cdf5ea61b44 /lib/ssl
parentbd1251dfe4d60f09e569731d36a92e94acbe297a (diff)
downloadotp-c4e594710f0e822db06a277b0a763e02d73d6e24.tar.gz
otp-c4e594710f0e822db06a277b0a763e02d73d6e24.tar.bz2
otp-c4e594710f0e822db06a277b0a763e02d73d6e24.zip
TLS distribution: wait for code server
As described in the comments in the patch, doing a TLS handshake requires the crypto module to be loaded. The crypto module needs the code server to find its NIF library. However, there is a time window between opening the listening ports for distribution and starting the code server, and if we get an incoming connection in that time window, the node would believe that it's alive, but it wouldn't actually accept any more connections.
Diffstat (limited to 'lib/ssl')
-rw-r--r--lib/ssl/src/ssl_tls_dist_proxy.erl30
1 files changed, 30 insertions, 0 deletions
diff --git a/lib/ssl/src/ssl_tls_dist_proxy.erl b/lib/ssl/src/ssl_tls_dist_proxy.erl
index a22af6b960..cc4c410520 100644
--- a/lib/ssl/src/ssl_tls_dist_proxy.erl
+++ b/lib/ssl/src/ssl_tls_dist_proxy.erl
@@ -149,6 +149,7 @@ accept_loop(Proxy, world = Type, Listen, Extra) ->
case gen_tcp:accept(Listen) of
{ok, Socket} ->
Opts = get_ssl_options(server),
+ wait_for_code_server(),
case ssl:ssl_accept(Socket, Opts) of
{ok, SslSocket} ->
PairHandler =
@@ -165,6 +166,35 @@ accept_loop(Proxy, world = Type, Listen, Extra) ->
end,
accept_loop(Proxy, Type, Listen, Extra).
+wait_for_code_server() ->
+ %% This is an ugly hack. Upgrading a socket to TLS requires the
+ %% crypto module to be loaded. Loading the crypto module triggers
+ %% its on_load function, which calls code:priv_dir/1 to find the
+ %% directory where its NIF library is. However, distribution is
+ %% started earlier than the code server, so the code server is not
+ %% necessarily started yet, and code:priv_dir/1 might fail because
+ %% of that, if we receive an incoming connection on the
+ %% distribution port early enough.
+ %%
+ %% If the on_load function of a module fails, the module is
+ %% unloaded, and the function call that triggered loading it fails
+ %% with 'undef', which is rather confusing.
+ %%
+ %% Thus, the ssl_tls_dist_proxy process will terminate, and be
+ %% restarted by ssl_dist_sup. However, it won't have any memory
+ %% of being asked by net_kernel to listen for incoming
+ %% connections. Hence, the node will believe that it's open for
+ %% distribution, but it actually isn't.
+ %%
+ %% So let's avoid that by waiting for the code server to start.
+ case whereis(code_server) of
+ undefined ->
+ timer:sleep(10),
+ wait_for_code_server();
+ Pid when is_pid(Pid) ->
+ ok
+ end.
+
try_connect(Port) ->
case gen_tcp:connect({127,0,0,1}, Port, [{active, false}, {packet,?PPRE}]) of
R = {ok, _S} ->