From 93ad08d8c6d6a9875a10b33633aca52de5d3c59b Mon Sep 17 00:00:00 2001 From: Maxim Fedorov Date: Tue, 28 Aug 2018 16:11:09 -0700 Subject: Fix an endless rescheduling loop when a process is executing process_info(self(), ...) It is possible that a process has to yield before completing process_info BIF when it runs out of reductions. If this BIF is called by the process itself, it does not send a signal but executes in the context of a process. If it has to yield, it turns F_LOCAL_SIGS_ONLY flag on, which means new signals won't be fetched from the outer message queue. When the same process needs to execute dirty system code (e.g. dirty GC) it has to be run on a dirty scheduler. However signals enqueued into outer queue cause it to be rescheduled on a normal scheduler. F_LOCAL_SIGS_ONLY prevent outer queue signals delivery, creating an endless rescheduling loop. This commit disengages F_LOCAL_SIG_ONLY if process needs to execute dirty code in order to complete signal delivery and allow process to be moved to dirty run queue. --- erts/emulator/beam/erl_process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index 7a9ef3c1de..d5bd17ff3e 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -9641,7 +9641,7 @@ Process *erts_schedule(ErtsSchedulerData *esdp, Process *p, int calls) if (state & ERTS_PSFLG_RUNNING_SYS) { if (state & (ERTS_PSFLG_SIG_Q|ERTS_PSFLG_SIG_IN_Q)) { int local_only = (!!(p->flags & F_LOCAL_SIGS_ONLY) - & !(state & ERTS_PSFLG_SUSPENDED)); + & !(state & (ERTS_PSFLG_SUSPENDED|ERTS_PSFLGS_DIRTY_WORK))); if (!local_only | !!(state & ERTS_PSFLG_SIG_Q)) { int sig_reds; /* -- cgit v1.2.3 From a31216200bdee2c04b3fb3ae5e26607674715c8a Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Wed, 5 Sep 2018 16:04:51 +0200 Subject: Prevent inconsistent node lists If net_kernel "forgets" to abort a connection (as it currently might), the garbage collection of a distribution entry could cause node lists to enter an inconsistent state. --- erts/emulator/beam/dist.c | 6 +++++ erts/emulator/beam/dist.h | 2 ++ erts/emulator/beam/erl_node_tables.c | 38 +++++++++++++++++++++++++++++ erts/emulator/test/node_container_SUITE.erl | 28 +++++++++++++++++++-- 4 files changed, 72 insertions(+), 2 deletions(-) diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c index 70474898b2..16c4d689a5 100644 --- a/erts/emulator/beam/dist.c +++ b/erts/emulator/beam/dist.c @@ -3628,6 +3628,12 @@ static Sint abort_connection(DistEntry* dep, Uint32 conn_id) return 0; } +Sint +erts_abort_connection(DistEntry *dep, Uint32 conn_id) +{ + return abort_connection(dep, conn_id); +} + BIF_RETTYPE erts_internal_abort_connection_2(BIF_ALIST_2) { DistEntry* dep; diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index dda2029a4c..30b4b35c20 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -399,5 +399,7 @@ extern void erts_kill_dist_connection(DistEntry *dep, Uint32); extern Uint erts_dist_cache_size(void); +extern Sint erts_abort_connection(DistEntry *dep, Uint32 conn_id); + #endif diff --git a/erts/emulator/beam/erl_node_tables.c b/erts/emulator/beam/erl_node_tables.c index 1f147011a8..9b34af1480 100644 --- a/erts/emulator/beam/erl_node_tables.c +++ b/erts/emulator/beam/erl_node_tables.c @@ -412,6 +412,44 @@ static void schedule_delete_dist_entry(DistEntry* dep) static void start_timer_delete_dist_entry(void *vdep) { + DistEntry *dep = vdep; + Eterm sysname; + enum dist_entry_state state; + Uint32 connection_id; + + erts_de_rlock(dep); + state = dep->state; + connection_id = dep->connection_id; + sysname = dep->sysname; + erts_de_runlock(dep); + + if (state != ERTS_DE_STATE_IDLE) { + char *state_str; + erts_dsprintf_buf_t *dsbuf = erts_create_logger_dsbuf(); + switch (state) { + case ERTS_DE_STATE_CONNECTED: + state_str = "connected"; + break; + case ERTS_DE_STATE_PENDING: + state_str = "pending connect"; + break; + case ERTS_DE_STATE_EXITING: + state_str = "exiting"; + break; + case ERTS_DE_STATE_IDLE: + state_str = "idle"; + break; + default: + state_str = "unknown"; + break; + } + erts_dsprintf(dsbuf, "Garbage collecting distribution " + "entry for node %T in state: %s", + sysname, state_str); + erts_send_error_to_logger_nogl(dsbuf); + erts_abort_connection(dep, connection_id); + } + if (node_tab_delete_delay == 0) { prepare_try_delete_dist_entry(vdep); } diff --git a/erts/emulator/test/node_container_SUITE.erl b/erts/emulator/test/node_container_SUITE.erl index 7df001fec5..55135fbcbc 100644 --- a/erts/emulator/test/node_container_SUITE.erl +++ b/erts/emulator/test/node_container_SUITE.erl @@ -50,7 +50,8 @@ bad_nc/1, unique_pid/1, iter_max_procs/1, - magic_ref/1]). + magic_ref/1, + dist_entry_gc/1]). suite() -> [{ct_hooks,[ts_install_cth]}, @@ -58,7 +59,7 @@ suite() -> all() -> - [term_to_binary_to_term_eq, round_trip_eq, cmp, ref_eq, + [dist_entry_gc, term_to_binary_to_term_eq, round_trip_eq, cmp, ref_eq, node_table_gc, dist_link_refc, dist_monitor_refc, node_controller_refc, ets_refc, match_spec_refc, timer_refc, pid_wrap, port_wrap, bad_nc, @@ -894,6 +895,29 @@ magic_ref(Config) when is_list(Config) -> true = is_reference(MRef2), true = erts_debug:get_internal_state({magic_ref,MRef2}), ok. + + +lost_pending_connection(Node) -> + _ = (catch erts_internal:new_connection(Node)), + ok. + +dist_entry_gc(Config) when is_list(Config) -> + Me = self(), + {ok, Node} = start_node(get_nodefirstname(), "+zdntgc 0"), + P = spawn_link(Node, + fun () -> + LostNode = list_to_atom("lost_pending_connection@" ++ hostname()), + lost_pending_connection(LostNode), + garbage_collect(), %% Could crash... + Me ! {self(), ok} + end), + receive + {P, ok} -> ok + end, + unlink(P), + stop_node(Node), + ok. + %% %% -- Internal utils --------------------------------------------------------- %% -- cgit v1.2.3 From 129ec8648ec42c84e315387538310f0e5eb35d66 Mon Sep 17 00:00:00 2001 From: Sverker Eriksson Date: Wed, 5 Sep 2018 16:54:48 +0200 Subject: kernel: Fix missing abort_connection in net_kernel --- lib/kernel/src/net_kernel.erl | 74 ++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/lib/kernel/src/net_kernel.erl b/lib/kernel/src/net_kernel.erl index c4e1a0ce1e..6b1cef6106 100644 --- a/lib/kernel/src/net_kernel.erl +++ b/lib/kernel/src/net_kernel.erl @@ -279,24 +279,18 @@ passive_connect_monitor(From, Node) -> ok = monitor_nodes(true,[{node_type,all}]), Reply = case lists:member(Node,nodes([connected])) of true -> - io:format("~p: passive_connect_monitor ~p\n", [self(), ?LINE]), true; _ -> receive {nodeup,Node,_} -> - io:format("~p: passive_connect_monitor ~p\n", [self(), ?LINE]), true after connecttime() -> - io:format("~p: passive_connect_monitor ~p\n", [self(), ?LINE]), false end end, ok = monitor_nodes(false,[{node_type,all}]), - io:format("~p: passive_connect_monitor ~p\n", [self(), ?LINE]), {Pid, Tag} = From, - io:format("~p: passive_connect_monitor ~p\n", [self(), ?LINE]), - erlang:send(Pid, {Tag, Reply}), - io:format("~p: passive_connect_monitor ~p\n", [self(), ?LINE]). + erlang:send(Pid, {Tag, Reply}). %% If the net_kernel isn't running we ignore all requests to the @@ -358,20 +352,34 @@ init({Name, LongOrShortNames, TickT, CleanHalt}) -> {stop, Error} end. - -do_auto_connect(Type, Node, ConnId, WaitForBarred, From, State) -> - ConnLookup = ets:lookup(sys_dist, Node), - - case ConnLookup of +do_auto_connect_1(Node, ConnId, From, State) -> + case ets:lookup(sys_dist, Node) of [#barred_connection{}] -> - case WaitForBarred of - false -> - {reply, false, State}; - true -> + case ConnId of + passive_cnct -> spawn(?MODULE,passive_connect_monitor,[From,Node]), - {noreply, State} + {noreply, State}; + _ -> + erts_internal:abort_connection(Node, ConnId), + {reply, false, State} end; + ConnLookup -> + do_auto_connect_2(Node, ConnId, From, State, ConnLookup) + end. + +do_auto_connect_2(Node, passive_cnct, From, State, ConnLookup) -> + case (catch erts_internal:new_connection(Node)) of + {Nr,_DHandle}=ConnId when is_integer(Nr) -> + do_auto_connect_2(Node, ConnId, From, State, ConnLookup); + + _Error -> + error_logger:error_msg("~n** Cannot get connection id for node ~w~n", + [Node]), + {reply, false, State} + end; +do_auto_connect_2(Node, ConnId, From, State, ConnLookup) -> + case ConnLookup of [#connection{conn_id=ConnId, state = up}] -> {reply, true, State}; [#connection{conn_id=ConnId, waiting=Waiting}=Conn] -> @@ -385,6 +393,7 @@ do_auto_connect(Type, Node, ConnId, WaitForBarred, From, State) -> case application:get_env(kernel, dist_auto_connect) of {ok, never} -> ?connect_failure(Node,{dist_auto_connect,never}), + erts_internal:abort_connection(Node, ConnId), {reply, false, State}; %% This might happen due to connection close @@ -394,14 +403,16 @@ do_auto_connect(Type, Node, ConnId, WaitForBarred, From, State) -> (hd(ConnLookup))#connection.state =:= up -> ?connect_failure(Node,{barred_connection, ets:lookup(sys_dist, Node)}), + erts_internal:abort_connection(Node, ConnId), {reply, false, State}; _ -> - case setup(ConnLookup, Node,ConnId,Type,From,State) of + case setup(ConnLookup, Node,ConnId,normal,From,State) of {ok, SetupPid} -> Owners = [{SetupPid, Node} | State#state.conn_owners], {noreply,State#state{conn_owners=Owners}}; _Error -> ?connect_failure(Node, {setup_call, failed, _Error}), + erts_internal:abort_connection(Node, ConnId), {reply, false, State} end end @@ -454,18 +465,7 @@ handle_call({passive_cnct, Node}, From, State) when Node =:= node() -> async_reply({reply, true, State}, From); handle_call({passive_cnct, Node}, From, State) -> verbose({passive_cnct, Node}, 1, State), - Type = normal, - WaitForBarred = true, - R = case (catch erts_internal:new_connection(Node)) of - {Nr,_DHandle}=ConnId when is_integer(Nr) -> - do_auto_connect(Type, Node, ConnId, WaitForBarred, From, State); - - _Error -> - error_logger:error_msg("~n** Cannot get connection id for node ~w~n", - [Node]), - {reply, false, State} - end, - + R = do_auto_connect_1(Node, passive_cnct, From, State), return_call(R, From); %% @@ -479,7 +479,16 @@ handle_call({connect, Type, Node}, From, State) -> ConnLookup = ets:lookup(sys_dist, Node), R = case (catch erts_internal:new_connection(Node)) of {Nr,_DHandle}=ConnId when is_integer(Nr) -> - do_explicit_connect(ConnLookup, Type, Node, ConnId, From, State); + R1 = do_explicit_connect(ConnLookup, Type, Node, ConnId, From, State), + case R1 of + {reply, true, _S} -> %% already connected + ok; + {noreply, _S} -> %% connection pending + ok; + {reply, false, _S} -> %% connection refused + erts_internal:abort_connection(Node, ConnId) + end, + R1; _Error -> error_logger:error_msg("~n** Cannot get connection id for node ~w~n", @@ -703,7 +712,7 @@ handle_info({auto_connect,Node, Nr, DHandle}, State) -> verbose({auto_connect, Node, Nr, DHandle}, 1, State), ConnId = {Nr, DHandle}, NewState = - case do_auto_connect(normal, Node, ConnId, false, noreply, State) of + case do_auto_connect_1(Node, ConnId, noreply, State) of {noreply, S} -> %% Pending connection S; @@ -711,7 +720,6 @@ handle_info({auto_connect,Node, Nr, DHandle}, State) -> S; {reply, false, S} -> %% Connection refused - erts_internal:abort_connection(Node, ConnId), S end, {noreply, NewState}; -- cgit v1.2.3 From 245a3e53b3d8f324b82ab56f06f8df3cf580f860 Mon Sep 17 00:00:00 2001 From: Erlang/OTP Date: Wed, 5 Sep 2018 20:53:24 +0200 Subject: Update version numbers --- erts/vsn.mk | 2 +- lib/kernel/vsn.mk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/erts/vsn.mk b/erts/vsn.mk index feb51e42d2..293b555b18 100644 --- a/erts/vsn.mk +++ b/erts/vsn.mk @@ -18,7 +18,7 @@ # %CopyrightEnd% # -VSN = 10.0.6 +VSN = 10.0.7 # Port number 4365 in 4.2 # Port number 4366 in 4.3 diff --git a/lib/kernel/vsn.mk b/lib/kernel/vsn.mk index aa8e4dc119..fe22e2af98 100644 --- a/lib/kernel/vsn.mk +++ b/lib/kernel/vsn.mk @@ -1 +1 @@ -KERNEL_VSN = 6.0 +KERNEL_VSN = 6.0.1 -- cgit v1.2.3 From 12d2c65ed477e9fde9a411727de4cc67c53b1a1c Mon Sep 17 00:00:00 2001 From: Erlang/OTP Date: Wed, 5 Sep 2018 20:53:36 +0200 Subject: Update release notes --- erts/doc/src/notes.xml | 28 ++++++++++++++++++++++++++++ lib/kernel/doc/src/notes.xml | 17 +++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/erts/doc/src/notes.xml b/erts/doc/src/notes.xml index 5862318ab7..c7491e2741 100644 --- a/erts/doc/src/notes.xml +++ b/erts/doc/src/notes.xml @@ -31,6 +31,34 @@

This document describes the changes made to the ERTS application.

+
Erts 10.0.7 + +
Fixed Bugs and Malfunctions + + +

+ A process could get stuck in an infinite rescheduling + loop between normal and dirty schedulers. This bug was + introduced in ERTS version 10.0.

+

+ Thanks to Maxim Fedorov for finding and fixing this + issue.

+

+ Own Id: OTP-15275 Aux Id: PR-1943

+
+ +

+ Garbage collection of a distribution entry could cause an + emulator crash if net_kernel had not brought + previous connection attempts on it down properly.

+

+ Own Id: OTP-15279 Aux Id: ERIERL-226

+
+
+
+ +
+
Erts 10.0.6
Fixed Bugs and Malfunctions diff --git a/lib/kernel/doc/src/notes.xml b/lib/kernel/doc/src/notes.xml index e1ef8ab387..f3834d1c1c 100644 --- a/lib/kernel/doc/src/notes.xml +++ b/lib/kernel/doc/src/notes.xml @@ -31,6 +31,23 @@

This document describes the changes made to the Kernel application.

+
Kernel 6.0.1 + +
Fixed Bugs and Malfunctions + + +

+ Fixed bug in net_kernel that could cause an + emulator crash if certain connection attempts failed. Bug + exists since kernel-6.0 (OTP-21.0).

+

+ Own Id: OTP-15280 Aux Id: ERIERL-226, OTP-15279

+
+
+
+ +
+
Kernel 6.0
Fixed Bugs and Malfunctions -- cgit v1.2.3 From 30e5321740b96b3a8984611b6f821692716d0555 Mon Sep 17 00:00:00 2001 From: Erlang/OTP Date: Wed, 5 Sep 2018 20:53:39 +0200 Subject: Updated OTP version --- OTP_VERSION | 2 +- otp_versions.table | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/OTP_VERSION b/OTP_VERSION index 25e6ec249c..7f4e82686d 100644 --- a/OTP_VERSION +++ b/OTP_VERSION @@ -1 +1 @@ -21.0.7 +21.0.8 diff --git a/otp_versions.table b/otp_versions.table index 668455c138..7b8beade18 100644 --- a/otp_versions.table +++ b/otp_versions.table @@ -1,3 +1,4 @@ +OTP-21.0.8 : erts-10.0.7 kernel-6.0.1 # asn1-5.0.6 common_test-1.16 compiler-7.2.3 crypto-4.3.2 debugger-4.2.5 dialyzer-3.3 diameter-2.1.5 edoc-0.9.3 eldap-1.2.4 erl_docgen-0.8 erl_interface-3.10.3 et-1.6.2 eunit-2.3.6 ftp-1.0 hipe-3.18 inets-7.0.1 jinterface-1.9 megaco-3.18.3 mnesia-4.15.4 observer-2.8 odbc-2.12.1 os_mon-2.4.5 otp_mibs-1.2 parsetools-2.1.7 public_key-1.6.1 reltool-0.7.6 runtime_tools-1.13 sasl-3.2 snmp-5.2.11 ssh-4.7 ssl-9.0.1 stdlib-3.5.1 syntax_tools-2.1.5 tftp-1.0 tools-3.0 wx-1.8.4 xmerl-1.3.17 : OTP-21.0.7 : erts-10.0.6 # asn1-5.0.6 common_test-1.16 compiler-7.2.3 crypto-4.3.2 debugger-4.2.5 dialyzer-3.3 diameter-2.1.5 edoc-0.9.3 eldap-1.2.4 erl_docgen-0.8 erl_interface-3.10.3 et-1.6.2 eunit-2.3.6 ftp-1.0 hipe-3.18 inets-7.0.1 jinterface-1.9 kernel-6.0 megaco-3.18.3 mnesia-4.15.4 observer-2.8 odbc-2.12.1 os_mon-2.4.5 otp_mibs-1.2 parsetools-2.1.7 public_key-1.6.1 reltool-0.7.6 runtime_tools-1.13 sasl-3.2 snmp-5.2.11 ssh-4.7 ssl-9.0.1 stdlib-3.5.1 syntax_tools-2.1.5 tftp-1.0 tools-3.0 wx-1.8.4 xmerl-1.3.17 : OTP-21.0.6 : crypto-4.3.2 inets-7.0.1 ssl-9.0.1 # asn1-5.0.6 common_test-1.16 compiler-7.2.3 debugger-4.2.5 dialyzer-3.3 diameter-2.1.5 edoc-0.9.3 eldap-1.2.4 erl_docgen-0.8 erl_interface-3.10.3 erts-10.0.5 et-1.6.2 eunit-2.3.6 ftp-1.0 hipe-3.18 jinterface-1.9 kernel-6.0 megaco-3.18.3 mnesia-4.15.4 observer-2.8 odbc-2.12.1 os_mon-2.4.5 otp_mibs-1.2 parsetools-2.1.7 public_key-1.6.1 reltool-0.7.6 runtime_tools-1.13 sasl-3.2 snmp-5.2.11 ssh-4.7 stdlib-3.5.1 syntax_tools-2.1.5 tftp-1.0 tools-3.0 wx-1.8.4 xmerl-1.3.17 : OTP-21.0.5 : compiler-7.2.3 crypto-4.3.1 erts-10.0.5 # asn1-5.0.6 common_test-1.16 debugger-4.2.5 dialyzer-3.3 diameter-2.1.5 edoc-0.9.3 eldap-1.2.4 erl_docgen-0.8 erl_interface-3.10.3 et-1.6.2 eunit-2.3.6 ftp-1.0 hipe-3.18 inets-7.0 jinterface-1.9 kernel-6.0 megaco-3.18.3 mnesia-4.15.4 observer-2.8 odbc-2.12.1 os_mon-2.4.5 otp_mibs-1.2 parsetools-2.1.7 public_key-1.6.1 reltool-0.7.6 runtime_tools-1.13 sasl-3.2 snmp-5.2.11 ssh-4.7 ssl-9.0 stdlib-3.5.1 syntax_tools-2.1.5 tftp-1.0 tools-3.0 wx-1.8.4 xmerl-1.3.17 : -- cgit v1.2.3