From 2bf7b5b5abbc1097a0b30bea2d390a2a0bec004c Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Fri, 28 Jun 2019 13:13:13 +0200 Subject: mnesia: Fix deadlock caused by add_table_copy If add_table_copy was called when a node was starting it deadlock waiting for mnesia_controller, when schema was not merged. Abort if that is the case. --- lib/mnesia/src/mnesia_controller.erl | 81 +++++++++++++++++-------------- lib/mnesia/src/mnesia_schema.erl | 2 +- lib/mnesia/test/mnesia_isolation_test.erl | 44 ++++++++++++++--- 3 files changed, 82 insertions(+), 45 deletions(-) diff --git a/lib/mnesia/src/mnesia_controller.erl b/lib/mnesia/src/mnesia_controller.erl index 882de0d613..0f221b0c1f 100644 --- a/lib/mnesia/src/mnesia_controller.erl +++ b/lib/mnesia/src/mnesia_controller.erl @@ -331,35 +331,39 @@ release_schema_commit_lock() -> %% Special for preparation of add table copy get_network_copy(Tid, Tab, Cs) -> -% We can't let the controller queue this one -% because that may cause a deadlock between schema_operations -% and initial tableloadings which both takes schema locks. -% But we have to get copier_done msgs when the other side -% goes down. - call({add_other, self()}), - Reason = {dumper,{add_table_copy, Tid}}, - Work = #net_load{table = Tab,reason = Reason,cstruct = Cs}, - %% I'll need this cause it's linked trough the subscriber - %% might be solved by using monitor in subscr instead. - process_flag(trap_exit, true), - Load = load_table_fun(Work), - Res = ?CATCH(Load()), - process_flag(trap_exit, false), - call({del_other, self()}), - case Res of - #loader_done{is_loaded = true} -> - Tab = Res#loader_done.table_name, - case Res#loader_done.needs_announce of - true -> - i_have_tab(Tab); - false -> - ignore - end, - Res#loader_done.reply; - #loader_done{} -> - Res#loader_done.reply; - Else -> - {not_loaded, Else} + %% We can't let the controller queue this one + %% because that may cause a deadlock between schema_operations + %% and initial tableloadings which both takes schema locks. + %% But we have to get copier_done msgs when the other side + %% goes down. + case call({add_other, self()}) of + ok -> + Reason = {dumper,{add_table_copy, Tid}}, + Work = #net_load{table = Tab,reason = Reason,cstruct = Cs}, + %% I'll need this cause it's linked trough the subscriber + %% might be solved by using monitor in subscr instead. + process_flag(trap_exit, true), + Load = load_table_fun(Work), + Res = ?CATCH(Load()), + process_flag(trap_exit, false), + call({del_other, self()}), + case Res of + #loader_done{is_loaded = true} -> + Tab = Res#loader_done.table_name, + case Res#loader_done.needs_announce of + true -> + i_have_tab(Tab); + false -> + ignore + end, + Res#loader_done.reply; + #loader_done{} -> + Res#loader_done.reply; + Else -> + {not_loaded, Else} + end; + {error, Else} -> + {not_loaded, Else} end. %% This functions is invoked from the dumper @@ -772,6 +776,18 @@ handle_call({unannounce_add_table_copy, [Tab, Node], From}, ReplyTo, State) -> noreply(State#state{early_msgs = [{call, Msg, undefined} | Msgs]}) end; +handle_call({add_other, Who}, _From, State = #state{others=Others0, schema_is_merged=SM}) -> + case SM of + true -> + Others = [Who|Others0], + {reply, ok, State#state{others=Others}}; + false -> + {reply, {error, {not_active,schema,node()}}, State} + end; +handle_call({del_other, Who}, _From, State = #state{others=Others0}) -> + Others = lists:delete(Who, Others0), + {reply, ok, State#state{others=Others}}; + handle_call(Msg, From, State) when State#state.schema_is_merged /= true -> %% Buffer early messages Msgs = State#state.early_msgs, @@ -803,13 +819,6 @@ handle_call({block_table, [Tab], From}, _Dummy, State) -> handle_call({check_w2r, _Node, Tab}, _From, State) -> {reply, val({Tab, where_to_read}), State}; -handle_call({add_other, Who}, _From, State = #state{others=Others0}) -> - Others = [Who|Others0], - {reply, ok, State#state{others=Others}}; -handle_call({del_other, Who}, _From, State = #state{others=Others0}) -> - Others = lists:delete(Who, Others0), - {reply, ok, State#state{others=Others}}; - handle_call(Msg, _From, State) -> error("~p got unexpected call: ~tp~n", [?SERVER_NAME, Msg]), noreply(State). diff --git a/lib/mnesia/src/mnesia_schema.erl b/lib/mnesia/src/mnesia_schema.erl index ef38adca1e..4a86aeb375 100644 --- a/lib/mnesia/src/mnesia_schema.erl +++ b/lib/mnesia/src/mnesia_schema.erl @@ -697,7 +697,7 @@ schema_coordinator(Client, _Fun, undefined) -> schema_coordinator(Client, Fun, Controller) when is_pid(Controller) -> %% Do not trap exit in order to automatically die %% when the controller dies - + put(transaction_client, Client), %% debug link(Controller), unlink(Client), diff --git a/lib/mnesia/test/mnesia_isolation_test.erl b/lib/mnesia/test/mnesia_isolation_test.erl index 49bcec14af..98a7840b72 100644 --- a/lib/mnesia/test/mnesia_isolation_test.erl +++ b/lib/mnesia/test/mnesia_isolation_test.erl @@ -1036,29 +1036,57 @@ add_table_copy(Config) when is_list(Config) -> Def = [{ram_copies, [ThisNode]}, {attributes, [key, attr1, attr2]}], ?match({atomic, ok}, mnesia:create_table(Tab, Def)), insert(Tab, 50), - {success, [A]} = ?start_activities([ThisNode]), + {success, [A]} = ?start_activities([ThisNode]), mnesia_test_lib:start_sync_transactions([A], 0), A ! fun() -> mnesia:write({Tab, 1, 1, updated}) end, ?match_receive({A, ok}), %% A is executed - Pid = spawn_link(?MODULE, op, [self(), mnesia, add_table_copy, + Pid = spawn_link(?MODULE, op, [self(), mnesia, add_table_copy, [Tab, Node2, ram_copies]]), - + ?match_receive(timeout), %% op waits for locks occupied by A A ! end_trans, %% Kill A, locks should be released - ?match_receive({A,{atomic,end_trans}}), - - receive + ?match_receive({A,{atomic,end_trans}}), + + receive Msg -> ?match({Pid, {atomic, ok}}, Msg) after timer:seconds(20) -> ?error("Operation timed out", []) end, + ?match_receive({'EXIT', Pid, normal}), sys:get_status(whereis(mnesia_locker)), % Explicit sync, release locks is async - ?match([], mnesia:system_info(held_locks)), - ?match([], mnesia:system_info(lock_queue)), + ?match([], mnesia:system_info(held_locks)), + ?match([], mnesia:system_info(lock_queue)), + + {atomic, ok} = mnesia:del_table_copy(Tab, Node2), + Self = self(), + New = spawn_link(Node2, + fun () -> + application:stop(mnesia), + Self ! {self(), ok}, + io:format(user, "restart mnesia~n", []), + Self ! {self(), catch application:start(mnesia)} + end), + receive {New,ok} -> ok end, + + Add = fun Add() -> + case mnesia:add_table_copy(Tab, Node2, disc_copies) of + {atomic, ok} -> ok; + _R -> io:format(user, "aborted with reason ~p~n", [_R]), + timer:sleep(10), + Add() + end + end, + + ?match(ok, Add()), + ?match_receive({New,ok}), + + sys:get_status(whereis(mnesia_locker)), % Explicit sync, release locks is async + ?match([], mnesia:system_info(held_locks)), + ?match([], mnesia:system_info(lock_queue)), ok. del_table_copy(suite) -> []; -- cgit v1.2.3