From 9237801d22a38d2643ffe94ab626c4d2815012dd Mon Sep 17 00:00:00 2001
From: Dan Gudmundsson <dgud@erlang.org>
Date: Thu, 10 Oct 2013 10:55:46 +0200
Subject: mnesia: Synchronize lock cleanup after mnesia down

Bad timing could lead to hanging transactions after a mnesia down from a
node with sticky locks.

Excellent bug report from janchochol

Situation:
* node A and B have copies of table T
* node A ows sticky of table T
* node A goes down (e.g. crash)
* node B tries to perform transactional operation on table T
  (e.g. mnesia:select)

In this situation there is possibility that first (and maybe other)
transaction on node B will hang indefinitely.
This is caused by race condition, when transaction process send lock
request operation to node A and waits for reply. When node A is down
it will never send reply, so process on node B will be stuck
forever.
Reason is that message sent to mnesia_locker gen_server from
mnesia_locker:mnesia_down can be received after mnesia_locker gen_server
already replies to transaction processes with {switch, N, Req} and
node N is down.
Monitoring remote process when sending request to other node should
be safe solution.
---
 lib/mnesia/src/mnesia_locker.erl | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'lib/mnesia/src/mnesia_locker.erl')

diff --git a/lib/mnesia/src/mnesia_locker.erl b/lib/mnesia/src/mnesia_locker.erl
index 14011003d3..c4fe370ec1 100644
--- a/lib/mnesia/src/mnesia_locker.erl
+++ b/lib/mnesia/src/mnesia_locker.erl
@@ -1,7 +1,7 @@
 %%
 %% %CopyrightBegin%
 %%
-%% Copyright Ericsson AB 1996-2012. All Rights Reserved.
+%% Copyright Ericsson AB 1996-2013. All Rights Reserved.
 %%
 %% The contents of this file are subject to the Erlang Public License,
 %% Version 1.1, (the "License"); you may not use this file except in
@@ -26,8 +26,8 @@
 	 global_lock/5,
 	 ixrlock/5,
 	 init/1,
-	 mnesia_down/2,
 	 release_tid/1,
+	 mnesia_down/2,
 	 async_release_tid/2,
 	 send_release_tid/2,
 	 receive_release_tid_acc/2,
@@ -137,6 +137,17 @@ receive_release_tid_acc([Node | Nodes], Tid) ->
 receive_release_tid_acc([], _Tid) ->
     ok.
 
+mnesia_down(Node, Pending) ->
+    case whereis(?MODULE) of
+	undefined -> {error, node_not_running};
+	Pid ->
+	    Ref = make_ref(),
+	    Pid ! {{self(), Ref}, {release_remote_non_pending, Node, Pending}},
+	    receive   %% No need to wait for anything else if process dies we die soon
+		{Ref,ok} -> ok
+	    end
+    end.
+
 loop(State) ->
     receive
 	{From, {write, Tid, Oid}} ->
@@ -213,9 +224,9 @@ loop(State) ->
 	    reply(From, {tid_released, Tid}),
 	    loop(State);
 
-	{release_remote_non_pending, Node, Pending} ->
+	{{From, Ref},{release_remote_non_pending, Node, Pending}} ->
 	    release_remote_non_pending(Node, Pending),
-	    mnesia_monitor:mnesia_down(?MODULE, Node),
+	    From ! {Ref, ok},
 	    loop(State);
 
 	{'EXIT', Pid, _} when Pid == State#state.supervisor ->
@@ -653,19 +664,6 @@ ix_read_res(Tab,IxKey,Pos) ->
 %% ********************* end server code ********************
 %% The following code executes at the client side of a transactions
 
-mnesia_down(N, Pending) ->
-    case whereis(?MODULE) of
-	undefined ->
-	    %% Takes care of mnesia_down's in early startup
-	    mnesia_monitor:mnesia_down(?MODULE, N);
-	Pid ->
-	    %% Syncronously call needed in order to avoid
-	    %% race with mnesia_tm's coordinator processes
-	    %% that may restart and acquire new locks.
-	    %% mnesia_monitor ensures the sync.
-	    Pid ! {release_remote_non_pending, N, Pending}
-    end.
-
 %% Aquire a write lock, but do a read, used by
 %% mnesia:wread/1
 
-- 
cgit v1.2.3