aboutsummaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorUlf Wiger <[email protected]>2010-12-09 15:08:52 +0100
committerHenrik Nord <[email protected]>2011-05-16 11:00:28 +0200
commit6d446fc5d08d56174a79ab546d5aa2e79277af06 (patch)
treeabfde5ca53f3fdf657df524b4bdb03791aaf9bbb /lib
parente3af9123e7ef9291535cafbd0ecb9d3309d674f7 (diff)
downloadotp-6d446fc5d08d56174a79ab546d5aa2e79277af06.tar.gz
otp-6d446fc5d08d56174a79ab546d5aa2e79277af06.tar.bz2
otp-6d446fc5d08d56174a79ab546d5aa2e79277af06.zip
Add {majority, boolean()} per-table option.
With {majority, true} set for a table, write transactions will abort if they cannot commit to a majority of the nodes that have a copy of the table. Currently, the implementation hooks into the prepare_commit, and forces an asymmetric transaction if the commit set affects any table with the majority flag set. In the commit itself, the transaction will abort if it cannot satisfy the majority requirement for all tables involved in the thransaction. A future optimization might be to abort already when a write lock is attempted on such a table (/-object) and the lock cannot be set on enough nodes. This functionality introduces the possibility to automatically "fence off" a table in the presence of failures. This is a first implementation. Only basic tests have been performed.
Diffstat (limited to 'lib')
-rw-r--r--lib/mnesia/src/mnesia.hrl1
-rw-r--r--lib/mnesia/src/mnesia_schema.erl15
-rw-r--r--lib/mnesia/src/mnesia_tm.erl76
3 files changed, 81 insertions, 11 deletions
diff --git a/lib/mnesia/src/mnesia.hrl b/lib/mnesia/src/mnesia.hrl
index d488d9364a..26537815a3 100644
--- a/lib/mnesia/src/mnesia.hrl
+++ b/lib/mnesia/src/mnesia.hrl
@@ -62,6 +62,7 @@
disc_only_copies = [], % [Node]
load_order = 0, % Integer
access_mode = read_write, % read_write | read_only
+ majority = false, % true | false
index = [], % [Integer]
snmp = [], % Snmp Ustruct
local_content = false, % true | false
diff --git a/lib/mnesia/src/mnesia_schema.erl b/lib/mnesia/src/mnesia_schema.erl
index d1d892a387..a6c8ffec01 100644
--- a/lib/mnesia/src/mnesia_schema.erl
+++ b/lib/mnesia/src/mnesia_schema.erl
@@ -178,6 +178,7 @@ do_set_schema(Tab, Cs) ->
set({Tab, disc_only_copies}, Cs#cstruct.disc_only_copies),
set({Tab, load_order}, Cs#cstruct.load_order),
set({Tab, access_mode}, Cs#cstruct.access_mode),
+ set({Tab, majority}, Cs#cstruct.majority),
set({Tab, snmp}, Cs#cstruct.snmp),
set({Tab, user_properties}, Cs#cstruct.user_properties),
[set({Tab, user_property, element(1, P)}, P) || P <- Cs#cstruct.user_properties],
@@ -651,6 +652,7 @@ list2cs(List) when is_list(List) ->
Snmp = pick(Name, snmp, List, []),
LoadOrder = pick(Name, load_order, List, 0),
AccessMode = pick(Name, access_mode, List, read_write),
+ Majority = pick(Name, majority, List, false),
UserProps = pick(Name, user_properties, List, []),
verify({alt, [nil, list]}, mnesia_lib:etype(UserProps),
{bad_type, Name, {user_properties, UserProps}}),
@@ -676,6 +678,7 @@ list2cs(List) when is_list(List) ->
snmp = Snmp,
load_order = LoadOrder,
access_mode = AccessMode,
+ majority = Majority,
local_content = LC,
record_name = RecName,
attributes = Attrs,
@@ -809,7 +812,16 @@ verify_cstruct(Cs) when is_record(Cs, cstruct) ->
Access = Cs#cstruct.access_mode,
verify({alt, [read_write, read_only]}, Access,
{bad_type, Tab, {access_mode, Access}}),
-
+ Majority = Cs#cstruct.majority,
+ verify({alt, [true, false]}, Majority,
+ {bad_type, Tab, {majority, Majority}}),
+ case Majority of
+ true ->
+ verify(false, LC,
+ {combine_error, Tab, [{local_content,true},{majority,true}]});
+ false ->
+ ok
+ end,
Snmp = Cs#cstruct.snmp,
verify(true, mnesia_snmp_hook:check_ustruct(Snmp),
{badarg, Tab, {snmp, Snmp}}),
@@ -2971,6 +2983,7 @@ merge_versions(AnythingNew, Cs, RemoteCs, Force) ->
Cs#cstruct.index == RemoteCs#cstruct.index,
Cs#cstruct.snmp == RemoteCs#cstruct.snmp,
Cs#cstruct.access_mode == RemoteCs#cstruct.access_mode,
+ Cs#cstruct.majority == RemoteCs#cstruct.majority,
Cs#cstruct.load_order == RemoteCs#cstruct.load_order,
Cs#cstruct.user_properties == RemoteCs#cstruct.user_properties ->
do_merge_versions(AnythingNew, Cs, RemoteCs);
diff --git a/lib/mnesia/src/mnesia_tm.erl b/lib/mnesia/src/mnesia_tm.erl
index bb8e788b40..c7a0c28589 100644
--- a/lib/mnesia/src/mnesia_tm.erl
+++ b/lib/mnesia/src/mnesia_tm.erl
@@ -64,7 +64,8 @@
prev_tab = [], % initiate to a non valid table name
prev_types,
prev_snmp,
- types
+ types,
+ majority = []
}).
-record(participant, {tid, pid, commit, disc_nodes = [],
@@ -1100,9 +1101,12 @@ t_commit(Type) ->
case arrange(Tid, Store, Type) of
{N, Prep} when N > 0 ->
multi_commit(Prep#prep.protocol,
+ majority_attr(Prep),
Tid, Prep#prep.records, Store);
{0, Prep} ->
- multi_commit(read_only, Tid, Prep#prep.records, Store)
+ multi_commit(read_only,
+ majority_attr(Prep),
+ Tid, Prep#prep.records, Store)
end;
true ->
%% nested commit
@@ -1117,6 +1121,12 @@ t_commit(Type) ->
do_commit_nested
end.
+majority_attr(#prep{majority = M}) ->
+ M;
+majority_attr(_) ->
+ [].
+
+
%% This function arranges for all objects we shall write in S to be
%% in a list of {Node, CommitRecord}
%% Important function for the performance of mnesia.
@@ -1222,11 +1232,13 @@ prepare_items(Tid, Tab, Key, Items, Prep) ->
{blocked, _} ->
unblocked = req({unblock_me, Tab}),
prepare_items(Tid, Tab, Key, Items, Prep);
- _ ->
+ _ ->
+ Majority = needs_majority(Tab, Prep),
Snmp = val({Tab, snmp}),
Recs2 = do_prepare_items(Tid, Tab, Key, Types,
Snmp, Items, Prep#prep.records),
Prep2 = Prep#prep{records = Recs2, prev_tab = Tab,
+ majority = Majority,
prev_types = Types, prev_snmp = Snmp},
check_prep(Prep2, Types)
end.
@@ -1235,6 +1247,41 @@ do_prepare_items(Tid, Tab, Key, Types, Snmp, Items, Recs) ->
Recs2 = prepare_snmp(Tid, Tab, Key, Types, Snmp, Items, Recs), % May exit
prepare_nodes(Tid, Types, Items, Recs2, normal).
+
+needs_majority(Tab, #prep{majority = M}) ->
+ case lists:keymember(Tab, 1, M) of
+ true ->
+ M;
+ false ->
+ case ?catch_val({Tab, majority}) of
+ {'EXIT', _} ->
+ M;
+ false ->
+ [{Tab, []} | M];
+ true ->
+ CopyHolders = all_copy_holders(Tab),
+ [{Tab, CopyHolders} | M]
+ end
+ end.
+
+all_copy_holders(Tab) ->
+ DC = val({Tab, disc_copies}),
+ DO = val({Tab, disc_only_copies}),
+ RC = val({Tab, ram_copies}),
+ DC ++ DO ++ RC.
+
+have_majority([], _) ->
+ ok;
+have_majority([{Tab, AllNodes} | Rest], Nodes) ->
+ Missing = AllNodes -- Nodes,
+ Present = AllNodes -- Missing,
+ case length(Present) > length(Missing) of
+ true ->
+ have_majority(Rest, Nodes);
+ false ->
+ {error, Tab}
+ end.
+
prepare_snmp(Tab, Key, Items) ->
case val({Tab, snmp}) of
[] ->
@@ -1261,10 +1308,15 @@ prepare_snmp(Tid, Tab, Key, Types, Us, Items, Recs) ->
prepare_nodes(Tid, Types, [{clear_table, Tab}], Recs, snmp)
end.
-check_prep(Prep, Types) when Prep#prep.types == Types ->
+check_prep(#prep{majority = [], types = Types} = Prep, Types) ->
Prep;
-check_prep(Prep, Types) when Prep#prep.types == undefined ->
- Prep#prep{types = Types};
+check_prep(#prep{majority = M, types = undefined} = Prep, Types) ->
+ Protocol = if M == [] ->
+ Prep#prep.protocol;
+ true ->
+ asym_trans
+ end,
+ Prep#prep{protocol = Protocol, types = Types};
check_prep(Prep, _Types) ->
Prep#prep{protocol = asym_trans}.
@@ -1311,7 +1363,7 @@ prepare_node(_Node, _Storage, [], Rec, _Kind) ->
%% multi_commit((Protocol, Tid, CommitRecords, Store)
%% Local work is always performed in users process
-multi_commit(read_only, Tid, CR, _Store) ->
+multi_commit(read_only, _Maj = [], Tid, CR, _Store) ->
%% This featherweight commit protocol is used when no
%% updates has been performed in the transaction.
@@ -1324,7 +1376,7 @@ multi_commit(read_only, Tid, CR, _Store) ->
?MODULE ! {delete_transaction, Tid},
do_commit;
-multi_commit(sym_trans, Tid, CR, Store) ->
+multi_commit(sym_trans, _Maj = [], Tid, CR, Store) ->
%% This lightweight commit protocol is used when all
%% the involved tables are replicated symetrically.
%% Their storage types must match on each node.
@@ -1376,7 +1428,7 @@ multi_commit(sym_trans, Tid, CR, Store) ->
[{tid, Tid}, {outcome, Outcome}]),
Outcome;
-multi_commit(sync_sym_trans, Tid, CR, Store) ->
+multi_commit(sync_sym_trans, _Maj = [], Tid, CR, Store) ->
%% This protocol is the same as sym_trans except that it
%% uses syncronized calls to disk_log and syncronized commits
%% when several nodes are involved.
@@ -1408,7 +1460,7 @@ multi_commit(sync_sym_trans, Tid, CR, Store) ->
[{tid, Tid}, {outcome, Outcome}]),
Outcome;
-multi_commit(asym_trans, Tid, CR, Store) ->
+multi_commit(asym_trans, Majority, Tid, CR, Store) ->
%% This more expensive commit protocol is used when
%% table definitions are changed (schema transactions).
%% It is also used when the involved tables are
@@ -1469,6 +1521,10 @@ multi_commit(asym_trans, Tid, CR, Store) ->
{D2, CR2} = commit_decision(D, CR, [], []),
DiscNs = D2#decision.disc_nodes,
RamNs = D2#decision.ram_nodes,
+ case have_majority(Majority, DiscNs ++ RamNs) of
+ ok -> ok;
+ {error, Tab} -> mnesia:abort({no_majority, Tab})
+ end,
Pending = mnesia_checkpoint:tm_enter_pending(Tid, DiscNs, RamNs),
?ets_insert(Store, Pending),
{WaitFor, Local} = ask_commit(asym_trans, Tid, CR2, DiscNs, RamNs),