diff options
author | Ulf Wiger <[email protected]> | 2010-12-09 15:08:52 +0100 |
---|---|---|
committer | Henrik Nord <[email protected]> | 2011-05-16 11:00:28 +0200 |
commit | 6d446fc5d08d56174a79ab546d5aa2e79277af06 (patch) | |
tree | abfde5ca53f3fdf657df524b4bdb03791aaf9bbb /lib/mnesia | |
parent | e3af9123e7ef9291535cafbd0ecb9d3309d674f7 (diff) | |
download | otp-6d446fc5d08d56174a79ab546d5aa2e79277af06.tar.gz otp-6d446fc5d08d56174a79ab546d5aa2e79277af06.tar.bz2 otp-6d446fc5d08d56174a79ab546d5aa2e79277af06.zip |
Add {majority, boolean()} per-table option.
With {majority, true} set for a table, write transactions will
abort if they cannot commit to a majority of the nodes that
have a copy of the table. Currently, the implementation hooks
into the prepare_commit, and forces an asymmetric transaction
if the commit set affects any table with the majority flag set.
In the commit itself, the transaction will abort if it cannot
satisfy the majority requirement for all tables involved in the
thransaction.
A future optimization might be to abort already when a write
lock is attempted on such a table (/-object) and the lock cannot
be set on enough nodes.
This functionality introduces the possibility to automatically
"fence off" a table in the presence of failures.
This is a first implementation. Only basic tests have been
performed.
Diffstat (limited to 'lib/mnesia')
-rw-r--r-- | lib/mnesia/src/mnesia.hrl | 1 | ||||
-rw-r--r-- | lib/mnesia/src/mnesia_schema.erl | 15 | ||||
-rw-r--r-- | lib/mnesia/src/mnesia_tm.erl | 76 |
3 files changed, 81 insertions, 11 deletions
diff --git a/lib/mnesia/src/mnesia.hrl b/lib/mnesia/src/mnesia.hrl index d488d9364a..26537815a3 100644 --- a/lib/mnesia/src/mnesia.hrl +++ b/lib/mnesia/src/mnesia.hrl @@ -62,6 +62,7 @@ disc_only_copies = [], % [Node] load_order = 0, % Integer access_mode = read_write, % read_write | read_only + majority = false, % true | false index = [], % [Integer] snmp = [], % Snmp Ustruct local_content = false, % true | false diff --git a/lib/mnesia/src/mnesia_schema.erl b/lib/mnesia/src/mnesia_schema.erl index d1d892a387..a6c8ffec01 100644 --- a/lib/mnesia/src/mnesia_schema.erl +++ b/lib/mnesia/src/mnesia_schema.erl @@ -178,6 +178,7 @@ do_set_schema(Tab, Cs) -> set({Tab, disc_only_copies}, Cs#cstruct.disc_only_copies), set({Tab, load_order}, Cs#cstruct.load_order), set({Tab, access_mode}, Cs#cstruct.access_mode), + set({Tab, majority}, Cs#cstruct.majority), set({Tab, snmp}, Cs#cstruct.snmp), set({Tab, user_properties}, Cs#cstruct.user_properties), [set({Tab, user_property, element(1, P)}, P) || P <- Cs#cstruct.user_properties], @@ -651,6 +652,7 @@ list2cs(List) when is_list(List) -> Snmp = pick(Name, snmp, List, []), LoadOrder = pick(Name, load_order, List, 0), AccessMode = pick(Name, access_mode, List, read_write), + Majority = pick(Name, majority, List, false), UserProps = pick(Name, user_properties, List, []), verify({alt, [nil, list]}, mnesia_lib:etype(UserProps), {bad_type, Name, {user_properties, UserProps}}), @@ -676,6 +678,7 @@ list2cs(List) when is_list(List) -> snmp = Snmp, load_order = LoadOrder, access_mode = AccessMode, + majority = Majority, local_content = LC, record_name = RecName, attributes = Attrs, @@ -809,7 +812,16 @@ verify_cstruct(Cs) when is_record(Cs, cstruct) -> Access = Cs#cstruct.access_mode, verify({alt, [read_write, read_only]}, Access, {bad_type, Tab, {access_mode, Access}}), - + Majority = Cs#cstruct.majority, + verify({alt, [true, false]}, Majority, + {bad_type, Tab, {majority, Majority}}), + case Majority of + true -> + verify(false, LC, + {combine_error, Tab, [{local_content,true},{majority,true}]}); + false -> + ok + end, Snmp = Cs#cstruct.snmp, verify(true, mnesia_snmp_hook:check_ustruct(Snmp), {badarg, Tab, {snmp, Snmp}}), @@ -2971,6 +2983,7 @@ merge_versions(AnythingNew, Cs, RemoteCs, Force) -> Cs#cstruct.index == RemoteCs#cstruct.index, Cs#cstruct.snmp == RemoteCs#cstruct.snmp, Cs#cstruct.access_mode == RemoteCs#cstruct.access_mode, + Cs#cstruct.majority == RemoteCs#cstruct.majority, Cs#cstruct.load_order == RemoteCs#cstruct.load_order, Cs#cstruct.user_properties == RemoteCs#cstruct.user_properties -> do_merge_versions(AnythingNew, Cs, RemoteCs); diff --git a/lib/mnesia/src/mnesia_tm.erl b/lib/mnesia/src/mnesia_tm.erl index bb8e788b40..c7a0c28589 100644 --- a/lib/mnesia/src/mnesia_tm.erl +++ b/lib/mnesia/src/mnesia_tm.erl @@ -64,7 +64,8 @@ prev_tab = [], % initiate to a non valid table name prev_types, prev_snmp, - types + types, + majority = [] }). -record(participant, {tid, pid, commit, disc_nodes = [], @@ -1100,9 +1101,12 @@ t_commit(Type) -> case arrange(Tid, Store, Type) of {N, Prep} when N > 0 -> multi_commit(Prep#prep.protocol, + majority_attr(Prep), Tid, Prep#prep.records, Store); {0, Prep} -> - multi_commit(read_only, Tid, Prep#prep.records, Store) + multi_commit(read_only, + majority_attr(Prep), + Tid, Prep#prep.records, Store) end; true -> %% nested commit @@ -1117,6 +1121,12 @@ t_commit(Type) -> do_commit_nested end. +majority_attr(#prep{majority = M}) -> + M; +majority_attr(_) -> + []. + + %% This function arranges for all objects we shall write in S to be %% in a list of {Node, CommitRecord} %% Important function for the performance of mnesia. @@ -1222,11 +1232,13 @@ prepare_items(Tid, Tab, Key, Items, Prep) -> {blocked, _} -> unblocked = req({unblock_me, Tab}), prepare_items(Tid, Tab, Key, Items, Prep); - _ -> + _ -> + Majority = needs_majority(Tab, Prep), Snmp = val({Tab, snmp}), Recs2 = do_prepare_items(Tid, Tab, Key, Types, Snmp, Items, Prep#prep.records), Prep2 = Prep#prep{records = Recs2, prev_tab = Tab, + majority = Majority, prev_types = Types, prev_snmp = Snmp}, check_prep(Prep2, Types) end. @@ -1235,6 +1247,41 @@ do_prepare_items(Tid, Tab, Key, Types, Snmp, Items, Recs) -> Recs2 = prepare_snmp(Tid, Tab, Key, Types, Snmp, Items, Recs), % May exit prepare_nodes(Tid, Types, Items, Recs2, normal). + +needs_majority(Tab, #prep{majority = M}) -> + case lists:keymember(Tab, 1, M) of + true -> + M; + false -> + case ?catch_val({Tab, majority}) of + {'EXIT', _} -> + M; + false -> + [{Tab, []} | M]; + true -> + CopyHolders = all_copy_holders(Tab), + [{Tab, CopyHolders} | M] + end + end. + +all_copy_holders(Tab) -> + DC = val({Tab, disc_copies}), + DO = val({Tab, disc_only_copies}), + RC = val({Tab, ram_copies}), + DC ++ DO ++ RC. + +have_majority([], _) -> + ok; +have_majority([{Tab, AllNodes} | Rest], Nodes) -> + Missing = AllNodes -- Nodes, + Present = AllNodes -- Missing, + case length(Present) > length(Missing) of + true -> + have_majority(Rest, Nodes); + false -> + {error, Tab} + end. + prepare_snmp(Tab, Key, Items) -> case val({Tab, snmp}) of [] -> @@ -1261,10 +1308,15 @@ prepare_snmp(Tid, Tab, Key, Types, Us, Items, Recs) -> prepare_nodes(Tid, Types, [{clear_table, Tab}], Recs, snmp) end. -check_prep(Prep, Types) when Prep#prep.types == Types -> +check_prep(#prep{majority = [], types = Types} = Prep, Types) -> Prep; -check_prep(Prep, Types) when Prep#prep.types == undefined -> - Prep#prep{types = Types}; +check_prep(#prep{majority = M, types = undefined} = Prep, Types) -> + Protocol = if M == [] -> + Prep#prep.protocol; + true -> + asym_trans + end, + Prep#prep{protocol = Protocol, types = Types}; check_prep(Prep, _Types) -> Prep#prep{protocol = asym_trans}. @@ -1311,7 +1363,7 @@ prepare_node(_Node, _Storage, [], Rec, _Kind) -> %% multi_commit((Protocol, Tid, CommitRecords, Store) %% Local work is always performed in users process -multi_commit(read_only, Tid, CR, _Store) -> +multi_commit(read_only, _Maj = [], Tid, CR, _Store) -> %% This featherweight commit protocol is used when no %% updates has been performed in the transaction. @@ -1324,7 +1376,7 @@ multi_commit(read_only, Tid, CR, _Store) -> ?MODULE ! {delete_transaction, Tid}, do_commit; -multi_commit(sym_trans, Tid, CR, Store) -> +multi_commit(sym_trans, _Maj = [], Tid, CR, Store) -> %% This lightweight commit protocol is used when all %% the involved tables are replicated symetrically. %% Their storage types must match on each node. @@ -1376,7 +1428,7 @@ multi_commit(sym_trans, Tid, CR, Store) -> [{tid, Tid}, {outcome, Outcome}]), Outcome; -multi_commit(sync_sym_trans, Tid, CR, Store) -> +multi_commit(sync_sym_trans, _Maj = [], Tid, CR, Store) -> %% This protocol is the same as sym_trans except that it %% uses syncronized calls to disk_log and syncronized commits %% when several nodes are involved. @@ -1408,7 +1460,7 @@ multi_commit(sync_sym_trans, Tid, CR, Store) -> [{tid, Tid}, {outcome, Outcome}]), Outcome; -multi_commit(asym_trans, Tid, CR, Store) -> +multi_commit(asym_trans, Majority, Tid, CR, Store) -> %% This more expensive commit protocol is used when %% table definitions are changed (schema transactions). %% It is also used when the involved tables are @@ -1469,6 +1521,10 @@ multi_commit(asym_trans, Tid, CR, Store) -> {D2, CR2} = commit_decision(D, CR, [], []), DiscNs = D2#decision.disc_nodes, RamNs = D2#decision.ram_nodes, + case have_majority(Majority, DiscNs ++ RamNs) of + ok -> ok; + {error, Tab} -> mnesia:abort({no_majority, Tab}) + end, Pending = mnesia_checkpoint:tm_enter_pending(Tid, DiscNs, RamNs), ?ets_insert(Store, Pending), {WaitFor, Local} = ask_commit(asym_trans, Tid, CR2, DiscNs, RamNs), |